From: Jan Hubicka Date: Wed, 1 Oct 2025 15:58:00 +0000 (+0200) Subject: Add --parm auto-profile-bbs X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=16980f46939cde58b22fae372252b449f0158216;p=thirdparty%2Fgcc.git Add --parm auto-profile-bbs This patch adds a parameter that controls whether BB profile is read from auto-profile or we just scale guessed profile according to known counts. This is mostly useful as a first aid when auto-profile goes wrong. Once we fix enough bugs I think it may be removed but so far it is quite useful, so I decided to push it. gcc/ChangeLog: * auto-profile.cc (determine_scale): Break out from ... (afdo_adjust_guessed_profile): ... here. (scale_bb_profile): New function. (afdo_annotate_cfg): Use it. * params.opt (auto-profile-bbs): New parmaeter. * doc/invoke.texi (auto-profile-bbs): Document. --- diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc index 1da729e4666..a2be7556c3f 100644 --- a/gcc/auto-profile.cc +++ b/gcc/auto-profile.cc @@ -3514,6 +3514,109 @@ scale_bbs (const vec &bbs, sreal scale) } } +/* Determine scaling factor by taking robust average of SCALES + and taking into account limits. + MAX_COUNT is maximal guessed count to be scaled while MAC_COUNT_IN_FN + is maximal count in function determined by auto-fdo. */ + +sreal +determine_scale (vec *scales, profile_count max_count, + profile_count max_count_in_fn) +{ + scales->qsort (cmp); + + uint64_t overall_weight = 0; + for (scale &e : *scales) + overall_weight += e.weight; + + uint64_t cummulated = 0, weight_sum = 0; + sreal scale_sum = 0; + for (scale &e : *scales) + { + uint64_t prev = cummulated; + cummulated += e.weight; + if (cummulated >= overall_weight / 4 + && prev <= 3 * overall_weight / 4) + { + scale_sum += e.scale * e.weight; + weight_sum += e.weight; + if (dump_file) + fprintf (dump_file, " accounting scale %.16f, weight %" PRId64 "\n", + e.scale.to_double (), e.weight); + } + else if (dump_file) + fprintf (dump_file, " ignoring scale %.16f, weight %" PRId64 "\n", + e.scale.to_double (), e.weight); + } + sreal scale = scale_sum / (sreal)weight_sum; + + /* Avoid scaled regions to have very large counts. + Otherwise they may dominate ipa-profile's histogram computing cutoff + of hot basic blocks. */ + if (max_count * scale > max_count_in_fn.guessed_local ().apply_scale (128, 1)) + { + if (dump_file) + { + fprintf (dump_file, "Scaling by %.16f produces max count ", + scale.to_double ()); + (max_count * scale).dump (dump_file); + fprintf (dump_file, " that exceeds max count in fn "); + max_count_in_fn.dump (dump_file); + fprintf (dump_file, "; capping\n"); + } + scale = max_count_in_fn.guessed_local ().to_sreal_scale (max_count); + } + return scale; +} + +/* Scale profile of the whole function to approximately match auto-profile. */ + +bool +scale_bb_profile () +{ + const function_instance *s + = afdo_source_profile->get_function_instance_by_decl + (current_function_decl); + + /* In the first pass only store non-zero counts. */ + gcov_type head_count = s->head_count () * autofdo::afdo_count_scale; + hash_set zero_bbs; + auto_vec bbs (n_basic_blocks_for_fn (cfun)); + auto_vec scales; + basic_block bb; + profile_count max_count = profile_count::zero (); + profile_count max_count_in_fn = profile_count::zero (); + bbs.quick_push (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + bbs.quick_push (EXIT_BLOCK_PTR_FOR_FN (cfun)); + if (head_count > 0) + { + profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count; + max_count = entry_count; + update_count_by_afdo_count (&entry_count, head_count); + max_count_in_fn = entry_count; + add_scale (&scales, entry_count, ENTRY_BLOCK_PTR_FOR_FN (cfun)->count); + } + FOR_EACH_BB_FN (bb, cfun) + { + profile_count cnt = bb->count; + bbs.safe_push (bb); + max_count = max_count.max (bb->count); + if (afdo_set_bb_count (bb, zero_bbs)) + { + std::swap (cnt, bb->count); + max_count_in_fn = max_count_in_fn.max (cnt); + add_scale (&scales, cnt, bb->count); + } + } + if (scales.length ()) + { + sreal scale = determine_scale (&scales, max_count, max_count_in_fn); + scale_bbs (bbs, scale); + return true; + } + return false; +} + /* In case given basic block was fully optimized out, AutoFDO will have no data about it. In this case try to preserve static profile. Identify connected components (in undirected form of CFG) which has @@ -3723,47 +3826,7 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb) continue; } gcc_checking_assert (scales.length ()); - scales.qsort (cmp); - - uint64_t overall_weight = 0; - for (scale &e : scales) - overall_weight += e.weight; - - uint64_t cummulated = 0, weight_sum = 0; - sreal scale_sum = 0; - for (scale &e : scales) - { - uint64_t prev = cummulated; - cummulated += e.weight; - if (cummulated >= overall_weight / 4 - && prev <= 3 * overall_weight / 4) - { - scale_sum += e.scale * e.weight; - weight_sum += e.weight; - if (dump_file) - fprintf (dump_file, " accounting scale %.16f, weight %" PRId64 "\n", - e.scale.to_double (), e.weight); - } - else if (dump_file) - fprintf (dump_file, " ignoring scale %.16f, weight %" PRId64 "\n", - e.scale.to_double (), e.weight); - } - sreal scale = scale_sum / (sreal)weight_sum; - - /* Avoid scaled regions to have very large counts. - Otherwise they may dominate ipa-profile's histogram computing cutoff - of hot basic blocks. */ - if (max_count * scale > max_count_in_fn.guessed_local ()) - { - if (dump_file) - { - fprintf (dump_file, "Scaling by %.16f produces max count ", - scale.to_double ()); - (max_count * scale).dump (dump_file); - fprintf (dump_file, " that exceeds max count in fn; capping\n"); - } - scale = max_count_in_fn.guessed_local ().to_sreal_scale (max_count); - } + sreal scale = determine_scale (&scales, max_count, max_count_in_fn); scale_bbs (bbs, scale); } } @@ -3913,21 +3976,30 @@ afdo_annotate_cfg (void) s->dump (dump_file); fprintf (dump_file, "\n"); } - - /* In the first pass only store non-zero counts. */ - gcov_type head_count = s->head_count () * autofdo::afdo_count_scale; - bool profile_found = head_count > 0; + bool profile_found = false; hash_set zero_bbs; - FOR_EACH_BB_FN (bb, cfun) + gcov_type head_count = s->head_count () * autofdo::afdo_count_scale; + + if (!param_auto_profile_bbs) { - if (afdo_set_bb_count (bb, zero_bbs)) + if (scale_bb_profile ()) + return; + } + else + { + /* In the first pass only store non-zero counts. */ + profile_found = head_count > 0; + FOR_EACH_BB_FN (bb, cfun) { - if (bb->count.quality () == AFDO) + if (afdo_set_bb_count (bb, zero_bbs)) { - gcc_assert (bb->count.nonzero_p ()); - profile_found = true; + if (bb->count.quality () == AFDO) + { + gcc_assert (bb->count.nonzero_p ()); + profile_found = true; + } + set_bb_annotated (bb, &annotated_bb); } - set_bb_annotated (bb, &annotated_bb); } } /* Exit without clobbering static profile if there was no diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 99607a09b89..492ca291432 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -16245,6 +16245,11 @@ In each case, the @var{value} is an integer. The following choices of @var{name} are recognized for all targets: @table @gcctabopt +@item auto-profile-bbs +If non-zero and used together with @option{-fauto-profile}, the auto-profile +will be used to determine basic block profile. If zero, then only function +level profile will be read. + @item phiopt-factor-max-stmts-live When factoring statements out of if/then/else, this is the max # of statements after the defining statement to be allow to extend the lifetime of a name diff --git a/gcc/params.opt b/gcc/params.opt index ae617094db6..1f6297de163 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -66,6 +66,10 @@ Enable asan stack protection. Common Joined UInteger Var(param_asan_use_after_return) Init(1) IntegerRange(0, 1) Param Optimization Enable asan detection of use-after-return bugs. +-param=auto-profile-bbs= +Common Joined UInteger Var(param_auto_profile_bbs) Init(1) IntegerRange(0, 1) Param Optimization +Build basic block profile using auto profile + -param=cycle-accurate-model= Common Joined UInteger Var(param_cycle_accurate_model) Init(1) IntegerRange(0, 1) Param Optimization Whether the scheduling description is mostly a cycle-accurate model of the target processor and is likely to be spill aggressively to fill any pipeline bubbles.