Main difference between normal profile feedback and auto-fdo is that with profile
feedback every basic block with non-zero profile has an incomming edge with non-zero
profile. With auto-profile it is possible that none of predecessors was sampled
and also the tool has cutoff parameter which makes it to ignore small counts.
This becomes a problem when one tries to specialize code and scale profile.
For exmaple if inline function happens to have hot loop with non-zero counts
but its entry count has zero counts and we want to inline to zero counts and we
want to inline to a call with a non-zero count X, we want to scale the body by
X/0 which we currently turn into X/1.
This is a problem since I added logic to scale up the auto-profiles (to get
some extra bits of precision) so X is often a large value and multiplying by X
is not a right answer at all. The multiply factor should be <= 1.
Iterating this few times will make counts to cap and we will lost any useful info.
Original implementation avoided this by doing all inlines before AFDO readback,
bit this is not possible with LTO (unless we move AFDO readback to WPA or add
support for context sensitive profiles). I think I can get the scaling work
reasonably well and then we can look into possible benefits of context sensitive
profiling which can be implemented both atop of AFDO as well as FDO.
This patch adds cutoff value to profile_info which is initialized by profile
feedback to 1 and by auto-profile to the scale factor (since we do not know the
cutoff create_gcov used; llvm's tool streams it and we probably should too).
Then force_nonzero forces every value smaller than cutoff/2 to cutoff/2 which
should keep scaling factors in reasonable ranges.
gcc/ChangeLog:
* auto-profile.cc
(autofdo_source_profile::read): Scale cutoff.
(read_autofdo_file): Initialize cutoff
* coverage.cc (read_counts_file): Initialize cutoff to 1.
* gcov-io.h (struct gcov_summary): Add cutoff field.
* ipa-inline.cc (inline_small_functions): mac_count can be non-zero
also with auto_profile.
* lto-cgraph.cc (output_profile_summary): Write cutoff
and sum_max.
(input_profile_summary): Read cutoff and sum max.
(merge_profile_summaries): Initialize and scale global cutoffs
and sum max.
* profile-count.cc: Include profile.h
(profile_count::force_nonzero): move here from ...; use cutoff.
* profile-count.h: (profile_count::force_nonzero): ... here.
gcc/testsuite/ChangeLog:
* gcc.dg/tree-prof/clone-merge-1.c:
afdo_count_scale
= MAX (((gcov_type)1 << (profile_count::n_bits / 2))
/ afdo_profile_info->sum_max, 1);
+ afdo_profile_info->cutoff *= afdo_count_scale;
afdo_hot_bb_threshod
= hot_frac
? afdo_profile_info->sum_max * afdo_count_scale / hot_frac
fprintf (dump_file, "Max count in profile %" PRIu64 "\n"
"Setting scale %" PRIu64 "\n"
"Scaled max count %" PRIu64 "\n"
+ "Cutoff %" PRIu64 "\n"
"Hot count threshold %" PRIu64 "\n\n",
(int64_t)afdo_profile_info->sum_max,
(int64_t)afdo_count_scale,
(int64_t)(afdo_profile_info->sum_max * afdo_count_scale),
+ (int64_t)afdo_profile_info->cutoff,
(int64_t)afdo_hot_bb_threshod);
afdo_profile_info->sum_max *= afdo_count_scale;
return true;
autofdo::afdo_profile_info = XNEW (gcov_summary);
autofdo::afdo_profile_info->runs = 1;
autofdo::afdo_profile_info->sum_max = 0;
+ autofdo::afdo_profile_info->cutoff = 1;
/* Read the profile from the profile file. */
autofdo::read_profile ();
gcov_profile_info = profile_info = XCNEW (gcov_summary);
profile_info->runs = gcov_read_unsigned ();
profile_info->sum_max = gcov_read_unsigned ();
+ profile_info->cutoff = 1;
}
else if (GCOV_TAG_IS_COUNTER (tag) && fn_ident)
{
{
gcov_unsigned_t runs; /* Number of program runs. */
gcov_type sum_max; /* Sum of individual run max values. */
+ gcov_type cutoff; /* Values smaller than this value are not
+ reliable (0 may mean non-zero).
+ For read profile cutoff is typically 1
+ however when we scale up or use auto-fdo
+ it may become bigger value. */
};
#if !defined(inhibit_libc)
gcc_assert (in_lto_p
|| !(max_count > 0)
+ || flag_auto_profile
|| (profile_info && flag_branch_probabilities));
while (!edge_heap.empty ())
{
if (profile_info)
{
- /* We do not output num and run_max, they are not used by
- GCC profile feedback and they are difficult to merge from multiple
- units. */
unsigned runs = (profile_info->runs);
streamer_write_uhwi_stream (ob->main_stream, runs);
+ streamer_write_gcov_count_stream (ob->main_stream,
+ profile_info->sum_max);
+ streamer_write_gcov_count_stream (ob->main_stream,
+ profile_info->cutoff);
/* IPA-profile computes hot bb threshold based on cumulated
whole program profile. We need to stream it down to ltrans. */
if (runs)
{
file_data->profile_info.runs = runs;
+ file_data->profile_info.sum_max = streamer_read_gcov_count (ib);
+ file_data->profile_info.cutoff = streamer_read_gcov_count (ib);
/* IPA-profile computes hot bb threshold based on cumulated
whole program profile. We need to stream it down to ltrans. */
profile_info = XCNEW (gcov_summary);
profile_info->runs = max_runs;
+ profile_info->sum_max = 0;
+ profile_info->cutoff = 0;
/* If merging already happent at WPA time, we are done. */
if (flag_ltrans)
scale = RDIV (node->count_materialization_scale * max_runs,
node->lto_file_data->profile_info.runs);
+ gcov_type sum_max = RDIV (node->lto_file_data->profile_info.sum_max * max_runs,
+ node->lto_file_data->profile_info.runs);
+ gcov_type cutoff = RDIV (node->lto_file_data->profile_info.cutoff * max_runs,
+ node->lto_file_data->profile_info.runs);
+ if (sum_max > profile_info->sum_max)
+ profile_info->sum_max = sum_max;
+ if (cutoff > profile_info->cutoff)
+ profile_info->cutoff = cutoff;
node->count_materialization_scale = scale;
if (scale < 0)
fatal_error (input_location, "Profile information in %s corrupted",
#include "cgraph.h"
#include "wide-int.h"
#include "sreal.h"
+#include "profile.h"
/* Names from profile_quality enum values. */
{
return *this * num;
}
+
+/* Make counter forcibly nonzero. */
+profile_count
+profile_count::force_nonzero () const
+{
+ if (!initialized_p ())
+ return *this;
+ profile_count ret = *this;
+ /* Generally values are forced non-zero to handle inconsistent profile
+ where count 0 needs to be scaled up to non-zero.
+
+ Use cutoff value here to avoid situation where profile has large
+ cutoff and we perform count = count * num / den where num is non-zero
+ and den is 0. If profile was scaled by large factor, forcing value
+ to 1 would lead to large scale factor. */
+ gcov_unsigned_t small = profile_info ? profile_info->cutoff / 2 + 1
+ : 1;
+ if (ret.m_val < small)
+ {
+ ret.m_val = small;
+ ret.m_quality = MIN (m_quality, ADJUSTED);
+ }
+ return ret;
+}
}
/* Make counter forcibly nonzero. */
- profile_count force_nonzero () const
- {
- if (!initialized_p ())
- return *this;
- profile_count ret = *this;
- if (ret.m_val == 0)
- {
- ret.m_val = 1;
- ret.m_quality = MIN (m_quality, ADJUSTED);
- }
- return ret;
- }
+ profile_count force_nonzero () const;
profile_count max (profile_count other) const
{
}
/* We will have profiles for test2 and test2.constprop.0 that will have to be
merged, */
-/* { dg-final-use-autofdo { scan-ipa-dump "Merging duplicate symbol test2" "afdo_offline"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump "Merging duplicate instance: test2" "afdo_offline"} } */