]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
perf record --off-cpu: Add --off-cpu-thresh option
authorHoward Chu <howardchu95@gmail.com>
Thu, 1 May 2025 02:28:07 +0000 (19:28 -0700)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Tue, 6 May 2025 00:51:54 +0000 (21:51 -0300)
Specify the threshold for dumping offcpu samples with --off-cpu-thresh,
the unit is milliseconds. Default value is 500ms.

Example:

  perf record --off-cpu --off-cpu-thresh 824

The example above collects direct off-cpu samples where the off-cpu time
is longer than 824ms.

Committer testing:

After commenting out the end off-cpu dump to have just the ones that are
added right after the task is scheduled back, and using a threshould of
1000ms, we see some periods (the 5th column, just before "offcpu-time"
in the 'perf script' output) that are over 1000.000.000 nanoseconds:

  root@number:~# perf record --off-cpu --off-cpu-thresh 10000
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 3.902 MB perf.data (34335 samples) ]
  root@number:~# perf script
<SNIP>
  Isolated Web Co   59932 [028] 63839.594437: 1000049427 offcpu-time:
             7fe63c7976c2 __syscall_cancel_arch_end+0x0 (/usr/lib64/libc.so.6)
             7fe63c78c04c __futex_abstimed_wait_common+0x7c (/usr/lib64/libc.so.6)
             7fe63c78e928 pthread_cond_timedwait@@GLIBC_2.3.2+0x178 (/usr/lib64/libc.so.6)
             5599974a9fe7 mozilla::detail::ConditionVariableImpl::wait_for(mozilla::detail::MutexImpl&, mozilla::BaseTimeDuration<mozilla::TimeDurationValueCalculator> const&)+0xe7 (/usr/lib64/fir>
                100000000 [unknown] ([unknown])

          swapper       0 [025] 63839.594459:     195724    cycles:P:  ffffffffac328270 read_tsc+0x0 ([kernel.kallsyms])
  Isolated Web Co   59932 [010] 63839.594466: 1000055278 offcpu-time:
             7fe63c7976c2 __syscall_cancel_arch_end+0x0 (/usr/lib64/libc.so.6)
             7fe63c78ba24 __syscall_cancel+0x14 (/usr/lib64/libc.so.6)
             7fe63c804c4e __poll+0x1e (/usr/lib64/libc.so.6)
             7fe633b0d1b8 PollWrapper(_GPollFD*, unsigned int, int) [clone .lto_priv.0]+0xf8 (/usr/lib64/firefox/libxul.so)
                10000002c [unknown] ([unknown])

          swapper       0 [027] 63839.594475:     134433    cycles:P:  ffffffffad4c45d9 irqentry_enter+0x19 ([kernel.kallsyms])
          swapper       0 [028] 63839.594499:     215838    cycles:P:  ffffffffac39199a switch_mm_irqs_off+0x10a ([kernel.kallsyms])
  MediaPD~oder #1 1407676 [027] 63839.594514:     134433    cycles:P:      7f982ef5e69f dct_IV(int*, int, int*)+0x24f (/usr/lib64/libfdk-aac.so.2.0.0)
          swapper       0 [024] 63839.594524:     267411    cycles:P:  ffffffffad4c6ee6 poll_idle+0x56 ([kernel.kallsyms])
  MediaSu~sor #75 1093827 [026] 63839.594555:     332652    cycles:P:      55be753ad030 moz_xmalloc+0x200 (/usr/lib64/firefox/firefox)
          swapper       0 [027] 63839.594616:     160548    cycles:P:  ffffffffad144840 menu_select+0x570 ([kernel.kallsyms])
  Isolated Web Co   14019 [027] 63839.595120: 1000050178 offcpu-time:
             7fc9537cc6c2 __syscall_cancel_arch_end+0x0 (/usr/lib64/libc.so.6)
             7fc9537c104c __futex_abstimed_wait_common+0x7c (/usr/lib64/libc.so.6)
             7fc9537c3928 pthread_cond_timedwait@@GLIBC_2.3.2+0x178 (/usr/lib64/libc.so.6)
             7fc95372a3c8 pt_TimedWait+0xb8 (/usr/lib64/libnspr4.so)
             7fc95372a8d8 PR_WaitCondVar+0x68 (/usr/lib64/libnspr4.so)
             7fc94afb1f7c WatchdogMain(void*)+0xac (/usr/lib64/firefox/libxul.so)
             7fc947498660 [unknown] ([unknown])
             7fc9535fce88 [unknown] ([unknown])
             7fc94b620e60 WatchdogManager::~WatchdogManager()+0x0 (/usr/lib64/firefox/libxul.so)
          fff8548387f8b48 [unknown] ([unknown])

          swapper       0 [003] 63839.595712:     212948    cycles:P:  ffffffffacd5b865 acpi_os_read_port+0x55 ([kernel.kallsyms])
<SNIP>

Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Suggested-by: Ian Rogers <irogers@google.com>
Suggested-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Gautam Menghani <gautam@linux.ibm.com>
Tested-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241108204137.2444151-2-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20250501022809.449767-10-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/Documentation/perf-record.txt
tools/perf/builtin-record.c
tools/perf/util/bpf_off_cpu.c
tools/perf/util/bpf_skel/off_cpu.bpf.c
tools/perf/util/off_cpu.h
tools/perf/util/record.h

index c59f1e79f2b4a6f84f3f02c58d0d086277b41150..612612fa2d8041b94860035ed9cb01557a20b6b7 100644 (file)
@@ -842,6 +842,15 @@ filtered through the mask provided by -C option.
        only, as of now.  So the applications built without the frame
        pointer might see bogus addresses.
 
+       off-cpu profiling consists two types of samples: direct samples, which
+       share the same behavior as regular samples, and the accumulated
+       samples, stored in BPF stack trace map, presented after all the regular
+       samples.
+
+--off-cpu-thresh::
+       Once a task's off-cpu time reaches this threshold (in milliseconds), it
+       generates a direct off-cpu sample. The default is 500ms.
+
 --setup-filter=<action>::
        Prepare BPF filter to be used by regular users.  The action should be
        either "pin" or "unpin".  The filter can be used after it's pinned.
index 4194ea5ac72999a14b94a861184a7a5a0ccda6ad..8898357325cf413217325878b45954b4eb372628 100644 (file)
@@ -3162,6 +3162,28 @@ out_free:
        return ret;
 }
 
+static int record__parse_off_cpu_thresh(const struct option *opt,
+                                       const char *str,
+                                       int unset __maybe_unused)
+{
+       struct record_opts *opts = opt->value;
+       char *endptr;
+       u64 off_cpu_thresh_ms;
+
+       if (!str)
+               return -EINVAL;
+
+       off_cpu_thresh_ms = strtoull(str, &endptr, 10);
+
+       /* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
+       if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
+               return -EINVAL;
+       else
+               opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
+
+       return 0;
+}
+
 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
 {
 }
@@ -3355,6 +3377,7 @@ static struct record record = {
                .ctl_fd              = -1,
                .ctl_fd_ack          = -1,
                .synth               = PERF_SYNTH_ALL,
+               .off_cpu_thresh_ns   = OFFCPU_THRESH,
        },
 };
 
@@ -3582,6 +3605,9 @@ static struct option __record_options[] = {
        OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
        OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
                   "BPF filter action"),
+       OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
+                    "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
+                    record__parse_off_cpu_thresh),
        OPT_END()
 };
 
index c7fde66bb8f951294a85b1eb822eab0fb236cf7f..c367fefe6ecbb5807f34598b02f5426723040f4c 100644 (file)
@@ -14,6 +14,7 @@
 #include "util/strlist.h"
 #include <bpf/bpf.h>
 #include <internal/xyarray.h>
+#include <linux/time64.h>
 
 #include "bpf_skel/off_cpu.skel.h"
 
@@ -292,6 +293,8 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
                }
        }
 
+       skel->bss->offcpu_thresh_ns = opts->off_cpu_thresh_ns;
+
        err = off_cpu_bpf__attach(skel);
        if (err) {
                pr_err("Failed to attach off-cpu BPF skeleton\n");
index 14cd8881f8bb23c6d627f5f285fb6c3f91f3d6de..72763bb8d1de57fb788fd85e1f0be452de6d5ceb 100644 (file)
@@ -124,7 +124,7 @@ const volatile bool uses_cgroup_v1 = false;
 
 int perf_subsys_id = -1;
 
-__u64 offcpu_thresh_ns = 500000000ull;
+__u64 offcpu_thresh_ns;
 
 /*
  * Old kernel used to call it task_struct->state and now it's '__state'.
index 2a4b7f9b2c4cbd7c224f9a5cee2cafb6ae8fae90..64bf763ddf5075e9cc8c71437d3564d6bc2544f8 100644 (file)
@@ -16,6 +16,7 @@ struct record_opts;
                              PERF_SAMPLE_PERIOD | PERF_SAMPLE_RAW | \
                              PERF_SAMPLE_CGROUP)
 
+#define OFFCPU_THRESH 500000000ULL
 
 #ifdef HAVE_BPF_SKEL
 int off_cpu_prepare(struct evlist *evlist, struct target *target,
index f1956c4db3195070f4b6047e486e9e75c0b82209..ea3a6c4657eefb743dc3d54b0b791ea39117cc10 100644 (file)
@@ -80,6 +80,7 @@ struct record_opts {
        int           synth;
        int           threads_spec;
        const char    *threads_user_spec;
+       u64           off_cpu_thresh_ns;
 };
 
 extern const char * const *record_usage;