]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
perf sched stats: Add record and rawdump support
authorSwapnil Sapkal <swapnil.sapkal@amd.com>
Mon, 19 Jan 2026 17:58:25 +0000 (17:58 +0000)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Thu, 22 Jan 2026 15:29:28 +0000 (12:29 -0300)
Define new, perf tool only, sample types and their layouts. Add logic
to parse /proc/schedstat, convert it to perf sample format and save
samples to perf.data file with `perf sched stats record` command.

Also add logic to read perf.data file, interpret schedstat samples and
print rawdump of samples with `perf script -D`.

Note that, /proc/schedstat file output is standardized with version
number. The patch supports v15 but older or newer version can be added
easily.

Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: James Clark <james.clark@linaro.org>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
[ PRIu64 needs uint64_t, not 'unsigned long' to work on both 32-bit and 64-bit ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
13 files changed:
tools/lib/perf/Documentation/libperf.txt
tools/lib/perf/Makefile
tools/lib/perf/include/perf/event.h
tools/lib/perf/include/perf/schedstat-v15.h [new file with mode: 0644]
tools/perf/builtin-inject.c
tools/perf/builtin-sched.c
tools/perf/util/event.c
tools/perf/util/event.h
tools/perf/util/session.c
tools/perf/util/synthetic-events.c
tools/perf/util/synthetic-events.h
tools/perf/util/tool.c
tools/perf/util/tool.h

index 4072bc9b7670d2131585b8cb54101bfc487d0807..576ecc5fc31253c01aaaecd2545ef0f3c5881ee1 100644 (file)
@@ -211,6 +211,8 @@ SYNOPSIS
   struct perf_record_header_feature;
   struct perf_record_compressed;
   struct perf_record_compressed2;
+  struct perf_record_schedstat_cpu;
+  struct perf_record_schedstat_domain;
 --
 
 DESCRIPTION
index 7fbb50b74c00b3b99ac538821124e04bfbc6da6a..9fa28e512ca8e8b24e7a1ca3364f6ee11e6af585 100644 (file)
@@ -179,6 +179,7 @@ install_lib: libs
                cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ)
 
 HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h
+HDRS += schedstat-v15.h
 INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h
 
 INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf
index 43a8cb04994fa033f9ae8d74753a984ffb15ea9e..ce04fed7cefc8a62e9ac97d47e7abff746eb412d 100644 (file)
@@ -496,6 +496,43 @@ struct perf_record_bpf_metadata {
        struct perf_record_bpf_metadata_entry entries[];
 };
 
+struct perf_record_schedstat_cpu_v15 {
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)                _type _name
+#include "schedstat-v15.h"
+#undef CPU_FIELD
+};
+
+struct perf_record_schedstat_cpu {
+       struct perf_event_header header;
+       __u64                    timestamp;
+       __u32                    cpu;
+       __u16                    version;
+       /* Padding */
+       char                     __pad[2];
+       union {
+               struct perf_record_schedstat_cpu_v15 v15;
+       };
+};
+
+struct perf_record_schedstat_domain_v15 {
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)          _type _name
+#include "schedstat-v15.h"
+#undef DOMAIN_FIELD
+};
+
+#define DOMAIN_NAME_LEN                16
+
+struct perf_record_schedstat_domain {
+       struct perf_event_header header;
+       __u64                    timestamp;
+       __u32                    cpu;
+       __u16                    version;
+       __u16                    domain;
+       union {
+               struct perf_record_schedstat_domain_v15 v15;
+       };
+};
+
 enum perf_user_event_type { /* above any possible kernel type */
        PERF_RECORD_USER_TYPE_START             = 64,
        PERF_RECORD_HEADER_ATTR                 = 64,
@@ -519,6 +556,8 @@ enum perf_user_event_type { /* above any possible kernel type */
        PERF_RECORD_FINISHED_INIT               = 82,
        PERF_RECORD_COMPRESSED2                 = 83,
        PERF_RECORD_BPF_METADATA                = 84,
+       PERF_RECORD_SCHEDSTAT_CPU               = 85,
+       PERF_RECORD_SCHEDSTAT_DOMAIN            = 86,
        PERF_RECORD_HEADER_MAX
 };
 
@@ -562,6 +601,8 @@ union perf_event {
        struct perf_record_compressed           pack;
        struct perf_record_compressed2          pack2;
        struct perf_record_bpf_metadata         bpf_metadata;
+       struct perf_record_schedstat_cpu        schedstat_cpu;
+       struct perf_record_schedstat_domain     schedstat_domain;
 };
 
 #endif /* __LIBPERF_EVENT_H */
diff --git a/tools/lib/perf/include/perf/schedstat-v15.h b/tools/lib/perf/include/perf/schedstat-v15.h
new file mode 100644 (file)
index 0000000..639458d
--- /dev/null
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, "sched_yield() count",
+         "%11u", false, yld_count, v15);
+CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored",
+         "%11u", false, array_exp, v15);
+CPU_FIELD(__u32, sched_count, "schedule() called",
+         "%11u", false, sched_count, v15);
+CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle",
+         "%11u", true, sched_count, v15);
+CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called",
+         "%11u", false, ttwu_count, v15);
+CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu",
+         "%11u", true, ttwu_count, v15);
+CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)",
+         "%11llu", false, rq_cpu_time, v15);
+CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)",
+         "%11llu", true, rq_cpu_time, v15);
+CPU_FIELD(__u64, pcount, "total timeslices run on this cpu",
+         "%11llu", false, pcount, v15);
+#endif
+
+#ifdef DOMAIN_FIELD
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category idle> ");
+#endif
+DOMAIN_FIELD(__u32, idle_lb_count,
+            "load_balance() count on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_balanced,
+            "load_balance() found balanced on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_failed,
+            "load_balance() move task failed on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_imbalance,
+            "imbalance sum on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_gained,
+            "pull_task() count on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_hot_gained,
+            "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_nobusyq,
+            "load_balance() failed to find busier queue on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_nobusyg,
+            "load_balance() failed to find busier group on cpu idle", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u",
+                 idle_lb_count, idle_lb_balanced, idle_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(idle_lb_avg_pulled,
+                 "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf",
+                 idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category busy> ");
+#endif
+DOMAIN_FIELD(__u32, busy_lb_count,
+            "load_balance() count on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_balanced,
+            "load_balance() found balanced on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_failed,
+            "load_balance() move task failed on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_imbalance,
+            "imbalance sum on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_gained,
+            "pull_task() count on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_hot_gained,
+            "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_nobusyq,
+            "load_balance() failed to find busier queue on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_nobusyg,
+            "load_balance() failed to find busier group on cpu busy", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u",
+                 busy_lb_count, busy_lb_balanced, busy_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(busy_lb_avg_pulled,
+                 "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf",
+                 busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category newidle> ");
+#endif
+DOMAIN_FIELD(__u32, newidle_lb_count,
+            "load_balance() count on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_balanced,
+            "load_balance() found balanced on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_failed,
+            "load_balance() move task failed on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance,
+            "imbalance sum on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_gained,
+            "pull_task() count on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained,
+            "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq,
+            "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg,
+            "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(newidle_lb_success_count,
+                 "load_balance() success count on cpu newly idle", "%11u",
+                 newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(newidle_lb_avg_pulled,
+                 "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf",
+                 newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category active_load_balance()> ");
+#endif
+DOMAIN_FIELD(__u32, alb_count,
+            "active_load_balance() count", "%11u", false, v15);
+DOMAIN_FIELD(__u32, alb_failed,
+            "active_load_balance() move task failed", "%11u", false, v15);
+DOMAIN_FIELD(__u32, alb_pushed,
+            "active_load_balance() successfully moved a task", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_exec()> ");
+#endif
+DOMAIN_FIELD(__u32, sbe_count,
+            "sbe_count is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbe_balanced,
+            "sbe_balanced is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbe_pushed,
+            "sbe_pushed is not used", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_fork()> ");
+#endif
+DOMAIN_FIELD(__u32, sbf_count,
+            "sbf_count is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbf_balanced,
+            "sbf_balanced is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbf_pushed,
+            "sbf_pushed is not used", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Wakeup Info> ");
+#endif
+DOMAIN_FIELD(__u32, ttwu_wake_remote,
+            "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v15);
+DOMAIN_FIELD(__u32, ttwu_move_affine,
+            "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v15);
+DOMAIN_FIELD(__u32, ttwu_move_balance,
+            "try_to_wake_up() started passive balancing", "%11u", false, v15);
+#endif /* DOMAIN_FIELD */
index c89ac85ec112cd987047f97ef50118b9e1c82012..2c9456614cde21d1e357073281e0ec96ee4acf4f 100644 (file)
@@ -2657,6 +2657,8 @@ int cmd_inject(int argc, const char **argv)
        inject.tool.compressed          = perf_event__repipe_op4_synth;
        inject.tool.auxtrace            = perf_event__repipe_auxtrace;
        inject.tool.bpf_metadata        = perf_event__repipe_op2_synth;
+       inject.tool.schedstat_cpu       = perf_event__repipe_op2_synth;
+       inject.tool.schedstat_domain    = perf_event__repipe_op2_synth;
        inject.tool.dont_split_sample_group = true;
        inject.tool.merge_deferred_callchains = false;
        inject.session = __perf_session__new(&data, &inject.tool,
index eca3b1c58c4bb2aa2e6f923d0ab94030eea98fa7..ee3b4e42156e27b9eaee3d2961415a55db4fe2a4 100644 (file)
@@ -28,6 +28,8 @@
 #include "util/debug.h"
 #include "util/event.h"
 #include "util/util.h"
+#include "util/synthetic-events.h"
+#include "util/target.h"
 
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -55,6 +57,7 @@
 #define MAX_PRIO               140
 
 static const char *cpu_list;
+static struct perf_cpu_map *user_requested_cpus;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 
 struct sched_atom;
@@ -236,6 +239,9 @@ struct perf_sched {
        volatile bool   thread_funcs_exit;
        const char      *prio_str;
        DECLARE_BITMAP(prio_bitmap, MAX_PRIO);
+
+       struct perf_session *session;
+       struct perf_data *data;
 };
 
 /* per thread run time data */
@@ -3734,6 +3740,195 @@ static void setup_sorting(struct perf_sched *sched, const struct option *options
        sort_dimension__add("pid", &sched->cmp_pid);
 }
 
+static int process_synthesized_schedstat_event(const struct perf_tool *tool,
+                                              union perf_event *event,
+                                              struct perf_sample *sample __maybe_unused,
+                                              struct machine *machine __maybe_unused)
+{
+       struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+       if (perf_data__write(sched->data, event, event->header.size) <= 0) {
+               pr_err("failed to write perf data, error: %m\n");
+               return -1;
+       }
+
+       sched->session->header.data_size += event->header.size;
+       return 0;
+}
+
+static void sighandler(int sig __maybe_unused)
+{
+}
+
+static int enable_sched_schedstats(int *reset)
+{
+       char path[PATH_MAX];
+       FILE *fp;
+       char ch;
+
+       snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
+       fp = fopen(path, "w+");
+       if (!fp) {
+               pr_err("Failed to open %s\n", path);
+               return -1;
+       }
+
+       ch = getc(fp);
+       if (ch == '0') {
+               *reset = 1;
+               rewind(fp);
+               putc('1', fp);
+               fclose(fp);
+       }
+       return 0;
+}
+
+static int disable_sched_schedstat(void)
+{
+       char path[PATH_MAX];
+       FILE *fp;
+
+       snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
+       fp = fopen(path, "w");
+       if (!fp) {
+               pr_err("Failed to open %s\n", path);
+               return -1;
+       }
+
+       putc('0', fp);
+       fclose(fp);
+       return 0;
+}
+
+/* perf.data or any other output file name used by stats subcommand (only). */
+const char *output_name;
+
+static int perf_sched__schedstat_record(struct perf_sched *sched,
+                                       int argc, const char **argv)
+{
+       struct perf_session *session;
+       struct target target = {};
+       struct evlist *evlist;
+       int reset = 0;
+       int err = 0;
+       int fd;
+       struct perf_data data = {
+               .path  = output_name,
+               .mode  = PERF_DATA_MODE_WRITE,
+       };
+
+       signal(SIGINT, sighandler);
+       signal(SIGCHLD, sighandler);
+       signal(SIGTERM, sighandler);
+
+       evlist = evlist__new();
+       if (!evlist)
+               return -ENOMEM;
+
+       session = perf_session__new(&data, &sched->tool);
+       if (IS_ERR(session)) {
+               pr_err("Perf session creation failed.\n");
+               evlist__delete(evlist);
+               return PTR_ERR(session);
+       }
+
+       session->evlist = evlist;
+
+       sched->session = session;
+       sched->data = &data;
+
+       fd = perf_data__fd(&data);
+
+       /*
+        * Capture all important metadata about the system. Although they are
+        * not used by `perf sched stats` tool directly, they provide useful
+        * information about profiled environment.
+        */
+       perf_header__set_feat(&session->header, HEADER_HOSTNAME);
+       perf_header__set_feat(&session->header, HEADER_OSRELEASE);
+       perf_header__set_feat(&session->header, HEADER_VERSION);
+       perf_header__set_feat(&session->header, HEADER_ARCH);
+       perf_header__set_feat(&session->header, HEADER_NRCPUS);
+       perf_header__set_feat(&session->header, HEADER_CPUDESC);
+       perf_header__set_feat(&session->header, HEADER_CPUID);
+       perf_header__set_feat(&session->header, HEADER_TOTAL_MEM);
+       perf_header__set_feat(&session->header, HEADER_CMDLINE);
+       perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
+       perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
+       perf_header__set_feat(&session->header, HEADER_CACHE);
+       perf_header__set_feat(&session->header, HEADER_MEM_TOPOLOGY);
+       perf_header__set_feat(&session->header, HEADER_HYBRID_TOPOLOGY);
+       perf_header__set_feat(&session->header, HEADER_CPU_DOMAIN_INFO);
+
+       err = perf_session__write_header(session, evlist, fd, false);
+       if (err < 0)
+               goto out;
+
+       /*
+        * `perf sched stats` does not support workload profiling (-p pid)
+        * since /proc/schedstat file contains cpu specific data only. Hence, a
+        * profile target is either set of cpus or systemwide, never a process.
+        * Note that, although `-- <workload>` is supported, profile data are
+        * still cpu/systemwide.
+        */
+       if (cpu_list)
+               target.cpu_list = cpu_list;
+       else
+               target.system_wide = true;
+
+       if (argc) {
+               err = evlist__prepare_workload(evlist, &target, argv, false, NULL);
+               if (err)
+                       goto out;
+       }
+
+       err = evlist__create_maps(evlist, &target);
+       if (err < 0)
+               goto out;
+
+       user_requested_cpus = evlist->core.user_requested_cpus;
+
+       err = perf_event__synthesize_schedstat(&(sched->tool),
+                                              process_synthesized_schedstat_event,
+                                              user_requested_cpus);
+       if (err < 0)
+               goto out;
+
+       err = enable_sched_schedstats(&reset);
+       if (err < 0)
+               goto out;
+
+       if (argc)
+               evlist__start_workload(evlist);
+
+       /* wait for signal */
+       pause();
+
+       if (reset) {
+               err = disable_sched_schedstat();
+               if (err < 0)
+                       goto out;
+       }
+
+       err = perf_event__synthesize_schedstat(&(sched->tool),
+                                              process_synthesized_schedstat_event,
+                                              user_requested_cpus);
+       if (err < 0)
+               goto out;
+
+       err = perf_session__write_header(session, evlist, fd, true);
+
+out:
+       if (!err)
+               fprintf(stderr, "[ perf sched stats: Wrote samples to %s ]\n", data.path);
+       else
+               fprintf(stderr, "[ perf sched stats: Failed !! ]\n");
+
+       evlist__delete(evlist);
+       close(fd);
+       return err;
+}
+
 static bool schedstat_events_exposed(void)
 {
        /*
@@ -3910,6 +4105,12 @@ int cmd_sched(int argc, const char **argv)
        OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
        OPT_PARENT(sched_options)
        };
+       const struct option stats_options[] = {
+       OPT_STRING('o', "output", &output_name, "file",
+                  "`stats record` with output filename"),
+       OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+       OPT_END()
+       };
 
        const char * const latency_usage[] = {
                "perf sched latency [<options>]",
@@ -3927,9 +4128,13 @@ int cmd_sched(int argc, const char **argv)
                "perf sched timehist [<options>]",
                NULL
        };
+       const char *stats_usage[] = {
+               "perf sched stats {record} [<options>]",
+               NULL
+       };
        const char *const sched_subcommands[] = { "record", "latency", "map",
                                                  "replay", "script",
-                                                 "timehist", NULL };
+                                                 "timehist", "stats", NULL };
        const char *sched_usage[] = {
                NULL,
                NULL
@@ -4027,6 +4232,21 @@ int cmd_sched(int argc, const char **argv)
                ret = symbol__validate_sym_arguments();
                if (!ret)
                        ret = perf_sched__timehist(&sched);
+       } else if (!strcmp(argv[0], "stats")) {
+               const char *const stats_subcommands[] = {"record", NULL};
+
+               argc = parse_options_subcommand(argc, argv, stats_options,
+                                               stats_subcommands,
+                                               stats_usage,
+                                               PARSE_OPT_STOP_AT_NON_OPTION);
+
+               if (argv[0] && !strcmp(argv[0], "record")) {
+                       if (argc)
+                               argc = parse_options(argc, argv, stats_options,
+                                                    stats_usage, 0);
+                       return perf_sched__schedstat_record(&sched, argc, argv);
+               }
+               usage_with_options(stats_usage, stats_options);
        } else {
                usage_with_options(sched_usage, sched_options);
        }
index 4c92cc1a952c1d9f3ef9e6be9161e1ea0a8ae20b..5a98c16e10923d576e446955ee0ddca5b3cfd880 100644 (file)
@@ -83,6 +83,8 @@ static const char *perf_event__names[] = {
        [PERF_RECORD_FINISHED_INIT]             = "FINISHED_INIT",
        [PERF_RECORD_COMPRESSED2]               = "COMPRESSED2",
        [PERF_RECORD_BPF_METADATA]              = "BPF_METADATA",
+       [PERF_RECORD_SCHEDSTAT_CPU]             = "SCHEDSTAT_CPU",
+       [PERF_RECORD_SCHEDSTAT_DOMAIN]          = "SCHEDSTAT_DOMAIN",
 };
 
 const char *perf_event__name(unsigned int id)
@@ -571,6 +573,44 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma
        return ret;
 }
 
+size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp)
+{
+       struct perf_record_schedstat_cpu *cs = &event->schedstat_cpu;
+       size_t size = fprintf(fp, "\ncpu%u ", cs->cpu);
+       __u16 version = cs->version;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)                \
+       size += fprintf(fp, "%" PRIu64 " ", (uint64_t)cs->_ver._name)
+
+       if (version == 15) {
+#include <perf/schedstat-v15.h>
+               return size;
+       }
+#undef CPU_FIELD
+
+       return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
+                      event->schedstat_cpu.version);
+}
+
+size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp)
+{
+       struct perf_record_schedstat_domain *ds = &event->schedstat_domain;
+       __u16 version = ds->version;
+       size_t size = fprintf(fp, "\ndomain%u ", ds->domain);
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)          \
+       size += fprintf(fp, "%" PRIu64 " ", (uint64_t)ds->_ver._name)
+
+       if (version == 15) {
+#include <perf/schedstat-v15.h>
+               return size;
+       }
+#undef DOMAIN_FIELD
+
+       return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
+                      event->schedstat_domain.version);
+}
+
 size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp)
 {
        size_t ret = fprintf(fp, "PERF_RECORD_%s",
index 64c63b59d6172204af8eed3a798666fc12bc3693..2ea83fdf8a039a50838a6ec21764f0ca5bf14b63 100644 (file)
@@ -392,6 +392,8 @@ size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_bpf_metadata(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *machine,FILE *fp);
+size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp);
 
 int kallsyms__get_function_start(const char *kallsyms_filename,
index d7b28cb4e6722d8177f6bdf10b9c09c182c0aa4a..c0231bc000e74697da01dafdc6d4bbdf9ee8eb6c 100644 (file)
@@ -698,6 +698,20 @@ static void perf_event__time_conv_swap(union perf_event *event,
        }
 }
 
+static void
+perf_event__schedstat_cpu_swap(union perf_event *event __maybe_unused,
+                              bool sample_id_all __maybe_unused)
+{
+       /* FIXME */
+}
+
+static void
+perf_event__schedstat_domain_swap(union perf_event *event __maybe_unused,
+                                 bool sample_id_all __maybe_unused)
+{
+       /* FIXME */
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
                                    bool sample_id_all);
 
@@ -737,6 +751,8 @@ static perf_event__swap_op perf_event__swap_ops[] = {
        [PERF_RECORD_STAT_ROUND]          = perf_event__stat_round_swap,
        [PERF_RECORD_EVENT_UPDATE]        = perf_event__event_update_swap,
        [PERF_RECORD_TIME_CONV]           = perf_event__time_conv_swap,
+       [PERF_RECORD_SCHEDSTAT_CPU]       = perf_event__schedstat_cpu_swap,
+       [PERF_RECORD_SCHEDSTAT_DOMAIN]    = perf_event__schedstat_domain_swap,
        [PERF_RECORD_HEADER_MAX]          = NULL,
 };
 
@@ -1667,6 +1683,12 @@ static s64 perf_session__process_user_event(struct perf_session *session,
        case PERF_RECORD_BPF_METADATA:
                err = tool->bpf_metadata(tool, session, event);
                break;
+       case PERF_RECORD_SCHEDSTAT_CPU:
+               err = tool->schedstat_cpu(tool, session, event);
+               break;
+       case PERF_RECORD_SCHEDSTAT_DOMAIN:
+               err = tool->schedstat_domain(tool, session, event);
+               break;
        default:
                err = -EINVAL;
                break;
index 2ba9fa25e00a68016757550ed4b3da7bd5f9ea29..5366ea921e70a431728f689027fa1ef008ae155e 100644 (file)
@@ -2529,3 +2529,182 @@ int parse_synth_opt(char *synth)
 
        return ret;
 }
+
+static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version,
+                                                   __u64 *cpu, __u64 timestamp)
+{
+       struct perf_record_schedstat_cpu *cs;
+       union perf_event *event;
+       size_t size;
+       char ch;
+
+       size = sizeof(*cs);
+       size = PERF_ALIGN(size, sizeof(u64));
+       event = zalloc(size);
+
+       if (!event)
+               return NULL;
+
+       cs = &event->schedstat_cpu;
+       cs->header.type = PERF_RECORD_SCHEDSTAT_CPU;
+       cs->header.size = size;
+       cs->timestamp = timestamp;
+
+       if (io__get_char(io) != 'p' || io__get_char(io) != 'u')
+               goto out_cpu;
+
+       if (io__get_dec(io, (__u64 *)cpu) != ' ')
+               goto out_cpu;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)        \
+       do {                                                            \
+               __u64 _tmp;                                             \
+               ch = io__get_dec(io, &_tmp);                            \
+               if (ch != ' ' && ch != '\n')                            \
+                       goto out_cpu;                                   \
+               cs->_ver._name = _tmp;                                  \
+       } while (0)
+
+       if (version == 15) {
+#include <perf/schedstat-v15.h>
+       }
+#undef CPU_FIELD
+
+       cs->cpu = *cpu;
+       cs->version = version;
+
+       return event;
+out_cpu:
+       free(event);
+       return NULL;
+}
+
+static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 version,
+                                                      __u64 cpu, __u64 timestamp)
+{
+       struct perf_record_schedstat_domain *ds;
+       union perf_event *event = NULL;
+       __u64 d_num;
+       size_t size;
+       char ch;
+
+       if (io__get_char(io) != 'o' || io__get_char(io) != 'm' || io__get_char(io) != 'a' ||
+           io__get_char(io) != 'i' || io__get_char(io) != 'n')
+               return NULL;
+
+       ch = io__get_dec(io, &d_num);
+
+       /* Skip cpumask as it can be extracted from perf header */
+       while (io__get_char(io) != ' ')
+               continue;
+
+       size = sizeof(*ds);
+       size = PERF_ALIGN(size, sizeof(u64));
+       event = zalloc(size);
+
+       ds = &event->schedstat_domain;
+       ds->header.type = PERF_RECORD_SCHEDSTAT_DOMAIN;
+       ds->header.size = size;
+       ds->version = version;
+       ds->timestamp = timestamp;
+       ds->domain = d_num;
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)  \
+       do {                                                            \
+               __u64 _tmp;                                             \
+               ch = io__get_dec(io, &_tmp);                            \
+               if (ch != ' ' && ch != '\n')                            \
+                       goto out_domain;                                \
+               ds->_ver._name = _tmp;                                  \
+       } while (0)
+
+       if (version == 15) {
+#include <perf/schedstat-v15.h>
+       }
+#undef DOMAIN_FIELD
+
+       ds->cpu = cpu;
+       goto out;
+
+out_domain:
+       free(event);
+       event = NULL;
+out:
+       return event;
+}
+
+int perf_event__synthesize_schedstat(const struct perf_tool *tool,
+                                    perf_event__handler_t process,
+                                    struct perf_cpu_map *user_requested_cpus)
+{
+       char *line = NULL, path[PATH_MAX];
+       union perf_event *event = NULL;
+       size_t line_len = 0;
+       char bf[BUFSIZ];
+       __u64 timestamp;
+       __u64 cpu = -1;
+       __u16 version;
+       struct io io;
+       int ret = -1;
+       char ch;
+
+       snprintf(path, PATH_MAX, "%s/schedstat", procfs__mountpoint());
+       io.fd = open(path, O_RDONLY, 0);
+       if (io.fd < 0) {
+               pr_err("Failed to open %s. Possibly CONFIG_SCHEDSTAT is disabled.\n", path);
+               return -1;
+       }
+       io__init(&io, io.fd, bf, sizeof(bf));
+
+       if (io__getline(&io, &line, &line_len) < 0 || !line_len)
+               goto out;
+
+       if (!strcmp(line, "version 15\n")) {
+               version = 15;
+       } else {
+               pr_err("Unsupported %s version: %s", path, line + 8);
+               goto out_free_line;
+       }
+
+       if (io__getline(&io, &line, &line_len) < 0 || !line_len)
+               goto out_free_line;
+       timestamp = atol(line + 10);
+
+       /*
+        * FIXME: Can be optimized a bit by not synthesizing domain samples
+        * for filtered out cpus.
+        */
+       for (ch = io__get_char(&io); !io.eof; ch = io__get_char(&io)) {
+               struct perf_cpu this_cpu;
+
+               if (ch == 'c') {
+                       event = __synthesize_schedstat_cpu(&io, version,
+                                                          &cpu, timestamp);
+               } else if (ch == 'd') {
+                       event = __synthesize_schedstat_domain(&io, version,
+                                                             cpu, timestamp);
+               }
+               if (!event)
+                       goto out_free_line;
+
+               this_cpu.cpu = cpu;
+
+               if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu))
+                       continue;
+
+               if (process(tool, event, NULL, NULL) < 0) {
+                       free(event);
+                       goto out_free_line;
+               }
+
+               free(event);
+       }
+
+       ret = 0;
+
+out_free_line:
+       free(line);
+out:
+       close(io.fd);
+       return ret;
+}
index f8588b6cf11a091a9d238b4a55ed23cd1921a592..b0edad0c310010b7029ae1a6e396106969645fd7 100644 (file)
@@ -128,4 +128,7 @@ int perf_event__synthesize_for_pipe(const struct perf_tool *tool,
                                    struct perf_data *data,
                                    perf_event__handler_t process);
 
+int perf_event__synthesize_schedstat(const struct perf_tool *tool,
+                                    perf_event__handler_t process,
+                                    struct perf_cpu_map *user_requested_cpu);
 #endif // __PERF_SYNTHETIC_EVENTS_H
index 27ba5849c74a2e7d5208fee0ddca644dc9824628..013c7839e2cfd29da64d0ca429022aa57797e34d 100644 (file)
@@ -253,7 +253,25 @@ static int perf_event__process_bpf_metadata_stub(const struct perf_tool *tool __
 {
        if (dump_trace)
                perf_event__fprintf_bpf_metadata(event, stdout);
+       dump_printf(": unhandled!\n");
+       return 0;
+}
+static int process_schedstat_cpu_stub(const struct perf_tool *tool __maybe_unused,
+                                     struct perf_session *perf_session __maybe_unused,
+                                     union perf_event *event)
+{
+       if (dump_trace)
+               perf_event__fprintf_schedstat_cpu(event, stdout);
+       dump_printf(": unhandled!\n");
+       return 0;
+}
 
+static int process_schedstat_domain_stub(const struct perf_tool *tool __maybe_unused,
+                                        struct perf_session *perf_session __maybe_unused,
+                                        union perf_event *event)
+{
+       if (dump_trace)
+               perf_event__fprintf_schedstat_domain(event, stdout);
        dump_printf(": unhandled!\n");
        return 0;
 }
@@ -317,6 +335,8 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
 #endif
        tool->finished_init = process_event_op2_stub;
        tool->bpf_metadata = perf_event__process_bpf_metadata_stub;
+       tool->schedstat_cpu = process_schedstat_cpu_stub;
+       tool->schedstat_domain = process_schedstat_domain_stub;
 }
 
 bool perf_tool__compressed_is_stub(const struct perf_tool *tool)
index e96b69d25a5b737de2c830ec5b6a4b6f53bff4ef..2d9a4b1ca9d0fad5359ebc42f36ff572c69b0bc3 100644 (file)
@@ -81,7 +81,9 @@ struct perf_tool {
                        stat_round,
                        feature,
                        finished_init,
-                       bpf_metadata;
+                       bpf_metadata,
+                       schedstat_cpu,
+                       schedstat_domain;
        event_op4       compressed;
        event_op3       auxtrace;
        bool            ordered_events;