#include "util/thread_map.h"
#include "util/stat.h"
#include "util/tool.h"
+#include "util/trace.h"
#include "util/util.h"
#include "trace/beauty/beauty.h"
#include "trace-event.h"
bool hexret;
};
-enum summary_mode {
- SUMMARY__NONE = 0,
- SUMMARY__BY_TOTAL,
- SUMMARY__BY_THREAD,
-};
-
struct trace {
struct perf_tool tool;
struct {
} stats;
unsigned int max_stack;
unsigned int min_stack;
- enum summary_mode summary_mode;
+ enum trace_summary_mode summary_mode;
int raw_augmented_syscalls_args_size;
bool raw_augmented_syscalls;
bool fd_path_disabled;
bool force;
bool vfs_getname;
bool force_btf;
+ bool summary_bpf;
int trace_pgfaults;
char *perfconfig_events;
struct {
struct syscall_stats *stats = NULL;
u64 duration = 0;
+ if (trace->summary_bpf)
+ return;
+
if (trace->summary_mode == SUMMARY__BY_TOTAL)
syscall_stats = trace->syscall_stats;
trace->live = true;
+ if (trace->summary_bpf) {
+ if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
+ goto out_delete_evlist;
+
+ if (trace->summary_only)
+ goto create_maps;
+ }
+
if (!trace->raw_augmented_syscalls) {
if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
goto out_error_raw_syscalls;
if (trace->cgroup)
evlist__set_default_cgroup(trace->evlist, trace->cgroup);
+create_maps:
err = evlist__create_maps(evlist, &trace->opts.target);
if (err < 0) {
fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
goto out_delete_evlist;
}
- if (trace->summary_mode == SUMMARY__BY_TOTAL) {
+ if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
trace->syscall_stats = alloc_syscall_stats();
if (trace->syscall_stats == NULL)
goto out_delete_evlist;
if (err < 0)
goto out_error_apply_filters;
- err = evlist__mmap(evlist, trace->opts.mmap_pages);
- if (err < 0)
- goto out_error_mmap;
+ if (!trace->summary_only || !trace->summary_bpf) {
+ err = evlist__mmap(evlist, trace->opts.mmap_pages);
+ if (err < 0)
+ goto out_error_mmap;
+ }
if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
evlist__enable(evlist);
evlist__enable(evlist);
}
+ if (trace->summary_bpf)
+ trace_start_bpf_summary();
+
trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
perf_thread_map__nr(evlist->core.threads) > 1 ||
evlist__first(evlist)->core.attr.inherit;
evlist__disable(evlist);
+ if (trace->summary_bpf)
+ trace_end_bpf_summary();
+
if (trace->sort_events)
ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
if (!err) {
if (trace->summary) {
- if (trace->summary_mode == SUMMARY__BY_TOTAL)
+ if (trace->summary_bpf)
+ trace_print_bpf_summary(trace->output);
+ else if (trace->summary_mode == SUMMARY__BY_TOTAL)
trace__fprintf_total_summary(trace, trace->output);
else
trace__fprintf_thread_summary(trace, trace->output);
}
out_delete_evlist:
+ trace_cleanup_bpf_summary();
delete_syscall_stats(trace->syscall_stats);
trace__symbols__exit(trace);
evlist__free_syscall_tp_fields(evlist);
"start"),
OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
"to customized ones"),
+ OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
OPTS_EVSWITCH(&trace.evswitch),
OPT_END()
};
goto skip_augmentation;
}
+ if (trace.summary_bpf) {
+ if (!trace.opts.target.system_wide) {
+ /* TODO: Add filters in the BPF to support other targets. */
+ pr_err("Error: --bpf-summary only works for system-wide mode.\n");
+ goto out;
+ }
+ if (trace.summary_only)
+ goto skip_augmentation;
+ }
+
trace.skel = augmented_raw_syscalls_bpf__open();
if (!trace.skel) {
pr_debug("Failed to open augmented syscalls BPF skeleton");
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "dwarf-regs.h" /* for EM_HOST */
+#include "syscalltbl.h"
+#include "util/hashmap.h"
+#include "util/trace.h"
+#include "util/util.h"
+#include <bpf/bpf.h>
+#include <linux/time64.h>
+#include <tools/libc_compat.h> /* reallocarray */
+
+#include "bpf_skel/syscall_summary.h"
+#include "bpf_skel/syscall_summary.skel.h"
+
+
+static struct syscall_summary_bpf *skel;
+
+int trace_prepare_bpf_summary(enum trace_summary_mode mode)
+{
+ skel = syscall_summary_bpf__open();
+ if (skel == NULL) {
+ fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
+ return -1;
+ }
+
+ if (mode == SUMMARY__BY_THREAD)
+ skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
+ else
+ skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
+
+ if (syscall_summary_bpf__load(skel) < 0) {
+ fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
+ return -1;
+ }
+
+ if (syscall_summary_bpf__attach(skel) < 0) {
+ fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+void trace_start_bpf_summary(void)
+{
+ skel->bss->enabled = 1;
+}
+
+void trace_end_bpf_summary(void)
+{
+ skel->bss->enabled = 0;
+}
+
+struct syscall_node {
+ int syscall_nr;
+ struct syscall_stats stats;
+};
+
+static double rel_stddev(struct syscall_stats *stat)
+{
+ double variance, average;
+
+ if (stat->count < 2)
+ return 0;
+
+ average = (double)stat->total_time / stat->count;
+
+ variance = stat->squared_sum;
+ variance -= (stat->total_time * stat->total_time) / stat->count;
+ variance /= stat->count - 1;
+
+ return 100 * sqrt(variance / stat->count) / average;
+}
+
+/*
+ * The syscall_data is to maintain syscall stats ordered by total time.
+ * It supports different summary modes like per-thread or global.
+ *
+ * For per-thread stats, it uses two-level data strurcture -
+ * syscall_data is keyed by TID and has an array of nodes which
+ * represents each syscall for the thread.
+ *
+ * For global stats, it's still two-level technically but we don't need
+ * per-cpu analysis so it's keyed by the syscall number to combine stats
+ * from different CPUs. And syscall_data always has a syscall_node so
+ * it can effectively work as flat hierarchy.
+ */
+struct syscall_data {
+ int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
+ int nr_events;
+ int nr_nodes;
+ u64 total_time;
+ struct syscall_node *nodes;
+};
+
+static int datacmp(const void *a, const void *b)
+{
+ const struct syscall_data * const *sa = a;
+ const struct syscall_data * const *sb = b;
+
+ return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
+}
+
+static int nodecmp(const void *a, const void *b)
+{
+ const struct syscall_node *na = a;
+ const struct syscall_node *nb = b;
+
+ return na->stats.total_time > nb->stats.total_time ? -1 : 1;
+}
+
+static size_t sc_node_hash(long key, void *ctx __maybe_unused)
+{
+ return key;
+}
+
+static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+ return key1 == key2;
+}
+
+static int print_common_stats(struct syscall_data *data, FILE *fp)
+{
+ int printed = 0;
+
+ for (int i = 0; i < data->nr_nodes; i++) {
+ struct syscall_node *node = &data->nodes[i];
+ struct syscall_stats *stat = &node->stats;
+ double total = (double)(stat->total_time) / NSEC_PER_MSEC;
+ double min = (double)(stat->min_time) / NSEC_PER_MSEC;
+ double max = (double)(stat->max_time) / NSEC_PER_MSEC;
+ double avg = total / stat->count;
+ const char *name;
+
+ /* TODO: support other ABIs */
+ name = syscalltbl__name(EM_HOST, node->syscall_nr);
+ if (name)
+ printed += fprintf(fp, " %-15s", name);
+ else
+ printed += fprintf(fp, " syscall:%-7d", node->syscall_nr);
+
+ printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
+ stat->count, stat->error, total, min, avg, max,
+ rel_stddev(stat));
+ }
+ return printed;
+}
+
+static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
+ struct syscall_stats *map_data)
+{
+ struct syscall_data *data;
+ struct syscall_node *nodes;
+
+ if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
+ data = zalloc(sizeof(*data));
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->key = map_key->cpu_or_tid;
+ if (hashmap__add(hash, data->key, data) < 0) {
+ free(data);
+ return -ENOMEM;
+ }
+ }
+
+ /* update thread total stats */
+ data->nr_events += map_data->count;
+ data->total_time += map_data->total_time;
+
+ nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
+ if (nodes == NULL)
+ return -ENOMEM;
+
+ data->nodes = nodes;
+ nodes = &data->nodes[data->nr_nodes++];
+ nodes->syscall_nr = map_key->nr;
+
+ /* each thread has an entry for each syscall, just use the stat */
+ memcpy(&nodes->stats, map_data, sizeof(*map_data));
+ return 0;
+}
+
+static int print_thread_stat(struct syscall_data *data, FILE *fp)
+{
+ int printed = 0;
+
+ qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
+
+ printed += fprintf(fp, " thread (%d), ", data->key);
+ printed += fprintf(fp, "%d events\n\n", data->nr_events);
+
+ printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
+ printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
+ printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
+
+ printed += print_common_stats(data, fp);
+ printed += fprintf(fp, "\n\n");
+
+ return printed;
+}
+
+static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+ int printed = 0;
+
+ for (int i = 0; i < nr_data; i++)
+ printed += print_thread_stat(data[i], fp);
+
+ return printed;
+}
+
+static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
+ struct syscall_stats *map_data)
+{
+ struct syscall_data *data;
+ struct syscall_stats *stat;
+
+ if (!hashmap__find(hash, map_key->nr, &data)) {
+ data = zalloc(sizeof(*data));
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->nodes = zalloc(sizeof(*data->nodes));
+ if (data->nodes == NULL) {
+ free(data);
+ return -ENOMEM;
+ }
+
+ data->nr_nodes = 1;
+ data->key = map_key->nr;
+ data->nodes->syscall_nr = data->key;
+
+ if (hashmap__add(hash, data->key, data) < 0) {
+ free(data->nodes);
+ free(data);
+ return -ENOMEM;
+ }
+ }
+
+ /* update total stats for this syscall */
+ data->nr_events += map_data->count;
+ data->total_time += map_data->total_time;
+
+ /* This is sum of the same syscall from different CPUs */
+ stat = &data->nodes->stats;
+
+ stat->total_time += map_data->total_time;
+ stat->squared_sum += map_data->squared_sum;
+ stat->count += map_data->count;
+ stat->error += map_data->error;
+
+ if (stat->max_time < map_data->max_time)
+ stat->max_time = map_data->max_time;
+ if (stat->min_time > map_data->min_time || stat->min_time == 0)
+ stat->min_time = map_data->min_time;
+
+ return 0;
+}
+
+static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+ int printed = 0;
+ int nr_events = 0;
+
+ for (int i = 0; i < nr_data; i++)
+ nr_events += data[i]->nr_events;
+
+ printed += fprintf(fp, " total, %d events\n\n", nr_events);
+
+ printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
+ printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
+ printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
+
+ for (int i = 0; i < nr_data; i++)
+ printed += print_common_stats(data[i], fp);
+
+ printed += fprintf(fp, "\n\n");
+ return printed;
+}
+
+int trace_print_bpf_summary(FILE *fp)
+{
+ struct bpf_map *map = skel->maps.syscall_stats_map;
+ struct syscall_key *prev_key, key;
+ struct syscall_data **data = NULL;
+ struct hashmap schash;
+ struct hashmap_entry *entry;
+ int nr_data = 0;
+ int printed = 0;
+ int i;
+ size_t bkt;
+
+ hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
+
+ printed = fprintf(fp, "\n Summary of events:\n\n");
+
+ /* get stats from the bpf map */
+ prev_key = NULL;
+ while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
+ struct syscall_stats stat;
+
+ if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
+ if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ update_thread_stats(&schash, &key, &stat);
+ else
+ update_total_stats(&schash, &key, &stat);
+ }
+
+ prev_key = &key;
+ }
+
+ nr_data = hashmap__size(&schash);
+ data = calloc(nr_data, sizeof(*data));
+ if (data == NULL)
+ goto out;
+
+ i = 0;
+ hashmap__for_each_entry(&schash, entry, bkt)
+ data[i++] = entry->pvalue;
+
+ qsort(data, nr_data, sizeof(*data), datacmp);
+
+ if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ printed += print_thread_stats(data, nr_data, fp);
+ else
+ printed += print_total_stats(data, nr_data, fp);
+
+ for (i = 0; i < nr_data && data; i++) {
+ free(data[i]->nodes);
+ free(data[i]);
+ }
+ free(data);
+
+out:
+ hashmap__clear(&schash);
+ return printed;
+}
+
+void trace_cleanup_bpf_summary(void)
+{
+ syscall_summary_bpf__destroy(skel);
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace raw_syscalls tracepoints to collect system call statistics.
+ */
+
+#include "vmlinux.h"
+#include "syscall_summary.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/* This is to calculate a delta between sys-enter and sys-exit for each thread */
+struct syscall_trace {
+ int nr; /* syscall number is only available at sys-enter */
+ int unused;
+ u64 timestamp;
+};
+
+#define MAX_ENTRIES (128 * 1024)
+
+struct syscall_trace_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int); /* tid */
+ __type(value, struct syscall_trace);
+ __uint(max_entries, MAX_ENTRIES);
+} syscall_trace_map SEC(".maps");
+
+struct syscall_stats_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct syscall_key);
+ __type(value, struct syscall_stats);
+ __uint(max_entries, MAX_ENTRIES);
+} syscall_stats_map SEC(".maps");
+
+int enabled; /* controlled from userspace */
+
+const volatile enum syscall_aggr_mode aggr_mode;
+
+static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
+{
+ struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
+ struct syscall_stats *stats;
+
+ stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+ if (stats == NULL) {
+ struct syscall_stats zero = {};
+
+ bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
+ stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+ if (stats == NULL)
+ return;
+ }
+
+ __sync_fetch_and_add(&stats->count, 1);
+ if (ret < 0)
+ __sync_fetch_and_add(&stats->error, 1);
+
+ if (duration > 0) {
+ __sync_fetch_and_add(&stats->total_time, duration);
+ __sync_fetch_and_add(&stats->squared_sum, duration * duration);
+ if (stats->max_time < duration)
+ stats->max_time = duration;
+ if (stats->min_time > duration || stats->min_time == 0)
+ stats->min_time = duration;
+ }
+
+ return;
+}
+
+SEC("tp_btf/sys_enter")
+int sys_enter(u64 *ctx)
+{
+ int tid;
+ struct syscall_trace st;
+
+ if (!enabled)
+ return 0;
+
+ st.nr = ctx[1]; /* syscall number */
+ st.unused = 0;
+ st.timestamp = bpf_ktime_get_ns();
+
+ tid = bpf_get_current_pid_tgid();
+ bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
+
+ return 0;
+}
+
+SEC("tp_btf/sys_exit")
+int sys_exit(u64 *ctx)
+{
+ int tid;
+ int key;
+ long ret = ctx[1]; /* return value of the syscall */
+ struct syscall_trace *st;
+ s64 delta;
+
+ if (!enabled)
+ return 0;
+
+ tid = bpf_get_current_pid_tgid();
+ st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
+ if (st == NULL)
+ return 0;
+
+ if (aggr_mode == SYSCALL_AGGR_THREAD)
+ key = tid;
+ else
+ key = bpf_get_smp_processor_id();
+
+ delta = bpf_ktime_get_ns() - st->timestamp;
+ update_stats(key, st->nr, delta, ret);
+
+ bpf_map_delete_elem(&syscall_trace_map, &tid);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";