]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
perf lock contention: Symbolize zone->lock using BTF
authorNamhyung Kim <namhyung@kernel.org>
Tue, 1 Apr 2025 06:30:55 +0000 (23:30 -0700)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Tue, 29 Apr 2025 15:23:53 +0000 (12:23 -0300)
The struct zone is embedded in struct pglist_data which can be allocated
for each NUMA node early in the boot process.  As it's not a slab object
nor a global lock, this was not symbolized.

Since the zone->lock is often contended, it'd be nice if we can
symbolize it.  On NUMA systems, node_data array will have pointers for
struct pglist_data.  By following the pointer, it can calculate the
address of each zone and its lock using BTF.  On UMA, it can just use
contig_page_data and its zones.

The following example shows the zone lock contention at the end.

  $ sudo ./perf lock con -abl -E 5 -- ./perf bench sched messaging
  # Running 'sched/messaging' benchmark:
  # 20 sender and receiver processes per group
  # 10 groups == 400 processes run

       Total time: 0.038 [sec]
   contended   total wait     max wait     avg wait            address   symbol

        5167     18.17 ms     10.27 us      3.52 us   ffff953340052d00   &kmem_cache_node (spinlock)
          38     11.75 ms    465.49 us    309.13 us   ffff95334060c480   &sock_inode_cache (spinlock)
        3916     10.13 ms     10.43 us      2.59 us   ffff953342aecb40   &kmem_cache_node (spinlock)
        2963     10.02 ms     13.75 us      3.38 us   ffff9533d2344098   &kmalloc-rnd-08-2k (spinlock)
         216      5.05 ms     99.49 us     23.39 us   ffff9542bf7d65d0   zone_lock (spinlock)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20250401063055.7431-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/util/bpf_lock_contention.c
tools/perf/util/bpf_skel/lock_contention.bpf.c
tools/perf/util/bpf_skel/lock_data.h
tools/perf/util/bpf_skel/vmlinux/vmlinux.h
tools/perf/util/lock-contention.h

index 5af8f6d1bc952613055b283d83ba0b238edea77a..98395667220e58ee2d33edcc0163a3a756d83965 100644 (file)
@@ -12,6 +12,7 @@
 #include "util/lock-contention.h"
 #include <linux/zalloc.h>
 #include <linux/string.h>
+#include <api/fs/fs.h>
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include <inttypes.h>
@@ -35,28 +36,26 @@ static bool slab_cache_equal(long key1, long key2, void *ctx __maybe_unused)
 
 static void check_slab_cache_iter(struct lock_contention *con)
 {
-       struct btf *btf = btf__load_vmlinux_btf();
        s32 ret;
 
        hashmap__init(&slab_hash, slab_cache_hash, slab_cache_equal, /*ctx=*/NULL);
 
-       if (btf == NULL) {
+       con->btf = btf__load_vmlinux_btf();
+       if (con->btf == NULL) {
                pr_debug("BTF loading failed: %s\n", strerror(errno));
                return;
        }
 
-       ret = btf__find_by_name_kind(btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
+       ret = btf__find_by_name_kind(con->btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
        if (ret < 0) {
                bpf_program__set_autoload(skel->progs.slab_cache_iter, false);
                pr_debug("slab cache iterator is not available: %d\n", ret);
-               goto out;
+               return;
        }
 
        has_slab_iter = true;
 
        bpf_map__set_max_entries(skel->maps.slab_caches, con->map_nr_entries);
-out:
-       btf__free(btf);
 }
 
 static void run_slab_cache_iter(void)
@@ -109,6 +108,75 @@ static void exit_slab_cache_iter(void)
        hashmap__clear(&slab_hash);
 }
 
+static void init_numa_data(struct lock_contention *con)
+{
+       struct symbol *sym;
+       struct map *kmap;
+       char *buf = NULL, *p;
+       size_t len;
+       long last = -1;
+       int ret;
+
+       /*
+        * 'struct zone' is embedded in 'struct pglist_data' as an array.
+        * As we may not have full information of the struct zone in the
+        * (fake) vmlinux.h, let's get the actual size from BTF.
+        */
+       ret = btf__find_by_name_kind(con->btf, "zone", BTF_KIND_STRUCT);
+       if (ret < 0) {
+               pr_debug("cannot get type of struct zone: %d\n", ret);
+               return;
+       }
+
+       ret = btf__resolve_size(con->btf, ret);
+       if (ret < 0) {
+               pr_debug("cannot get size of struct zone: %d\n", ret);
+               return;
+       }
+       skel->rodata->sizeof_zone = ret;
+
+       /* UMA system doesn't have 'node_data[]' - just use contig_page_data. */
+       sym = machine__find_kernel_symbol_by_name(con->machine,
+                                                 "contig_page_data",
+                                                 &kmap);
+       if (sym) {
+               skel->rodata->contig_page_data_addr = map__unmap_ip(kmap, sym->start);
+               map__put(kmap);
+               return;
+       }
+
+       /*
+        * The 'node_data' is an array of pointers to struct pglist_data.
+        * It needs to follow the pointer for each node in BPF to get the
+        * address of struct pglist_data and its zones.
+        */
+       sym = machine__find_kernel_symbol_by_name(con->machine,
+                                                 "node_data",
+                                                 &kmap);
+       if (sym == NULL)
+               return;
+
+       skel->rodata->node_data_addr = map__unmap_ip(kmap, sym->start);
+       map__put(kmap);
+
+       /* get the number of online nodes using the last node number + 1 */
+       ret = sysfs__read_str("devices/system/node/online", &buf, &len);
+       if (ret < 0) {
+               pr_debug("failed to read online node: %d\n", ret);
+               return;
+       }
+
+       p = buf;
+       while (p && *p) {
+               last = strtol(p, &p, 0);
+
+               if (p && (*p == ',' || *p == '-' || *p == '\n'))
+                       p++;
+       }
+       skel->rodata->nr_nodes = last + 1;
+       free(buf);
+}
+
 int lock_contention_prepare(struct lock_contention *con)
 {
        int i, fd;
@@ -218,6 +286,8 @@ int lock_contention_prepare(struct lock_contention *con)
 
        bpf_map__set_max_entries(skel->maps.slab_filter, nslabs);
 
+       init_numa_data(con);
+
        if (lock_contention_bpf__load(skel) < 0) {
                pr_err("Failed to load lock-contention BPF skeleton\n");
                return -1;
@@ -505,6 +575,11 @@ static const char *lock_contention_get_name(struct lock_contention *con,
                                return "rq_lock";
                }
 
+               if (!bpf_map_lookup_elem(lock_fd, &key->lock_addr_or_cgroup, &flags)) {
+                       if (flags == LOCK_CLASS_ZONE_LOCK)
+                               return "zone_lock";
+               }
+
                /* look slab_hash for dynamic locks in a slab object */
                if (hashmap__find(&slab_hash, flags & LCB_F_SLAB_ID_MASK, &slab_data)) {
                        snprintf(name_buf, sizeof(name_buf), "&%s", slab_data->name);
@@ -743,6 +818,7 @@ int lock_contention_finish(struct lock_contention *con)
        }
 
        exit_slab_cache_iter();
+       btf__free(con->btf);
 
        return 0;
 }
index 69be7a4234e076e86f0891a4958ee2015b134571..6f12c7d978a2e01537620df3ea9913dcf0ea5ef3 100644 (file)
@@ -11,6 +11,9 @@
 /* for collect_lock_syms().  4096 was rejected by the verifier */
 #define MAX_CPUS  1024
 
+/* for collect_zone_lock().  It should be more than the actual zones. */
+#define MAX_ZONES  10
+
 /* lock contention flags from include/trace/events/lock.h */
 #define LCB_F_SPIN     (1U << 0)
 #define LCB_F_READ     (1U << 1)
@@ -801,6 +804,11 @@ out:
 
 extern struct rq runqueues __ksym;
 
+const volatile __u64 contig_page_data_addr;
+const volatile __u64 node_data_addr;
+const volatile int nr_nodes;
+const volatile int sizeof_zone;
+
 struct rq___old {
        raw_spinlock_t lock;
 } __attribute__((preserve_access_index));
@@ -809,6 +817,59 @@ struct rq___new {
        raw_spinlock_t __lock;
 } __attribute__((preserve_access_index));
 
+static void collect_zone_lock(void)
+{
+       __u64 nr_zones, zone_off;
+       __u64 lock_addr, lock_off;
+       __u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
+
+       zone_off = offsetof(struct pglist_data, node_zones);
+       lock_off = offsetof(struct zone, lock);
+
+       if (contig_page_data_addr) {
+               struct pglist_data *contig_page_data;
+
+               contig_page_data = (void *)(long)contig_page_data_addr;
+               nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
+
+               for (int i = 0; i < MAX_ZONES; i++) {
+                       __u64 zone_addr;
+
+                       if (i >= nr_zones)
+                               break;
+
+                       zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
+                       lock_addr = zone_addr + lock_off;
+
+                       bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+               }
+       } else if (nr_nodes > 0) {
+               struct pglist_data **node_data = (void *)(long)node_data_addr;
+
+               for (int i = 0; i < nr_nodes; i++) {
+                       struct pglist_data *pgdat = NULL;
+                       int err;
+
+                       err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
+                       if (err < 0 || pgdat == NULL)
+                               break;
+
+                       nr_zones = BPF_CORE_READ(pgdat, nr_zones);
+                       for (int k = 0; k < MAX_ZONES; k++) {
+                               __u64 zone_addr;
+
+                               if (k >= nr_zones)
+                                       break;
+
+                               zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
+                               lock_addr = zone_addr + lock_off;
+
+                               bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+                       }
+               }
+       }
+}
+
 SEC("raw_tp/bpf_test_finish")
 int BPF_PROG(collect_lock_syms)
 {
@@ -830,6 +891,9 @@ int BPF_PROG(collect_lock_syms)
                lock_flag = LOCK_CLASS_RQLOCK;
                bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
        }
+
+       collect_zone_lock();
+
        return 0;
 }
 
index 15f5743bd409f2f93e3c9137d6efeea8811b0dd0..28c5e5aced7fcc9104550f2b846763277580a32e 100644 (file)
@@ -67,6 +67,7 @@ enum lock_aggr_mode {
 enum lock_class_sym {
        LOCK_CLASS_NONE,
        LOCK_CLASS_RQLOCK,
+       LOCK_CLASS_ZONE_LOCK,
 };
 
 struct slab_cache_data {
index 7b81d3173917fdb558d9acb24b7bb2ef5e4e6565..a59ce912be18cd0f155dfddf75a28c22f7b6b6a2 100644 (file)
@@ -203,4 +203,13 @@ struct bpf_iter__kmem_cache {
        struct kmem_cache *s;
 } __attribute__((preserve_access_index));
 
+struct zone {
+       spinlock_t lock;
+} __attribute__((preserve_access_index));
+
+struct pglist_data {
+       struct zone node_zones[6]; /* value for all possible config */
+       int nr_zones;
+} __attribute__((preserve_access_index));
+
 #endif // __VMLINUX_H
index b5d916aa49df6424dda9c0c06dd834dab667c27b..d331ce8e3caad4cb95e2e8f1c3ae2faf1720dc04 100644 (file)
@@ -142,6 +142,7 @@ struct lock_contention {
        struct lock_filter *filters;
        struct lock_contention_fails fails;
        struct rb_root cgroups;
+       void *btf;
        unsigned long map_nr_entries;
        int max_stack;
        int stack_skip;