1 From 8051496261c8fef0fbc63d14c4640e1cd5d8ceaf Mon Sep 17 00:00:00 2001
2 From: Sasha Levin <sashal@kernel.org>
3 Date: Tue, 9 Jan 2024 11:48:04 -1000
4 Subject: kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in
5 kernfs_find_and_get_node_by_id()
7 From: Tejun Heo <tj@kernel.org>
9 [ Upstream commit 4207b556e62f0a8915afc5da4c5d5ad915a253a5 ]
11 The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
12 which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
13 can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
14 programs including e.g. the ones that attach to functions which are holding
15 the scheduler rq lock.
17 Consider the following BPF program:
19 SEC("fentry/__set_cpus_allowed_ptr_locked")
20 int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
21 struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
23 struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);
26 bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
27 bpf_cgroup_release(cgrp);
32 __set_cpus_allowed_ptr_locked() is called with rq lock held and the above
33 BPF program calls bpf_cgroup_from_id() within leading to the following
36 =====================================================
37 WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
38 6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
39 -----------------------------------------------------
40 repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
41 ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70
43 and this task is already holding:
44 ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
45 which would create a new lock dependency:
46 (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
48 Possible interrupt unsafe locking scenario:
52 lock(kernfs_idr_lock);
55 lock(kernfs_idr_lock);
62 dump_stack_lvl+0x55/0x70
64 __lock_acquire+0x781/0x2a40
65 lock_acquire+0xbf/0x1f0
66 _raw_spin_lock+0x2f/0x40
67 kernfs_find_and_get_node_by_id+0x1e/0x70
68 cgroup_get_from_id+0x21/0x240
69 bpf_cgroup_from_id+0xe/0x20
70 bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
71 bpf_trampoline_6442545632+0x4f/0x1000
72 __set_cpus_allowed_ptr_locked+0x5/0x5a0
73 sched_setaffinity+0x1b3/0x290
74 __x64_sys_sched_setaffinity+0x4f/0x60
75 do_syscall_64+0x40/0xe0
76 entry_SYSCALL_64_after_hwframe+0x46/0x4e
78 Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
79 kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
82 This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
83 Combined with the preceding rearrange patch, the net increase is 8 bytes.
85 Signed-off-by: Tejun Heo <tj@kernel.org>
86 Cc: Andrea Righi <andrea.righi@canonical.com>
87 Cc: Geert Uytterhoeven <geert@linux-m68k.org>
88 Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org
89 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
90 Signed-off-by: Sasha Levin <sashal@kernel.org>
92 fs/kernfs/dir.c | 31 ++++++++++++++++++++-----------
93 fs/kernfs/kernfs-internal.h | 2 ++
94 include/linux/kernfs.h | 2 ++
95 3 files changed, 24 insertions(+), 11 deletions(-)
97 diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
98 index bce1d7ac95caa..458519e416fe7 100644
100 +++ b/fs/kernfs/dir.c
101 @@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
103 EXPORT_SYMBOL_GPL(kernfs_get);
105 +static void kernfs_free_rcu(struct rcu_head *rcu)
107 + struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
109 + kfree_const(kn->name);
112 + simple_xattrs_free(&kn->iattr->xattrs, NULL);
113 + kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
116 + kmem_cache_free(kernfs_node_cache, kn);
120 * kernfs_put - put a reference count on a kernfs_node
121 * @kn: the target kernfs_node
122 @@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
123 if (kernfs_type(kn) == KERNFS_LINK)
124 kernfs_put(kn->symlink.target_kn);
126 - kfree_const(kn->name);
129 - simple_xattrs_free(&kn->iattr->xattrs, NULL);
130 - kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
132 spin_lock(&kernfs_idr_lock);
133 idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
134 spin_unlock(&kernfs_idr_lock);
135 - kmem_cache_free(kernfs_node_cache, kn);
137 + call_rcu(&kn->rcu, kernfs_free_rcu);
141 @@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
143 /* just released the root kn, free @root too */
144 idr_destroy(&root->ino_idr);
146 + kfree_rcu(root, rcu);
149 EXPORT_SYMBOL_GPL(kernfs_put);
150 @@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
151 ino_t ino = kernfs_id_ino(id);
152 u32 gen = kernfs_id_gen(id);
154 - spin_lock(&kernfs_idr_lock);
157 kn = idr_find(&root->ino_idr, (u32)ino);
159 @@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
160 if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
163 - spin_unlock(&kernfs_idr_lock);
167 - spin_unlock(&kernfs_idr_lock);
172 diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
173 index 237f2764b9412..b42ee6547cdc1 100644
174 --- a/fs/kernfs/kernfs-internal.h
175 +++ b/fs/kernfs/kernfs-internal.h
176 @@ -49,6 +49,8 @@ struct kernfs_root {
177 struct rw_semaphore kernfs_rwsem;
178 struct rw_semaphore kernfs_iattr_rwsem;
179 struct rw_semaphore kernfs_supers_rwsem;
181 + struct rcu_head rcu;
184 /* +1 to avoid triggering overflow warning when negating it */
185 diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
186 index 99aaa050ccb76..e857a150babc6 100644
187 --- a/include/linux/kernfs.h
188 +++ b/include/linux/kernfs.h
189 @@ -223,6 +223,8 @@ struct kernfs_node {
190 unsigned short flags;
192 struct kernfs_iattrs *iattr;
194 + struct rcu_head rcu;