]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_nod...
authorTejun Heo <tj@kernel.org>
Tue, 9 Jan 2024 21:48:04 +0000 (11:48 -1000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 30 Jan 2024 23:54:25 +0000 (15:54 -0800)
The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.

Consider the following BPF program:

  SEC("fentry/__set_cpus_allowed_ptr_locked")
  int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
       struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
  {
  struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);

  if (cgrp) {
  bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
  bpf_cgroup_release(cgrp);
  }
  return 0;
  }

__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:

  =====================================================
  WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
  6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
  -----------------------------------------------------
  repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
  ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70

and this task is already holding:
  ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
  which would create a new lock dependency:
   (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
  ...
   Possible interrupt unsafe locking scenario:

 CPU0                    CPU1
 ----                    ----
    lock(kernfs_idr_lock);
 local_irq_disable();
 lock(&rq->__lock);
 lock(kernfs_idr_lock);
    <Interrupt>
      lock(&rq->__lock);

 *** DEADLOCK ***
  ...
  Call Trace:
   dump_stack_lvl+0x55/0x70
   dump_stack+0x10/0x20
   __lock_acquire+0x781/0x2a40
   lock_acquire+0xbf/0x1f0
   _raw_spin_lock+0x2f/0x40
   kernfs_find_and_get_node_by_id+0x1e/0x70
   cgroup_get_from_id+0x21/0x240
   bpf_cgroup_from_id+0xe/0x20
   bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
   bpf_trampoline_6442545632+0x4f/0x1000
   __set_cpus_allowed_ptr_locked+0x5/0x5a0
   sched_setaffinity+0x1b3/0x290
   __x64_sys_sched_setaffinity+0x4f/0x60
   do_syscall_64+0x40/0xe0
   entry_SYSCALL_64_after_hwframe+0x46/0x4e

Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.

This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
fs/kernfs/dir.c
fs/kernfs/kernfs-internal.h
include/linux/kernfs.h

index bce1d7ac95caaa6ae48ba62c094d43c9da27298e..458519e416fe75e97cc454a716d29c06b3586b56 100644 (file)
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
 }
 EXPORT_SYMBOL_GPL(kernfs_get);
 
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+       struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+       kfree_const(kn->name);
+
+       if (kn->iattr) {
+               simple_xattrs_free(&kn->iattr->xattrs, NULL);
+               kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+       }
+
+       kmem_cache_free(kernfs_node_cache, kn);
+}
+
 /**
  * kernfs_put - put a reference count on a kernfs_node
  * @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
        if (kernfs_type(kn) == KERNFS_LINK)
                kernfs_put(kn->symlink.target_kn);
 
-       kfree_const(kn->name);
-
-       if (kn->iattr) {
-               simple_xattrs_free(&kn->iattr->xattrs, NULL);
-               kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
-       }
        spin_lock(&kernfs_idr_lock);
        idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
        spin_unlock(&kernfs_idr_lock);
-       kmem_cache_free(kernfs_node_cache, kn);
+
+       call_rcu(&kn->rcu, kernfs_free_rcu);
 
        kn = parent;
        if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
        } else {
                /* just released the root kn, free @root too */
                idr_destroy(&root->ino_idr);
-               kfree(root);
+               kfree_rcu(root, rcu);
        }
 }
 EXPORT_SYMBOL_GPL(kernfs_put);
@@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
        ino_t ino = kernfs_id_ino(id);
        u32 gen = kernfs_id_gen(id);
 
-       spin_lock(&kernfs_idr_lock);
+       rcu_read_lock();
 
        kn = idr_find(&root->ino_idr, (u32)ino);
        if (!kn)
@@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
        if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
                goto err_unlock;
 
-       spin_unlock(&kernfs_idr_lock);
+       rcu_read_unlock();
        return kn;
 err_unlock:
-       spin_unlock(&kernfs_idr_lock);
+       rcu_read_unlock();
        return NULL;
 }
 
index 237f2764b9412d65f2db6be01329d5565169a327..b42ee6547cdc1cfb1be7bb30ff890ce2d91e0777 100644 (file)
@@ -49,6 +49,8 @@ struct kernfs_root {
        struct rw_semaphore     kernfs_rwsem;
        struct rw_semaphore     kernfs_iattr_rwsem;
        struct rw_semaphore     kernfs_supers_rwsem;
+
+       struct rcu_head         rcu;
 };
 
 /* +1 to avoid triggering overflow warning when negating it */
index 82e1ce79a70c3a58877a40c8d2f4d53570bedfbe..87c79d076d6d7396c6da4efc5873a49a5feac873 100644 (file)
@@ -223,6 +223,8 @@ struct kernfs_node {
 
        void                    *priv;
        struct kernfs_iattrs    *iattr;
+
+       struct rcu_head         rcu;
 };
 
 /*