]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
fhandle: fix UAF due to unlocked ->mnt_ns read in may_decode_fh()
authorJann Horn <jannh@google.com>
Wed, 3 Jun 2026 19:31:57 +0000 (21:31 +0200)
committerChristian Brauner <brauner@kernel.org>
Thu, 4 Jun 2026 07:39:50 +0000 (09:39 +0200)
may_decode_fh() accesses mount::mnt_ns without holding any locks; that
means the mount can concurrently be unmounted, and the mnt_namespace can
concurrently be freed after an RCU grace period.

This race can happens as follows, assuming that the mount point was
created by open_tree(..., OPEN_TREE_CLONE):

thread 1            thread 2            RCU
                    __do_sys_open_by_handle_at
                      do_handle_open
                        handle_to_path
                          may_decode_fh
                            is_mounted
                              [mount::mnt_ns access]
                            [mount::mnt_ns access]
__do_sys_close
  fput_close_sync
    __fput
      dissolve_on_fput
        umount_tree
        class_namespace_excl_destructor
          namespace_unlock
            free_mnt_ns
              mnt_ns_tree_remove
                call_rcu(mnt_ns_release_rcu)
                                        mnt_ns_release_rcu
                                          mnt_ns_release
                                            kfree
                            [mnt_namespace::user_ns access] **UAF**

Fix it by taking rcu_read_lock() around the mount::mnt_ns access, like
in __prepend_path().
Additionally, document the semantics of mount::mnt_ns, and use WRITE_ONCE()
for writers that can race with lockless readers.

This bug is unreachable unless one of the following is set:

 - CONFIG_PREEMPTION
 - CONFIG_RCU_STRICT_GRACE_PERIOD

because it requires an RCU grace period to happen during a syscall without
an explicit preemption.

This doesn't seem to have interesting security impact; worst-case, it could
leak the result of an integer comparison to userspace (from the level
check in cap_capable()), cause an endless loop, or crash the kernel by
dereferencing an invalid address.

Fixes: 620c266f3949 ("fhandle: relax open_by_handle_at() permission checks")
Cc: stable@vger.kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Link: https://patch.msgid.link/20260603-vfs-fhandle-uaf-fix-v2-1-d05db76a5084@google.com
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
fs/fhandle.c
fs/mount.h
fs/namespace.c

index 642e3d5694972aa5d2a51ac500da55b650dca843..1ca7eb3a6cb51655d4198e0b931ba55cdf7d819f 100644 (file)
@@ -285,6 +285,19 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
        return 0;
 }
 
+static bool capable_wrt_mount(struct mount *mount)
+{
+       struct mnt_namespace *mnt_ns;
+
+       /*
+        * For ->mnt_ns access.
+        * The following READ_ONCE() is semantically rcu_dereference().
+        */
+       guard(rcu)();
+       mnt_ns = READ_ONCE(mount->mnt_ns);
+       return ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN);
+}
+
 static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
                                unsigned int o_flags)
 {
@@ -320,8 +333,7 @@ static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
        if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
                ctx->flags = HANDLE_CHECK_PERMS;
        else if (is_mounted(root->mnt) &&
-                ns_capable(real_mount(root->mnt)->mnt_ns->user_ns,
-                           CAP_SYS_ADMIN) &&
+                capable_wrt_mount(real_mount(root->mnt)) &&
                 !has_locked_children(real_mount(root->mnt), root->dentry))
                ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
        else
index e0816c11a1989fdd2aa1c31056032e9e46e3e528..5c120f8361bd80a89722b656283664a75d4f7c1f 100644 (file)
@@ -71,7 +71,15 @@ struct mount {
        struct hlist_head mnt_slave_list;/* list of slave mounts */
        struct hlist_node mnt_slave;    /* slave list entry */
        struct mount *mnt_master;       /* slave is on master->mnt_slave_list */
-       struct mnt_namespace *mnt_ns;   /* containing namespace */
+       /*
+        * Containing namespace (active or deactivating, non-refcounted).
+        * Normally protected by namespace_sem.
+        * Can also be accessed locklessly under RCU. RCU readers can't rely on
+        * the namespace still being active, but implicitly hold a passive
+        * reference (because an RCU delay happens between a namespace being
+        * deactivated and the corresponding passive refcount drop).
+        */
+       struct mnt_namespace *mnt_ns;
        struct mountpoint *mnt_mp;      /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;  /* list mounts with the same mountpoint */
index fe919abd2f0118594a4ab7e593648c7c629a6544..f5905f4ec56068b1f8f1d4a062897a69f75d95c3 100644 (file)
@@ -1079,7 +1079,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
        bool mnt_first_node = true, mnt_last_node = true;
 
        WARN_ON(mnt_ns_attached(mnt));
-       mnt->mnt_ns = ns;
+       WRITE_ONCE(mnt->mnt_ns, ns);
        while (*link) {
                parent = *link;
                if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
@@ -1434,7 +1434,7 @@ EXPORT_SYMBOL(mntget);
 void mnt_make_shortterm(struct vfsmount *mnt)
 {
        if (mnt)
-               real_mount(mnt)->mnt_ns = NULL;
+               WRITE_ONCE(real_mount(mnt)->mnt_ns, NULL);
 }
 
 /**
@@ -1806,7 +1806,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                        ns->nr_mounts--;
                        __touch_mnt_namespace(ns);
                }
-               p->mnt_ns = NULL;
+               WRITE_ONCE(p->mnt_ns, NULL);
                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;