may_decode_fh() accesses mount::mnt_ns without holding any locks; that
means the mount can concurrently be unmounted, and the mnt_namespace can
concurrently be freed after an RCU grace period.
This race can happens as follows, assuming that the mount point was
created by open_tree(..., OPEN_TREE_CLONE):
thread 1 thread 2 RCU
__do_sys_open_by_handle_at
do_handle_open
handle_to_path
may_decode_fh
is_mounted
[mount::mnt_ns access]
[mount::mnt_ns access]
__do_sys_close
fput_close_sync
__fput
dissolve_on_fput
umount_tree
class_namespace_excl_destructor
namespace_unlock
free_mnt_ns
mnt_ns_tree_remove
call_rcu(mnt_ns_release_rcu)
mnt_ns_release_rcu
mnt_ns_release
kfree
[mnt_namespace::user_ns access] **UAF**
Fix it by taking rcu_read_lock() around the mount::mnt_ns access, like
in __prepend_path().
Additionally, document the semantics of mount::mnt_ns, and use WRITE_ONCE()
for writers that can race with lockless readers.
This bug is unreachable unless one of the following is set:
- CONFIG_PREEMPTION
- CONFIG_RCU_STRICT_GRACE_PERIOD
because it requires an RCU grace period to happen during a syscall without
an explicit preemption.
This doesn't seem to have interesting security impact; worst-case, it could
leak the result of an integer comparison to userspace (from the level
check in cap_capable()), cause an endless loop, or crash the kernel by
dereferencing an invalid address.
Fixes: 620c266f3949 ("fhandle: relax open_by_handle_at() permission checks")
Cc: stable@vger.kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Link: https://patch.msgid.link/20260603-vfs-fhandle-uaf-fix-v2-1-d05db76a5084@google.com
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
return 0;
}
+static bool capable_wrt_mount(struct mount *mount)
+{
+ struct mnt_namespace *mnt_ns;
+
+ /*
+ * For ->mnt_ns access.
+ * The following READ_ONCE() is semantically rcu_dereference().
+ */
+ guard(rcu)();
+ mnt_ns = READ_ONCE(mount->mnt_ns);
+ return ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN);
+}
+
static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
unsigned int o_flags)
{
if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
ctx->flags = HANDLE_CHECK_PERMS;
else if (is_mounted(root->mnt) &&
- ns_capable(real_mount(root->mnt)->mnt_ns->user_ns,
- CAP_SYS_ADMIN) &&
+ capable_wrt_mount(real_mount(root->mnt)) &&
!has_locked_children(real_mount(root->mnt), root->dentry))
ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
else
struct hlist_head mnt_slave_list;/* list of slave mounts */
struct hlist_node mnt_slave; /* slave list entry */
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
- struct mnt_namespace *mnt_ns; /* containing namespace */
+ /*
+ * Containing namespace (active or deactivating, non-refcounted).
+ * Normally protected by namespace_sem.
+ * Can also be accessed locklessly under RCU. RCU readers can't rely on
+ * the namespace still being active, but implicitly hold a passive
+ * reference (because an RCU delay happens between a namespace being
+ * deactivated and the corresponding passive refcount drop).
+ */
+ struct mnt_namespace *mnt_ns;
struct mountpoint *mnt_mp; /* where is it mounted */
union {
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
bool mnt_first_node = true, mnt_last_node = true;
WARN_ON(mnt_ns_attached(mnt));
- mnt->mnt_ns = ns;
+ WRITE_ONCE(mnt->mnt_ns, ns);
while (*link) {
parent = *link;
if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
void mnt_make_shortterm(struct vfsmount *mnt)
{
if (mnt)
- real_mount(mnt)->mnt_ns = NULL;
+ WRITE_ONCE(real_mount(mnt)->mnt_ns, NULL);
}
/**
ns->nr_mounts--;
__touch_mnt_namespace(ns);
}
- p->mnt_ns = NULL;
+ WRITE_ONCE(p->mnt_ns, NULL);
if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;