From: Al Viro Date: Tue, 6 May 2025 22:48:05 +0000 (-0400) Subject: don't set MNT_LOCKED on parentless mounts X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d08fa7f44ae7f5ad5c842e15f2412790736a6144;p=thirdparty%2Fkernel%2Fstable.git don't set MNT_LOCKED on parentless mounts Originally MNT_LOCKED meant only one thing - "don't let this mount to be peeled off its parent, we don't want to have its mountpoint exposed". Accordingly, it had only been set on mounts that *do* have a parent. Later it got overloaded with another use - setting it on the absolute root had given free protection against umount(2) of absolute root (was possible to trigger, oopsed). Not a bad trick, but it ended up costing more than it bought us. Unfortunately, the cost included both hard-to-reason-about logics and a subtle race between mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in the end of __do_loopback() could race with sb_prepare_remount_readonly() setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should be). The race wouldn't be much of a problem (there are other ways to deal with it), but the subtlety is. Turns out that nobody except umount(2) had ever made use of having MNT_LOCKED set on absolute root. So let's give up on that trick, clever as it had been, add an explicit check in do_umount() and return to using MNT_LOCKED only for mounts that have a parent. It means that * clone_mnt() no longer copies MNT_LOCKED * copy_tree() sets it on submounts if their counterparts had been marked such, and does that right next to attach_mnt() in there, in the same mount_lock scope. * __do_loopback() no longer needs to strip MNT_LOCKED off the root of subtree it's about to return; no store, no race. * init_mount_tree() doesn't bother setting MNT_LOCKED on absolute root. * lock_mnt_tree() does not set MNT_LOCKED on the subtree's root; accordingly, its caller (loop in attach_recursive_mnt()) does not need to bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt() (from commit_tree()) that makes them reachable. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- diff --git a/fs/namespace.c b/fs/namespace.c index 75d45d0b615c7..791904128f1e9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1313,7 +1313,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_LOCKED); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1988,6 +1988,9 @@ static int do_umount(struct mount *mnt, int flags) if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; + if (!mnt_has_parent(mnt)) /* not the absolute root */ + goto out; + event++; if (flags & MNT_DETACH) { if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) @@ -2257,6 +2260,8 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) + dst_mnt->mnt.mnt_flags |= MNT_LOCKED; list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); unlock_mount_hash(); @@ -2489,7 +2494,7 @@ static void lock_mnt_tree(struct mount *mnt) if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ - if (list_empty(&p->mnt_expire)) + if (list_empty(&p->mnt_expire) && p != mnt) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } @@ -2704,7 +2709,6 @@ static int attach_recursive_mnt(struct mount *source_mnt, /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); - child->mnt.mnt_flags &= ~MNT_LOCKED; q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); if (q) { @@ -2985,26 +2989,21 @@ static inline bool may_copy_tree(struct path *path) static struct mount *__do_loopback(struct path *old_path, int recurse) { - struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + struct mount *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) - return mnt; + return ERR_PTR(-EINVAL); if (!may_copy_tree(old_path)) - return mnt; + return ERR_PTR(-EINVAL); if (!recurse && __has_locked_children(old, old_path->dentry)) - return mnt; + return ERR_PTR(-EINVAL); if (recurse) - mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else - mnt = clone_mnt(old, old_path->dentry, 0); - - if (!IS_ERR(mnt)) - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - - return mnt; + return clone_mnt(old, old_path->dentry, 0); } /* @@ -4749,11 +4748,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; @@ -6154,7 +6153,6 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; - mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root);