__unlock_mount(m);
}
+static void lock_mount_exact(const struct path *path,
+ struct pinned_mountpoint *mp);
+
#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
do_lock_mount((path), &mp, (beneath))
return check_anonymous_mnt(mnt);
}
-
-static struct mount *__do_loopback(const struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path,
+ unsigned int flags, unsigned int copy_flags)
{
struct mount *old = real_mount(old_path->mnt);
+ bool recurse = flags & AT_RECURSIVE;
if (IS_MNT_UNBINDABLE(old))
return ERR_PTR(-EINVAL);
if (!recurse && __has_locked_children(old, old_path->dentry))
return ERR_PTR(-EINVAL);
+ /*
+ * When creating a new mount namespace we don't want to copy over
+ * mounts of mount namespaces to avoid the risk of cycles and also to
+ * minimize the default complex interdependencies between mount
+ * namespaces.
+ *
+ * We could ofc just check whether all mount namespace files aren't
+ * creating cycles but really let's keep this simple.
+ */
+ if (!(flags & OPEN_TREE_NAMESPACE))
+ copy_flags |= CL_COPY_MNT_NS_FILE;
+
if (recurse)
- return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
- else
- return clone_mnt(old, old_path->dentry, 0);
+ return copy_tree(old, old_path->dentry, copy_flags);
+
+ return clone_mnt(old, old_path->dentry, copy_flags);
}
/*
{
struct path old_path __free(path_put) = {};
struct mount *mnt = NULL;
+ unsigned int flags = recurse ? AT_RECURSIVE : 0;
int err;
+
if (!old_name || !*old_name)
return -EINVAL;
err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
if (!check_mnt(mp.parent))
return -EINVAL;
- mnt = __do_loopback(&old_path, recurse);
+ mnt = __do_loopback(&old_path, flags, 0);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
return err;
}
-static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
{
struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
struct user_namespace *user_ns = mnt_ns->user_ns;
ns->seq_origin = src_mnt_ns->ns.ns_id;
}
- mnt = __do_loopback(path, recursive);
+ mnt = __do_loopback(path, flags, 0);
if (IS_ERR(mnt)) {
emptied_ns = ns;
return ERR_CAST(mnt);
return ns;
}
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct file *open_detached_copy(struct path *path, unsigned int flags)
{
- struct mnt_namespace *ns = get_detached_copy(path, recursive);
+ struct mnt_namespace *ns = get_detached_copy(path, flags);
struct file *file;
if (IS_ERR(ns))
return file;
}
+DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
+ if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
+
+static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+{
+ struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
+ struct path to_path __free(path_put) = {};
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ struct user_namespace *user_ns = current_user_ns();
+ struct mount *new_ns_root;
+ struct mount *mnt;
+ unsigned int copy_flags = 0;
+ bool locked = false;
+
+ if (user_ns != ns->user_ns)
+ copy_flags |= CL_SLAVE;
+
+ new_ns = alloc_mnt_ns(user_ns, false);
+ if (IS_ERR(new_ns))
+ return ERR_CAST(new_ns);
+
+ scoped_guard(namespace_excl) {
+ new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
+ if (IS_ERR(new_ns_root))
+ return ERR_CAST(new_ns_root);
+
+ /*
+ * If the real rootfs had a locked mount on top of it somewhere
+ * in the stack, lock the new mount tree as well so it can't be
+ * exposed.
+ */
+ mnt = ns->root;
+ while (mnt->overmount) {
+ mnt = mnt->overmount;
+ if (mnt->mnt.mnt_flags & MNT_LOCKED)
+ locked = true;
+ }
+ }
+
+ /*
+ * We dropped the namespace semaphore so we can actually lock
+ * the copy for mounting. The copied mount isn't attached to any
+ * mount namespace and it is thus excluded from any propagation.
+ * So realistically we're isolated and the mount can't be
+ * overmounted.
+ */
+
+ /* Borrow the reference from clone_mnt(). */
+ to_path.mnt = &new_ns_root->mnt;
+ to_path.dentry = dget(new_ns_root->mnt.mnt_root);
+
+ /* Now lock for actual mounting. */
+ LOCK_MOUNT_EXACT(mp, &to_path);
+ if (unlikely(IS_ERR(mp.parent)))
+ return ERR_CAST(mp.parent);
+
+ /*
+ * We don't emulate unshare()ing a mount namespace. We stick to the
+ * restrictions of creating detached bind-mounts. It has a lot
+ * saner and simpler semantics.
+ */
+ mnt = __do_loopback(path, flags, copy_flags);
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
+
+ scoped_guard(mount_writer) {
+ if (locked)
+ mnt->mnt.mnt_flags |= MNT_LOCKED;
+ /*
+ * Now mount the detached tree on top of the copy of the
+ * real rootfs we created.
+ */
+ attach_mnt(mnt, new_ns_root, mp.mp);
+ if (user_ns != ns->user_ns)
+ lock_mnt_tree(new_ns_root);
+ }
+
+ /* Add all mounts to the new namespace. */
+ for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
+ mnt_add_to_ns(new_ns, p);
+ new_ns->nr_mounts++;
+ }
+
+ new_ns->root = real_mount(no_free_ptr(to_path.mnt));
+ ns_tree_add_raw(new_ns);
+ return no_free_ptr(new_ns);
+}
+
+static struct file *open_new_namespace(struct path *path, unsigned int flags)
+{
+ struct mnt_namespace *new_ns;
+
+ new_ns = create_new_namespace(path, flags);
+ if (IS_ERR(new_ns))
+ return ERR_CAST(new_ns);
+ return open_namespace_file(to_ns_common(new_ns));
+}
+
static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
int ret;
struct path path __free(path_put) = {};
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
- bool detached = flags & OPEN_TREE_CLONE;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
- OPEN_TREE_CLOEXEC))
+ OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
return ERR_PTR(-EINVAL);
- if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+ if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
+ AT_RECURSIVE)
+ return ERR_PTR(-EINVAL);
+
+ if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
return ERR_PTR(-EINVAL);
if (flags & AT_NO_AUTOMOUNT)
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
- if (detached && !may_mount())
+ /*
+ * If we create a new mount namespace with the cloned mount tree we
+ * just care about being privileged over our current user namespace.
+ * The new mount namespace will be owned by it.
+ */
+ if ((flags & OPEN_TREE_NAMESPACE) &&
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if ((flags & OPEN_TREE_CLONE) && !may_mount())
return ERR_PTR(-EPERM);
ret = user_path_at(dfd, filename, lookup_flags, &path);
if (unlikely(ret))
return ERR_PTR(ret);
- if (detached)
- return open_detached_copy(&path, flags & AT_RECURSIVE);
+ if (flags & OPEN_TREE_NAMESPACE)
+ return open_new_namespace(&path, flags);
+
+ if (flags & OPEN_TREE_CLONE)
+ return open_detached_copy(&path, flags);
return dentry_open(&path, O_PATH, current_cred());
}