mount: add OPEN_TREE_NAMESPACE

author Christian Brauner <brauner@kernel.org>

Mon, 29 Dec 2025 13:03:24 +0000 (14:03 +0100)

committer Christian Brauner <brauner@kernel.org>

Fri, 16 Jan 2026 18:21:40 +0000 (19:21 +0100)
author Christian Brauner <brauner@kernel.org>
Mon, 29 Dec 2025 13:03:24 +0000 (14:03 +0100)
committer Christian Brauner <brauner@kernel.org>
Fri, 16 Jan 2026 18:21:40 +0000 (19:21 +0100)
diff --git a/fs/internal.h b/fs/internal.h

index e333b105337a80c31ccab3c3ac989340adb1b61b..f6932e639f3694d881ab0c947f5a845d44ba39c9 100644 (file)
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -246,6 +246,7 @@ extern void mnt_pin_kill(struct mount *m);
   */
  extern const struct dentry_operations ns_dentry_operations;
  int open_namespace(struct ns_common *ns);
+struct file *open_namespace_file(struct ns_common *ns);
  
  /*
   * fs/stat.c:
diff --git a/fs/namespace.c b/fs/namespace.c

index ec3b16fedd9f29f9d15d14c4eee92df2da5e7430..59557019e4225f65ec6157b20e3dbe25eff8b793 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
                 __unlock_mount(m);
  }
  
+static void lock_mount_exact(const struct path *path,
+                            struct pinned_mountpoint *mp);
+
  #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
         struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
         do_lock_mount((path), &mp, (beneath))
@@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path)
         return check_anonymous_mnt(mnt);
  }
  
-
-static struct mount *__do_loopback(const struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path,
+                                  unsigned int flags, unsigned int copy_flags)
  {
         struct mount *old = real_mount(old_path->mnt);
+       bool recurse = flags & AT_RECURSIVE;
  
         if (IS_MNT_UNBINDABLE(old))
                 return ERR_PTR(-EINVAL);
@@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse)
         if (!recurse && __has_locked_children(old, old_path->dentry))
                 return ERR_PTR(-EINVAL);
  
+       /*
+        * When creating a new mount namespace we don't want to copy over
+        * mounts of mount namespaces to avoid the risk of cycles and also to
+        * minimize the default complex interdependencies between mount
+        * namespaces.
+        *
+        * We could ofc just check whether all mount namespace files aren't
+        * creating cycles but really let's keep this simple.
+        */
+       if (!(flags & OPEN_TREE_NAMESPACE))
+               copy_flags |= CL_COPY_MNT_NS_FILE;
+
         if (recurse)
-               return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
-       else
-               return clone_mnt(old, old_path->dentry, 0);
+               return copy_tree(old, old_path->dentry, copy_flags);
+
+       return clone_mnt(old, old_path->dentry, copy_flags);
  }
  
  /*
@@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name,
  {
         struct path old_path __free(path_put) = {};
         struct mount *mnt = NULL;
+       unsigned int flags = recurse ? AT_RECURSIVE : 0;
         int err;
+
         if (!old_name || !*old_name)
                 return -EINVAL;
         err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name,
         if (!check_mnt(mp.parent))
                 return -EINVAL;
  
-       mnt = __do_loopback(&old_path, recurse);
+       mnt = __do_loopback(&old_path, flags, 0);
         if (IS_ERR(mnt))
                 return PTR_ERR(mnt);
  
@@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name,
         return err;
  }
  
-static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
  {
         struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
         struct user_namespace *user_ns = mnt_ns->user_ns;
@@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
                         ns->seq_origin = src_mnt_ns->ns.ns_id;
         }
  
-       mnt = __do_loopback(path, recursive);
+       mnt = __do_loopback(path, flags, 0);
         if (IS_ERR(mnt)) {
                 emptied_ns = ns;
                 return ERR_CAST(mnt);
@@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
         return ns;
  }
  
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct file *open_detached_copy(struct path *path, unsigned int flags)
  {
-       struct mnt_namespace *ns = get_detached_copy(path, recursive);
+       struct mnt_namespace *ns = get_detached_copy(path, flags);
         struct file *file;
  
         if (IS_ERR(ns))
@@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
         return file;
  }
  
+DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
+           if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
+
+static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+{
+       struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
+       struct path to_path __free(path_put) = {};
+       struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+       struct user_namespace *user_ns = current_user_ns();
+       struct mount *new_ns_root;
+       struct mount *mnt;
+       unsigned int copy_flags = 0;
+       bool locked = false;
+
+       if (user_ns != ns->user_ns)
+               copy_flags |= CL_SLAVE;
+
+       new_ns = alloc_mnt_ns(user_ns, false);
+       if (IS_ERR(new_ns))
+               return ERR_CAST(new_ns);
+
+       scoped_guard(namespace_excl) {
+               new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
+               if (IS_ERR(new_ns_root))
+                       return ERR_CAST(new_ns_root);
+
+               /*
+                * If the real rootfs had a locked mount on top of it somewhere
+                * in the stack, lock the new mount tree as well so it can't be
+                * exposed.
+                */
+               mnt = ns->root;
+               while (mnt->overmount) {
+                       mnt = mnt->overmount;
+                       if (mnt->mnt.mnt_flags & MNT_LOCKED)
+                               locked = true;
+               }
+       }
+
+       /*
+        * We dropped the namespace semaphore so we can actually lock
+        * the copy for mounting. The copied mount isn't attached to any
+        * mount namespace and it is thus excluded from any propagation.
+        * So realistically we're isolated and the mount can't be
+        * overmounted.
+        */
+
+       /* Borrow the reference from clone_mnt(). */
+       to_path.mnt = &new_ns_root->mnt;
+       to_path.dentry = dget(new_ns_root->mnt.mnt_root);
+
+       /* Now lock for actual mounting. */
+       LOCK_MOUNT_EXACT(mp, &to_path);
+       if (unlikely(IS_ERR(mp.parent)))
+               return ERR_CAST(mp.parent);
+
+       /*
+        * We don't emulate unshare()ing a mount namespace. We stick to the
+        * restrictions of creating detached bind-mounts. It has a lot
+        * saner and simpler semantics.
+        */
+       mnt = __do_loopback(path, flags, copy_flags);
+       if (IS_ERR(mnt))
+               return ERR_CAST(mnt);
+
+       scoped_guard(mount_writer) {
+               if (locked)
+                       mnt->mnt.mnt_flags |= MNT_LOCKED;
+               /*
+                * Now mount the detached tree on top of the copy of the
+                * real rootfs we created.
+                */
+               attach_mnt(mnt, new_ns_root, mp.mp);
+               if (user_ns != ns->user_ns)
+                       lock_mnt_tree(new_ns_root);
+       }
+
+       /* Add all mounts to the new namespace. */
+       for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
+               mnt_add_to_ns(new_ns, p);
+               new_ns->nr_mounts++;
+       }
+
+       new_ns->root = real_mount(no_free_ptr(to_path.mnt));
+       ns_tree_add_raw(new_ns);
+       return no_free_ptr(new_ns);
+}
+
+static struct file *open_new_namespace(struct path *path, unsigned int flags)
+{
+       struct mnt_namespace *new_ns;
+
+       new_ns = create_new_namespace(path, flags);
+       if (IS_ERR(new_ns))
+               return ERR_CAST(new_ns);
+       return open_namespace_file(to_ns_common(new_ns));
+}
+
  static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
  {
         int ret;
         struct path path __free(path_put) = {};
         int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-       bool detached = flags & OPEN_TREE_CLONE;
  
         BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
  
         if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
                       AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
-                     OPEN_TREE_CLOEXEC))
+                     OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
                 return ERR_PTR(-EINVAL);
  
-       if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+       if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
+           AT_RECURSIVE)
+               return ERR_PTR(-EINVAL);
+
+       if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
                 return ERR_PTR(-EINVAL);
  
         if (flags & AT_NO_AUTOMOUNT)
@@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
         if (flags & AT_EMPTY_PATH)
                 lookup_flags |= LOOKUP_EMPTY;
  
-       if (detached && !may_mount())
+       /*
+        * If we create a new mount namespace with the cloned mount tree we
+        * just care about being privileged over our current user namespace.
+        * The new mount namespace will be owned by it.
+        */
+       if ((flags & OPEN_TREE_NAMESPACE) &&
+           !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       if ((flags & OPEN_TREE_CLONE) && !may_mount())
                 return ERR_PTR(-EPERM);
  
         ret = user_path_at(dfd, filename, lookup_flags, &path);
         if (unlikely(ret))
                 return ERR_PTR(ret);
  
-       if (detached)
-               return open_detached_copy(&path, flags & AT_RECURSIVE);
+       if (flags & OPEN_TREE_NAMESPACE)
+               return open_new_namespace(&path, flags);
+
+       if (flags & OPEN_TREE_CLONE)
+               return open_detached_copy(&path, flags);
  
         return dentry_open(&path, O_PATH, current_cred());
  }
diff --git a/fs/nsfs.c b/fs/nsfs.c

index bf27d5da91f1e71b7df6f9193afd65d594e3385c..db91de2086456bb9fe0547be7dbbd2de87c97ca7 100644 (file)
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task,
         return ns_get_path_cb(path, ns_get_path_task, &args);
  }
  
+struct file *open_namespace_file(struct ns_common *ns)
+{
+       struct path path __free(path_put) = {};
+       int err;
+
+       /* call first to consume reference */
+       err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+       if (err < 0)
+               return ERR_PTR(err);
+
+       return dentry_open(&path, O_RDONLY, current_cred());
+}
+
  /**
   * open_namespace - open a namespace
   * @ns: the namespace to open
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h

index 18c62440526888b86a43018027b827e5755e2ec7..d9d86598d100cb328a7854a0551897d23dc76247 100644 (file)
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -61,7 +61,8 @@
  /*
   * open_tree() flags.
   */
-#define OPEN_TREE_CLONE                1               /* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLONE                (1 << 0)        /* Clone the target tree and attach the clone */
+#define OPEN_TREE_NAMESPACE    (1 << 1)        /* Clone the target tree into a new mount namespace */
  #define OPEN_TREE_CLOEXEC      O_CLOEXEC       /* Close the file on execve() */
  
  /*
author	Christian Brauner <brauner@kernel.org>
	Mon, 29 Dec 2025 13:03:24 +0000 (14:03 +0100)
committer	Christian Brauner <brauner@kernel.org>
	Fri, 16 Jan 2026 18:21:40 +0000 (19:21 +0100)
fs/internal.h		patch \| blob \| blame \| history
fs/namespace.c		patch \| blob \| blame \| history
fs/nsfs.c		patch \| blob \| blame \| history
include/uapi/linux/mount.h		patch \| blob \| blame \| history