namespace: allow creating empty mount namespaces

author Christian Brauner <brauner@kernel.org>

Fri, 6 Mar 2026 16:28:37 +0000 (17:28 +0100)

committer Christian Brauner <brauner@kernel.org>

Thu, 12 Mar 2026 12:33:55 +0000 (13:33 +0100)
author Christian Brauner <brauner@kernel.org>
Fri, 6 Mar 2026 16:28:37 +0000 (17:28 +0100)
committer Christian Brauner <brauner@kernel.org>
Thu, 12 Mar 2026 12:33:55 +0000 (13:33 +0100)
diff --git a/fs/namespace.c b/fs/namespace.c

index 702e932435055419cea6ed432e44987b743e1ccb..555f0a10de9aaab9feedbc61688b86630c5ea694 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
                 struct user_namespace *user_ns, struct fs_struct *new_fs)
  {
         struct mnt_namespace *new_ns;
-       struct vfsmount *rootmnt __free(mntput) = NULL;
-       struct vfsmount *pwdmnt __free(mntput) = NULL;
+       struct path old_root __free(path_put) = {};
+       struct path old_pwd __free(path_put) = {};
         struct mount *p, *q;
         struct mount *old;
         struct mount *new;
@@ -4254,11 +4254,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
                 return new_ns;
  
         guard(namespace_excl)();
-       /* First pass: copy the tree topology */
-       copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
+
+       if (flags & CLONE_EMPTY_MNTNS)
+               copy_flags = 0;
+       else
+               copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
         if (user_ns != ns->user_ns)
                 copy_flags |= CL_SLAVE;
-       new = copy_tree(old, old->mnt.mnt_root, copy_flags);
+
+       if (flags & CLONE_EMPTY_MNTNS)
+               new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
+       else
+               new = copy_tree(old, old->mnt.mnt_root, copy_flags);
         if (IS_ERR(new)) {
                 emptied_ns = new_ns;
                 return ERR_CAST(new);
@@ -4269,33 +4276,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
         }
         new_ns->root = new;
  
-       /*
-        * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
-        * as belonging to new namespace.  We have already acquired a private
-        * fs_struct, so tsk->fs->lock is not needed.
-        */
-       p = old;
-       q = new;
-       while (p) {
-               mnt_add_to_ns(new_ns, q);
-               new_ns->nr_mounts++;
+       if (flags & CLONE_EMPTY_MNTNS) {
+               /*
+                * Empty mount namespace: only the root mount exists.
+                * Reset root and pwd to the cloned mount's root dentry.
+                */
                 if (new_fs) {
-                       if (&p->mnt == new_fs->root.mnt) {
-                               new_fs->root.mnt = mntget(&q->mnt);
-                               rootmnt = &p->mnt;
-                       }
-                       if (&p->mnt == new_fs->pwd.mnt) {
-                               new_fs->pwd.mnt = mntget(&q->mnt);
-                               pwdmnt = &p->mnt;
+                       old_root = new_fs->root;
+                       old_pwd = new_fs->pwd;
+
+                       new_fs->root.mnt = mntget(&new->mnt);
+                       new_fs->root.dentry = dget(new->mnt.mnt_root);
+
+                       new_fs->pwd.mnt = mntget(&new->mnt);
+                       new_fs->pwd.dentry = dget(new->mnt.mnt_root);
+               }
+               mnt_add_to_ns(new_ns, new);
+               new_ns->nr_mounts++;
+       } else {
+               /*
+                * Full copy: walk old and new trees in parallel, switching
+                * the tsk->fs->* elements and marking new vfsmounts as
+                * belonging to new namespace.  We have already acquired a
+                * private fs_struct, so tsk->fs->lock is not needed.
+                */
+               p = old;
+               q = new;
+               while (p) {
+                       mnt_add_to_ns(new_ns, q);
+                       new_ns->nr_mounts++;
+                       if (new_fs) {
+                               if (&p->mnt == new_fs->root.mnt) {
+                                       old_root.mnt = new_fs->root.mnt;
+                                       new_fs->root.mnt = mntget(&q->mnt);
+                               }
+                               if (&p->mnt == new_fs->pwd.mnt) {
+                                       old_pwd.mnt = new_fs->pwd.mnt;
+                                       new_fs->pwd.mnt = mntget(&q->mnt);
+                               }
                         }
+                       p = next_mnt(p, old);
+                       q = next_mnt(q, new);
+                       if (!q)
+                               break;
+                       // an mntns binding we'd skipped?
+                       while (p->mnt.mnt_root != q->mnt.mnt_root)
+                               p = next_mnt(skip_mnt_tree(p), old);
                 }
-               p = next_mnt(p, old);
-               q = next_mnt(q, new);
-               if (!q)
-                       break;
-               // an mntns binding we'd skipped?
-               while (p->mnt.mnt_root != q->mnt.mnt_root)
-                       p = next_mnt(skip_mnt_tree(p), old);
         }
         ns_tree_add_raw(new_ns);
         return new_ns;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h

index 359a14cc76a4038aeacef14b2915d5ce60d0cf44..4e76fce9f7770d79b1e7fdb3f7fadf919d95d4cd 100644 (file)
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,6 +36,7 @@
  /* Flags for the clone3() syscall. */
  #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
  #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#define CLONE_EMPTY_MNTNS      (1ULL << 37) /* Create an empty mount namespace. */
  
  /*
   * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -43,6 +44,12 @@
   */
  #define CLONE_NEWTIME  0x00000080      /* New time namespace */
  
+/*
+ * unshare flags share the bit space with clone flags but only apply to the
+ * unshare syscall:
+ */
+#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */
+
  #ifndef __ASSEMBLY__
  /**
   * struct clone_args - arguments for the clone3 syscall
diff --git a/kernel/fork.c b/kernel/fork.c

index 65113a304518ae73590704034513ba06e708928a..dea6b34544472330391b534635b9c57397b7b7b4 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2619,6 +2619,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
         int trace = 0;
         pid_t nr;
  
+       /*
+        * Creating an empty mount namespace implies creating a new mount
+        * namespace.  Set this before copy_process() so that the
+        * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
+        */
+       if (clone_flags & CLONE_EMPTY_MNTNS) {
+               clone_flags |= CLONE_NEWNS;
+               args->flags = clone_flags;
+       }
+
         /*
          * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
          * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
@@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
  {
         /* Verify that no unknown flags are passed along. */
         if (kargs->flags &
-           ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+           ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+             CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS))
                 return false;
  
         /*
@@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
                                 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
                                 CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
-                               CLONE_NEWTIME))
+                               CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS))
                 return -EINVAL;
         /*
          * Not implemented, but pretend it works if there is nothing
@@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags)
         /*
          * If unsharing namespace, must also unshare filesystem information.
          */
+       if (unshare_flags & UNSHARE_EMPTY_MNTNS)
+               unshare_flags |= CLONE_NEWNS;
         if (unshare_flags & CLONE_NEWNS)
                 unshare_flags |= CLONE_FS;
  
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c

index 259c4b4f1eeb96288669854b816235dd01d9e5e0..1bdc5be2dd202151077638d37996608ba9e4b370 100644 (file)
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
         if (!new_nsp)
                 return ERR_PTR(-ENOMEM);
  
-       new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+       new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
+                                     user_ns, new_fs);
         if (IS_ERR(new_nsp->mnt_ns)) {
                 err = PTR_ERR(new_nsp->mnt_ns);
                 goto out_ns;
@@ -212,18 +213,28 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
         struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
  {
         struct user_namespace *user_ns;
+       u64 flags = unshare_flags;
         int err = 0;
  
-       if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-                              CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
-                              CLONE_NEWTIME)))
+       if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+                      CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+                      CLONE_NEWTIME)))
                 return 0;
  
         user_ns = new_cred ? new_cred->user_ns : current_user_ns();
         if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                 return -EPERM;
  
-       *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+       /*
+        * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
+        * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
+        */
+       if (flags & UNSHARE_EMPTY_MNTNS) {
+               flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
+               flags |= CLONE_EMPTY_MNTNS;
+       }
+
+       *new_nsp = create_new_namespaces(flags, current, user_ns,
                                          new_fs ? new_fs : current->fs);
         if (IS_ERR(*new_nsp)) {
                 err = PTR_ERR(*new_nsp);
author	Christian Brauner <brauner@kernel.org>
	Fri, 6 Mar 2026 16:28:37 +0000 (17:28 +0100)
committer	Christian Brauner <brauner@kernel.org>
	Thu, 12 Mar 2026 12:33:55 +0000 (13:33 +0100)
fs/namespace.c		patch \| blob \| blame \| history
include/uapi/linux/sched.h		patch \| blob \| blame \| history
kernel/fork.c		patch \| blob \| blame \| history
kernel/nsproxy.c		patch \| blob \| blame \| history