]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
selftests/filesystems: add clone3 tests for empty mount namespaces
authorChristian Brauner <brauner@kernel.org>
Fri, 6 Mar 2026 16:28:39 +0000 (17:28 +0100)
committerChristian Brauner <brauner@kernel.org>
Thu, 12 Mar 2026 12:33:55 +0000 (13:33 +0100)
Add a test suite for the CLONE_EMPTY_MNTNS flag exercising the empty
mount namespace functionality through the clone3() syscall.

The clone3() code path is distinct from the unshare() path already
tested in empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS
(0x400000000ULL) is a 64-bit flag that implies CLONE_NEWNS.  The
implication happens in kernel_clone() before copy_process(), unlike
unshare() where it goes through UNSHARE_EMPTY_MNTNS to
CLONE_EMPTY_MNTNS conversion in unshare_nsproxy_namespaces().

The tests cover:

- basic functionality: clone3 child gets empty mount namespace with
  exactly one mount, root and cwd point to the same mount
- CLONE_NEWNS implication: CLONE_EMPTY_MNTNS works without explicit
  CLONE_NEWNS, also works with redundant CLONE_NEWNS
- flag interactions: combines correctly with CLONE_NEWUSER,
  CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_PIDFD
- mutual exclusion: CLONE_EMPTY_MNTNS | CLONE_FS returns EINVAL
  because the implied CLONE_NEWNS conflicts with CLONE_FS
- error paths: EPERM without capabilities, unknown 64-bit flags
  rejected
- parent isolation: parent mount namespace is unchanged after clone
- many parent mounts: child still gets exactly one mount
- mount properties: root mount is nullfs, is its own parent, is the
  only listmount entry
- overmount workflow: child can mount tmpfs over nullfs root to build
  a writable filesystem from scratch
- repeated clone3: each child gets a distinct mount namespace
- setns: parent can join child's empty mount namespace via setns()
- regression: plain CLONE_NEWNS via clone3 still copies the full
  mount tree

Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-3-6eb30529bbb0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
tools/testing/selftests/filesystems/empty_mntns/.gitignore
tools/testing/selftests/filesystems/empty_mntns/Makefile
tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c [new file with mode: 0644]

index 48054440b7e1ca357ea840f8aa435d6090de044e..99f89d329db2405f0902366cf3e765543207f6ef 100644 (file)
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+clone3_empty_mntns_test
 empty_mntns_test
 overmount_chroot_test
index 5d4cffa4c4ae6fb1fe2d2112cf3162d353b64995..22e3fb915e812d195d0dc72c557ec33e8f9c9846 100644 (file)
@@ -3,9 +3,10 @@
 CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
 LDLIBS += -lcap
 
-TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test
+TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test clone3_empty_mntns_test
 
 include ../../lib.mk
 
 $(OUTPUT)/empty_mntns_test: ../utils.c
 $(OUTPUT)/overmount_chroot_test: ../utils.c
+$(OUTPUT)/clone3_empty_mntns_test: ../utils.c
diff --git a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
new file mode 100644 (file)
index 0000000..130cc1a
--- /dev/null
@@ -0,0 +1,938 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Tests for empty mount namespace creation via clone3() CLONE_EMPTY_MNTNS
+ *
+ * These tests exercise the clone3() code path for creating empty mount
+ * namespaces, which is distinct from the unshare() path tested in
+ * empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS (0x400000000ULL)
+ * is a 64-bit flag that implies CLONE_NEWNS.  The implication happens in
+ * kernel_clone() before copy_process(), unlike unshare() where it goes
+ * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in
+ * unshare_nsproxy_namespaces().
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "../wrappers.h"
+#include "clone3/clone3_selftests.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+static pid_t clone3_empty_mntns(uint64_t extra_flags)
+{
+       struct __clone_args args = {
+               .flags          = CLONE_EMPTY_MNTNS | extra_flags,
+               .exit_signal    = SIGCHLD,
+       };
+
+       return sys_clone3(&args, sizeof(args));
+}
+
+static bool clone3_empty_mntns_supported(void)
+{
+       pid_t pid;
+       int status;
+
+       pid = fork();
+       if (pid < 0)
+               return false;
+
+       if (pid == 0) {
+               if (enter_userns())
+                       _exit(1);
+
+               pid = clone3_empty_mntns(0);
+               if (pid < 0)
+                       _exit(1);
+
+               if (pid == 0)
+                       _exit(0);
+
+               _exit(wait_for_pid(pid) != 0);
+       }
+
+       if (waitpid(pid, &status, 0) != pid)
+               return false;
+
+       if (!WIFEXITED(status))
+               return false;
+
+       return WEXITSTATUS(status) == 0;
+}
+
+FIXTURE(clone3_empty_mntns) {};
+
+FIXTURE_SETUP(clone3_empty_mntns)
+{
+       if (!clone3_empty_mntns_supported())
+               SKIP(return, "CLONE_EMPTY_MNTNS via clone3 not supported");
+}
+
+FIXTURE_TEARDOWN(clone3_empty_mntns) {}
+
+/*
+ * Basic clone3() with CLONE_EMPTY_MNTNS: child gets empty mount namespace
+ * with exactly 1 mount and root == cwd.
+ */
+TEST_F(clone3_empty_mntns, basic)
+{
+       pid_t pid, inner;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               if (enter_userns())
+                       _exit(1);
+
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(2);
+
+               if (inner == 0) {
+                       uint64_t root_id, cwd_id;
+
+                       if (count_mounts() != 1)
+                               _exit(3);
+
+                       root_id = get_unique_mnt_id("/");
+                       cwd_id = get_unique_mnt_id(".");
+                       if (root_id == 0 || cwd_id == 0)
+                               _exit(4);
+
+                       if (root_id != cwd_id)
+                               _exit(5);
+
+                       _exit(0);
+               }
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS implies CLONE_NEWNS.  Verify that it works without
+ * explicitly setting CLONE_NEWNS (tests fork.c:2627-2630).
+ */
+TEST_F(clone3_empty_mntns, implies_newns)
+{
+       pid_t pid, inner;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               ssize_t parent_mounts;
+
+               if (enter_userns())
+                       _exit(1);
+
+               /* Verify we have mounts in our current namespace. */
+               parent_mounts = count_mounts();
+               if (parent_mounts < 1)
+                       _exit(2);
+
+               /* Only CLONE_EMPTY_MNTNS, no explicit CLONE_NEWNS. */
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(3);
+
+               if (inner == 0) {
+                       if (count_mounts() != 1)
+                               _exit(4);
+
+                       _exit(0);
+               }
+
+               /* Parent still has its mounts. */
+               if (count_mounts() != parent_mounts)
+                       _exit(5);
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Helper macro: generate a test that clones with CLONE_EMPTY_MNTNS |
+ * @extra_flags and verifies the child has exactly one mount.
+ */
+#define TEST_CLONE3_FLAGS(test_name, extra_flags)                      \
+TEST_F(clone3_empty_mntns, test_name)                                  \
+{                                                                      \
+       pid_t pid, inner;                                               \
+                                                                       \
+       pid = fork();                                                   \
+       ASSERT_GE(pid, 0);                                              \
+                                                                       \
+       if (pid == 0) {                                                 \
+               if (enter_userns())                                     \
+                       _exit(1);                                       \
+                                                                       \
+               inner = clone3_empty_mntns(extra_flags);                \
+               if (inner < 0)                                          \
+                       _exit(2);                                       \
+                                                                       \
+               if (inner == 0) {                                       \
+                       if (count_mounts() != 1)                        \
+                               _exit(3);                               \
+                       _exit(0);                                       \
+               }                                                       \
+                                                                       \
+               _exit(wait_for_pid(inner));                             \
+       }                                                               \
+                                                                       \
+       ASSERT_EQ(wait_for_pid(pid), 0);                                \
+}
+
+/* Redundant CLONE_NEWNS | CLONE_EMPTY_MNTNS should succeed. */
+TEST_CLONE3_FLAGS(with_explicit_newns, CLONE_NEWNS)
+
+/* CLONE_EMPTY_MNTNS combined with CLONE_NEWUSER. */
+TEST_CLONE3_FLAGS(with_newuser, CLONE_NEWUSER)
+
+/* CLONE_EMPTY_MNTNS combined with other namespace flags. */
+TEST_CLONE3_FLAGS(with_other_ns_flags, CLONE_NEWUTS | CLONE_NEWIPC)
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_NEWPID.
+ */
+TEST_F(clone3_empty_mntns, with_newpid)
+{
+       pid_t pid, inner;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               if (enter_userns())
+                       _exit(1);
+
+               inner = clone3_empty_mntns(CLONE_NEWPID);
+               if (inner < 0)
+                       _exit(2);
+
+               if (inner == 0) {
+                       if (count_mounts() != 1)
+                               _exit(3);
+
+                       /* In a new PID namespace, getpid() returns 1. */
+                       if (getpid() != 1)
+                               _exit(4);
+
+                       _exit(0);
+               }
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS | CLONE_FS must fail because the implied CLONE_NEWNS
+ * and CLONE_FS are mutually exclusive (fork.c:1981).
+ */
+TEST_F(clone3_empty_mntns, with_clone_fs_fails)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               struct __clone_args args = {
+                       .flags          = CLONE_EMPTY_MNTNS | CLONE_FS,
+                       .exit_signal    = SIGCHLD,
+               };
+               pid_t ret;
+
+               if (enter_userns())
+                       _exit(1);
+
+               ret = sys_clone3(&args, sizeof(args));
+               if (ret >= 0) {
+                       if (ret == 0)
+                               _exit(0);
+                       wait_for_pid(ret);
+                       _exit(2);
+               }
+
+               if (errno != EINVAL)
+                       _exit(3);
+
+               _exit(0);
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_PIDFD returns a valid pidfd.
+ */
+TEST_F(clone3_empty_mntns, with_pidfd)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               struct __clone_args args = {
+                       .flags          = CLONE_EMPTY_MNTNS | CLONE_PIDFD,
+                       .exit_signal    = SIGCHLD,
+               };
+               int pidfd = -1;
+               pid_t inner;
+
+               if (enter_userns())
+                       _exit(1);
+
+               args.pidfd = (uintptr_t)&pidfd;
+
+               inner = sys_clone3(&args, sizeof(args));
+               if (inner < 0)
+                       _exit(2);
+
+               if (inner == 0) {
+                       if (count_mounts() != 1)
+                               _exit(3);
+
+                       _exit(0);
+               }
+
+               /* Verify we got a valid pidfd. */
+               if (pidfd < 0)
+                       _exit(4);
+
+               close(pidfd);
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * clone3 without CAP_SYS_ADMIN must fail with EPERM.
+ */
+TEST_F(clone3_empty_mntns, eperm_without_caps)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               pid_t ret;
+
+               /* Skip if already root. */
+               if (getuid() == 0)
+                       _exit(0);
+
+               ret = clone3_empty_mntns(0);
+               if (ret >= 0) {
+                       if (ret == 0)
+                               _exit(0);
+                       wait_for_pid(ret);
+                       _exit(1);
+               }
+
+               if (errno != EPERM)
+                       _exit(2);
+
+               _exit(0);
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent's mount namespace is unaffected after clone3 with CLONE_EMPTY_MNTNS.
+ */
+TEST_F(clone3_empty_mntns, parent_unchanged)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               ssize_t nr_before, nr_after;
+               pid_t inner;
+
+               if (enter_userns())
+                       _exit(1);
+
+               nr_before = count_mounts();
+               if (nr_before < 1)
+                       _exit(2);
+
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(3);
+
+               if (inner == 0)
+                       _exit(0);
+
+               if (wait_for_pid(inner) != 0)
+                       _exit(4);
+
+               nr_after = count_mounts();
+               if (nr_after != nr_before)
+                       _exit(5);
+
+               _exit(0);
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent with many mounts: child still gets exactly 1 mount.
+ */
+TEST_F(clone3_empty_mntns, many_parent_mounts)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               char tmpdir[] = "/tmp/clone3_mntns_test.XXXXXX";
+               pid_t inner;
+               int i;
+
+               if (enter_userns())
+                       _exit(1);
+
+               if (unshare(CLONE_NEWNS))
+                       _exit(2);
+
+               if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+                       _exit(3);
+
+               if (!mkdtemp(tmpdir))
+                       _exit(4);
+
+               if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+                       _exit(5);
+
+               for (i = 0; i < 5; i++) {
+                       char subdir[256];
+
+                       snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+                       if (mkdir(subdir, 0755) && errno != EEXIST)
+                               _exit(6);
+                       if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+                               _exit(7);
+               }
+
+               if (count_mounts() < 5)
+                       _exit(8);
+
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(9);
+
+               if (inner == 0) {
+                       if (count_mounts() != 1)
+                               _exit(10);
+
+                       _exit(0);
+               }
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify the child's root mount is nullfs with expected statmount properties.
+ */
+TEST_F(clone3_empty_mntns, mount_properties)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               pid_t inner;
+
+               if (enter_userns())
+                       _exit(1);
+
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(2);
+
+               if (inner == 0) {
+                       struct statmount *sm;
+                       uint64_t root_id;
+
+                       root_id = get_unique_mnt_id("/");
+                       if (!root_id)
+                               _exit(3);
+
+                       sm = statmount_alloc(root_id, 0,
+                                            STATMOUNT_MNT_BASIC |
+                                            STATMOUNT_MNT_POINT |
+                                            STATMOUNT_FS_TYPE);
+                       if (!sm)
+                               _exit(4);
+
+                       /* Root mount point is "/". */
+                       if (!(sm->mask & STATMOUNT_MNT_POINT))
+                               _exit(5);
+                       if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+                               _exit(6);
+
+                       /* Filesystem type is nullfs. */
+                       if (!(sm->mask & STATMOUNT_FS_TYPE))
+                               _exit(7);
+                       if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+                               _exit(8);
+
+                       /* Root mount is its own parent. */
+                       if (!(sm->mask & STATMOUNT_MNT_BASIC))
+                               _exit(9);
+                       if (sm->mnt_parent_id != sm->mnt_id)
+                               _exit(10);
+
+                       free(sm);
+                       _exit(0);
+               }
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Listmount returns only the root mount in the child's empty namespace.
+ */
+TEST_F(clone3_empty_mntns, listmount_single_entry)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               pid_t inner;
+
+               if (enter_userns())
+                       _exit(1);
+
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(2);
+
+               if (inner == 0) {
+                       uint64_t list[16];
+                       ssize_t nr_mounts;
+                       uint64_t root_id;
+
+                       nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
+                       if (nr_mounts != 1)
+                               _exit(3);
+
+                       root_id = get_unique_mnt_id("/");
+                       if (!root_id)
+                               _exit(4);
+
+                       if (list[0] != root_id)
+                               _exit(5);
+
+                       _exit(0);
+               }
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Child can mount tmpfs over nullfs root (the primary container use case).
+ *
+ * Uses the new mount API (fsopen/fsmount/move_mount) because resolving
+ * "/" returns the process root directly without following overmounts.
+ * The mount fd from fsmount lets us fchdir + chroot into the new tmpfs.
+ */
+TEST_F(clone3_empty_mntns, child_overmount_tmpfs)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               pid_t inner;
+
+               if (enter_userns())
+                       _exit(1);
+
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(2);
+
+               if (inner == 0) {
+                       struct statmount *sm;
+                       uint64_t root_id;
+                       int fd, fsfd, mntfd;
+
+                       if (count_mounts() != 1)
+                               _exit(3);
+
+                       /* Verify root is nullfs. */
+                       root_id = get_unique_mnt_id("/");
+                       if (!root_id)
+                               _exit(4);
+
+                       sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+                       if (!sm)
+                               _exit(5);
+                       if (!(sm->mask & STATMOUNT_FS_TYPE))
+                               _exit(6);
+                       if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+                               _exit(7);
+                       free(sm);
+
+                       /* Create tmpfs via the new mount API. */
+                       fsfd = sys_fsopen("tmpfs", 0);
+                       if (fsfd < 0)
+                               _exit(8);
+
+                       if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING,
+                                        "size", "1M", 0)) {
+                               close(fsfd);
+                               _exit(9);
+                       }
+
+                       if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE,
+                                        NULL, NULL, 0)) {
+                               close(fsfd);
+                               _exit(10);
+                       }
+
+                       mntfd = sys_fsmount(fsfd, 0, 0);
+                       close(fsfd);
+                       if (mntfd < 0)
+                               _exit(11);
+
+                       /* Attach tmpfs to "/". */
+                       if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
+                                          MOVE_MOUNT_F_EMPTY_PATH)) {
+                               close(mntfd);
+                               _exit(12);
+                       }
+
+                       if (count_mounts() != 2) {
+                               close(mntfd);
+                               _exit(13);
+                       }
+
+                       /* Enter the tmpfs. */
+                       if (fchdir(mntfd)) {
+                               close(mntfd);
+                               _exit(14);
+                       }
+
+                       if (chroot(".")) {
+                               close(mntfd);
+                               _exit(15);
+                       }
+
+                       close(mntfd);
+
+                       /* Verify "/" is now tmpfs. */
+                       root_id = get_unique_mnt_id("/");
+                       if (!root_id)
+                               _exit(16);
+
+                       sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+                       if (!sm)
+                               _exit(17);
+                       if (!(sm->mask & STATMOUNT_FS_TYPE))
+                               _exit(18);
+                       if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
+                               _exit(19);
+                       free(sm);
+
+                       /* Verify tmpfs is writable. */
+                       fd = open("/testfile", O_CREAT | O_RDWR, 0644);
+                       if (fd < 0)
+                               _exit(20);
+
+                       if (write(fd, "test", 4) != 4) {
+                               close(fd);
+                               _exit(21);
+                       }
+                       close(fd);
+
+                       if (access("/testfile", F_OK))
+                               _exit(22);
+
+                       _exit(0);
+               }
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Multiple clone3 calls with CLONE_EMPTY_MNTNS produce children with
+ * distinct mount namespace root mount IDs.
+ */
+TEST_F(clone3_empty_mntns, repeated)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               int pipe1[2], pipe2[2];
+               uint64_t id1 = 0, id2 = 0;
+               pid_t inner1, inner2;
+
+               if (enter_userns())
+                       _exit(1);
+
+               if (pipe(pipe1) || pipe(pipe2))
+                       _exit(2);
+
+               inner1 = clone3_empty_mntns(0);
+               if (inner1 < 0)
+                       _exit(3);
+
+               if (inner1 == 0) {
+                       uint64_t root_id;
+
+                       close(pipe1[0]);
+                       root_id = get_unique_mnt_id("/");
+                       if (write(pipe1[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+                               _exit(1);
+                       close(pipe1[1]);
+                       _exit(0);
+               }
+
+               inner2 = clone3_empty_mntns(0);
+               if (inner2 < 0)
+                       _exit(4);
+
+               if (inner2 == 0) {
+                       uint64_t root_id;
+
+                       close(pipe2[0]);
+                       root_id = get_unique_mnt_id("/");
+                       if (write(pipe2[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+                               _exit(1);
+                       close(pipe2[1]);
+                       _exit(0);
+               }
+
+               close(pipe1[1]);
+               close(pipe2[1]);
+
+               if (read(pipe1[0], &id1, sizeof(id1)) != sizeof(id1))
+                       _exit(5);
+               if (read(pipe2[0], &id2, sizeof(id2)) != sizeof(id2))
+                       _exit(6);
+
+               close(pipe1[0]);
+               close(pipe2[0]);
+
+               if (wait_for_pid(inner1) || wait_for_pid(inner2))
+                       _exit(7);
+
+               /* Each child must have a distinct root mount ID. */
+               if (id1 == 0 || id2 == 0)
+                       _exit(8);
+               if (id1 == id2)
+                       _exit(9);
+
+               _exit(0);
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify setns() into a child's empty mount namespace works.
+ */
+TEST_F(clone3_empty_mntns, setns_into_child_mntns)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               int pipe_fd[2];
+               pid_t inner;
+               char c;
+
+               if (enter_userns())
+                       _exit(1);
+
+               if (pipe(pipe_fd))
+                       _exit(2);
+
+               inner = clone3_empty_mntns(0);
+               if (inner < 0)
+                       _exit(3);
+
+               if (inner == 0) {
+                       /* Signal parent we're ready. */
+                       close(pipe_fd[0]);
+                       if (write(pipe_fd[1], "r", 1) != 1)
+                               _exit(1);
+
+                       /*
+                        * Wait for parent to finish.  Reading from our
+                        * write end will block until the parent closes
+                        * its read end, giving us an implicit barrier.
+                        */
+                       if (read(pipe_fd[1], &c, 1) < 0)
+                               ;
+                       close(pipe_fd[1]);
+                       _exit(0);
+               }
+
+               close(pipe_fd[1]);
+
+               /* Wait for child to be ready. */
+               if (read(pipe_fd[0], &c, 1) != 1)
+                       _exit(4);
+
+               /* Open child's mount namespace. */
+               {
+                       char path[64];
+                       int mntns_fd;
+
+                       snprintf(path, sizeof(path), "/proc/%d/ns/mnt", inner);
+                       mntns_fd = open(path, O_RDONLY);
+                       if (mntns_fd < 0)
+                               _exit(5);
+
+                       if (setns(mntns_fd, CLONE_NEWNS))
+                               _exit(6);
+
+                       close(mntns_fd);
+               }
+
+               /* Now we should be in the child's empty mntns. */
+               if (count_mounts() != 1)
+                       _exit(7);
+
+               close(pipe_fd[0]);
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Tests below do not require CLONE_EMPTY_MNTNS support.
+ */
+
+/*
+ * Unknown 64-bit flags beyond the known set are rejected.
+ */
+TEST(unknown_flags_rejected)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               struct __clone_args args = {
+                       .flags          = 0x800000000ULL,
+                       .exit_signal    = SIGCHLD,
+               };
+               pid_t ret;
+
+               ret = sys_clone3(&args, sizeof(args));
+               if (ret >= 0) {
+                       if (ret == 0)
+                               _exit(0);
+                       wait_for_pid(ret);
+                       _exit(1);
+               }
+
+               if (errno != EINVAL)
+                       _exit(2);
+
+               _exit(0);
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Regular clone3 with CLONE_NEWNS (without CLONE_EMPTY_MNTNS) still
+ * copies the full mount tree.
+ */
+TEST(clone3_newns_full_copy)
+{
+       pid_t pid;
+
+       pid = fork();
+       ASSERT_GE(pid, 0);
+
+       if (pid == 0) {
+               struct __clone_args args = {
+                       .flags          = CLONE_NEWNS,
+                       .exit_signal    = SIGCHLD,
+               };
+               ssize_t parent_mounts;
+               pid_t inner;
+
+               if (enter_userns())
+                       _exit(1);
+
+               parent_mounts = count_mounts();
+               if (parent_mounts < 1)
+                       _exit(2);
+
+               inner = sys_clone3(&args, sizeof(args));
+               if (inner < 0)
+                       _exit(3);
+
+               if (inner == 0) {
+                       /* Full copy should have at least as many mounts. */
+                       if (count_mounts() < parent_mounts)
+                               _exit(1);
+
+                       _exit(0);
+               }
+
+               _exit(wait_for_pid(inner));
+       }
+
+       ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN