]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
selftests/namespaces: test for efault
authorChristian Brauner <brauner@kernel.org>
Sun, 9 Nov 2025 21:11:29 +0000 (22:11 +0100)
committerChristian Brauner <brauner@kernel.org>
Mon, 10 Nov 2025 14:53:56 +0000 (15:53 +0100)
Ensure that put_user() can fail and that namespace cleanup works
correctly.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-8-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
tools/testing/selftests/namespaces/.gitignore
tools/testing/selftests/namespaces/Makefile
tools/testing/selftests/namespaces/listns_efault_test.c [new file with mode: 0644]

index 4cb428d77659b83c894853bb3d69e657bcd75292..0989e80da4571f4983198fc3bc3c08c9d2b84723 100644 (file)
@@ -4,6 +4,7 @@ init_ino_test
 ns_active_ref_test
 listns_test
 listns_permissions_test
+listns_efault_test
 siocgskns_test
 cred_change_test
 stress_test
index 1f36c7bf7728821ad9e53e0231f7ddca8d121ad7..fbb821652c17874952647856555954e5f95fba86 100644 (file)
@@ -8,6 +8,7 @@ TEST_GEN_PROGS := nsid_test \
                  ns_active_ref_test \
                  listns_test \
                  listns_permissions_test \
+                 listns_efault_test \
                  siocgskns_test \
                  cred_change_test \
                  stress_test \
@@ -19,6 +20,7 @@ include ../lib.mk
 $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c
 $(OUTPUT)/listns_test: ../filesystems/utils.c
 $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c
+$(OUTPUT)/listns_efault_test: ../filesystems/utils.c
 $(OUTPUT)/siocgskns_test: ../filesystems/utils.c
 $(OUTPUT)/cred_change_test: ../filesystems/utils.c
 $(OUTPUT)/stress_test: ../filesystems/utils.c
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
new file mode 100644 (file)
index 0000000..c7ed402
--- /dev/null
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "../pidfd/pidfd.h"
+#include "wrappers.h"
+
+/*
+ * Test listns() error handling with invalid buffer addresses.
+ *
+ * When the buffer pointer is invalid (e.g., crossing page boundaries
+ * into unmapped memory), listns() returns EINVAL.
+ *
+ * This test also creates mount namespaces that get destroyed during
+ * iteration, testing that namespace cleanup happens outside the RCU
+ * read lock.
+ */
+TEST(listns_partial_fault_with_ns_cleanup)
+{
+       void *map;
+       __u64 *ns_ids;
+       ssize_t ret;
+       long page_size;
+       pid_t pid, iter_pid;
+       int pidfds[5];
+       int sv[5][2];
+       int iter_pidfd;
+       int i, status;
+       char c;
+
+       page_size = sysconf(_SC_PAGESIZE);
+       ASSERT_GT(page_size, 0);
+
+       /*
+        * Map two pages:
+        * - First page: readable and writable
+        * - Second page: will be unmapped to trigger EFAULT
+        */
+       map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(map, MAP_FAILED);
+
+       /* Unmap the second page */
+       ret = munmap((char *)map + page_size, page_size);
+       ASSERT_EQ(ret, 0);
+
+       /*
+        * Position the buffer pointer so there's room for exactly one u64
+        * before the page boundary. The second u64 would fall into the
+        * unmapped page.
+        */
+       ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
+
+       /*
+        * Create a separate process to run listns() in a loop concurrently
+        * with namespace creation and destruction.
+        */
+       iter_pid = create_child(&iter_pidfd, 0);
+       ASSERT_NE(iter_pid, -1);
+
+       if (iter_pid == 0) {
+               struct ns_id_req req = {
+                       .size = sizeof(req),
+                       .spare = 0,
+                       .ns_id = 0,
+                       .ns_type = 0,  /* All types */
+                       .spare2 = 0,
+                       .user_ns_id = 0,  /* Global listing */
+               };
+               int iter_ret;
+
+               /*
+                * Loop calling listns() until killed.
+                * The kernel should:
+                * 1. Successfully write the first namespace ID (within valid page)
+                * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
+                * 3. Handle concurrent namespace destruction without deadlock
+                */
+               while (1) {
+                       iter_ret = sys_listns(&req, ns_ids, 2, 0);
+
+                       if (iter_ret == -1 && errno == ENOSYS)
+                               _exit(PIDFD_SKIP);
+               }
+       }
+
+       /* Small delay to let iterator start looping */
+       usleep(50000);
+
+       /*
+        * Create several child processes, each in its own mount namespace.
+        * These will be destroyed while the iterator is running listns().
+        */
+       for (i = 0; i < 5; i++) {
+               /* Create socketpair for synchronization */
+               ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+               pid = create_child(&pidfds[i], CLONE_NEWNS);
+               ASSERT_NE(pid, -1);
+
+               if (pid == 0) {
+                       close(sv[i][0]); /* Close parent end */
+
+                       if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+                               _exit(1);
+
+                       /* Child: create a couple of tmpfs mounts */
+                       if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+                               _exit(1);
+                       if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+                               _exit(1);
+
+                       if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+                               _exit(1);
+                       if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+                               _exit(1);
+
+                       /* Signal parent that setup is complete */
+                       if (write_nointr(sv[i][1], "R", 1) != 1)
+                               _exit(1);
+
+                       /* Wait for parent to signal us to exit */
+                       if (read_nointr(sv[i][1], &c, 1) != 1)
+                               _exit(1);
+
+                       close(sv[i][1]);
+                       _exit(0);
+               }
+
+               close(sv[i][1]); /* Close child end */
+       }
+
+       /* Wait for all children to finish setup */
+       for (i = 0; i < 5; i++) {
+               ret = read_nointr(sv[i][0], &c, 1);
+               ASSERT_EQ(ret, 1);
+               ASSERT_EQ(c, 'R');
+       }
+
+       /*
+        * Signal children to exit. This will destroy their mount namespaces
+        * while listns() is iterating the namespace tree.
+        * This tests that cleanup happens outside the RCU read lock.
+        */
+       for (i = 0; i < 5; i++)
+               write_nointr(sv[i][0], "X", 1);
+
+       /* Wait for all mount namespace children to exit and cleanup */
+       for (i = 0; i < 5; i++) {
+               waitpid(-1, NULL, 0);
+               close(sv[i][0]);
+               close(pidfds[i]);
+       }
+
+       /* Kill iterator and wait for it */
+       sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+       ret = waitpid(iter_pid, &status, 0);
+       ASSERT_EQ(ret, iter_pid);
+       close(iter_pidfd);
+
+       /* Should have been killed */
+       ASSERT_TRUE(WIFSIGNALED(status));
+       ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+       /* Clean up */
+       munmap(map, page_size);
+}
+
+/*
+ * Test listns() error handling when the entire buffer is invalid.
+ * This is a sanity check that basic invalid pointer detection works.
+ */
+TEST(listns_complete_fault)
+{
+       struct ns_id_req req = {
+               .size = sizeof(req),
+               .spare = 0,
+               .ns_id = 0,
+               .ns_type = 0,
+               .spare2 = 0,
+               .user_ns_id = 0,
+       };
+       __u64 *ns_ids;
+       ssize_t ret;
+
+       /* Use a clearly invalid pointer */
+       ns_ids = (__u64 *)0xdeadbeef;
+
+       ret = sys_listns(&req, ns_ids, 10, 0);
+
+       if (ret == -1 && errno == ENOSYS)
+               SKIP(return, "listns() not supported");
+
+       /* Should fail with EFAULT */
+       ASSERT_EQ(ret, -1);
+       ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() error handling when the buffer is NULL.
+ */
+TEST(listns_null_buffer)
+{
+       struct ns_id_req req = {
+               .size = sizeof(req),
+               .spare = 0,
+               .ns_id = 0,
+               .ns_type = 0,
+               .spare2 = 0,
+               .user_ns_id = 0,
+       };
+       ssize_t ret;
+
+       /* NULL buffer with non-zero count should fail */
+       ret = sys_listns(&req, NULL, 10, 0);
+
+       if (ret == -1 && errno == ENOSYS)
+               SKIP(return, "listns() not supported");
+
+       /* Should fail with EFAULT */
+       ASSERT_EQ(ret, -1);
+       ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() with a buffer that becomes invalid mid-iteration
+ * (after several successful writes), combined with mount namespace
+ * destruction to test RCU cleanup logic.
+ */
+TEST(listns_late_fault_with_ns_cleanup)
+{
+       void *map;
+       __u64 *ns_ids;
+       ssize_t ret;
+       long page_size;
+       pid_t pid, iter_pid;
+       int pidfds[10];
+       int sv[10][2];
+       int iter_pidfd;
+       int i, status;
+       char c;
+
+       page_size = sysconf(_SC_PAGESIZE);
+       ASSERT_GT(page_size, 0);
+
+       /* Map two pages */
+       map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(map, MAP_FAILED);
+
+       /* Unmap the second page */
+       ret = munmap((char *)map + page_size, page_size);
+       ASSERT_EQ(ret, 0);
+
+       /*
+        * Position buffer so we can write several u64s successfully
+        * before hitting the page boundary.
+        */
+       ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
+
+       /*
+        * Create a separate process to run listns() concurrently.
+        */
+       iter_pid = create_child(&iter_pidfd, 0);
+       ASSERT_NE(iter_pid, -1);
+
+       if (iter_pid == 0) {
+               struct ns_id_req req = {
+                       .size = sizeof(req),
+                       .spare = 0,
+                       .ns_id = 0,
+                       .ns_type = 0,
+                       .spare2 = 0,
+                       .user_ns_id = 0,
+               };
+               int iter_ret;
+
+               /*
+                * Loop calling listns() until killed.
+                * Request 10 namespace IDs while namespaces are being destroyed.
+                * This tests:
+                * 1. EFAULT handling when buffer becomes invalid
+                * 2. Namespace cleanup outside RCU read lock during iteration
+                */
+               while (1) {
+                       iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+                       if (iter_ret == -1 && errno == ENOSYS)
+                               _exit(PIDFD_SKIP);
+               }
+       }
+
+       /* Small delay to let iterator start looping */
+       usleep(50000);
+
+       /*
+        * Create more children with mount namespaces to increase the
+        * likelihood that namespace cleanup happens during iteration.
+        */
+       for (i = 0; i < 10; i++) {
+               /* Create socketpair for synchronization */
+               ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+               pid = create_child(&pidfds[i], CLONE_NEWNS);
+               ASSERT_NE(pid, -1);
+
+               if (pid == 0) {
+                       close(sv[i][0]); /* Close parent end */
+
+                       if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+                               _exit(1);
+
+                       /* Child: create tmpfs mounts */
+                       if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+                               _exit(1);
+                       if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+                               _exit(1);
+
+                       if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+                               _exit(1);
+                       if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+                               _exit(1);
+
+                       /* Signal parent that setup is complete */
+                       if (write_nointr(sv[i][1], "R", 1) != 1)
+                               _exit(1);
+
+                       /* Wait for parent to signal us to exit */
+                       if (read_nointr(sv[i][1], &c, 1) != 1)
+                               _exit(1);
+
+                       close(sv[i][1]);
+                       _exit(0);
+               }
+
+               close(sv[i][1]); /* Close child end */
+       }
+
+       /* Wait for all children to finish setup */
+       for (i = 0; i < 10; i++) {
+               ret = read_nointr(sv[i][0], &c, 1);
+               ASSERT_EQ(ret, 1);
+               ASSERT_EQ(c, 'R');
+       }
+
+       /* Kill half the children */
+       for (i = 0; i < 5; i++)
+               write_nointr(sv[i][0], "X", 1);
+
+       /* Small delay to let some exit */
+       usleep(10000);
+
+       /* Kill remaining children */
+       for (i = 5; i < 10; i++)
+               write_nointr(sv[i][0], "X", 1);
+
+       /* Wait for all children and cleanup */
+       for (i = 0; i < 10; i++) {
+               waitpid(-1, NULL, 0);
+               close(sv[i][0]);
+               close(pidfds[i]);
+       }
+
+       /* Kill iterator and wait for it */
+       sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+       ret = waitpid(iter_pid, &status, 0);
+       ASSERT_EQ(ret, iter_pid);
+       close(iter_pidfd);
+
+       /* Should have been killed */
+       ASSERT_TRUE(WIFSIGNALED(status));
+       ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+       /* Clean up */
+       munmap(map, page_size);
+}
+
+/*
+ * Test specifically focused on mount namespace cleanup during EFAULT.
+ * Filter for mount namespaces only.
+ */
+TEST(listns_mnt_ns_cleanup_on_fault)
+{
+       void *map;
+       __u64 *ns_ids;
+       ssize_t ret;
+       long page_size;
+       pid_t pid, iter_pid;
+       int pidfds[8];
+       int sv[8][2];
+       int iter_pidfd;
+       int i, status;
+       char c;
+
+       page_size = sysconf(_SC_PAGESIZE);
+       ASSERT_GT(page_size, 0);
+
+       /* Set up partial fault buffer */
+       map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(map, MAP_FAILED);
+
+       ret = munmap((char *)map + page_size, page_size);
+       ASSERT_EQ(ret, 0);
+
+       /* Position for 3 successful writes, then fault */
+       ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
+
+       /*
+        * Create a separate process to run listns() concurrently.
+        */
+       iter_pid = create_child(&iter_pidfd, 0);
+       ASSERT_NE(iter_pid, -1);
+
+       if (iter_pid == 0) {
+               struct ns_id_req req = {
+                       .size = sizeof(req),
+                       .spare = 0,
+                       .ns_id = 0,
+                       .ns_type = CLONE_NEWNS,  /* Only mount namespaces */
+                       .spare2 = 0,
+                       .user_ns_id = 0,
+               };
+               int iter_ret;
+
+               /*
+                * Loop calling listns() until killed.
+                * Call listns() to race with namespace destruction.
+                */
+               while (1) {
+                       iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+                       if (iter_ret == -1 && errno == ENOSYS)
+                               _exit(PIDFD_SKIP);
+               }
+       }
+
+       /* Small delay to let iterator start looping */
+       usleep(50000);
+
+       /* Create children with mount namespaces */
+       for (i = 0; i < 8; i++) {
+               /* Create socketpair for synchronization */
+               ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+               pid = create_child(&pidfds[i], CLONE_NEWNS);
+               ASSERT_NE(pid, -1);
+
+               if (pid == 0) {
+                       close(sv[i][0]); /* Close parent end */
+
+                       if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+                               _exit(1);
+
+                       /* Do some mount operations to make cleanup more interesting */
+                       if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+                               _exit(1);
+                       if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+                               _exit(1);
+
+                       if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+                               _exit(1);
+                       if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+                               _exit(1);
+
+                       /* Signal parent that setup is complete */
+                       if (write_nointr(sv[i][1], "R", 1) != 1)
+                               _exit(1);
+
+                       /* Wait for parent to signal us to exit */
+                       if (read_nointr(sv[i][1], &c, 1) != 1)
+                               _exit(1);
+
+                       close(sv[i][1]);
+                       _exit(0);
+               }
+
+               close(sv[i][1]); /* Close child end */
+       }
+
+       /* Wait for all children to finish setup */
+       for (i = 0; i < 8; i++) {
+               ret = read_nointr(sv[i][0], &c, 1);
+               ASSERT_EQ(ret, 1);
+               ASSERT_EQ(c, 'R');
+       }
+
+       /* Kill children to trigger namespace destruction during iteration */
+       for (i = 0; i < 8; i++)
+               write_nointr(sv[i][0], "X", 1);
+
+       /* Wait for children and cleanup */
+       for (i = 0; i < 8; i++) {
+               waitpid(-1, NULL, 0);
+               close(sv[i][0]);
+               close(pidfds[i]);
+       }
+
+       /* Kill iterator and wait for it */
+       sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+       ret = waitpid(iter_pid, &status, 0);
+       ASSERT_EQ(ret, iter_pid);
+       close(iter_pidfd);
+
+       /* Should have been killed */
+       ASSERT_TRUE(WIFSIGNALED(status));
+       ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+       munmap(map, page_size);
+}
+
+TEST_HARNESS_MAIN