From d35fa257eb109395ea6532eeeb9fc0327a946002 Mon Sep 17 00:00:00 2001 From: Malformed C Date: Wed, 27 May 2026 22:43:35 +0300 Subject: [PATCH] nspawn: join network namespace before cloning user namespace When both --private-users and --network-namespace-path are specified, systemd-nspawn fails to start with "Operation not permitted" during the setns() call. This occurs because of the following execution sequence: 1. The outer child calls raw_clone() with CLONE_NEWUSER to create the new user namespace. 2. The inner child is spawned inside this new user namespace. 3. The inner child then attempts to call setns() to join the external network namespace. Because the inner child is already running inside the restricted user namespace, the kernel rejects the setns() call to join a network namespace owned by a different (host/more privileged) user namespace. Fix this by moving the setns() call to the outer child, executing it just before the raw_clone() call. This ensures the network namespace is joined while the process still has the necessary privileges, which also aligns with the inner child's expectation that the network namespace is already set up upon entry. --- src/nspawn/nspawn.c | 10 +++++++--- test/units/TEST-13-NSPAWN.nspawn.sh | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 7f7ceac3183..cb78f1ccfb3 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -4449,6 +4449,13 @@ static int outer_child( if (notify_fd < 0) return notify_fd; + /* Join the external network namespace first, while we are still in the parent's + * user namespace and have CAP_SYS_ADMIN there. Once we clone with CLONE_NEWUSER, + * the child will be in a new user namespace, lacking the capabilities in the + * parent user namespace required to join its network namespace. */ + if (arg_network_namespace_path && setns(netns_fd, CLONE_NEWNET) < 0) + return log_error_errno(errno, "Failed to join network namespace: %m"); + pid_t pid = raw_clone(SIGCHLD|CLONE_NEWNS| arg_clone_ns_flags | (IN_SET(arg_userns_mode, USER_NAMESPACE_FIXED, USER_NAMESPACE_PICK) ? CLONE_NEWUSER : 0) | @@ -4464,9 +4471,6 @@ static int outer_child( /* The inner child has all namespaces that are requested, so that we all are owned by the * user if user namespaces are turned on. */ - if (arg_network_namespace_path && setns(netns_fd, CLONE_NEWNET) < 0) - return log_error_errno(errno, "Failed to join network namespace: %m"); - if (arg_userns_mode == USER_NAMESPACE_MANAGED) { /* In managed usernamespace operation, sysfs + procfs are special, we'll have to * mount them inside the inner namespaces, but before we switch root. Hence do so diff --git a/test/units/TEST-13-NSPAWN.nspawn.sh b/test/units/TEST-13-NSPAWN.nspawn.sh index 822a0087329..a6f5b32f626 100755 --- a/test/units/TEST-13-NSPAWN.nspawn.sh +++ b/test/units/TEST-13-NSPAWN.nspawn.sh @@ -1185,6 +1185,21 @@ matrix_run_one() { ip a | grep -v -E '^1: lo.*UP' ip netns del nspawn_test + # test --network-namespace-path works when combined with --private-users=pick + ip netns add nspawn_test + ip netns exec nspawn_test ip link add foo type dummy + + if [[ "$IS_USERNS_SUPPORTED" == "yes" && "$api_vfs_writable" == "no" ]]; then + SYSTEMD_NSPAWN_USE_CGNS="$use_cgns" SYSTEMD_NSPAWN_API_VFS_WRITABLE="$api_vfs_writable" \ + systemd-nspawn --register=no \ + --directory="$root" \ + --private-users=pick \ + --network-namespace-path=/run/netns/nspawn_test \ + ip link show dev foo + fi + + ip netns del nspawn_test + rm -fr "$root" return 0 -- 2.47.3