From: Mike Yuan Date: Wed, 27 Nov 2024 15:35:11 +0000 (+0100) Subject: namespace-util: modernize fd_is_namespace() and is_our_namespace() X-Git-Tag: v258-rc1~1704^2~11 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=07610cafcf60d1dddd8a59d508129fdca91857d7;p=thirdparty%2Fsystemd.git namespace-util: modernize fd_is_namespace() and is_our_namespace() - Make fd_is_namespace() take NamespaceType - Drop support for kernel without NS_GET_NSTYPE (< 4.11) - Port is_our_namespace() to namespace_open_by_type() (preparation for later commits, where the latter would go by pidfd if available, avoiding procfs) --- diff --git a/README b/README index 4dabdaee062..7501b6b9434 100644 --- a/README +++ b/README @@ -35,6 +35,7 @@ REQUIREMENTS: ≥ 4.9 for RENAME_NOREPLACE support in vfat ≥ 4.10 for cgroup-bpf egress and ingress hooks ≥ 4.11 for nsfs + # FIXME: drop compat glue and remove entries above before v258 ≥ 4.15 for cgroup-bpf device hook and cpu controller in cgroup v2 ≥ 4.17 for cgroup-bpf socket address hooks and /sys/power/resume_offset ≥ 4.20 for PSI (used by systemd-oomd) @@ -43,16 +44,17 @@ REQUIREMENTS: ≥ 5.4 for pidfd, new mount API, and signed Verity images ≥ 5.6 for getrandom() GRND_INSECURE ≥ 5.7 for CLONE_INTO_CGROUP, BPF links and the BPF LSM hook - ≥ 5.9 for close_range() ≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT + ≥ 5.9 for close_range() ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD, and MOVE_MOUNT_BENEATH ≥ 6.9 for pidfs - ⛔ Kernel versions below 4.3 ("minimum baseline") are not supported at + ⛔ Kernel versions below 4.11 ("minimum baseline") are not supported at all, and are missing required functionality (e.g. CLOCK_BOOTTIME support - for timerfd_create(), getrandom(), ambient capabilities, or memfd_create()). + for timerfd_create(), getrandom(), ambient capabilities, memfd_create(), + or nsfs (NS_GET_NSTYPE)). ⚠️ Kernel versions below 5.4 ("recommended baseline") have significant gaps in functionality and are not recommended for use with this version diff --git a/src/basic/namespace-util.c b/src/basic/namespace-util.c index f4b29a49ac9..945916495a1 100644 --- a/src/basic/namespace-util.c +++ b/src/basic/namespace-util.c @@ -40,7 +40,7 @@ const struct namespace_info namespace_info[_NAMESPACE_TYPE_MAX + 1] = { #define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path) -static NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) { +NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) { for (NamespaceType t = 0; t < _NAMESPACE_TYPE_MAX; t++) if (((namespace_info[t].clone_flag ^ clone_flag) & (CLONE_NEWCGROUP|CLONE_NEWIPC|CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUSER|CLONE_NEWUTS|CLONE_NEWTIME)) == 0) return t; @@ -157,10 +157,10 @@ int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int /* Can't setns to your own userns, since then you could escalate from non-root to root in * your own namespace, so check if namespaces are equal before attempting to enter. */ - r = inode_same_at(userns_fd, "", AT_FDCWD, "/proc/self/ns/user", AT_EMPTY_PATH); + r = is_our_namespace(userns_fd, NAMESPACE_USER); if (r < 0) return r; - if (r) + if (r > 0) userns_fd = -EBADF; } @@ -191,50 +191,49 @@ int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int return reset_uid_gid(); } -int fd_is_ns(int fd, unsigned long nsflag) { - struct statfs s; +int fd_is_namespace(int fd, NamespaceType type) { int r; - /* Checks whether the specified file descriptor refers to a namespace created by specifying nsflag in clone(). - * On old kernels there's no nice way to detect that, hence on those we'll return a recognizable error (EUCLEAN), - * so that callers can handle this somewhat nicely. - * - * This function returns > 0 if the fd definitely refers to a network namespace, 0 if it definitely does not - * refer to a network namespace, -EUCLEAN if we can't determine, and other negative error codes on error. */ + /* Checks whether the specified file descriptor refers to a namespace (of type if type != _NAMESPACE_INVALID). */ - if (fstatfs(fd, &s) < 0) - return -errno; + assert(fd >= 0); + assert(type < _NAMESPACE_TYPE_MAX); - if (!is_fs_type(&s, NSFS_MAGIC)) { - /* On really old kernels, there was no "nsfs", and network namespace sockets belonged to procfs - * instead. Handle that in a somewhat smart way. */ + r = fd_is_fs_type(fd, NSFS_MAGIC); + if (r <= 0) + return r; - if (is_fs_type(&s, PROC_SUPER_MAGIC)) { - struct statfs t; + if (type < 0) + return true; - /* OK, so it is procfs. Let's see if our own network namespace is procfs, too. If so, then the - * passed fd might refer to a network namespace, but we can't know for sure. In that case, - * return a recognizable error. */ + int clone_flag = ioctl(fd, NS_GET_NSTYPE); + if (clone_flag < 0) + return -errno; - if (statfs("/proc/self/ns/net", &t) < 0) - return -errno; + NamespaceType found_type = clone_flag_to_namespace_type(clone_flag); + if (found_type < 0) + return -EBADF; /* Uh? Unknown namespace type? */ - if (s.f_type == t.f_type) - return -EUCLEAN; /* It's possible, we simply don't know */ - } + return found_type == type; +} - return 0; /* No! */ - } +int is_our_namespace(int fd, NamespaceType type) { + int r; - r = ioctl(fd, NS_GET_NSTYPE); - if (r < 0) { - if (errno == ENOTTY) /* Old kernels didn't know this ioctl, let's also return a recognizable error in that case */ - return -EUCLEAN; + assert(fd >= 0); + assert(type < _NAMESPACE_TYPE_MAX); - return -errno; - } + r = fd_is_namespace(fd, type); + if (r < 0) + return r; + if (r == 0) /* Not a namespace or not of the right type? */ + return -EUCLEAN; + + _cleanup_close_ int our_ns = namespace_open_by_type(type); + if (our_ns < 0) + return our_ns; - return (unsigned long) r == nsflag; + return fd_inode_same(fd, our_ns); } int detach_mount_namespace(void) { @@ -505,37 +504,6 @@ int namespace_is_init(NamespaceType type) { return st.st_ino == namespace_info[type].root_inode; } -int is_our_namespace(int fd, NamespaceType request_type) { - int clone_flag; - - assert(fd >= 0); - - clone_flag = ioctl(fd, NS_GET_NSTYPE); - if (clone_flag < 0) - return -errno; - - NamespaceType found_type = clone_flag_to_namespace_type(clone_flag); - if (found_type < 0) - return -EBADF; /* Uh? Unknown namespace type? */ - - if (request_type >= 0 && request_type != found_type) /* It's a namespace, but not of the right type? */ - return -EUCLEAN; - - struct stat st_fd, st_ours; - if (fstat(fd, &st_fd) < 0) - return -errno; - - const char *p = pid_namespace_path(0, found_type); - if (stat(p, &st_ours) < 0) { - if (errno == ENOENT) - return proc_mounted() == 0 ? -ENOSYS : -ENOENT; - - return -errno; - } - - return stat_inode_same(&st_ours, &st_fd); -} - int is_idmapping_supported(const char *path) { _cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF, dir_fd = -EBADF; _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; diff --git a/src/basic/namespace-util.h b/src/basic/namespace-util.h index e92d4078640..2a3e99b936a 100644 --- a/src/basic/namespace-util.h +++ b/src/basic/namespace-util.h @@ -27,6 +27,8 @@ extern const struct namespace_info { ino_t root_inode; } namespace_info[_NAMESPACE_TYPE_MAX + 1]; +NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag); + int pidref_namespace_open( const PidRef *pidref, int *ret_pidns_fd, @@ -41,9 +43,11 @@ int namespace_open( int *ret_netns_fd, int *ret_userns_fd, int *ret_root_fd); + int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd); -int fd_is_ns(int fd, unsigned long nsflag); +int fd_is_namespace(int fd, NamespaceType type); +int is_our_namespace(int fd, NamespaceType type); int detach_mount_namespace(void); int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid); @@ -77,6 +81,4 @@ int namespace_open_by_type(NamespaceType type); int namespace_is_init(NamespaceType type); -int is_our_namespace(int fd, NamespaceType type); - int is_idmapping_supported(const char *path); diff --git a/src/core/namespace.c b/src/core/namespace.c index 72e9ca48b48..1a0c51fe72f 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -3253,6 +3253,7 @@ int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) { int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) { _cleanup_close_ int ns = -EBADF; + NamespaceType type; int r; assert(ns_storage_socket); @@ -3264,6 +3265,9 @@ int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, un * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will * allocate a new anonymous ns if needed. */ + type = clone_flag_to_namespace_type(nsflag); + assert(type >= 0); + r = posix_lock(ns_storage_socket[0], LOCK_EX); if (r < 0) return r; @@ -3282,11 +3286,11 @@ int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, un if (ns < 0) return -errno; - r = fd_is_ns(ns, nsflag); + r = fd_is_namespace(ns, type); + if (r < 0) + return r; if (r == 0) return -EINVAL; - if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */ - return r; r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT); if (r < 0) diff --git a/src/mountfsd/mountwork.c b/src/mountfsd/mountwork.c index af46287fcf8..cd2f034832e 100644 --- a/src/mountfsd/mountwork.c +++ b/src/mountfsd/mountwork.c @@ -227,7 +227,7 @@ static int validate_userns(sd_varlink *link, int *userns_fd) { if (r < 0) return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m"); - r = fd_is_ns(*userns_fd, CLONE_NEWUSER); + r = fd_is_namespace(*userns_fd, NAMESPACE_USER); if (r < 0) return r; if (r == 0) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index c2f232ae790..69fd347ad7a 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -5249,12 +5249,10 @@ static int run_container( if (child_netns_fd < 0) return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path); - r = fd_is_ns(child_netns_fd, CLONE_NEWNET); - if (r == -EUCLEAN) - log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path); - else if (r < 0) + r = fd_is_namespace(child_netns_fd, NAMESPACE_NET); + if (r < 0) return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path); - else if (r == 0) + if (r == 0) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path); } diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c index 21fc2d9088c..1cce13d4421 100644 --- a/src/nsresourced/nsresourcework.c +++ b/src/nsresourced/nsresourcework.c @@ -666,7 +666,7 @@ static int validate_userns(sd_varlink *link, int userns_fd) { return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m"); /* Validate this is actually a valid user namespace fd */ - r = fd_is_ns(userns_fd, CLONE_NEWUSER); + r = fd_is_namespace(userns_fd, NAMESPACE_USER); if (r < 0) return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m"); if (r == 0) @@ -1455,7 +1455,7 @@ static int validate_netns(sd_varlink *link, int userns_fd, int netns_fd) { return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m"); /* Validate this is actually a valid network namespace fd */ - r = fd_is_ns(netns_fd, CLONE_NEWNET); + r = fd_is_namespace(netns_fd, NAMESPACE_NET); if (r < 0) return r; if (r == 0) diff --git a/src/nsresourced/userns-restrict.c b/src/nsresourced/userns-restrict.c index be33f49f55f..aed2e135cb7 100644 --- a/src/nsresourced/userns-restrict.c +++ b/src/nsresourced/userns-restrict.c @@ -280,7 +280,7 @@ int userns_restrict_put_by_fd( assert(userns_fd >= 0); assert(n_mount_fds == 0 || mount_fds); - r = fd_is_ns(userns_fd, CLONE_NEWUSER); + r = fd_is_namespace(userns_fd, NAMESPACE_USER); if (r < 0) return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m"); if (r == 0) diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index 756fb74c354..0d8a1f68ebc 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -1097,7 +1097,7 @@ static int mount_in_namespace( if (r < 0) return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m"); - r = inode_same_at(mntns_fd, "", AT_FDCWD, "/proc/self/ns/mnt", AT_EMPTY_PATH); + r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT); if (r < 0) return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m"); /* We can't add new mounts at runtime if the process wasn't started in a namespace */ diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c index 66a8bdf4270..d646306acfb 100644 --- a/src/test/test-namespace.c +++ b/src/test/test-namespace.c @@ -164,6 +164,31 @@ TEST(ipcns) { test_shareable_ns(CLONE_NEWIPC); } +TEST(fd_is_namespace) { + _cleanup_close_ int fd = -EBADF; + + ASSERT_OK_ZERO(fd_is_namespace(STDIN_FILENO, NAMESPACE_NET)); + ASSERT_OK_ZERO(fd_is_namespace(STDOUT_FILENO, NAMESPACE_NET)); + ASSERT_OK_ZERO(fd_is_namespace(STDERR_FILENO, NAMESPACE_NET)); + + fd = namespace_open_by_type(NAMESPACE_MOUNT); + if (IN_SET(fd, -ENOSYS, -ENOENT)) { + log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt"); + return; + } + ASSERT_OK(fd); + ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_MOUNT)); + ASSERT_OK_ZERO(fd_is_namespace(fd, NAMESPACE_NET)); + fd = safe_close(fd); + + ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_IPC)); + ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_IPC)); + fd = safe_close(fd); + + ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_NET)); + ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_NET)); +} + TEST(protect_kernel_logs) { static const NamespaceParameters p = { .runtime_scope = RUNTIME_SCOPE_SYSTEM, diff --git a/src/test/test-stat-util.c b/src/test/test-stat-util.c index 4687b561fc3..3ab177d1ce2 100644 --- a/src/test/test-stat-util.c +++ b/src/test/test-stat-util.c @@ -165,31 +165,6 @@ TEST(path_is_read_only_fs) { assert_se(path_is_read_only_fs("/i-dont-exist") == -ENOENT); } -TEST(fd_is_ns) { - _cleanup_close_ int fd = -EBADF; - - assert_se(fd_is_ns(STDIN_FILENO, CLONE_NEWNET) == 0); - assert_se(fd_is_ns(STDERR_FILENO, CLONE_NEWNET) == 0); - assert_se(fd_is_ns(STDOUT_FILENO, CLONE_NEWNET) == 0); - - fd = open("/proc/self/ns/mnt", O_CLOEXEC|O_RDONLY); - if (fd < 0) { - assert_se(errno == ENOENT); - log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt"); - return; - } - assert_se(fd >= 0); - assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 0, -EUCLEAN)); - fd = safe_close(fd); - - assert_se((fd = open("/proc/self/ns/ipc", O_CLOEXEC|O_RDONLY)) >= 0); - assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWIPC), 1, -EUCLEAN)); - fd = safe_close(fd); - - assert_se((fd = open("/proc/self/ns/net", O_CLOEXEC|O_RDONLY)) >= 0); - assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 1, -EUCLEAN)); -} - TEST(dir_is_empty) { _cleanup_(rm_rf_physical_and_freep) char *empty_dir = NULL; _cleanup_free_ char *j = NULL, *jj = NULL, *jjj = NULL;