≥ 4.9 for RENAME_NOREPLACE support in vfat
≥ 4.10 for cgroup-bpf egress and ingress hooks
≥ 4.11 for nsfs
+ # FIXME: drop compat glue and remove entries above before v258
≥ 4.15 for cgroup-bpf device hook and cpu controller in cgroup v2
≥ 4.17 for cgroup-bpf socket address hooks and /sys/power/resume_offset
≥ 4.20 for PSI (used by systemd-oomd)
≥ 5.4 for pidfd, new mount API, and signed Verity images
≥ 5.6 for getrandom() GRND_INSECURE
≥ 5.7 for CLONE_INTO_CGROUP, BPF links and the BPF LSM hook
- ≥ 5.9 for close_range()
≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
+ ≥ 5.9 for close_range()
≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
and MOVE_MOUNT_BENEATH
≥ 6.9 for pidfs
- ⛔ Kernel versions below 4.3 ("minimum baseline") are not supported at
+ ⛔ Kernel versions below 4.11 ("minimum baseline") are not supported at
all, and are missing required functionality (e.g. CLOCK_BOOTTIME support
- for timerfd_create(), getrandom(), ambient capabilities, or memfd_create()).
+ for timerfd_create(), getrandom(), ambient capabilities, memfd_create(),
+ or nsfs (NS_GET_NSTYPE)).
⚠️ Kernel versions below 5.4 ("recommended baseline") have significant
gaps in functionality and are not recommended for use with this version
#define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path)
-static NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
+NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
for (NamespaceType t = 0; t < _NAMESPACE_TYPE_MAX; t++)
if (((namespace_info[t].clone_flag ^ clone_flag) & (CLONE_NEWCGROUP|CLONE_NEWIPC|CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUSER|CLONE_NEWUTS|CLONE_NEWTIME)) == 0)
return t;
/* Can't setns to your own userns, since then you could escalate from non-root to root in
* your own namespace, so check if namespaces are equal before attempting to enter. */
- r = inode_same_at(userns_fd, "", AT_FDCWD, "/proc/self/ns/user", AT_EMPTY_PATH);
+ r = is_our_namespace(userns_fd, NAMESPACE_USER);
if (r < 0)
return r;
- if (r)
+ if (r > 0)
userns_fd = -EBADF;
}
return reset_uid_gid();
}
-int fd_is_ns(int fd, unsigned long nsflag) {
- struct statfs s;
+int fd_is_namespace(int fd, NamespaceType type) {
int r;
- /* Checks whether the specified file descriptor refers to a namespace created by specifying nsflag in clone().
- * On old kernels there's no nice way to detect that, hence on those we'll return a recognizable error (EUCLEAN),
- * so that callers can handle this somewhat nicely.
- *
- * This function returns > 0 if the fd definitely refers to a network namespace, 0 if it definitely does not
- * refer to a network namespace, -EUCLEAN if we can't determine, and other negative error codes on error. */
+ /* Checks whether the specified file descriptor refers to a namespace (of type if type != _NAMESPACE_INVALID). */
- if (fstatfs(fd, &s) < 0)
- return -errno;
+ assert(fd >= 0);
+ assert(type < _NAMESPACE_TYPE_MAX);
- if (!is_fs_type(&s, NSFS_MAGIC)) {
- /* On really old kernels, there was no "nsfs", and network namespace sockets belonged to procfs
- * instead. Handle that in a somewhat smart way. */
+ r = fd_is_fs_type(fd, NSFS_MAGIC);
+ if (r <= 0)
+ return r;
- if (is_fs_type(&s, PROC_SUPER_MAGIC)) {
- struct statfs t;
+ if (type < 0)
+ return true;
- /* OK, so it is procfs. Let's see if our own network namespace is procfs, too. If so, then the
- * passed fd might refer to a network namespace, but we can't know for sure. In that case,
- * return a recognizable error. */
+ int clone_flag = ioctl(fd, NS_GET_NSTYPE);
+ if (clone_flag < 0)
+ return -errno;
- if (statfs("/proc/self/ns/net", &t) < 0)
- return -errno;
+ NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
+ if (found_type < 0)
+ return -EBADF; /* Uh? Unknown namespace type? */
- if (s.f_type == t.f_type)
- return -EUCLEAN; /* It's possible, we simply don't know */
- }
+ return found_type == type;
+}
- return 0; /* No! */
- }
+int is_our_namespace(int fd, NamespaceType type) {
+ int r;
- r = ioctl(fd, NS_GET_NSTYPE);
- if (r < 0) {
- if (errno == ENOTTY) /* Old kernels didn't know this ioctl, let's also return a recognizable error in that case */
- return -EUCLEAN;
+ assert(fd >= 0);
+ assert(type < _NAMESPACE_TYPE_MAX);
- return -errno;
- }
+ r = fd_is_namespace(fd, type);
+ if (r < 0)
+ return r;
+ if (r == 0) /* Not a namespace or not of the right type? */
+ return -EUCLEAN;
+
+ _cleanup_close_ int our_ns = namespace_open_by_type(type);
+ if (our_ns < 0)
+ return our_ns;
- return (unsigned long) r == nsflag;
+ return fd_inode_same(fd, our_ns);
}
int detach_mount_namespace(void) {
return st.st_ino == namespace_info[type].root_inode;
}
-int is_our_namespace(int fd, NamespaceType request_type) {
- int clone_flag;
-
- assert(fd >= 0);
-
- clone_flag = ioctl(fd, NS_GET_NSTYPE);
- if (clone_flag < 0)
- return -errno;
-
- NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
- if (found_type < 0)
- return -EBADF; /* Uh? Unknown namespace type? */
-
- if (request_type >= 0 && request_type != found_type) /* It's a namespace, but not of the right type? */
- return -EUCLEAN;
-
- struct stat st_fd, st_ours;
- if (fstat(fd, &st_fd) < 0)
- return -errno;
-
- const char *p = pid_namespace_path(0, found_type);
- if (stat(p, &st_ours) < 0) {
- if (errno == ENOENT)
- return proc_mounted() == 0 ? -ENOSYS : -ENOENT;
-
- return -errno;
- }
-
- return stat_inode_same(&st_ours, &st_fd);
-}
-
int is_idmapping_supported(const char *path) {
_cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF, dir_fd = -EBADF;
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
ino_t root_inode;
} namespace_info[_NAMESPACE_TYPE_MAX + 1];
+NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag);
+
int pidref_namespace_open(
const PidRef *pidref,
int *ret_pidns_fd,
int *ret_netns_fd,
int *ret_userns_fd,
int *ret_root_fd);
+
int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd);
-int fd_is_ns(int fd, unsigned long nsflag);
+int fd_is_namespace(int fd, NamespaceType type);
+int is_our_namespace(int fd, NamespaceType type);
int detach_mount_namespace(void);
int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid);
int namespace_is_init(NamespaceType type);
-int is_our_namespace(int fd, NamespaceType type);
-
int is_idmapping_supported(const char *path);
int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
_cleanup_close_ int ns = -EBADF;
+ NamespaceType type;
int r;
assert(ns_storage_socket);
* it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
* allocate a new anonymous ns if needed. */
+ type = clone_flag_to_namespace_type(nsflag);
+ assert(type >= 0);
+
r = posix_lock(ns_storage_socket[0], LOCK_EX);
if (r < 0)
return r;
if (ns < 0)
return -errno;
- r = fd_is_ns(ns, nsflag);
+ r = fd_is_namespace(ns, type);
+ if (r < 0)
+ return r;
if (r == 0)
return -EINVAL;
- if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
- return r;
r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
if (r < 0)
if (r < 0)
return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
- r = fd_is_ns(*userns_fd, CLONE_NEWUSER);
+ r = fd_is_namespace(*userns_fd, NAMESPACE_USER);
if (r < 0)
return r;
if (r == 0)
if (child_netns_fd < 0)
return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
- r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
- if (r == -EUCLEAN)
- log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
- else if (r < 0)
+ r = fd_is_namespace(child_netns_fd, NAMESPACE_NET);
+ if (r < 0)
return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
- else if (r == 0)
+ if (r == 0)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
"Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
}
return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
/* Validate this is actually a valid user namespace fd */
- r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+ r = fd_is_namespace(userns_fd, NAMESPACE_USER);
if (r < 0)
return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m");
if (r == 0)
return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m");
/* Validate this is actually a valid network namespace fd */
- r = fd_is_ns(netns_fd, CLONE_NEWNET);
+ r = fd_is_namespace(netns_fd, NAMESPACE_NET);
if (r < 0)
return r;
if (r == 0)
assert(userns_fd >= 0);
assert(n_mount_fds == 0 || mount_fds);
- r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+ r = fd_is_namespace(userns_fd, NAMESPACE_USER);
if (r < 0)
return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
if (r == 0)
if (r < 0)
return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
- r = inode_same_at(mntns_fd, "", AT_FDCWD, "/proc/self/ns/mnt", AT_EMPTY_PATH);
+ r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
if (r < 0)
return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m");
/* We can't add new mounts at runtime if the process wasn't started in a namespace */
test_shareable_ns(CLONE_NEWIPC);
}
+TEST(fd_is_namespace) {
+ _cleanup_close_ int fd = -EBADF;
+
+ ASSERT_OK_ZERO(fd_is_namespace(STDIN_FILENO, NAMESPACE_NET));
+ ASSERT_OK_ZERO(fd_is_namespace(STDOUT_FILENO, NAMESPACE_NET));
+ ASSERT_OK_ZERO(fd_is_namespace(STDERR_FILENO, NAMESPACE_NET));
+
+ fd = namespace_open_by_type(NAMESPACE_MOUNT);
+ if (IN_SET(fd, -ENOSYS, -ENOENT)) {
+ log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt");
+ return;
+ }
+ ASSERT_OK(fd);
+ ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_MOUNT));
+ ASSERT_OK_ZERO(fd_is_namespace(fd, NAMESPACE_NET));
+ fd = safe_close(fd);
+
+ ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_IPC));
+ ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_IPC));
+ fd = safe_close(fd);
+
+ ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_NET));
+ ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_NET));
+}
+
TEST(protect_kernel_logs) {
static const NamespaceParameters p = {
.runtime_scope = RUNTIME_SCOPE_SYSTEM,
assert_se(path_is_read_only_fs("/i-dont-exist") == -ENOENT);
}
-TEST(fd_is_ns) {
- _cleanup_close_ int fd = -EBADF;
-
- assert_se(fd_is_ns(STDIN_FILENO, CLONE_NEWNET) == 0);
- assert_se(fd_is_ns(STDERR_FILENO, CLONE_NEWNET) == 0);
- assert_se(fd_is_ns(STDOUT_FILENO, CLONE_NEWNET) == 0);
-
- fd = open("/proc/self/ns/mnt", O_CLOEXEC|O_RDONLY);
- if (fd < 0) {
- assert_se(errno == ENOENT);
- log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt");
- return;
- }
- assert_se(fd >= 0);
- assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 0, -EUCLEAN));
- fd = safe_close(fd);
-
- assert_se((fd = open("/proc/self/ns/ipc", O_CLOEXEC|O_RDONLY)) >= 0);
- assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWIPC), 1, -EUCLEAN));
- fd = safe_close(fd);
-
- assert_se((fd = open("/proc/self/ns/net", O_CLOEXEC|O_RDONLY)) >= 0);
- assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 1, -EUCLEAN));
-}
-
TEST(dir_is_empty) {
_cleanup_(rm_rf_physical_and_freep) char *empty_dir = NULL;
_cleanup_free_ char *j = NULL, *jj = NULL, *jjj = NULL;