#include "stat-util.h"
#include "stdio-util.h"
#include "uid-range.h"
+#include "unaligned.h"
#include "user-util.h"
const struct namespace_info namespace_info[_NAMESPACE_TYPE_MAX + 1] = {
}
}
+int namespace_open_by_id(uint64_t ns_id) {
+ int r;
+
+ /* Looks up a namespace by its unique boot-stable identifier and returns an O_PATH fd to it.
+ * Requires kernel ≥ 6.13.
+ *
+ * Returns -ESTALE if the namespace no longer exists, or if the kernel refuses the lookup
+ * for permission reasons. The latter happens outside the initial user namespace: the
+ * kernel only permits open_by_handle_at() on nsfs when the caller is in the initial user
+ * and pid namespaces with CAP_SYS_ADMIN, with a narrow exception for lookups of the
+ * caller's own user namespace and its ancestors. To avoid conflating "namespace is dead"
+ * with "kernel refused us", we refuse early with -EPERM when we aren't in the initial
+ * user/pid namespace or missing CAP_SYS_ADMIN and let the caller skip the check. */
+
+ if (ns_id == 0)
+ return -EINVAL;
+
+ r = namespace_is_init(NAMESPACE_USER);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EPERM;
+
+ r = namespace_is_init(NAMESPACE_PID);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EPERM;
+
+ r = have_effective_cap(CAP_SYS_ADMIN);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EPERM;
+
+ /* The natural way to write this would be a compound designated initializer:
+ *
+ * union { ... } fh = {
+ * .file_handle.handle_bytes = sizeof(struct nsfs_file_handle),
+ * .file_handle.handle_type = FILEID_NSFS,
+ * };
+ *
+ * but that only zero-initializes the named struct members of struct file_handle.
+ * struct file_handle ends with a flexible array (`unsigned char f_handle[]`), whose
+ * storage comes from the overlapping `space[]` member of the union. Bytes in that storage
+ * are not covered by the partial struct initializer and end up as stack garbage. Zero the
+ * entire union first, then fill in the fields explicitly. */
+
+ union {
+ struct file_handle file_handle;
+ uint8_t space[offsetof(struct file_handle, f_handle) + sizeof(struct nsfs_file_handle)];
+ } fh = {};
+ fh.file_handle.handle_bytes = sizeof(struct nsfs_file_handle);
+ fh.file_handle.handle_type = FILEID_NSFS;
+
+ /* The first 8 bytes of struct nsfs_file_handle (see <linux/nsfs.h>, uapi since kernel v6.18)
+ * are __u64 ns_id; the remaining ns_type/ns_inum fields stay zero so the kernel looks up by
+ * id alone. The kernel made lookup-by-id-only an explicit ABI guarantee in v6.19 via commit
+ * 04173501a69e ("nstree: allow lookup solely based on inode"). */
+ unaligned_write_ne64(fh.file_handle.f_handle, ns_id);
+
+ return RET_NERRNO(open_by_handle_at(FD_NSFS_ROOT, &fh.file_handle, O_PATH|O_CLOEXEC));
+}
+
int is_idmapping_supported(const char *path) {
_cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF, dir_fd = -EBADF;
int r;
int userns_get_base_uid(int userns_fd, uid_t *ret_uid, gid_t *ret_gid);
+int namespace_open_by_id(uint64_t ns_id);
+
int process_is_owned_by_uid(const PidRef *pidref, uid_t uid);
int is_idmapping_supported(const char *path);
#define AT_HANDLE_MNT_ID_UNIQUE 0x001 /* Return the u64 unique mount ID. */
#endif
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
/* Defined since glibc-2.42.
* Supported since kernel v5.6 (fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179). */
int openat2_shim(int dfd, const char *filename, const struct open_how *how, size_t usize);
#define PROC_PID_INIT_INO ((ino_t) UINT32_C(0xEFFFFFFC))
#define PROC_CGROUP_INIT_INO ((ino_t) UINT32_C(0xEFFFFFFB))
#define PROC_TIME_INIT_INO ((ino_t) UINT32_C(0xEFFFFFFA))
+
+/* From kernel-internal include/linux/exportfs.h, not part of uapi. */
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
#include "fs-util.h"
#include "log.h"
#include "mkdir.h"
+#include "namespace-util.h"
#include "nsresourced-manager.h"
#include "parse-util.h"
#include "pidfd-util.h"
manager_release_userns_by_inode(m, inode);
}
+ /* Look for registry entries whose user namespace has died without us getting a BPF
+ * notification — e.g. because the BPF ring buffer overflowed, the kprobe is missing, or
+ * something else dropped the fd store entry without going through our cleanup path. Each
+ * registry entry stores the kernel's unique namespace identifier; ask the kernel to open
+ * the namespace by that identifier and release the entry if the lookup fails. Entries
+ * written by older versions don't carry the identifier, and old kernels (or running
+ * outside the initial user namespace) don't support lookup by it — in those cases we leave
+ * the entry alone. */
+
+ SET_FOREACH(p, registry_inodes) {
+ uint64_t inode = PTR_TO_UINT32(p);
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ r = userns_registry_load_by_userns_inode(m->registry_fd, inode, &userns_info);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to load registry entry for user namespace %" PRIu64 ", ignoring: %m", inode);
+ continue;
+ }
+
+ if (userns_info->userns_id == 0)
+ continue; /* Entry predates ns_id tracking, can't probe authoritatively */
+
+ _cleanup_close_ int probe_fd = namespace_open_by_id(userns_info->userns_id);
+ if (probe_fd >= 0)
+ continue; /* User namespace is still alive */
+ /* EPERM/EACCES means we're not in the initial user/pid namespace or missing
+ * CAP_SYS_ADMIN; ENOTSUP/ENOSYS means the kernel is too old for
+ * open_by_handle_at() on nsfs. Either way the sweep can't proceed for any
+ * entry, so bail out rather than logging once per entry. */
+ if (ERRNO_IS_NEG_PRIVILEGE(probe_fd) || ERRNO_IS_NEG_NOT_SUPPORTED(probe_fd)) {
+ log_debug_errno(probe_fd, "Cannot detect stale registry entries, skipping: %m");
+ break;
+ }
+ /* Anything else except ESTALE is unexpected — log it but skip just this one. */
+ if (probe_fd != -ESTALE) {
+ log_debug_errno(probe_fd, "Failed to probe liveness of user namespace %" PRIu64 " (id %" PRIu64 "), ignoring: %m",
+ inode, userns_info->userns_id);
+ continue;
+ }
+
+ log_debug("Registry entry for user namespace %" PRIu64 " (id %" PRIu64 ") refers to a dead namespace, removing.",
+ inode, userns_info->userns_id);
+ manager_release_userns_by_inode(m, inode);
+ }
+
r = manager_make_listen_socket(m);
if (r < 0)
return r;
userns_info->owner = peer_uid;
userns_info->userns_inode = userns_st.st_ino;
+ if (ioctl(userns_fd, NS_GET_ID, &userns_info->userns_id) < 0)
+ log_debug_errno(errno, "Failed to query userns ID, ignoring: %m");
userns_info->size = p.size;
userns_info->target_uid = p.target;
userns_info->target_gid = (gid_t) p.target;
userns_info->owner = peer_uid;
userns_info->userns_inode = userns_st.st_ino;
+ if (ioctl(userns_fd, NS_GET_ID, &userns_info->userns_id) < 0)
+ log_debug_errno(errno, "Failed to query userns ID, ignoring: %m");
r = userns_registry_store(registry_dir_fd, userns_info);
if (r < 0)
{ "owner", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(UserNamespaceInfo, owner), SD_JSON_MANDATORY },
{ "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(UserNamespaceInfo, name), SD_JSON_MANDATORY },
{ "userns", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(UserNamespaceInfo, userns_inode), SD_JSON_MANDATORY },
+ { "usernsId", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(UserNamespaceInfo, userns_id), 0 },
{ "size", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32, offsetof(UserNamespaceInfo, size), 0 },
{ "start", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(UserNamespaceInfo, start_uid), 0 },
{ "target", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(UserNamespaceInfo, target_uid), 0 },
SD_JSON_BUILD_PAIR_UNSIGNED("owner", info->owner),
SD_JSON_BUILD_PAIR_STRING("name", info->name),
SD_JSON_BUILD_PAIR_UNSIGNED("userns", info->userns_inode),
+ SD_JSON_BUILD_PAIR_CONDITION(info->userns_id != 0, "usernsId", SD_JSON_BUILD_UNSIGNED(info->userns_id)),
SD_JSON_BUILD_PAIR_CONDITION(info->size > 0, "size", SD_JSON_BUILD_UNSIGNED(info->size)),
SD_JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start_uid), "start", SD_JSON_BUILD_UNSIGNED(info->start_uid)),
SD_JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->target_uid), "target", SD_JSON_BUILD_UNSIGNED(info->target_uid)),
uid_t owner;
char *name;
uint64_t userns_inode;
+ uint64_t userns_id; /* Unique namespace identifier from NS_GET_ID, 0 if unavailable */
uint32_t size;
uid_t start_uid;
uid_t target_uid;
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <fcntl.h>
+#include <linux/nsfs.h>
#include <sched.h>
#include <stdlib.h>
+#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
ASSERT_ERROR(userns_get_base_uid(fd, &base_uid, &base_gid), ENOMSG);
}
+TEST(namespace_open_by_id) {
+ /* Try our own user namespace first to see if the kernel exposes ns_id at all. */
+ _cleanup_close_ int userns_fd = ASSERT_OK_ERRNO(open("/proc/self/ns/user", O_RDONLY|O_CLOEXEC));
+
+ uint64_t ns_id;
+ int r = RET_NERRNO(ioctl(userns_fd, NS_GET_ID, &ns_id));
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ return (void) log_tests_skipped("NS_GET_ID is not supported by this kernel");
+ ASSERT_OK(r);
+
+ /* namespace_open_by_id() refuses with -EPERM outside the initial user/pid namespace, since
+ * the kernel restricts open_by_handle_at() on nsfs to the initial userns and pidns and to
+ * CAP_SYS_ADMIN. */
+ _cleanup_close_ int opened = namespace_open_by_id(ns_id);
+ if (opened == -EPERM)
+ return (void) log_tests_skipped("not in initial user namespace or missing CAP_SYS_ADMIN");
+ if (IN_SET(opened, -EOPNOTSUPP, -EINVAL))
+ return (void) log_tests_skipped("nsfs lookup by ns_id is not supported by this kernel");
+ ASSERT_OK(opened);
+
+ struct stat orig_st, opened_st;
+ ASSERT_OK_ERRNO(fstat(userns_fd, &orig_st));
+ ASSERT_OK_ERRNO(fstat(opened, &opened_st));
+ ASSERT_EQ(orig_st.st_ino, opened_st.st_ino);
+
+ opened = safe_close(opened);
+
+ ASSERT_ERROR(namespace_open_by_id(0), EINVAL);
+
+ _cleanup_close_ int transient_fd = userns_acquire_empty();
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(transient_fd) || ERRNO_IS_NEG_PRIVILEGE(transient_fd))
+ return (void) log_tests_skipped("cannot acquire userns for transient lookup test");
+ ASSERT_OK(transient_fd);
+
+ uint64_t transient_id;
+ ASSERT_OK_ERRNO(ioctl(transient_fd, NS_GET_ID, &transient_id));
+ ASSERT_NE(transient_id, ns_id);
+
+ opened = ASSERT_OK(namespace_open_by_id(transient_id));
+
+ struct stat transient_st, transient_opened_st;
+ ASSERT_OK_ERRNO(fstat(transient_fd, &transient_st));
+ ASSERT_OK_ERRNO(fstat(opened, &transient_opened_st));
+ ASSERT_EQ(transient_st.st_ino, transient_opened_st.st_ino);
+ opened = safe_close(opened);
+
+ /* Close the only reference. The namespace is now dead — lookup must fail. */
+ transient_fd = safe_close(transient_fd);
+ ASSERT_ERROR(namespace_open_by_id(transient_id), ESTALE);
+}
+
TEST(process_is_owned_by_uid) {
int r;