From: Daan De Meyer Date: Thu, 21 May 2026 11:34:44 +0000 (+0000) Subject: nsresourced: Verify user namespace identity on registry lookup X-Git-Tag: v261-rc1~8^2~1 X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=01e6465b558570d2e33ebeb514b39cedde96014f;p=thirdparty%2Fsystemd.git nsresourced: Verify user namespace identity on registry lookup When a user namespace dies and its registry entry is torn down, the kernel can recycle its inode number for a freshly-created namespace. A subsequent registration or operation request can therefore find a stale registry entry keyed by the same inode that actually belongs to a different, now-dead user namespace. Use NS_GET_ID to compare the kernel-assigned namespace identifier against the stored one whenever we look up the registry from a live userns fd (AddMount/AddControlGroup/AddNetworkInterface, plus the two registration paths). Extract release_userns_by_info()/release_userns_by_inode() into userns-registry.c so nsresourcework can fully clean up stale entries (BPF allowlist, fdstore fd, cgroups, netifs, on-disk record) before reusing the slot, and remove the now-unused userns_registry_inode_exists(). Co-developed-by: Claude Opus 4.7 --- diff --git a/src/nsresourced/nsresourced-manager.c b/src/nsresourced/nsresourced-manager.c index c2a6cd6eab0..098bc18f8b7 100644 --- a/src/nsresourced/nsresourced-manager.c +++ b/src/nsresourced/nsresourced-manager.c @@ -13,6 +13,7 @@ #include "build-path.h" #include "common-signal.h" #include "env-util.h" +#include "errno-util.h" #include "event-util.h" #include "fd-util.h" #include "format-util.h" @@ -34,7 +35,6 @@ #include "time-util.h" #include "umask-util.h" #include "unaligned.h" -#include "user-util.h" #include "userns-registry.h" #include "userns-restrict.h" @@ -313,82 +313,61 @@ static int start_workers(Manager *m, bool explicit_request) { return 0; } -static void manager_release_userns_bpf(Manager *m, uint64_t inode) { -#if HAVE_VMLINUX_H - int r; - +static struct userns_restrict_bpf *manager_bpf(Manager *m) { assert(m); - if (inode == 0) - return; - - assert(m->userns_restrict_bpf); - - r = userns_restrict_reset_by_inode(m->userns_restrict_bpf, inode); - if (r < 0) - return (void) log_warning_errno(r, "Failed to remove namespace inode from BPF map, ignoring: %m"); +#if HAVE_VMLINUX_H + return m->userns_restrict_bpf; +#else + return NULL; #endif } -static void manager_release_userns_fds(Manager *m, uint64_t inode) { - int r; - +/* Releases the resources tied to a user namespace described by info. The caller must hold the + * registry lock if there is any chance of a concurrent writer (i.e. workers — true once the listen + * socket is open; not true during manager_startup() before that point). */ +static void manager_release_userns_by_info(Manager *m, UserNamespaceInfo *info) { assert(m); - assert(inode != 0); + assert(info); + assert(info->userns_inode != 0); + + /* Before tearing anything down, confirm by namespace id that the namespace we're releasing is + * actually dead. The kernel may have recycled this inode for a freshly created live namespace + * (e.g. between a BPF death event firing and us getting here); proceeding in that case would + * clobber the new namespace's BPF allowlist, fdstore fd and registry entry. */ + if (info->userns_id != 0) { + _cleanup_close_ int probe_fd = namespace_open_by_id(info->userns_id); + if (probe_fd >= 0) { + log_warning("Refusing to release user namespace %" PRIu64 " (id %" PRIu64 "): the namespace is still alive.", + info->userns_inode, info->userns_id); + return; + } + if (probe_fd != -ESTALE && + !ERRNO_IS_NEG_PRIVILEGE(probe_fd) && + !ERRNO_IS_NEG_NOT_SUPPORTED(probe_fd)) + log_warning_errno(probe_fd, + "Failed to probe liveness of user namespace %" PRIu64 " (id %" PRIu64 "), proceeding with release: %m", + info->userns_inode, info->userns_id); + } - r = sd_notifyf(/* unset_environment= */ false, - "FDSTOREREMOVE=1\n" - "FDNAME=userns-%" PRIu64 "\n", inode); - if (r < 0) - log_warning_errno(r, "Failed to send fd store removal message, ignoring: %m"); + userns_registry_release_by_info(manager_bpf(m), m->registry_fd, info); } static void manager_release_userns_by_inode(Manager *m, uint64_t inode) { _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; - _cleanup_close_ int lock_fd = -EBADF; int r; assert(m); assert(inode != 0); - lock_fd = userns_registry_lock(m->registry_fd); - if (lock_fd < 0) - return (void) log_error_errno(lock_fd, "Failed to lock registry: %m"); - r = userns_registry_load_by_userns_inode(m->registry_fd, inode, &userns_info); - if (r < 0) - log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, - "Failed to find userns for inode %" PRIu64 ", ignoring: %m", inode); - - if (DEBUG_LOGGING) { - if (userns_info && uid_is_valid(userns_info->start_uid)) - log_debug("Removing user namespace mapping %" PRIu64 " for UID " UID_FMT ".", inode, userns_info->start_uid); - else - log_debug("Removing user namespace mapping %" PRIu64 ".", inode); - } - - /* Remove the BPF rules */ - manager_release_userns_bpf(m, inode); - - /* Remove the resources from the fdstore */ - manager_release_userns_fds(m, inode); - - /* And finally remove the resources file from disk */ - if (userns_info) { - /* Remove the cgroups of this userns */ - r = userns_info_remove_cgroups(userns_info); - if (r < 0) - log_warning_errno(r, "Failed to remove cgroups of user namespace, ignoring: %m"); + if (r >= 0) + return manager_release_userns_by_info(m, userns_info); - /* Remove the netifs of this userns */ - r = userns_info_remove_netifs(userns_info); - if (r < 0) - log_warning_errno(r, "Failed to remove netifs of user namespace, ignoring: %m"); - - r = userns_registry_remove(m->registry_fd, userns_info); - if (r < 0) - log_warning_errno(r, "Failed to remove user namespace '%s', ignoring.", userns_info->name); - } + /* No registry entry to consult — fall through to inode-only cleanup of kernel resources. */ + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to load registry entry for user namespace %" PRIu64 ", proceeding with inode-only cleanup: %m", inode); + userns_registry_release_by_userns_inode(manager_bpf(m), m->registry_fd, inode); } static int manager_scan_registry(Manager *m, Set **registry_inodes) { @@ -540,6 +519,13 @@ static int ringbuf_event(void *userdata, void *data, size_t size) { if ((size % sizeof(unsigned)) != 0) /* Not multiples of "unsigned"? */ return -EIO; + /* Workers are active alongside us once we're processing BPF events, so we have to serialize + * registry mutations against them. The startup-time release callers run before any worker + * exists and skip the lock. */ + _cleanup_close_ int lock_fd = userns_registry_lock(m->registry_fd); + if (lock_fd < 0) + return log_error_errno(lock_fd, "Failed to lock registry: %m"); + n = size / sizeof(unsigned); for (size_t i = 0; i < n; i++) { const void *d; @@ -691,7 +677,7 @@ int manager_startup(Manager *m) { log_debug("Registry entry for user namespace %" PRIu64 " (id %" PRIu64 ") refers to a dead namespace, removing.", inode, userns_info->userns_id); - manager_release_userns_by_inode(m, inode); + manager_release_userns_by_info(m, userns_info); } r = manager_make_listen_socket(m); diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c index a366e9421ab..3eeb88ad206 100644 --- a/src/nsresourced/nsresourcework.c +++ b/src/nsresourced/nsresourcework.c @@ -501,6 +501,41 @@ static int name_is_available( return true; } +static int inode_slot_is_available( + int registry_dir_fd, + int userns_fd, + struct userns_restrict_bpf *bpf, + uint64_t inode) { + + _cleanup_(userns_info_freep) UserNamespaceInfo *existing = NULL; + int r; + + assert(registry_dir_fd >= 0); + assert(userns_fd >= 0); + assert(inode != 0); + + /* Returns true if the registry has no entry for this inode (after cleaning up any stale + * leftover from a previously-registered namespace whose inode was recycled by the kernel), + * false if a live registration already occupies the slot, negative on error. */ + + r = userns_registry_load_by_userns_inode(registry_dir_fd, inode, &existing); + if (r == -ENOENT) + return true; + if (r < 0) + return log_debug_errno(r, "Failed to load existing registry entry: %m"); + + r = userns_info_verify_fd(userns_fd, existing); + if (r >= 0) + return false; + if (r != -ESTALE) + return log_debug_errno(r, "Failed to verify user namespace identity: %m"); + + log_debug("Inode %" PRIu64 " was reused by the kernel; cleaning up stale registry entry for namespace id %" PRIu64 ".", + inode, existing->userns_id); + userns_registry_release_by_info(bpf, registry_dir_fd, existing); + return true; +} + static int allocate_one( int registry_dir_fd, const char *name, @@ -604,6 +639,7 @@ static int allocate_now( int registry_dir_fd, int userns_fd, int parent_userns_fd, + struct userns_restrict_bpf *bpf, UserNamespaceInfo *info, int *ret_lock_fd) { @@ -638,10 +674,10 @@ static int allocate_now( if (r >= USERNS_PER_UID) return log_debug_errno(SYNTHETIC_ERRNO(EUSERS), "User already registered %i user namespaces, refusing.", r); - r = userns_registry_inode_exists(registry_dir_fd, info->userns_inode); + r = inode_slot_is_available(registry_dir_fd, userns_fd, bpf, info->userns_inode); if (r < 0) return r; - if (r > 0) + if (r == 0) return -EDEADLK; r = name_is_available(registry_dir_fd, info->name); @@ -1342,7 +1378,7 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para userns_info->n_delegates = p.delegate_container_ranges; } - r = allocate_now(registry_dir_fd, userns_fd, parent_userns_fd, userns_info, &lock_fd); + r = allocate_now(registry_dir_fd, userns_fd, parent_userns_fd, c->bpf, userns_info, &lock_fd); if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */ return sd_varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL); if (r == -EBUSY) /* All used up */ @@ -1555,10 +1591,10 @@ static int vl_method_register_user_namespace(sd_varlink *link, sd_json_variant * if (lock_fd < 0) return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); - r = userns_registry_inode_exists(registry_dir_fd, userns_st.st_ino); + r = inode_slot_is_available(registry_dir_fd, userns_fd, c->bpf, userns_st.st_ino); if (r < 0) return r; - if (r > 0) + if (r == 0) return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL); r = name_is_available(registry_dir_fd, userns_name); @@ -1706,6 +1742,12 @@ static int vl_method_add_mount_to_user_namespace(sd_varlink *link, sd_json_varia if (r < 0) return r; + r = userns_info_verify_fd(userns_fd, userns_info); + if (r == -ESTALE) + return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return log_debug_errno(r, "Failed to verify user namespace identity: %m"); + if (!c->bpf) { r = userns_restrict_install(/* pin= */ true, &c->bpf); if (r < 0) @@ -1858,6 +1900,12 @@ static int vl_method_add_cgroup_to_user_namespace(sd_varlink *link, sd_json_vari if (r < 0) return r; + r = userns_info_verify_fd(userns_fd, userns_info); + if (r == -ESTALE) + return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return log_debug_errno(r, "Failed to verify user namespace identity: %m"); + /* The user namespace must have a user assigned */ if (userns_info->size == 0) return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceWithoutUserRange", NULL); @@ -2251,6 +2299,12 @@ static int vl_method_add_netif_to_user_namespace(sd_varlink *link, sd_json_varia if (r < 0) return r; + r = userns_info_verify_fd(userns_fd, userns_info); + if (r == -ESTALE) + return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return log_debug_errno(r, "Failed to verify user namespace identity: %m"); + if (strv_length(userns_info->netifs) > USER_NAMESPACE_NETIFS_DELEGATE_MAX) return sd_varlink_error(link, "io.systemd.NamespaceResource.TooManyNetworkInterfaces", NULL); diff --git a/src/nsresourced/userns-registry.c b/src/nsresourced/userns-registry.c index 3a0dace7ca3..fa8fbcc9b33 100644 --- a/src/nsresourced/userns-registry.c +++ b/src/nsresourced/userns-registry.c @@ -1,8 +1,11 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #include +#include +#include #include +#include "sd-daemon.h" #include "sd-json.h" #include "sd-netlink.h" @@ -23,6 +26,7 @@ #include "uid-classification.h" #include "user-util.h" #include "userns-registry.h" +#include "userns-restrict.h" int userns_registry_open_fd(void) { int fd; @@ -368,23 +372,6 @@ int userns_registry_name_exists(int dir_fd, const char *name) { return true; } -int userns_registry_inode_exists(int dir_fd, uint64_t inode) { - _cleanup_free_ char *fn = NULL; - - assert(dir_fd >= 0); - - if (inode <= 0) - return -EINVAL; - - if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0) - return -ENOMEM; - - if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) - return errno == ENOENT ? false : -errno; - - return true; -} - int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret) { _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; _cleanup_close_ int registry_fd = -EBADF; @@ -484,6 +471,97 @@ int userns_registry_load_by_userns_inode(int dir_fd, uint64_t inode, UserNamespa return 0; } +static void release_userns_inode_resources(struct userns_restrict_bpf *bpf, uint64_t inode) { + int r; + + assert(inode != 0); + + if (bpf) { + r = userns_restrict_reset_by_inode(bpf, inode); + if (r < 0) + log_warning_errno(r, "Failed to remove namespace inode from BPF map, ignoring: %m"); + } + + r = sd_notifyf(/* unset_environment= */ false, + "FDSTOREREMOVE=1\n" + "FDNAME=userns-%" PRIu64 "\n", inode); + if (r < 0) + log_warning_errno(r, "Failed to send fd store removal message, ignoring: %m"); +} + +void userns_registry_release_by_info(struct userns_restrict_bpf *bpf, int dir_fd, UserNamespaceInfo *info) { + int r; + + assert(dir_fd >= 0); + assert(info); + assert(info->userns_inode != 0); + + if (DEBUG_LOGGING) { + if (uid_is_valid(info->start_uid)) + log_debug("Removing user namespace mapping %" PRIu64 " for UID " UID_FMT ".", info->userns_inode, info->start_uid); + else + log_debug("Removing user namespace mapping %" PRIu64 ".", info->userns_inode); + } + + release_userns_inode_resources(bpf, info->userns_inode); + + r = userns_info_remove_cgroups(info); + if (r < 0) + log_warning_errno(r, "Failed to remove cgroups of user namespace, ignoring: %m"); + + r = userns_info_remove_netifs(info); + if (r < 0) + log_warning_errno(r, "Failed to remove netifs of user namespace, ignoring: %m"); + + r = userns_registry_remove(dir_fd, info); + if (r < 0) + log_warning_errno(r, "Failed to remove user namespace '%s', ignoring.", info->name); +} + +void userns_registry_release_by_userns_inode(struct userns_restrict_bpf *bpf, int dir_fd, uint64_t inode) { + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + int r; + + assert(dir_fd >= 0); + assert(inode != 0); + + r = userns_registry_load_by_userns_inode(dir_fd, inode, &userns_info); + if (r >= 0) + return userns_registry_release_by_info(bpf, dir_fd, userns_info); + + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to find userns for inode %" PRIu64 ", ignoring: %m", inode); + log_debug("Removing user namespace mapping %" PRIu64 ".", inode); + + /* No registry entry — still clean up the inode-keyed kernel resources (BPF map allowlist and + * fdstore fd), which can outlive a missing registry record. */ + release_userns_inode_resources(bpf, inode); +} + +int userns_info_verify_fd(int userns_fd, const UserNamespaceInfo *info) { + uint64_t live_id; + + assert(userns_fd >= 0); + assert(info); + + /* Verifies that userns_fd refers to the same user namespace described by info, distinguishing a + * live namespace from a different one that happens to have inherited the same inode after the + * original was destroyed. Returns 0 on match (also when the check cannot be performed because + * the stored or live id is unavailable on older kernels), -ESTALE on mismatch, or another + * negative errno on unexpected failure. */ + + if (info->userns_id == 0) + return 0; + + if (ioctl(userns_fd, NS_GET_ID, &live_id) < 0) { + if (ERRNO_IS_IOCTL_NOT_SUPPORTED(errno)) + return 0; + return -errno; + } + + return live_id == info->userns_id ? 0 : -ESTALE; +} + int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret) { _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; _cleanup_close_ int registry_fd = -EBADF; diff --git a/src/nsresourced/userns-registry.h b/src/nsresourced/userns-registry.h index 028d57e48cc..8bc0adee29a 100644 --- a/src/nsresourced/userns-registry.h +++ b/src/nsresourced/userns-registry.h @@ -3,6 +3,8 @@ #include "shared-forward.h" +struct userns_restrict_bpf; + #define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16U #define USER_NAMESPACE_NETIFS_DELEGATE_MAX 16U #define USER_NAMESPACE_DELEGATIONS_MAX 16U @@ -64,10 +66,19 @@ int userns_registry_load_by_start_gid(int dir_fd, gid_t start, UserNamespaceInfo int userns_registry_load_by_userns_inode(int dir_fd, uint64_t inode, UserNamespaceInfo **ret); int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret); +int userns_info_verify_fd(int userns_fd, const UserNamespaceInfo *info); + +/* Releases all resources tied to a user namespace: removes BPF allowlist entries (if a bpf handle is + * given), drops the corresponding fd from systemd's fdstore, removes cgroups and netifs recorded for + * it, and unlinks the registry entry. The caller must already hold the registry lock (e.g. via + * userns_registry_lock()). The _by_inode variant loads the registry entry; prefer the _by_info + * variant where the caller already has it. */ +void userns_registry_release_by_info(struct userns_restrict_bpf *bpf, int dir_fd, UserNamespaceInfo *info); +void userns_registry_release_by_userns_inode(struct userns_restrict_bpf *bpf, int dir_fd, uint64_t inode); + int userns_registry_store(int dir_fd, UserNamespaceInfo *info); int userns_registry_remove(int dir_fd, UserNamespaceInfo *info); -int userns_registry_inode_exists(int dir_fd, uint64_t inode); int userns_registry_name_exists(int dir_fd, const char *name); int userns_registry_uid_exists(int dir_fd, uid_t start); int userns_registry_gid_exists(int dir_fd, gid_t start);