#include "build-path.h"
#include "common-signal.h"
#include "env-util.h"
+#include "errno-util.h"
#include "event-util.h"
#include "fd-util.h"
#include "format-util.h"
#include "time-util.h"
#include "umask-util.h"
#include "unaligned.h"
-#include "user-util.h"
#include "userns-registry.h"
#include "userns-restrict.h"
return 0;
}
-static void manager_release_userns_bpf(Manager *m, uint64_t inode) {
-#if HAVE_VMLINUX_H
- int r;
-
+static struct userns_restrict_bpf *manager_bpf(Manager *m) {
assert(m);
- if (inode == 0)
- return;
-
- assert(m->userns_restrict_bpf);
-
- r = userns_restrict_reset_by_inode(m->userns_restrict_bpf, inode);
- if (r < 0)
- return (void) log_warning_errno(r, "Failed to remove namespace inode from BPF map, ignoring: %m");
+#if HAVE_VMLINUX_H
+ return m->userns_restrict_bpf;
+#else
+ return NULL;
#endif
}
-static void manager_release_userns_fds(Manager *m, uint64_t inode) {
- int r;
-
+/* Releases the resources tied to a user namespace described by info. The caller must hold the
+ * registry lock if there is any chance of a concurrent writer (i.e. workers — true once the listen
+ * socket is open; not true during manager_startup() before that point). */
+static void manager_release_userns_by_info(Manager *m, UserNamespaceInfo *info) {
assert(m);
- assert(inode != 0);
+ assert(info);
+ assert(info->userns_inode != 0);
+
+ /* Before tearing anything down, confirm by namespace id that the namespace we're releasing is
+ * actually dead. The kernel may have recycled this inode for a freshly created live namespace
+ * (e.g. between a BPF death event firing and us getting here); proceeding in that case would
+ * clobber the new namespace's BPF allowlist, fdstore fd and registry entry. */
+ if (info->userns_id != 0) {
+ _cleanup_close_ int probe_fd = namespace_open_by_id(info->userns_id);
+ if (probe_fd >= 0) {
+ log_warning("Refusing to release user namespace %" PRIu64 " (id %" PRIu64 "): the namespace is still alive.",
+ info->userns_inode, info->userns_id);
+ return;
+ }
+ if (probe_fd != -ESTALE &&
+ !ERRNO_IS_NEG_PRIVILEGE(probe_fd) &&
+ !ERRNO_IS_NEG_NOT_SUPPORTED(probe_fd))
+ log_warning_errno(probe_fd,
+ "Failed to probe liveness of user namespace %" PRIu64 " (id %" PRIu64 "), proceeding with release: %m",
+ info->userns_inode, info->userns_id);
+ }
- r = sd_notifyf(/* unset_environment= */ false,
- "FDSTOREREMOVE=1\n"
- "FDNAME=userns-%" PRIu64 "\n", inode);
- if (r < 0)
- log_warning_errno(r, "Failed to send fd store removal message, ignoring: %m");
+ userns_registry_release_by_info(manager_bpf(m), m->registry_fd, info);
}
static void manager_release_userns_by_inode(Manager *m, uint64_t inode) {
_cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
- _cleanup_close_ int lock_fd = -EBADF;
int r;
assert(m);
assert(inode != 0);
- lock_fd = userns_registry_lock(m->registry_fd);
- if (lock_fd < 0)
- return (void) log_error_errno(lock_fd, "Failed to lock registry: %m");
-
r = userns_registry_load_by_userns_inode(m->registry_fd, inode, &userns_info);
- if (r < 0)
- log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
- "Failed to find userns for inode %" PRIu64 ", ignoring: %m", inode);
-
- if (DEBUG_LOGGING) {
- if (userns_info && uid_is_valid(userns_info->start_uid))
- log_debug("Removing user namespace mapping %" PRIu64 " for UID " UID_FMT ".", inode, userns_info->start_uid);
- else
- log_debug("Removing user namespace mapping %" PRIu64 ".", inode);
- }
-
- /* Remove the BPF rules */
- manager_release_userns_bpf(m, inode);
-
- /* Remove the resources from the fdstore */
- manager_release_userns_fds(m, inode);
-
- /* And finally remove the resources file from disk */
- if (userns_info) {
- /* Remove the cgroups of this userns */
- r = userns_info_remove_cgroups(userns_info);
- if (r < 0)
- log_warning_errno(r, "Failed to remove cgroups of user namespace, ignoring: %m");
+ if (r >= 0)
+ return manager_release_userns_by_info(m, userns_info);
- /* Remove the netifs of this userns */
- r = userns_info_remove_netifs(userns_info);
- if (r < 0)
- log_warning_errno(r, "Failed to remove netifs of user namespace, ignoring: %m");
-
- r = userns_registry_remove(m->registry_fd, userns_info);
- if (r < 0)
- log_warning_errno(r, "Failed to remove user namespace '%s', ignoring.", userns_info->name);
- }
+ /* No registry entry to consult — fall through to inode-only cleanup of kernel resources. */
+ log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to load registry entry for user namespace %" PRIu64 ", proceeding with inode-only cleanup: %m", inode);
+ userns_registry_release_by_userns_inode(manager_bpf(m), m->registry_fd, inode);
}
static int manager_scan_registry(Manager *m, Set **registry_inodes) {
if ((size % sizeof(unsigned)) != 0) /* Not multiples of "unsigned"? */
return -EIO;
+ /* Workers are active alongside us once we're processing BPF events, so we have to serialize
+ * registry mutations against them. The startup-time release callers run before any worker
+ * exists and skip the lock. */
+ _cleanup_close_ int lock_fd = userns_registry_lock(m->registry_fd);
+ if (lock_fd < 0)
+ return log_error_errno(lock_fd, "Failed to lock registry: %m");
+
n = size / sizeof(unsigned);
for (size_t i = 0; i < n; i++) {
const void *d;
log_debug("Registry entry for user namespace %" PRIu64 " (id %" PRIu64 ") refers to a dead namespace, removing.",
inode, userns_info->userns_id);
- manager_release_userns_by_inode(m, inode);
+ manager_release_userns_by_info(m, userns_info);
}
r = manager_make_listen_socket(m);
return true;
}
+static int inode_slot_is_available(
+ int registry_dir_fd,
+ int userns_fd,
+ struct userns_restrict_bpf *bpf,
+ uint64_t inode) {
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *existing = NULL;
+ int r;
+
+ assert(registry_dir_fd >= 0);
+ assert(userns_fd >= 0);
+ assert(inode != 0);
+
+ /* Returns true if the registry has no entry for this inode (after cleaning up any stale
+ * leftover from a previously-registered namespace whose inode was recycled by the kernel),
+ * false if a live registration already occupies the slot, negative on error. */
+
+ r = userns_registry_load_by_userns_inode(registry_dir_fd, inode, &existing);
+ if (r == -ENOENT)
+ return true;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load existing registry entry: %m");
+
+ r = userns_info_verify_fd(userns_fd, existing);
+ if (r >= 0)
+ return false;
+ if (r != -ESTALE)
+ return log_debug_errno(r, "Failed to verify user namespace identity: %m");
+
+ log_debug("Inode %" PRIu64 " was reused by the kernel; cleaning up stale registry entry for namespace id %" PRIu64 ".",
+ inode, existing->userns_id);
+ userns_registry_release_by_info(bpf, registry_dir_fd, existing);
+ return true;
+}
+
static int allocate_one(
int registry_dir_fd,
const char *name,
int registry_dir_fd,
int userns_fd,
int parent_userns_fd,
+ struct userns_restrict_bpf *bpf,
UserNamespaceInfo *info,
int *ret_lock_fd) {
if (r >= USERNS_PER_UID)
return log_debug_errno(SYNTHETIC_ERRNO(EUSERS), "User already registered %i user namespaces, refusing.", r);
- r = userns_registry_inode_exists(registry_dir_fd, info->userns_inode);
+ r = inode_slot_is_available(registry_dir_fd, userns_fd, bpf, info->userns_inode);
if (r < 0)
return r;
- if (r > 0)
+ if (r == 0)
return -EDEADLK;
r = name_is_available(registry_dir_fd, info->name);
userns_info->n_delegates = p.delegate_container_ranges;
}
- r = allocate_now(registry_dir_fd, userns_fd, parent_userns_fd, userns_info, &lock_fd);
+ r = allocate_now(registry_dir_fd, userns_fd, parent_userns_fd, c->bpf, userns_info, &lock_fd);
if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */
return sd_varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL);
if (r == -EBUSY) /* All used up */
if (lock_fd < 0)
return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
- r = userns_registry_inode_exists(registry_dir_fd, userns_st.st_ino);
+ r = inode_slot_is_available(registry_dir_fd, userns_fd, c->bpf, userns_st.st_ino);
if (r < 0)
return r;
- if (r > 0)
+ if (r == 0)
return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL);
r = name_is_available(registry_dir_fd, userns_name);
if (r < 0)
return r;
+ r = userns_info_verify_fd(userns_fd, userns_info);
+ if (r == -ESTALE)
+ return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to verify user namespace identity: %m");
+
if (!c->bpf) {
r = userns_restrict_install(/* pin= */ true, &c->bpf);
if (r < 0)
if (r < 0)
return r;
+ r = userns_info_verify_fd(userns_fd, userns_info);
+ if (r == -ESTALE)
+ return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to verify user namespace identity: %m");
+
/* The user namespace must have a user assigned */
if (userns_info->size == 0)
return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceWithoutUserRange", NULL);
if (r < 0)
return r;
+ r = userns_info_verify_fd(userns_fd, userns_info);
+ if (r == -ESTALE)
+ return sd_varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to verify user namespace identity: %m");
+
if (strv_length(userns_info->netifs) > USER_NAMESPACE_NETIFS_DELEGATE_MAX)
return sd_varlink_error(link, "io.systemd.NamespaceResource.TooManyNetworkInterfaces", NULL);
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <linux/magic.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
#include <unistd.h>
+#include "sd-daemon.h"
#include "sd-json.h"
#include "sd-netlink.h"
#include "uid-classification.h"
#include "user-util.h"
#include "userns-registry.h"
+#include "userns-restrict.h"
int userns_registry_open_fd(void) {
int fd;
return true;
}
-int userns_registry_inode_exists(int dir_fd, uint64_t inode) {
- _cleanup_free_ char *fn = NULL;
-
- assert(dir_fd >= 0);
-
- if (inode <= 0)
- return -EINVAL;
-
- if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0)
- return -ENOMEM;
-
- if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
- return errno == ENOENT ? false : -errno;
-
- return true;
-}
-
int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret) {
_cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
_cleanup_close_ int registry_fd = -EBADF;
return 0;
}
+static void release_userns_inode_resources(struct userns_restrict_bpf *bpf, uint64_t inode) {
+ int r;
+
+ assert(inode != 0);
+
+ if (bpf) {
+ r = userns_restrict_reset_by_inode(bpf, inode);
+ if (r < 0)
+ log_warning_errno(r, "Failed to remove namespace inode from BPF map, ignoring: %m");
+ }
+
+ r = sd_notifyf(/* unset_environment= */ false,
+ "FDSTOREREMOVE=1\n"
+ "FDNAME=userns-%" PRIu64 "\n", inode);
+ if (r < 0)
+ log_warning_errno(r, "Failed to send fd store removal message, ignoring: %m");
+}
+
+void userns_registry_release_by_info(struct userns_restrict_bpf *bpf, int dir_fd, UserNamespaceInfo *info) {
+ int r;
+
+ assert(dir_fd >= 0);
+ assert(info);
+ assert(info->userns_inode != 0);
+
+ if (DEBUG_LOGGING) {
+ if (uid_is_valid(info->start_uid))
+ log_debug("Removing user namespace mapping %" PRIu64 " for UID " UID_FMT ".", info->userns_inode, info->start_uid);
+ else
+ log_debug("Removing user namespace mapping %" PRIu64 ".", info->userns_inode);
+ }
+
+ release_userns_inode_resources(bpf, info->userns_inode);
+
+ r = userns_info_remove_cgroups(info);
+ if (r < 0)
+ log_warning_errno(r, "Failed to remove cgroups of user namespace, ignoring: %m");
+
+ r = userns_info_remove_netifs(info);
+ if (r < 0)
+ log_warning_errno(r, "Failed to remove netifs of user namespace, ignoring: %m");
+
+ r = userns_registry_remove(dir_fd, info);
+ if (r < 0)
+ log_warning_errno(r, "Failed to remove user namespace '%s', ignoring.", info->name);
+}
+
+void userns_registry_release_by_userns_inode(struct userns_restrict_bpf *bpf, int dir_fd, uint64_t inode) {
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ int r;
+
+ assert(dir_fd >= 0);
+ assert(inode != 0);
+
+ r = userns_registry_load_by_userns_inode(dir_fd, inode, &userns_info);
+ if (r >= 0)
+ return userns_registry_release_by_info(bpf, dir_fd, userns_info);
+
+ log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to find userns for inode %" PRIu64 ", ignoring: %m", inode);
+ log_debug("Removing user namespace mapping %" PRIu64 ".", inode);
+
+ /* No registry entry — still clean up the inode-keyed kernel resources (BPF map allowlist and
+ * fdstore fd), which can outlive a missing registry record. */
+ release_userns_inode_resources(bpf, inode);
+}
+
+int userns_info_verify_fd(int userns_fd, const UserNamespaceInfo *info) {
+ uint64_t live_id;
+
+ assert(userns_fd >= 0);
+ assert(info);
+
+ /* Verifies that userns_fd refers to the same user namespace described by info, distinguishing a
+ * live namespace from a different one that happens to have inherited the same inode after the
+ * original was destroyed. Returns 0 on match (also when the check cannot be performed because
+ * the stored or live id is unavailable on older kernels), -ESTALE on mismatch, or another
+ * negative errno on unexpected failure. */
+
+ if (info->userns_id == 0)
+ return 0;
+
+ if (ioctl(userns_fd, NS_GET_ID, &live_id) < 0) {
+ if (ERRNO_IS_IOCTL_NOT_SUPPORTED(errno))
+ return 0;
+ return -errno;
+ }
+
+ return live_id == info->userns_id ? 0 : -ESTALE;
+}
+
int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret) {
_cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
_cleanup_close_ int registry_fd = -EBADF;
#include "shared-forward.h"
+struct userns_restrict_bpf;
+
#define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16U
#define USER_NAMESPACE_NETIFS_DELEGATE_MAX 16U
#define USER_NAMESPACE_DELEGATIONS_MAX 16U
int userns_registry_load_by_userns_inode(int dir_fd, uint64_t inode, UserNamespaceInfo **ret);
int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret);
+int userns_info_verify_fd(int userns_fd, const UserNamespaceInfo *info);
+
+/* Releases all resources tied to a user namespace: removes BPF allowlist entries (if a bpf handle is
+ * given), drops the corresponding fd from systemd's fdstore, removes cgroups and netifs recorded for
+ * it, and unlinks the registry entry. The caller must already hold the registry lock (e.g. via
+ * userns_registry_lock()). The _by_inode variant loads the registry entry; prefer the _by_info
+ * variant where the caller already has it. */
+void userns_registry_release_by_info(struct userns_restrict_bpf *bpf, int dir_fd, UserNamespaceInfo *info);
+void userns_registry_release_by_userns_inode(struct userns_restrict_bpf *bpf, int dir_fd, uint64_t inode);
+
int userns_registry_store(int dir_fd, UserNamespaceInfo *info);
int userns_registry_remove(int dir_fd, UserNamespaceInfo *info);
-int userns_registry_inode_exists(int dir_fd, uint64_t inode);
int userns_registry_name_exists(int dir_fd, const char *name);
int userns_registry_uid_exists(int dir_fd, uid_t start);
int userns_registry_gid_exists(int dir_fd, gid_t start);