nsresourced: detect and clean up registry entries for dead user namespaces

author Daan De Meyer <daan@amutable.com>

Wed, 13 May 2026 10:54:02 +0000 (12:54 +0200)

committer Daan De Meyer <daan@amutable.com>

Fri, 15 May 2026 18:05:21 +0000 (18:05 +0000)
author Daan De Meyer <daan@amutable.com>
Wed, 13 May 2026 10:54:02 +0000 (12:54 +0200)
committer Daan De Meyer <daan@amutable.com>
Fri, 15 May 2026 18:05:21 +0000 (18:05 +0000)
diff --git a/src/basic/namespace-util.c b/src/basic/namespace-util.c

index 3f355e082f7595079043eed032bcba80e6eefb64..fff518ef96957efe88def641732a1daf63fe8b11 100644 (file)
--- a/src/basic/namespace-util.c
+++ b/src/basic/namespace-util.c
@@ -23,6 +23,7 @@
  #include "stat-util.h"
  #include "stdio-util.h"
  #include "uid-range.h"
+#include "unaligned.h"
  #include "user-util.h"
  
  const struct namespace_info namespace_info[_NAMESPACE_TYPE_MAX + 1] = {
@@ -860,6 +861,70 @@ int process_is_owned_by_uid(const PidRef *pidref, uid_t uid) {
          }
  }
  
+int namespace_open_by_id(uint64_t ns_id) {
+        int r;
+
+        /* Looks up a namespace by its unique boot-stable identifier and returns an O_PATH fd to it.
+         * Requires kernel ≥ 6.13.
+         *
+         * Returns -ESTALE if the namespace no longer exists, or if the kernel refuses the lookup
+         * for permission reasons. The latter happens outside the initial user namespace: the
+         * kernel only permits open_by_handle_at() on nsfs when the caller is in the initial user
+         * and pid namespaces with CAP_SYS_ADMIN, with a narrow exception for lookups of the
+         * caller's own user namespace and its ancestors. To avoid conflating "namespace is dead"
+         * with "kernel refused us", we refuse early with -EPERM when we aren't in the initial
+         * user/pid namespace or missing CAP_SYS_ADMIN and let the caller skip the check. */
+
+        if (ns_id == 0)
+                return -EINVAL;
+
+        r = namespace_is_init(NAMESPACE_USER);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EPERM;
+
+        r = namespace_is_init(NAMESPACE_PID);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EPERM;
+
+        r = have_effective_cap(CAP_SYS_ADMIN);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EPERM;
+
+        /* The natural way to write this would be a compound designated initializer:
+         *
+         *         union { ... } fh = {
+         *                 .file_handle.handle_bytes = sizeof(struct nsfs_file_handle),
+         *                 .file_handle.handle_type = FILEID_NSFS,
+         *         };
+         *
+         * but that only zero-initializes the named struct members of struct file_handle.
+         * struct file_handle ends with a flexible array (`unsigned char f_handle[]`), whose
+         * storage comes from the overlapping `space[]` member of the union. Bytes in that storage
+         * are not covered by the partial struct initializer and end up as stack garbage. Zero the
+         * entire union first, then fill in the fields explicitly. */
+
+        union {
+                struct file_handle file_handle;
+                uint8_t space[offsetof(struct file_handle, f_handle) + sizeof(struct nsfs_file_handle)];
+        } fh = {};
+        fh.file_handle.handle_bytes = sizeof(struct nsfs_file_handle);
+        fh.file_handle.handle_type = FILEID_NSFS;
+
+        /* The first 8 bytes of struct nsfs_file_handle (see <linux/nsfs.h>, uapi since kernel v6.18)
+         * are __u64 ns_id; the remaining ns_type/ns_inum fields stay zero so the kernel looks up by
+         * id alone. The kernel made lookup-by-id-only an explicit ABI guarantee in v6.19 via commit
+         * 04173501a69e ("nstree: allow lookup solely based on inode"). */
+        unaligned_write_ne64(fh.file_handle.f_handle, ns_id);
+
+        return RET_NERRNO(open_by_handle_at(FD_NSFS_ROOT, &fh.file_handle, O_PATH|O_CLOEXEC));
+}
+
  int is_idmapping_supported(const char *path) {
          _cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF, dir_fd = -EBADF;
          int r;
diff --git a/src/basic/namespace-util.h b/src/basic/namespace-util.h

index cd2ea786927c3cc1ca30430570025fa626a8cf04..3bfa34371c0d22b30f85f22ebccf735648555a1c 100644 (file)
--- a/src/basic/namespace-util.h
+++ b/src/basic/namespace-util.h
@@ -88,6 +88,8 @@ bool userns_supported(void);
  
  int userns_get_base_uid(int userns_fd, uid_t *ret_uid, gid_t *ret_gid);
  
+int namespace_open_by_id(uint64_t ns_id);
+
  int process_is_owned_by_uid(const PidRef *pidref, uid_t uid);
  
  int is_idmapping_supported(const char *path);
diff --git a/src/include/override/fcntl.h b/src/include/override/fcntl.h

index 875f112b009d1d2739a01d85fbf2086ed581c7aa..bf42009022db04541892bf98c782df34fe5b1d13 100644 (file)
--- a/src/include/override/fcntl.h
+++ b/src/include/override/fcntl.h
@@ -25,6 +25,10 @@
  #define AT_HANDLE_MNT_ID_UNIQUE 0x001  /* Return the u64 unique mount ID. */
  #endif
  
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
  /* Defined since glibc-2.42.
   * Supported since kernel v5.6 (fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179). */
  int openat2_shim(int dfd, const char *filename, const struct open_how *how, size_t usize);
diff --git a/src/include/override/linux/nsfs.h b/src/include/override/linux/nsfs.h

index a256df1c6f9fa801ccc0fa642ab079ff95df6c98..163333d362843d543115a1fa4abdb076751f6884 100644 (file)
--- a/src/include/override/linux/nsfs.h
+++ b/src/include/override/linux/nsfs.h
@@ -12,3 +12,8 @@
  #define PROC_PID_INIT_INO    ((ino_t) UINT32_C(0xEFFFFFFC))
  #define PROC_CGROUP_INIT_INO ((ino_t) UINT32_C(0xEFFFFFFB))
  #define PROC_TIME_INIT_INO   ((ino_t) UINT32_C(0xEFFFFFFA))
+
+/* From kernel-internal include/linux/exportfs.h, not part of uapi. */
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
diff --git a/src/nsresourced/nsresourced-manager.c b/src/nsresourced/nsresourced-manager.c

index cceaa9c378e7449016641d21c2668cde5434d32b..406db72e7d72a30396c3b407064ecfd0c90acdd4 100644 (file)
--- a/src/nsresourced/nsresourced-manager.c
+++ b/src/nsresourced/nsresourced-manager.c
@@ -19,6 +19,7 @@
  #include "fs-util.h"
  #include "log.h"
  #include "mkdir.h"
+#include "namespace-util.h"
  #include "nsresourced-manager.h"
  #include "parse-util.h"
  #include "pidfd-util.h"
@@ -648,6 +649,51 @@ int manager_startup(Manager *m) {
                  manager_release_userns_by_inode(m, inode);
          }
  
+        /* Look for registry entries whose user namespace has died without us getting a BPF
+         * notification — e.g. because the BPF ring buffer overflowed, the kprobe is missing, or
+         * something else dropped the fd store entry without going through our cleanup path. Each
+         * registry entry stores the kernel's unique namespace identifier; ask the kernel to open
+         * the namespace by that identifier and release the entry if the lookup fails. Entries
+         * written by older versions don't carry the identifier, and old kernels (or running
+         * outside the initial user namespace) don't support lookup by it — in those cases we leave
+         * the entry alone. */
+
+        SET_FOREACH(p, registry_inodes) {
+                uint64_t inode = PTR_TO_UINT32(p);
+
+                _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+                r = userns_registry_load_by_userns_inode(m->registry_fd, inode, &userns_info);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to load registry entry for user namespace %" PRIu64 ", ignoring: %m", inode);
+                        continue;
+                }
+
+                if (userns_info->userns_id == 0)
+                        continue; /* Entry predates ns_id tracking, can't probe authoritatively */
+
+                _cleanup_close_ int probe_fd = namespace_open_by_id(userns_info->userns_id);
+                if (probe_fd >= 0)
+                        continue; /* User namespace is still alive */
+                /* EPERM/EACCES means we're not in the initial user/pid namespace or missing
+                 * CAP_SYS_ADMIN; ENOTSUP/ENOSYS means the kernel is too old for
+                 * open_by_handle_at() on nsfs. Either way the sweep can't proceed for any
+                 * entry, so bail out rather than logging once per entry. */
+                if (ERRNO_IS_NEG_PRIVILEGE(probe_fd) || ERRNO_IS_NEG_NOT_SUPPORTED(probe_fd)) {
+                        log_debug_errno(probe_fd, "Cannot detect stale registry entries, skipping: %m");
+                        break;
+                }
+                /* Anything else except ESTALE is unexpected — log it but skip just this one. */
+                if (probe_fd != -ESTALE) {
+                        log_debug_errno(probe_fd, "Failed to probe liveness of user namespace %" PRIu64 " (id %" PRIu64 "), ignoring: %m",
+                                        inode, userns_info->userns_id);
+                        continue;
+                }
+
+                log_debug("Registry entry for user namespace %" PRIu64 " (id %" PRIu64 ") refers to a dead namespace, removing.",
+                          inode, userns_info->userns_id);
+                manager_release_userns_by_inode(m, inode);
+        }
+
          r = manager_make_listen_socket(m);
          if (r < 0)
                  return r;
diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c

index 91b3645809a9242f3016d9e4eb009c43c836e7d3..a366e9421ab938915812b2bfde1b38ee40676b4a 100644 (file)
--- a/src/nsresourced/nsresourcework.c
+++ b/src/nsresourced/nsresourcework.c
@@ -1295,6 +1295,8 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
  
          userns_info->owner = peer_uid;
          userns_info->userns_inode = userns_st.st_ino;
+        if (ioctl(userns_fd, NS_GET_ID, &userns_info->userns_id) < 0)
+                log_debug_errno(errno, "Failed to query userns ID, ignoring: %m");
          userns_info->size = p.size;
          userns_info->target_uid = p.target;
          userns_info->target_gid = (gid_t) p.target;
@@ -1575,6 +1577,8 @@ static int vl_method_register_user_namespace(sd_varlink *link, sd_json_variant *
  
          userns_info->owner = peer_uid;
          userns_info->userns_inode = userns_st.st_ino;
+        if (ioctl(userns_fd, NS_GET_ID, &userns_info->userns_id) < 0)
+                log_debug_errno(errno, "Failed to query userns ID, ignoring: %m");
  
          r = userns_registry_store(registry_dir_fd, userns_info);
          if (r < 0)
diff --git a/src/nsresourced/userns-registry.c b/src/nsresourced/userns-registry.c

index a9e3f82e59c22643bfd8c696d6a9c27ecd1b4629..3a0dace7ca3da5e9e948d7e45c2c0ff8df8c4772 100644 (file)
--- a/src/nsresourced/userns-registry.c
+++ b/src/nsresourced/userns-registry.c
@@ -239,6 +239,7 @@ static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **
                  { "owner",     SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid,  offsetof(UserNamespaceInfo, owner),        SD_JSON_MANDATORY },
                  { "name",      SD_JSON_VARIANT_STRING,   sd_json_dispatch_string,   offsetof(UserNamespaceInfo, name),         SD_JSON_MANDATORY },
                  { "userns",    SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64,   offsetof(UserNamespaceInfo, userns_inode), SD_JSON_MANDATORY },
+                { "usernsId",  SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64,   offsetof(UserNamespaceInfo, userns_id),    0                 },
                  { "size",      SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32,   offsetof(UserNamespaceInfo, size),         0                 },
                  { "start",     SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid,  offsetof(UserNamespaceInfo, start_uid),    0                 },
                  { "target",    SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid,  offsetof(UserNamespaceInfo, target_uid),   0                 },
@@ -565,6 +566,7 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                          SD_JSON_BUILD_PAIR_UNSIGNED("owner", info->owner),
                          SD_JSON_BUILD_PAIR_STRING("name", info->name),
                          SD_JSON_BUILD_PAIR_UNSIGNED("userns", info->userns_inode),
+                        SD_JSON_BUILD_PAIR_CONDITION(info->userns_id != 0, "usernsId", SD_JSON_BUILD_UNSIGNED(info->userns_id)),
                          SD_JSON_BUILD_PAIR_CONDITION(info->size > 0, "size", SD_JSON_BUILD_UNSIGNED(info->size)),
                          SD_JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start_uid), "start", SD_JSON_BUILD_UNSIGNED(info->start_uid)),
                          SD_JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->target_uid), "target", SD_JSON_BUILD_UNSIGNED(info->target_uid)),
diff --git a/src/nsresourced/userns-registry.h b/src/nsresourced/userns-registry.h

index 77ff2d6d20760ecfa26aae2678e141b1df4f5060..028d57e48ccca7431fd8667e0d3db0a0b058065e 100644 (file)
--- a/src/nsresourced/userns-registry.h
+++ b/src/nsresourced/userns-registry.h
@@ -29,6 +29,7 @@ typedef struct UserNamespaceInfo {
          uid_t owner;
          char *name;
          uint64_t userns_inode;
+        uint64_t userns_id; /* Unique namespace identifier from NS_GET_ID, 0 if unavailable */
          uint32_t size;
          uid_t start_uid;
          uid_t target_uid;
diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c

index 7497596cc56fc0c86912866a71f5e6bfaa143234..96272bc7623d4051bba593be30c175e16a6ec3b5 100644 (file)
--- a/src/test/test-namespace.c
+++ b/src/test/test-namespace.c
@@ -1,8 +1,10 @@
  /* SPDX-License-Identifier: LGPL-2.1-or-later */
  
  #include <fcntl.h>
+#include <linux/nsfs.h>
  #include <sched.h>
  #include <stdlib.h>
+#include <sys/ioctl.h>
  #include <sys/prctl.h>
  #include <sys/socket.h>
  #include <sys/stat.h>
@@ -282,6 +284,57 @@ TEST(userns_get_base_uid) {
          ASSERT_ERROR(userns_get_base_uid(fd, &base_uid, &base_gid), ENOMSG);
  }
  
+TEST(namespace_open_by_id) {
+        /* Try our own user namespace first to see if the kernel exposes ns_id at all. */
+        _cleanup_close_ int userns_fd = ASSERT_OK_ERRNO(open("/proc/self/ns/user", O_RDONLY|O_CLOEXEC));
+
+        uint64_t ns_id;
+        int r = RET_NERRNO(ioctl(userns_fd, NS_GET_ID, &ns_id));
+        if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                return (void) log_tests_skipped("NS_GET_ID is not supported by this kernel");
+        ASSERT_OK(r);
+
+        /* namespace_open_by_id() refuses with -EPERM outside the initial user/pid namespace, since
+         * the kernel restricts open_by_handle_at() on nsfs to the initial userns and pidns and to
+         * CAP_SYS_ADMIN. */
+        _cleanup_close_ int opened = namespace_open_by_id(ns_id);
+        if (opened == -EPERM)
+                return (void) log_tests_skipped("not in initial user namespace or missing CAP_SYS_ADMIN");
+        if (IN_SET(opened, -EOPNOTSUPP, -EINVAL))
+                return (void) log_tests_skipped("nsfs lookup by ns_id is not supported by this kernel");
+        ASSERT_OK(opened);
+
+        struct stat orig_st, opened_st;
+        ASSERT_OK_ERRNO(fstat(userns_fd, &orig_st));
+        ASSERT_OK_ERRNO(fstat(opened, &opened_st));
+        ASSERT_EQ(orig_st.st_ino, opened_st.st_ino);
+
+        opened = safe_close(opened);
+
+        ASSERT_ERROR(namespace_open_by_id(0), EINVAL);
+
+        _cleanup_close_ int transient_fd = userns_acquire_empty();
+        if (ERRNO_IS_NEG_NOT_SUPPORTED(transient_fd) || ERRNO_IS_NEG_PRIVILEGE(transient_fd))
+                return (void) log_tests_skipped("cannot acquire userns for transient lookup test");
+        ASSERT_OK(transient_fd);
+
+        uint64_t transient_id;
+        ASSERT_OK_ERRNO(ioctl(transient_fd, NS_GET_ID, &transient_id));
+        ASSERT_NE(transient_id, ns_id);
+
+        opened = ASSERT_OK(namespace_open_by_id(transient_id));
+
+        struct stat transient_st, transient_opened_st;
+        ASSERT_OK_ERRNO(fstat(transient_fd, &transient_st));
+        ASSERT_OK_ERRNO(fstat(opened, &transient_opened_st));
+        ASSERT_EQ(transient_st.st_ino, transient_opened_st.st_ino);
+        opened = safe_close(opened);
+
+        /* Close the only reference. The namespace is now dead — lookup must fail. */
+        transient_fd = safe_close(transient_fd);
+        ASSERT_ERROR(namespace_open_by_id(transient_id), ESTALE);
+}
+
  TEST(process_is_owned_by_uid) {
          int r;
author	Daan De Meyer <daan@amutable.com>
	Wed, 13 May 2026 10:54:02 +0000 (12:54 +0200)
committer	Daan De Meyer <daan@amutable.com>
	Fri, 15 May 2026 18:05:21 +0000 (18:05 +0000)
src/basic/namespace-util.c		patch \| blob \| blame \| history
src/basic/namespace-util.h		patch \| blob \| blame \| history
src/include/override/fcntl.h		patch \| blob \| blame \| history
src/include/override/linux/nsfs.h		patch \| blob \| blame \| history
src/nsresourced/nsresourced-manager.c		patch \| blob \| blame \| history
src/nsresourced/nsresourcework.c		patch \| blob \| blame \| history
src/nsresourced/userns-registry.c		patch \| blob \| blame \| history
src/nsresourced/userns-registry.h		patch \| blob \| blame \| history
src/test/test-namespace.c		patch \| blob \| blame \| history