nsresourced: Add support for self mappings with delegations

author Daan De Meyer <daan.j.demeyer@gmail.com>

Sat, 24 Jan 2026 00:03:35 +0000 (01:03 +0100)

committer Daan De Meyer <daan@amutable.com>

Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
author Daan De Meyer <daan.j.demeyer@gmail.com>
Sat, 24 Jan 2026 00:03:35 +0000 (01:03 +0100)
committer Daan De Meyer <daan@amutable.com>
Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
diff --git a/man/systemd-nsresourced.service.xml b/man/systemd-nsresourced.service.xml

index 853fe09fbc97f96843803efd65424007edb201fe..e9b661271642e35eab96efb8986cc0bfb0063d61 100644 (file)
--- a/man/systemd-nsresourced.service.xml
+++ b/man/systemd-nsresourced.service.xml
@@ -62,6 +62,13 @@
      namespace, each of size 65536. The ranges are allocated from the container UID ranges as per
      <ulink url="https://systemd.io/UIDS-GIDS">Users, Groups, UIDs and GIDs on systemd Systems</ulink>.</para>
  
+    <para>The allocation API also supports <emphasis>identity mappings</emphasis>: instead of allocating a
+    transient UID/GID range, the user namespace can be configured to map the caller's UID/GID to root (UID
+    0) inside the namespace, or to itself. Identity mappings can be combined with delegated ranges to enter
+    a privileged user namespace from which the container can be set up after which the container can run in
+    one of the delegated ranges. Identity mapped users are not subject to BPF-LSM write restrictions unlike
+    the transient ranges.</para>
+
      <para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
      per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
      namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
diff --git a/src/basic/uid-classification.h b/src/basic/uid-classification.h

index 6735e06b322184e3afed7f166b8585d0e2fa06e6..a4d8f916e9f3b240cd67a82ac3f131e047d395c6 100644 (file)
--- a/src/basic/uid-classification.h
+++ b/src/basic/uid-classification.h
@@ -46,6 +46,14 @@ static inline bool gid_is_foreign(gid_t gid) {
          return uid_is_foreign((uid_t) gid);
  }
  
+static inline bool uid_is_transient(uid_t uid) {
+        return uid_is_container(uid) || uid_is_dynamic(uid);
+}
+
+static inline bool gid_is_transient(gid_t gid) {
+        return uid_is_container((uid_t) gid) || uid_is_dynamic((uid_t) gid);
+}
+
  typedef struct UGIDAllocationRange {
          uid_t system_alloc_uid_min;
          uid_t system_uid_max;
diff --git a/src/basic/uid-range.c b/src/basic/uid-range.c

index 763c421e91edb510cb3d8b7727a983be784e6dbc..31305952ba43cf564e319ccc0124d46751fef20b 100644 (file)
--- a/src/basic/uid-range.c
+++ b/src/basic/uid-range.c
@@ -532,6 +532,27 @@ int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t u
          return -ESRCH;
  }
  
+int uid_range_translate_userns_fd(int userns_fd, UIDRangeUsernsMode mode, uid_t uid, uid_t *ret) {
+        int r;
+
+        assert(userns_fd >= 0);
+        assert(IN_SET(mode, UID_RANGE_USERNS_OUTSIDE, GID_RANGE_USERNS_OUTSIDE));
+
+        _cleanup_(uid_range_freep) UIDRange *outside_range = NULL;
+        r = uid_range_load_userns_by_fd_full(userns_fd, mode, /* coalesce= */ false, &outside_range);
+        if (r < 0)
+                return r;
+
+        mode = mode == UID_RANGE_USERNS_OUTSIDE ? UID_RANGE_USERNS_INSIDE : GID_RANGE_USERNS_INSIDE;
+
+        _cleanup_(uid_range_freep) UIDRange *inside_range = NULL;
+        r = uid_range_load_userns_by_fd_full(userns_fd, mode, /* coalesce= */ false, &inside_range);
+        if (r < 0)
+                return r;
+
+        return uid_range_translate(outside_range, inside_range, uid, ret);
+}
+
  bool uid_range_equal(const UIDRange *a, const UIDRange *b) {
          if (a == b)
                  return true;
diff --git a/src/basic/uid-range.h b/src/basic/uid-range.h

index a15a2a8e4f969d49eecacd55255cd668af3c7c9d..08d707ae25968b6aa5cf4afd4c4dbd258566b470 100644 (file)
--- a/src/basic/uid-range.h
+++ b/src/basic/uid-range.h
@@ -67,6 +67,7 @@ int uid_range_partition(UIDRange *range, uid_t size);
  int uid_range_copy(const UIDRange *range, UIDRange **ret);
  int uid_range_remove(UIDRange *range, uid_t start, uid_t size);
  int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret);
+int uid_range_translate_userns_fd(int userns_fd, UIDRangeUsernsMode mode, uid_t uid, uid_t *ret);
  
  int uid_map_search_root(pid_t pid, UIDRangeUsernsMode mode, uid_t *ret);
  
diff --git a/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c b/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c

index dbfcf59b28671eabb9d6c693aed5901037245f3c..25d609bf38fc83bc7c4c4f3ad5b9f923124d0454 100644 (file)
--- a/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c
+++ b/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c
@@ -62,6 +62,13 @@ struct {
          __array(values, struct mnt_id_map);
  } userns_mnt_id_hash SEC(".maps");
  
+struct {
+        __uint(type, BPF_MAP_TYPE_HASH);
+        __uint(max_entries, 1);        /* placeholder, configured otherwise by nsresourced */
+        __type(key, unsigned);         /* userns inode */
+        __type(value, int);            /* dummy value */
+} userns_setgroups_deny SEC(".maps");
+
  struct {
          __uint(type, BPF_MAP_TYPE_RINGBUF);
          __uint(max_entries, 4096);
@@ -229,20 +236,59 @@ int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct
          return validate_mount(new_dir->mnt, ret);
  }
  
+SEC("lsm/task_fix_setgroups")
+int BPF_PROG(userns_restrict_task_fix_setgroups, struct cred *new_cred, const struct cred *old, int ret) {
+        struct user_namespace *p;
+        unsigned inode;
+
+        if (ret != 0) /* propagate earlier error */
+                return ret;
+
+        /* Walk the task's user namespace and its ancestors to find the first one managed by nsresourced
+         * (i.e. present in either the setgroups deny map or the mount ID hash map). This is necessary
+         * because a task could otherwise trivially bypass the setgroups() restriction by unsharing the user
+         * namespace and mapping the same users and groups. */
+        p = new_cred->user_ns;
+        for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
+                if (!p)
+                        break;
+
+                inode = p->ns.inum;
+
+                if (bpf_map_lookup_elem(&userns_setgroups_deny, &inode))
+                        return -EPERM;
+
+                if (bpf_map_lookup_elem(&userns_mnt_id_hash, &inode))
+                        return 0;
+
+                p = p->parent;
+        }
+
+        /* No nsresourced-managed ancestor found, allow. */
+        return 0;
+}
+
  SEC("kprobe/retire_userns_sysctls")
  int BPF_KPROBE(userns_restrict_retire_userns_sysctls, struct user_namespace *userns) {
          unsigned inode;
-        void *mnt_id_map;
  
          /* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into
           * user namespaces being deleted than using kprobes, but couldn't find any. */
          userns = bpf_rdonly_cast(userns, bpf_core_type_id_kernel(struct user_namespace));
          inode = userns->ns.inum;
  
-        mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode);
-        if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */
-                return 0;
+        /* Check each map separately to avoid the compiler merging the two lookups into a pointer OR
+         * operation, which the BPF verifier rejects. */
+        if (bpf_map_lookup_elem(&userns_mnt_id_hash, &inode))
+                goto notify;
+
+        if (bpf_map_lookup_elem(&userns_setgroups_deny, &inode))
+                goto notify;
+
+        /* No rules installed for this userns? Then send no notification. */
+        return 0;
  
+notify:
          bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0);
          return 0;
  }
diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c

index abb50955081cf6ad3164d48afddb51671b5db6cb..82640fac4f76dcb1423f33b469f9c21d1431314a 100644 (file)
--- a/src/nsresourced/nsresourcework.c
+++ b/src/nsresourced/nsresourcework.c
@@ -41,6 +41,7 @@
  #include "siphash24.h"
  #include "socket-util.h"
  #include "stat-util.h"
+#include "string-table.h"
  #include "string-util.h"
  #include "strv.h"
  #include "time-util.h"
@@ -76,6 +77,21 @@ typedef struct LookupParameters {
          const char *service;
  } LookupParameters;
  
+typedef enum AllocateUserRangeType {
+        ALLOCATE_USER_RANGE_MANAGED,
+        ALLOCATE_USER_RANGE_SELF,
+        _ALLOCATE_USER_RANGE_TYPE_MAX,
+        _ALLOCATE_USER_RANGE_TYPE_INVALID = -EINVAL,
+} AllocateUserRangeType;
+
+static const char *const allocate_user_range_type_table[_ALLOCATE_USER_RANGE_TYPE_MAX] = {
+        [ALLOCATE_USER_RANGE_MANAGED] = "managed",
+        [ALLOCATE_USER_RANGE_SELF]    = "self",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(allocate_user_range_type, AllocateUserRangeType);
+static JSON_DISPATCH_ENUM_DEFINE(dispatch_allocate_user_range_type, AllocateUserRangeType, allocate_user_range_type_from_string);
+
  static int build_user_json(UserNamespaceInfo *userns_info, uid_t offset, sd_json_variant **ret) {
          _cleanup_free_ char *name = NULL, *realname = NULL;
          UserDisposition disposition;
@@ -634,17 +650,21 @@ static int allocate_now(
          if (r == 0)
                  return -EEXIST;
  
-        r = allocate_one(
-                        registry_dir_fd,
-                        info->name, info->size,
-                        parent_userns_fd,
-                        candidates,
-                        &candidate);
-        if (r < 0)
-                return r;
+        /* If the source UID/GID are already set we're doing a "self" user namespace and don't need to
+         * allocate a transient range. */
+        if (!uid_is_valid(info->start_uid) && !gid_is_valid(info->start_gid)) {
+                r = allocate_one(
+                                registry_dir_fd,
+                                info->name, info->size,
+                                parent_userns_fd,
+                                candidates,
+                                &candidate);
+                if (r < 0)
+                        return r;
  
-        info->start_uid = candidate;
-        info->start_gid = (gid_t) candidate;
+                info->start_uid = candidate;
+                info->start_gid = (gid_t) candidate;
+        }
  
          /* Now allocate delegated ranges if requested */
          if (info->n_delegates > 0) {
@@ -761,7 +781,7 @@ static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespace
          /* Let's enforce that the transient UID/GID ranges are mapped 1:1 in the parent user namespace, to
           * avoid any weird mapping shenanigans that might happen otherwise. */
  
-        if (start_uid != userns_info->start_uid)
+        if (uid_is_transient(userns_info->start_uid) && start_uid != userns_info->start_uid)
                  return log_debug_errno(
                          SYNTHETIC_ERRNO(ERANGE),
                          "Transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
@@ -814,7 +834,7 @@ static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespace
          if (r < 0)
                  return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid);
  
-        if (start_gid != userns_info->start_gid)
+        if (gid_is_transient(userns_info->start_gid) && start_gid != userns_info->start_gid)
                  return log_debug_errno(
                          SYNTHETIC_ERRNO(ERANGE),
                          "Transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
@@ -1023,14 +1043,23 @@ static int validate_name(sd_varlink *link, const char *name, bool mangle, char *
          return 0;
  }
  
-static int validate_target_and_size(sd_varlink *link, uid_t target, uint32_t size) {
+static int validate_target_and_size(sd_varlink *link, uid_t target, uint32_t size, AllocateUserRangeType type) {
          assert(link);
  
-        if (!IN_SET(size, 1U, 0x10000))
-                return sd_varlink_error_invalid_parameter_name(link, "size");
+        if (type == ALLOCATE_USER_RANGE_SELF) {
+                /* Self userns must have size 1 and target must be 0 or unset */
+                if (size != 1)
+                        return sd_varlink_error_invalid_parameter_name(link, "size");
+
+                if (!IN_SET(target, UID_INVALID, 0))
+                        return sd_varlink_error_invalid_parameter_name(link, "target");
+        } else {
+                if (!IN_SET(size, 1U, 0x10000))
+                        return sd_varlink_error_invalid_parameter_name(link, "size");
  
-        if (!uid_is_valid(target) || target > UINT32_MAX - size)
-                return sd_varlink_error_invalid_parameter_name(link, "target");
+                if (!uid_is_valid(target) || target > UINT32_MAX - size)
+                        return sd_varlink_error_invalid_parameter_name(link, "target");
+        }
  
          return 0;
  }
@@ -1104,6 +1133,7 @@ static int validate_userns_is_empty(sd_varlink *link, int userns_fd) {
  
  typedef struct AllocateParameters {
          const char *name;
+        AllocateUserRangeType type;
          uint32_t size;
          uid_t target;
          unsigned userns_fd_idx;
@@ -1114,12 +1144,13 @@ typedef struct AllocateParameters {
  static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) {
  
          static const sd_json_dispatch_field dispatch_table[] = {
-                { "name",                        SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string, offsetof(AllocateParameters, name),                      SD_JSON_MANDATORY },
-                { "size",                        _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,       offsetof(AllocateParameters, size),                      SD_JSON_MANDATORY },
-                { "target",                      _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid,      offsetof(AllocateParameters, target),                    0                 },
-                { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint,         offsetof(AllocateParameters, userns_fd_idx),             SD_JSON_MANDATORY },
-                { "mangleName",                  SD_JSON_VARIANT_BOOLEAN,       sd_json_dispatch_stdbool,      offsetof(AllocateParameters, mangle_name),               0                 },
-                { "delegateContainerRanges",     _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,       offsetof(AllocateParameters, delegate_container_ranges), 0                 },
+                { "name",                        SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string,     offsetof(AllocateParameters, name),                      SD_JSON_MANDATORY },
+                { "size",                        _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,           offsetof(AllocateParameters, size),                      SD_JSON_MANDATORY },
+                { "target",                      _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid,          offsetof(AllocateParameters, target),                    0                 },
+                { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint,             offsetof(AllocateParameters, userns_fd_idx),             SD_JSON_MANDATORY },
+                { "mangleName",                  SD_JSON_VARIANT_BOOLEAN,       sd_json_dispatch_stdbool,          offsetof(AllocateParameters, mangle_name),               0                 },
+                { "type",                        SD_JSON_VARIANT_STRING,        dispatch_allocate_user_range_type, offsetof(AllocateParameters, type),                      0                 },
+                { "delegateContainerRanges",     _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,           offsetof(AllocateParameters, delegate_container_ranges), 0                 },
                  {}
          };
  
@@ -1127,9 +1158,12 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
          _cleanup_free_ char *userns_name = NULL;
          Context *c = ASSERT_PTR(userdata);
          uid_t peer_uid;
+        gid_t peer_gid;
          struct stat userns_st;
          AllocateParameters p = {
+                .type = ALLOCATE_USER_RANGE_MANAGED,
                  .size = UINT32_MAX,
+                .target = UID_INVALID,
                  .userns_fd_idx = UINT_MAX,
          };
          int r;
@@ -1145,11 +1179,14 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
          if (r != 0)
                  return r;
  
+        if (p.type != ALLOCATE_USER_RANGE_SELF && p.target == UID_INVALID)
+                p.target = 0;
+
          r = validate_name(link, p.name, p.mangle_name, &userns_name);
          if (r != 0)
                  return r;
  
-        r = validate_target_and_size(link, p.target, p.size);
+        r = validate_target_and_size(link, p.target, p.size, p.type);
          if (r != 0)
                  return r;
  
@@ -1179,6 +1216,10 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
          if (r < 0)
                  return r;
  
+        r = sd_varlink_get_peer_gid(link, &peer_gid);
+        if (r < 0)
+                return r;
+
          const char *polkit_details[] = {
                  "name", userns_name,
                  NULL,
@@ -1219,6 +1260,33 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
          userns_info->target_uid = p.target;
          userns_info->target_gid = (gid_t) p.target;
  
+        if (p.type == ALLOCATE_USER_RANGE_SELF) {
+                /* The start UID/GID will be mapped to the parent userns in write_userns(). If a self
+                 * mapping to the peer UID/GID is requested, we have to map the target UID/GID ourselves here
+                 * as write_userns() doesn't take care of that. */
+
+                userns_info->start_uid = peer_uid;
+                userns_info->start_gid = peer_gid;
+
+                if (p.target == UID_INVALID) {
+                        r = uid_range_translate_userns_fd(
+                                        parent_userns_fd,
+                                        UID_RANGE_USERNS_OUTSIDE,
+                                        peer_uid,
+                                        &userns_info->target_uid);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent user namespace: %m", peer_uid);
+
+                        r = uid_range_translate_userns_fd(
+                                        parent_userns_fd,
+                                        GID_RANGE_USERNS_OUTSIDE,
+                                        peer_gid,
+                                        &userns_info->target_gid);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent user namespace: %m", peer_gid);
+                }
+        }
+
          /* Set up delegation arrays if requested */
          if (p.delegate_container_ranges > 0) {
                  userns_info->delegates = new0(DelegatedUserNamespaceInfo, p.delegate_container_ranges);
@@ -1259,6 +1327,15 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
          if (r < 0)
                  goto fail;
  
+        if (p.type == ALLOCATE_USER_RANGE_SELF) {
+                /* For "self" allocations we deny setgroups() via the BPF LSM. We can't use
+                 * /proc/self/setgroups for this as that is transitive and also applies to child user
+                 * namespaces. The BPF LSM hook only applies to the specific user namespace. */
+                r = userns_restrict_setgroups_deny_by_fd(c->bpf, userns_fd);
+                if (r < 0)
+                        goto fail;
+        }
+
          r = write_userns(userns_fd, parent_userns_fd, userns_info);
          if (r < 0)
                  goto fail;
diff --git a/src/nsresourced/test-userns-restrict.c b/src/nsresourced/test-userns-restrict.c

index 853fc1441f6d7c95874862244d486103d03f01f6..29125a84a8d300347c05cf1ad70c9438e28aea0d 100644 (file)
--- a/src/nsresourced/test-userns-restrict.c
+++ b/src/nsresourced/test-userns-restrict.c
@@ -1,5 +1,6 @@
  /* SPDX-License-Identifier: LGPL-2.1-or-later */
  
+#include <grp.h>
  #include <sched.h>
  #include <sys/eventfd.h>
  #include <sys/mount.h>
@@ -8,6 +9,7 @@
  
  #include "errno-util.h"
  #include "fd-util.h"
+#include "fileio.h"
  #include "namespace-util.h"
  #include "pidref.h"
  #include "process-util.h"
@@ -147,4 +149,137 @@ TEST(userns_restrict) {
          ASSERT_OK(pidref_wait_for_terminate_and_check("(test)", &pidref, WAIT_LOG));
  }
  
+static void write_child_mappings(PidRef *child, int parent_userns_fd) {
+        /* The kernel requires uid_map/gid_map to be written from the parent user namespace of the
+         * target namespace. Fork a helper that joins the parent userns and writes the mappings from
+         * there, mirroring what write_userns() does in nsresourcework.c. */
+        int r;
+
+        r = ASSERT_OK(pidref_safe_fork("(sd-write-map)", FORK_DEATHSIG_SIGKILL|FORK_WAIT|FORK_LOG, NULL));
+        if (r == 0) {
+                char path[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1];
+
+                ASSERT_OK_ERRNO(setns(parent_userns_fd, CLONE_NEWUSER));
+
+                xsprintf(path, "/proc/" PID_FMT "/uid_map", child->pid);
+                ASSERT_OK(write_string_file(path, "0 0 1\n", WRITE_STRING_FILE_DISABLE_BUFFER));
+
+                xsprintf(path, "/proc/" PID_FMT "/gid_map", child->pid);
+                ASSERT_OK(write_string_file(path, "0 0 1\n", WRITE_STRING_FILE_DISABLE_BUFFER));
+
+                _exit(EXIT_SUCCESS);
+        }
+}
+
+TEST(setgroups_deny) {
+        _cleanup_close_ int deny_userns_fd = -EBADF, allow_userns_fd = -EBADF,
+                             afd = -EBADF, bfd = -EBADF;
+        int r;
+
+        _cleanup_free_ char *idmap = NULL;
+        ASSERT_OK(asprintf(&idmap, "0 "UID_FMT" 1", CONTAINER_UID_MIN));
+
+        /* Create a userns that will have setgroups() denied via BPF. We don't set setgroups_deny here
+         * because that uses /proc/self/setgroups which is transitive and we want to test the BPF-LSM
+         * denial specifically. */
+        deny_userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ false));
+
+        ASSERT_OK(userns_restrict_put_by_fd(
+                        bpf_obj,
+                        deny_userns_fd,
+                        /* replace= */ true,
+                        /* mount_fds= */ NULL,
+                        /* n_mount_fds= */ 0));
+        ASSERT_OK(userns_restrict_setgroups_deny_by_fd(bpf_obj, deny_userns_fd));
+
+        /* Create a userns that is managed (in mount ID hash) but does NOT have setgroups() denied */
+        allow_userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ false));
+
+        ASSERT_OK(userns_restrict_put_by_fd(
+                        bpf_obj,
+                        allow_userns_fd,
+                        /* replace= */ true,
+                        /* mount_fds= */ NULL,
+                        /* n_mount_fds= */ 0));
+
+        afd = ASSERT_OK_ERRNO(eventfd(0, EFD_CLOEXEC));
+        bfd = ASSERT_OK_ERRNO(eventfd(0, EFD_CLOEXEC));
+
+        /* Test 1: setgroups() should be denied in the deny userns, including after unsharing into a child
+         * user namespace (the ancestor walk should find the deny entry). */
+        {
+                _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
+
+                r = ASSERT_OK(pidref_safe_fork("(test-deny)", FORK_LOG|FORK_DEATHSIG_SIGKILL, &pidref));
+                if (r == 0) {
+                        /* Enter the userns manually without going through namespace_enter(), because
+                         * that calls reset_uid_gid() which calls setgroups() internally. Since the
+                         * BPF LSM denies setgroups(), reset_uid_gid() would fail before calling
+                         * setresuid()/setresgid(), leaving us as the overflow UID without
+                         * capabilities. */
+                        ASSERT_OK_ERRNO(setns(deny_userns_fd, CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(setresgid(0, 0, 0));
+                        ASSERT_OK_ERRNO(setresuid(0, 0, 0));
+
+                        /* setgroups() should be denied by BPF LSM */
+                        ASSERT_ERROR_ERRNO(setgroups(0, NULL), EPERM);
+
+                        /* Unshare into a child user namespace. The parent will write the mappings
+                         * for us since writing /proc/self/uid_map from inside the userns fails
+                         * because the proc mount belongs to the init user namespace. */
+                        ASSERT_OK_ERRNO(unshare(CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(eventfd_write(afd, 1));
+                        uint64_t x;
+                        ASSERT_OK_ERRNO(eventfd_read(bfd, &x));
+
+                        /* setgroups() should still be denied because the ancestor walk finds the
+                         * deny entry on the parent user namespace */
+                        ASSERT_ERROR_ERRNO(setgroups(0, NULL), EPERM);
+
+                        _exit(EXIT_SUCCESS);
+                }
+
+                uint64_t x;
+                ASSERT_OK_ERRNO(eventfd_read(afd, &x));
+                write_child_mappings(&pidref, deny_userns_fd);
+                ASSERT_OK_ERRNO(eventfd_write(bfd, 1));
+
+                ASSERT_OK(pidref_wait_for_terminate_and_check("(test-deny)", &pidref, WAIT_LOG));
+        }
+
+        /* Test 2: setgroups() should be allowed in the managed-only userns (mount ID hash but no setgroups
+         * deny entry), including in a child user namespace. */
+        {
+                _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
+
+                r = ASSERT_OK(pidref_safe_fork("(test-allow)", FORK_LOG|FORK_DEATHSIG_SIGKILL, &pidref));
+                if (r == 0) {
+                        ASSERT_OK_ERRNO(setns(allow_userns_fd, CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(setresgid(0, 0, 0));
+                        ASSERT_OK_ERRNO(setresuid(0, 0, 0));
+
+                        /* setgroups() should succeed since this userns is only in the mount ID hash */
+                        ASSERT_OK_ERRNO(setgroups(0, NULL));
+
+                        /* Also should work in a child userns since the ancestor walk finds the
+                         * mount ID hash entry (not the setgroups deny entry) */
+                        ASSERT_OK_ERRNO(unshare(CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(eventfd_write(afd, 1));
+                        uint64_t x;
+                        ASSERT_OK_ERRNO(eventfd_read(bfd, &x));
+
+                        ASSERT_OK_ERRNO(setgroups(0, NULL));
+
+                        _exit(EXIT_SUCCESS);
+                }
+
+                uint64_t x;
+                ASSERT_OK_ERRNO(eventfd_read(afd, &x));
+                write_child_mappings(&pidref, allow_userns_fd);
+                ASSERT_OK_ERRNO(eventfd_write(bfd, 1));
+
+                ASSERT_OK(pidref_wait_for_terminate_and_check("(test-allow)", &pidref, WAIT_LOG));
+        }
+}
+
  DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro);
diff --git a/src/nsresourced/userns-registry.c b/src/nsresourced/userns-registry.c

index a728c7fde9f8030390921867cc1ea57a3e0270e0..371a35086f4248416a5962db4e860a21526534e7 100644 (file)
--- a/src/nsresourced/userns-registry.c
+++ b/src/nsresourced/userns-registry.c
@@ -20,6 +20,7 @@
  #include "stat-util.h"
  #include "string-util.h"
  #include "strv.h"
+#include "uid-classification.h"
  #include "user-util.h"
  #include "userns-registry.h"
  
@@ -285,8 +286,6 @@ static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **
  
          if (userns_info->userns_inode == 0)
                  return -EBADMSG;
-        if (userns_info->start_uid == 0 || userns_info->start_gid == 0)
-                return -EBADMSG;
  
          if (userns_info->size == 0) {
                  if (uid_is_valid(userns_info->start_uid) || uid_is_valid(userns_info->target_uid))
@@ -611,7 +610,7 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                  goto fail;
          }
  
-        if (uid_is_valid(info->start_uid)) {
+        if (uid_is_transient(info->start_uid)) {
                  if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0) {
                          r = log_oom_debug();
                          goto fail;
@@ -624,7 +623,7 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                  }
          }
  
-        if (gid_is_valid(info->start_gid)) {
+        if (gid_is_transient(info->start_gid)) {
                  if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0) {
                          r = log_oom_debug();
                          goto fail;
@@ -795,7 +794,7 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
          if (r < 0)
                  RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link1_fn));
  
-        if (uid_is_valid(info->start_uid)) {
+        if (uid_is_transient(info->start_uid)) {
                  _cleanup_free_ char *link2_fn = NULL;
  
                  if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0)
@@ -806,7 +805,7 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
                          RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link2_fn));
          }
  
-        if (uid_is_valid(info->start_gid)) {
+        if (gid_is_transient(info->start_gid)) {
                  _cleanup_free_ char *link3_fn = NULL;
  
                  if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0)
diff --git a/src/nsresourced/userns-restrict.c b/src/nsresourced/userns-restrict.c

index 6a8306de66a6843218b64df545a0516c412d3843..c0d7f8a82daec561c9fae3c3c25d712acb3c7104 100644 (file)
--- a/src/nsresourced/userns-restrict.c
+++ b/src/nsresourced/userns-restrict.c
@@ -111,6 +111,10 @@ int userns_restrict_install(
          if (r < 0)
                  return log_error_errno(r, "Failed to size userns ring buffer: %m");
  
+        r = sym_bpf_map__set_max_entries(obj->maps.userns_setgroups_deny, USERNS_MAX);
+        if (r < 0)
+                return log_error_errno(r, "Failed to size userns setgroups deny hash table: %m");
+
          /* Dummy map to satisfy the verifier */
          dummy_mnt_id_hash_fd = make_inner_hash_map();
          if (dummy_mnt_id_hash_fd < 0)
@@ -320,7 +324,7 @@ int userns_restrict_put_by_fd(
  int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
  
  #if HAVE_VMLINUX_H
-        int r, outer_map_fd;
+        int r, outer_map_fd, setgroups_deny_fd;
          unsigned u;
  
          assert(obj);
@@ -339,8 +343,77 @@ int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t use
          if (r < 0)
                  return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", userns_inode);
  
+        setgroups_deny_fd = sym_bpf_map__fd(obj->maps.userns_setgroups_deny);
+        if (setgroups_deny_fd < 0)
+                return log_debug_errno(setgroups_deny_fd, "Failed to get setgroups deny BPF map fd: %m");
+
+        r = sym_bpf_map_delete_elem(setgroups_deny_fd, &u);
+        if (r < 0 && r != -ENOENT)
+                return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from setgroups deny map: %m", userns_inode);
+
          return 0;
  #else
          return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
  #endif
  }
+
+int userns_restrict_setgroups_deny_by_inode(
+                struct userns_restrict_bpf *obj,
+                uint64_t userns_inode) {
+
+#if HAVE_VMLINUX_H
+        int map_fd, r;
+        uint32_t dummy = 1;
+        unsigned ino;
+
+        assert(obj);
+        assert(userns_inode != 0);
+
+        /* The BPF map only supports 32bit keys, and user namespace inode numbers are 32bit too, even though
+         * ino_t is 64bit these days. Should we ever run into a 64bit inode let's refuse early. */
+        if (userns_inode > UINT32_MAX)
+                return -EINVAL;
+
+        ino = (unsigned) userns_inode;
+
+        map_fd = sym_bpf_map__fd(obj->maps.userns_setgroups_deny);
+        if (map_fd < 0)
+                return log_debug_errno(map_fd, "Failed to get setgroups deny BPF map fd: %m");
+
+        r = sym_bpf_map_update_elem(map_fd, &ino, &dummy, BPF_ANY);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to add userns inode to setgroups deny map: %m");
+
+        log_debug("Denying setgroups() on userns inode %" PRIu64, userns_inode);
+
+        return 0;
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+#endif
+}
+
+int userns_restrict_setgroups_deny_by_fd(
+                struct userns_restrict_bpf *obj,
+                int userns_fd) {
+
+#if HAVE_VMLINUX_H
+        struct stat st;
+        int r;
+
+        assert(obj);
+        assert(userns_fd >= 0);
+
+        r = fd_is_namespace(userns_fd, NAMESPACE_USER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
+        if (r == 0)
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
+
+        if (fstat(userns_fd, &st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
+
+        return userns_restrict_setgroups_deny_by_inode(obj, st.st_ino);
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+#endif
+}
diff --git a/src/nsresourced/userns-restrict.h b/src/nsresourced/userns-restrict.h

index 21a81feaff4e5aa9fe22f9f3b295aeae8f3a46d9..f0673d159446e173955e1972330d61f8aa6d7469 100644 (file)
--- a/src/nsresourced/userns-restrict.h
+++ b/src/nsresourced/userns-restrict.h
@@ -13,4 +13,7 @@ int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t usern
  
  int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
  
+int userns_restrict_setgroups_deny_by_fd(struct userns_restrict_bpf *obj, int userns_fd);
+int userns_restrict_setgroups_deny_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
+
  DEFINE_TRIVIAL_CLEANUP_FUNC(struct userns_restrict_bpf*, userns_restrict_bpf_free);
diff --git a/src/shared/varlink-io.systemd.NamespaceResource.c b/src/shared/varlink-io.systemd.NamespaceResource.c

index 7d5f5093224f480b107a318d544b86109c18ebf1..79fff1b592b2b1a0f43f8bd9cfd2cfa06813ea32 100644 (file)
--- a/src/shared/varlink-io.systemd.NamespaceResource.c
+++ b/src/shared/varlink-io.systemd.NamespaceResource.c
@@ -2,18 +2,27 @@
  
  #include "varlink-io.systemd.NamespaceResource.h"
  
+static SD_VARLINK_DEFINE_ENUM_TYPE(
+                AllocateUserRangeType,
+                SD_VARLINK_FIELD_COMMENT("Allocate a transient UID/GID range from the dynamic range pool. This is the default."),
+                SD_VARLINK_DEFINE_ENUM_VALUE(managed),
+                SD_VARLINK_FIELD_COMMENT("Create a user namespace that maps the peer UID/GID to itself instead of allocating a transient UID range."),
+                SD_VARLINK_DEFINE_ENUM_VALUE(self));
+
  static SD_VARLINK_DEFINE_METHOD(
                  AllocateUserRange,
                  SD_VARLINK_FIELD_COMMENT("The name for the user namespace, a short string that must be fit to be included in a file name and in a user name. This name is included in the user records announced via NSS and is otherwise useful for debugging."),
                  SD_VARLINK_DEFINE_INPUT(name, SD_VARLINK_STRING, 0),
                  SD_VARLINK_FIELD_COMMENT("Controls whether to mangle the provided name if needed so that it is suitable for naming a user namespace. If true this will shorten the name as necessary or randomize it if that's not sufficient. If null defaults to false."),
                  SD_VARLINK_DEFINE_INPUT(mangleName, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE),
-                SD_VARLINK_FIELD_COMMENT("The number of UIDs to assign. Must be 1 or 65536."),
+                SD_VARLINK_FIELD_COMMENT("The number of UIDs to assign. Must be 1 or 65536. If type is 'self', must be 1."),
                  SD_VARLINK_DEFINE_INPUT(size, SD_VARLINK_INT, 0),
-                SD_VARLINK_FIELD_COMMENT("The target UID inside the user namespace. If not specified defaults to 0."),
+                SD_VARLINK_FIELD_COMMENT("The target UID inside the user namespace. If not specified defaults to 0. If type is 'self', must be 0 or unset in which case the peer UID is mapped to itself."),
                  SD_VARLINK_DEFINE_INPUT(target, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                  SD_VARLINK_FIELD_COMMENT("A file descriptor to an allocated userns with no current UID range assignments"),
                  SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, 0),
+                SD_VARLINK_FIELD_COMMENT("The type of allocation to perform. If 'managed' (the default), a transient UID/GID range is allocated from the dynamic range pool. If 'self', the peer UID/GID is mapped to itself. Defaults to 'managed'."),
+                SD_VARLINK_DEFINE_INPUT_BY_TYPE(type, AllocateUserRangeType, SD_VARLINK_NULLABLE),
                  SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."),
                  SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                  SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."),
@@ -77,6 +86,8 @@ SD_VARLINK_DEFINE_INTERFACE(
                  io_systemd_NamespaceResource,
                  "io.systemd.NamespaceResource",
                  SD_VARLINK_INTERFACE_COMMENT("Allocate transient UID ranges for user namespace, and assign mounts, cgroups and networking devices to them"),
+                SD_VARLINK_SYMBOL_COMMENT("The type of user range allocation to perform."),
+                &vl_type_AllocateUserRangeType,
                  SD_VARLINK_SYMBOL_COMMENT("Assigns a UID range to a client-allocated user namespace that has no UID range assigned so far, and registers it for assignment of other resources."),
                  &vl_method_AllocateUserRange,
                  SD_VARLINK_SYMBOL_COMMENT("Registers an already initialized user namespace for assignment of resources."),
diff --git a/test/units/TEST-50-DISSECT.mountfsd.sh b/test/units/TEST-50-DISSECT.mountfsd.sh

index e5092b56868f35b9c2fe15b6b268c151baa2f815..c468e3b8f89b0549485f8a70c361e8684485800b 100755 (executable)
--- a/test/units/TEST-50-DISSECT.mountfsd.sh
+++ b/test/units/TEST-50-DISSECT.mountfsd.sh
@@ -77,6 +77,28 @@ test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
          '{"name":"test-fail","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":17}') |&
              grep "io.systemd.NamespaceResource.TooManyDelegations" >/dev/null
  
+# Test self mapping
+# Verify that self mapping maps the peer UID to root (uid_map should show "0 <peer_uid> 1")
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+        --push-fd=/proc/self/ns/user \
+        /run/systemd/userdb/io.systemd.NamespaceResource \
+        io.systemd.NamespaceResource.AllocateUserRange \
+        '{"name":"test-id","target":0,"size":1,"userNamespaceFileDescriptor":0,"type":"self"}' \
+        -- cat /proc/self/uid_map | awk '{print $1, $3}')" = "0 1"
+
+# Test nested delegation with self mapping
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+        --push-fd=/proc/self/ns/user \
+        /run/systemd/userdb/io.systemd.NamespaceResource \
+        io.systemd.NamespaceResource.AllocateUserRange \
+        '{"name":"test-delegate2","type":"self","size":1,"userNamespaceFileDescriptor":0,"delegateContainerRanges":3}' \
+        -- unshare --user varlinkctl --exec call \
+            --push-fd=/proc/self/ns/user \
+            /run/systemd/userdb/io.systemd.NamespaceResource \
+            io.systemd.NamespaceResource.AllocateUserRange \
+            '{"name":"test-delegate3","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \
+            -- cat /proc/self/uid_map | wc -l)" -eq 3
+
  # This should work without the key
  systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
  systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
diff --git a/units/systemd-nsresourced.service.in b/units/systemd-nsresourced.service.in

index 0e2d6b3628c3535dba62b58230768a0cef5a3e01..143f9a9a3d296fc32ad6cbc6b00648f622c7d76f 100644 (file)
--- a/units/systemd-nsresourced.service.in
+++ b/units/systemd-nsresourced.service.in
@@ -18,7 +18,7 @@ After=modprobe@tun.service
  DefaultDependencies=no
  
  [Service]
-CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER CAP_NET_ADMIN
+CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER CAP_NET_ADMIN CAP_SETFCAP
  ExecStart={{LIBEXECDIR}}/systemd-nsresourced
  IPAddressDeny=any
  LimitNOFILE={{HIGH_RLIMIT_NOFILE}}
author	Daan De Meyer <daan.j.demeyer@gmail.com>
	Sat, 24 Jan 2026 00:03:35 +0000 (01:03 +0100)
committer	Daan De Meyer <daan@amutable.com>
	Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
man/systemd-nsresourced.service.xml		patch \| blob \| blame \| history
src/basic/uid-classification.h		patch \| blob \| blame \| history
src/basic/uid-range.c		patch \| blob \| blame \| history
src/basic/uid-range.h		patch \| blob \| blame \| history
src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c		patch \| blob \| blame \| history
src/nsresourced/nsresourcework.c		patch \| blob \| blame \| history
src/nsresourced/test-userns-restrict.c		patch \| blob \| blame \| history
src/nsresourced/userns-registry.c		patch \| blob \| blame \| history
src/nsresourced/userns-restrict.c		patch \| blob \| blame \| history
src/nsresourced/userns-restrict.h		patch \| blob \| blame \| history
src/shared/varlink-io.systemd.NamespaceResource.c		patch \| blob \| blame \| history
test/units/TEST-50-DISSECT.mountfsd.sh		patch \| blob \| blame \| history
units/systemd-nsresourced.service.in		patch \| blob \| blame \| history