]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nsresourced: Add support for self mappings with delegations
authorDaan De Meyer <daan.j.demeyer@gmail.com>
Sat, 24 Jan 2026 00:03:35 +0000 (01:03 +0100)
committerDaan De Meyer <daan@amutable.com>
Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
mkosi does all of its environment setup in an unprivileged user
namespace with an identity mapping. When it invokes nspawn and nspawn
tries to get a transient userns from nsresourced, this fails as no
transient ranges are mapped into mkosi's unprivileged userns (as doing
so would require privileges).

To fix this problem, let's allow allocating unprivileged self user
namespaces in nsresourced, similar to what the kernel allows, except that
we also support delegations for these. This means that mkosi can get its
unprivileged userns as before from nsresourced, but it can also request a
delegated 64K range inside that userns as well, which nsresourced can then
allocate to nspawn later when it asks for one.

Similar to the kernel, we disallow setgroups for self mappings. However,
instead of doing this via /proc/self/setgroups, which applies to the current
user namespace and all its child user namespaces, we use the BPF LSM to deny
setgroups instead, so that it can still be allowed for child user namespaces.
We need this because as soon as a container launches in a child user namespace
using one of the delegated mappings, it has to be able to do setgroups() to be
able to function properly.

To allow mapping the root user, we need to add the CAP_SETFCAP capability to
nsresourced.

13 files changed:
man/systemd-nsresourced.service.xml
src/basic/uid-classification.h
src/basic/uid-range.c
src/basic/uid-range.h
src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c
src/nsresourced/nsresourcework.c
src/nsresourced/test-userns-restrict.c
src/nsresourced/userns-registry.c
src/nsresourced/userns-restrict.c
src/nsresourced/userns-restrict.h
src/shared/varlink-io.systemd.NamespaceResource.c
test/units/TEST-50-DISSECT.mountfsd.sh
units/systemd-nsresourced.service.in

index 853fe09fbc97f96843803efd65424007edb201fe..e9b661271642e35eab96efb8986cc0bfb0063d61 100644 (file)
     namespace, each of size 65536. The ranges are allocated from the container UID ranges as per
     <ulink url="https://systemd.io/UIDS-GIDS">Users, Groups, UIDs and GIDs on systemd Systems</ulink>.</para>
 
+    <para>The allocation API also supports <emphasis>identity mappings</emphasis>: instead of allocating a
+    transient UID/GID range, the user namespace can be configured to map the caller's UID/GID to root (UID
+    0) inside the namespace, or to itself. Identity mappings can be combined with delegated ranges to enter
+    a privileged user namespace from which the container can be set up after which the container can run in
+    one of the delegated ranges. Identity mapped users are not subject to BPF-LSM write restrictions unlike
+    the transient ranges.</para>
+
     <para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
     per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
     namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
index 6735e06b322184e3afed7f166b8585d0e2fa06e6..a4d8f916e9f3b240cd67a82ac3f131e047d395c6 100644 (file)
@@ -46,6 +46,14 @@ static inline bool gid_is_foreign(gid_t gid) {
         return uid_is_foreign((uid_t) gid);
 }
 
+static inline bool uid_is_transient(uid_t uid) {
+        return uid_is_container(uid) || uid_is_dynamic(uid);
+}
+
+static inline bool gid_is_transient(gid_t gid) {
+        return uid_is_container((uid_t) gid) || uid_is_dynamic((uid_t) gid);
+}
+
 typedef struct UGIDAllocationRange {
         uid_t system_alloc_uid_min;
         uid_t system_uid_max;
index 763c421e91edb510cb3d8b7727a983be784e6dbc..31305952ba43cf564e319ccc0124d46751fef20b 100644 (file)
@@ -532,6 +532,27 @@ int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t u
         return -ESRCH;
 }
 
+int uid_range_translate_userns_fd(int userns_fd, UIDRangeUsernsMode mode, uid_t uid, uid_t *ret) {
+        int r;
+
+        assert(userns_fd >= 0);
+        assert(IN_SET(mode, UID_RANGE_USERNS_OUTSIDE, GID_RANGE_USERNS_OUTSIDE));
+
+        _cleanup_(uid_range_freep) UIDRange *outside_range = NULL;
+        r = uid_range_load_userns_by_fd_full(userns_fd, mode, /* coalesce= */ false, &outside_range);
+        if (r < 0)
+                return r;
+
+        mode = mode == UID_RANGE_USERNS_OUTSIDE ? UID_RANGE_USERNS_INSIDE : GID_RANGE_USERNS_INSIDE;
+
+        _cleanup_(uid_range_freep) UIDRange *inside_range = NULL;
+        r = uid_range_load_userns_by_fd_full(userns_fd, mode, /* coalesce= */ false, &inside_range);
+        if (r < 0)
+                return r;
+
+        return uid_range_translate(outside_range, inside_range, uid, ret);
+}
+
 bool uid_range_equal(const UIDRange *a, const UIDRange *b) {
         if (a == b)
                 return true;
index a15a2a8e4f969d49eecacd55255cd668af3c7c9d..08d707ae25968b6aa5cf4afd4c4dbd258566b470 100644 (file)
@@ -67,6 +67,7 @@ int uid_range_partition(UIDRange *range, uid_t size);
 int uid_range_copy(const UIDRange *range, UIDRange **ret);
 int uid_range_remove(UIDRange *range, uid_t start, uid_t size);
 int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret);
+int uid_range_translate_userns_fd(int userns_fd, UIDRangeUsernsMode mode, uid_t uid, uid_t *ret);
 
 int uid_map_search_root(pid_t pid, UIDRangeUsernsMode mode, uid_t *ret);
 
index dbfcf59b28671eabb9d6c693aed5901037245f3c..25d609bf38fc83bc7c4c4f3ad5b9f923124d0454 100644 (file)
@@ -62,6 +62,13 @@ struct {
         __array(values, struct mnt_id_map);
 } userns_mnt_id_hash SEC(".maps");
 
+struct {
+        __uint(type, BPF_MAP_TYPE_HASH);
+        __uint(max_entries, 1);        /* placeholder, configured otherwise by nsresourced */
+        __type(key, unsigned);         /* userns inode */
+        __type(value, int);            /* dummy value */
+} userns_setgroups_deny SEC(".maps");
+
 struct {
         __uint(type, BPF_MAP_TYPE_RINGBUF);
         __uint(max_entries, 4096);
@@ -229,20 +236,59 @@ int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct
         return validate_mount(new_dir->mnt, ret);
 }
 
+SEC("lsm/task_fix_setgroups")
+int BPF_PROG(userns_restrict_task_fix_setgroups, struct cred *new_cred, const struct cred *old, int ret) {
+        struct user_namespace *p;
+        unsigned inode;
+
+        if (ret != 0) /* propagate earlier error */
+                return ret;
+
+        /* Walk the task's user namespace and its ancestors to find the first one managed by nsresourced
+         * (i.e. present in either the setgroups deny map or the mount ID hash map). This is necessary
+         * because a task could otherwise trivially bypass the setgroups() restriction by unsharing the user
+         * namespace and mapping the same users and groups. */
+        p = new_cred->user_ns;
+        for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
+                if (!p)
+                        break;
+
+                inode = p->ns.inum;
+
+                if (bpf_map_lookup_elem(&userns_setgroups_deny, &inode))
+                        return -EPERM;
+
+                if (bpf_map_lookup_elem(&userns_mnt_id_hash, &inode))
+                        return 0;
+
+                p = p->parent;
+        }
+
+        /* No nsresourced-managed ancestor found, allow. */
+        return 0;
+}
+
 SEC("kprobe/retire_userns_sysctls")
 int BPF_KPROBE(userns_restrict_retire_userns_sysctls, struct user_namespace *userns) {
         unsigned inode;
-        void *mnt_id_map;
 
         /* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into
          * user namespaces being deleted than using kprobes, but couldn't find any. */
         userns = bpf_rdonly_cast(userns, bpf_core_type_id_kernel(struct user_namespace));
         inode = userns->ns.inum;
 
-        mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode);
-        if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */
-                return 0;
+        /* Check each map separately to avoid the compiler merging the two lookups into a pointer OR
+         * operation, which the BPF verifier rejects. */
+        if (bpf_map_lookup_elem(&userns_mnt_id_hash, &inode))
+                goto notify;
+
+        if (bpf_map_lookup_elem(&userns_setgroups_deny, &inode))
+                goto notify;
+
+        /* No rules installed for this userns? Then send no notification. */
+        return 0;
 
+notify:
         bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0);
         return 0;
 }
index abb50955081cf6ad3164d48afddb51671b5db6cb..82640fac4f76dcb1423f33b469f9c21d1431314a 100644 (file)
@@ -41,6 +41,7 @@
 #include "siphash24.h"
 #include "socket-util.h"
 #include "stat-util.h"
+#include "string-table.h"
 #include "string-util.h"
 #include "strv.h"
 #include "time-util.h"
@@ -76,6 +77,21 @@ typedef struct LookupParameters {
         const char *service;
 } LookupParameters;
 
+typedef enum AllocateUserRangeType {
+        ALLOCATE_USER_RANGE_MANAGED,
+        ALLOCATE_USER_RANGE_SELF,
+        _ALLOCATE_USER_RANGE_TYPE_MAX,
+        _ALLOCATE_USER_RANGE_TYPE_INVALID = -EINVAL,
+} AllocateUserRangeType;
+
+static const char *const allocate_user_range_type_table[_ALLOCATE_USER_RANGE_TYPE_MAX] = {
+        [ALLOCATE_USER_RANGE_MANAGED] = "managed",
+        [ALLOCATE_USER_RANGE_SELF]    = "self",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(allocate_user_range_type, AllocateUserRangeType);
+static JSON_DISPATCH_ENUM_DEFINE(dispatch_allocate_user_range_type, AllocateUserRangeType, allocate_user_range_type_from_string);
+
 static int build_user_json(UserNamespaceInfo *userns_info, uid_t offset, sd_json_variant **ret) {
         _cleanup_free_ char *name = NULL, *realname = NULL;
         UserDisposition disposition;
@@ -634,17 +650,21 @@ static int allocate_now(
         if (r == 0)
                 return -EEXIST;
 
-        r = allocate_one(
-                        registry_dir_fd,
-                        info->name, info->size,
-                        parent_userns_fd,
-                        candidates,
-                        &candidate);
-        if (r < 0)
-                return r;
+        /* If the source UID/GID are already set we're doing a "self" user namespace and don't need to
+         * allocate a transient range. */
+        if (!uid_is_valid(info->start_uid) && !gid_is_valid(info->start_gid)) {
+                r = allocate_one(
+                                registry_dir_fd,
+                                info->name, info->size,
+                                parent_userns_fd,
+                                candidates,
+                                &candidate);
+                if (r < 0)
+                        return r;
 
-        info->start_uid = candidate;
-        info->start_gid = (gid_t) candidate;
+                info->start_uid = candidate;
+                info->start_gid = (gid_t) candidate;
+        }
 
         /* Now allocate delegated ranges if requested */
         if (info->n_delegates > 0) {
@@ -761,7 +781,7 @@ static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespace
         /* Let's enforce that the transient UID/GID ranges are mapped 1:1 in the parent user namespace, to
          * avoid any weird mapping shenanigans that might happen otherwise. */
 
-        if (start_uid != userns_info->start_uid)
+        if (uid_is_transient(userns_info->start_uid) && start_uid != userns_info->start_uid)
                 return log_debug_errno(
                         SYNTHETIC_ERRNO(ERANGE),
                         "Transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
@@ -814,7 +834,7 @@ static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespace
         if (r < 0)
                 return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid);
 
-        if (start_gid != userns_info->start_gid)
+        if (gid_is_transient(userns_info->start_gid) && start_gid != userns_info->start_gid)
                 return log_debug_errno(
                         SYNTHETIC_ERRNO(ERANGE),
                         "Transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
@@ -1023,14 +1043,23 @@ static int validate_name(sd_varlink *link, const char *name, bool mangle, char *
         return 0;
 }
 
-static int validate_target_and_size(sd_varlink *link, uid_t target, uint32_t size) {
+static int validate_target_and_size(sd_varlink *link, uid_t target, uint32_t size, AllocateUserRangeType type) {
         assert(link);
 
-        if (!IN_SET(size, 1U, 0x10000))
-                return sd_varlink_error_invalid_parameter_name(link, "size");
+        if (type == ALLOCATE_USER_RANGE_SELF) {
+                /* Self userns must have size 1 and target must be 0 or unset */
+                if (size != 1)
+                        return sd_varlink_error_invalid_parameter_name(link, "size");
+
+                if (!IN_SET(target, UID_INVALID, 0))
+                        return sd_varlink_error_invalid_parameter_name(link, "target");
+        } else {
+                if (!IN_SET(size, 1U, 0x10000))
+                        return sd_varlink_error_invalid_parameter_name(link, "size");
 
-        if (!uid_is_valid(target) || target > UINT32_MAX - size)
-                return sd_varlink_error_invalid_parameter_name(link, "target");
+                if (!uid_is_valid(target) || target > UINT32_MAX - size)
+                        return sd_varlink_error_invalid_parameter_name(link, "target");
+        }
 
         return 0;
 }
@@ -1104,6 +1133,7 @@ static int validate_userns_is_empty(sd_varlink *link, int userns_fd) {
 
 typedef struct AllocateParameters {
         const char *name;
+        AllocateUserRangeType type;
         uint32_t size;
         uid_t target;
         unsigned userns_fd_idx;
@@ -1114,12 +1144,13 @@ typedef struct AllocateParameters {
 static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) {
 
         static const sd_json_dispatch_field dispatch_table[] = {
-                { "name",                        SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string, offsetof(AllocateParameters, name),                      SD_JSON_MANDATORY },
-                { "size",                        _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,       offsetof(AllocateParameters, size),                      SD_JSON_MANDATORY },
-                { "target",                      _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid,      offsetof(AllocateParameters, target),                    0                 },
-                { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint,         offsetof(AllocateParameters, userns_fd_idx),             SD_JSON_MANDATORY },
-                { "mangleName",                  SD_JSON_VARIANT_BOOLEAN,       sd_json_dispatch_stdbool,      offsetof(AllocateParameters, mangle_name),               0                 },
-                { "delegateContainerRanges",     _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,       offsetof(AllocateParameters, delegate_container_ranges), 0                 },
+                { "name",                        SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string,     offsetof(AllocateParameters, name),                      SD_JSON_MANDATORY },
+                { "size",                        _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,           offsetof(AllocateParameters, size),                      SD_JSON_MANDATORY },
+                { "target",                      _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid,          offsetof(AllocateParameters, target),                    0                 },
+                { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint,             offsetof(AllocateParameters, userns_fd_idx),             SD_JSON_MANDATORY },
+                { "mangleName",                  SD_JSON_VARIANT_BOOLEAN,       sd_json_dispatch_stdbool,          offsetof(AllocateParameters, mangle_name),               0                 },
+                { "type",                        SD_JSON_VARIANT_STRING,        dispatch_allocate_user_range_type, offsetof(AllocateParameters, type),                      0                 },
+                { "delegateContainerRanges",     _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,           offsetof(AllocateParameters, delegate_container_ranges), 0                 },
                 {}
         };
 
@@ -1127,9 +1158,12 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         _cleanup_free_ char *userns_name = NULL;
         Context *c = ASSERT_PTR(userdata);
         uid_t peer_uid;
+        gid_t peer_gid;
         struct stat userns_st;
         AllocateParameters p = {
+                .type = ALLOCATE_USER_RANGE_MANAGED,
                 .size = UINT32_MAX,
+                .target = UID_INVALID,
                 .userns_fd_idx = UINT_MAX,
         };
         int r;
@@ -1145,11 +1179,14 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         if (r != 0)
                 return r;
 
+        if (p.type != ALLOCATE_USER_RANGE_SELF && p.target == UID_INVALID)
+                p.target = 0;
+
         r = validate_name(link, p.name, p.mangle_name, &userns_name);
         if (r != 0)
                 return r;
 
-        r = validate_target_and_size(link, p.target, p.size);
+        r = validate_target_and_size(link, p.target, p.size, p.type);
         if (r != 0)
                 return r;
 
@@ -1179,6 +1216,10 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         if (r < 0)
                 return r;
 
+        r = sd_varlink_get_peer_gid(link, &peer_gid);
+        if (r < 0)
+                return r;
+
         const char *polkit_details[] = {
                 "name", userns_name,
                 NULL,
@@ -1219,6 +1260,33 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         userns_info->target_uid = p.target;
         userns_info->target_gid = (gid_t) p.target;
 
+        if (p.type == ALLOCATE_USER_RANGE_SELF) {
+                /* The start UID/GID will be mapped to the parent userns in write_userns(). If a self
+                 * mapping to the peer UID/GID is requested, we have to map the target UID/GID ourselves here
+                 * as write_userns() doesn't take care of that. */
+
+                userns_info->start_uid = peer_uid;
+                userns_info->start_gid = peer_gid;
+
+                if (p.target == UID_INVALID) {
+                        r = uid_range_translate_userns_fd(
+                                        parent_userns_fd,
+                                        UID_RANGE_USERNS_OUTSIDE,
+                                        peer_uid,
+                                        &userns_info->target_uid);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent user namespace: %m", peer_uid);
+
+                        r = uid_range_translate_userns_fd(
+                                        parent_userns_fd,
+                                        GID_RANGE_USERNS_OUTSIDE,
+                                        peer_gid,
+                                        &userns_info->target_gid);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent user namespace: %m", peer_gid);
+                }
+        }
+
         /* Set up delegation arrays if requested */
         if (p.delegate_container_ranges > 0) {
                 userns_info->delegates = new0(DelegatedUserNamespaceInfo, p.delegate_container_ranges);
@@ -1259,6 +1327,15 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         if (r < 0)
                 goto fail;
 
+        if (p.type == ALLOCATE_USER_RANGE_SELF) {
+                /* For "self" allocations we deny setgroups() via the BPF LSM. We can't use
+                 * /proc/self/setgroups for this as that is transitive and also applies to child user
+                 * namespaces. The BPF LSM hook only applies to the specific user namespace. */
+                r = userns_restrict_setgroups_deny_by_fd(c->bpf, userns_fd);
+                if (r < 0)
+                        goto fail;
+        }
+
         r = write_userns(userns_fd, parent_userns_fd, userns_info);
         if (r < 0)
                 goto fail;
index 853fc1441f6d7c95874862244d486103d03f01f6..29125a84a8d300347c05cf1ad70c9438e28aea0d 100644 (file)
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
+#include <grp.h>
 #include <sched.h>
 #include <sys/eventfd.h>
 #include <sys/mount.h>
@@ -8,6 +9,7 @@
 
 #include "errno-util.h"
 #include "fd-util.h"
+#include "fileio.h"
 #include "namespace-util.h"
 #include "pidref.h"
 #include "process-util.h"
@@ -147,4 +149,137 @@ TEST(userns_restrict) {
         ASSERT_OK(pidref_wait_for_terminate_and_check("(test)", &pidref, WAIT_LOG));
 }
 
+static void write_child_mappings(PidRef *child, int parent_userns_fd) {
+        /* The kernel requires uid_map/gid_map to be written from the parent user namespace of the
+         * target namespace. Fork a helper that joins the parent userns and writes the mappings from
+         * there, mirroring what write_userns() does in nsresourcework.c. */
+        int r;
+
+        r = ASSERT_OK(pidref_safe_fork("(sd-write-map)", FORK_DEATHSIG_SIGKILL|FORK_WAIT|FORK_LOG, NULL));
+        if (r == 0) {
+                char path[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1];
+
+                ASSERT_OK_ERRNO(setns(parent_userns_fd, CLONE_NEWUSER));
+
+                xsprintf(path, "/proc/" PID_FMT "/uid_map", child->pid);
+                ASSERT_OK(write_string_file(path, "0 0 1\n", WRITE_STRING_FILE_DISABLE_BUFFER));
+
+                xsprintf(path, "/proc/" PID_FMT "/gid_map", child->pid);
+                ASSERT_OK(write_string_file(path, "0 0 1\n", WRITE_STRING_FILE_DISABLE_BUFFER));
+
+                _exit(EXIT_SUCCESS);
+        }
+}
+
+TEST(setgroups_deny) {
+        _cleanup_close_ int deny_userns_fd = -EBADF, allow_userns_fd = -EBADF,
+                             afd = -EBADF, bfd = -EBADF;
+        int r;
+
+        _cleanup_free_ char *idmap = NULL;
+        ASSERT_OK(asprintf(&idmap, "0 "UID_FMT" 1", CONTAINER_UID_MIN));
+
+        /* Create a userns that will have setgroups() denied via BPF. We don't set setgroups_deny here
+         * because that uses /proc/self/setgroups which is transitive and we want to test the BPF-LSM
+         * denial specifically. */
+        deny_userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ false));
+
+        ASSERT_OK(userns_restrict_put_by_fd(
+                        bpf_obj,
+                        deny_userns_fd,
+                        /* replace= */ true,
+                        /* mount_fds= */ NULL,
+                        /* n_mount_fds= */ 0));
+        ASSERT_OK(userns_restrict_setgroups_deny_by_fd(bpf_obj, deny_userns_fd));
+
+        /* Create a userns that is managed (in mount ID hash) but does NOT have setgroups() denied */
+        allow_userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ false));
+
+        ASSERT_OK(userns_restrict_put_by_fd(
+                        bpf_obj,
+                        allow_userns_fd,
+                        /* replace= */ true,
+                        /* mount_fds= */ NULL,
+                        /* n_mount_fds= */ 0));
+
+        afd = ASSERT_OK_ERRNO(eventfd(0, EFD_CLOEXEC));
+        bfd = ASSERT_OK_ERRNO(eventfd(0, EFD_CLOEXEC));
+
+        /* Test 1: setgroups() should be denied in the deny userns, including after unsharing into a child
+         * user namespace (the ancestor walk should find the deny entry). */
+        {
+                _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
+
+                r = ASSERT_OK(pidref_safe_fork("(test-deny)", FORK_LOG|FORK_DEATHSIG_SIGKILL, &pidref));
+                if (r == 0) {
+                        /* Enter the userns manually without going through namespace_enter(), because
+                         * that calls reset_uid_gid() which calls setgroups() internally. Since the
+                         * BPF LSM denies setgroups(), reset_uid_gid() would fail before calling
+                         * setresuid()/setresgid(), leaving us as the overflow UID without
+                         * capabilities. */
+                        ASSERT_OK_ERRNO(setns(deny_userns_fd, CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(setresgid(0, 0, 0));
+                        ASSERT_OK_ERRNO(setresuid(0, 0, 0));
+
+                        /* setgroups() should be denied by BPF LSM */
+                        ASSERT_ERROR_ERRNO(setgroups(0, NULL), EPERM);
+
+                        /* Unshare into a child user namespace. The parent will write the mappings
+                         * for us since writing /proc/self/uid_map from inside the userns fails
+                         * because the proc mount belongs to the init user namespace. */
+                        ASSERT_OK_ERRNO(unshare(CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(eventfd_write(afd, 1));
+                        uint64_t x;
+                        ASSERT_OK_ERRNO(eventfd_read(bfd, &x));
+
+                        /* setgroups() should still be denied because the ancestor walk finds the
+                         * deny entry on the parent user namespace */
+                        ASSERT_ERROR_ERRNO(setgroups(0, NULL), EPERM);
+
+                        _exit(EXIT_SUCCESS);
+                }
+
+                uint64_t x;
+                ASSERT_OK_ERRNO(eventfd_read(afd, &x));
+                write_child_mappings(&pidref, deny_userns_fd);
+                ASSERT_OK_ERRNO(eventfd_write(bfd, 1));
+
+                ASSERT_OK(pidref_wait_for_terminate_and_check("(test-deny)", &pidref, WAIT_LOG));
+        }
+
+        /* Test 2: setgroups() should be allowed in the managed-only userns (mount ID hash but no setgroups
+         * deny entry), including in a child user namespace. */
+        {
+                _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
+
+                r = ASSERT_OK(pidref_safe_fork("(test-allow)", FORK_LOG|FORK_DEATHSIG_SIGKILL, &pidref));
+                if (r == 0) {
+                        ASSERT_OK_ERRNO(setns(allow_userns_fd, CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(setresgid(0, 0, 0));
+                        ASSERT_OK_ERRNO(setresuid(0, 0, 0));
+
+                        /* setgroups() should succeed since this userns is only in the mount ID hash */
+                        ASSERT_OK_ERRNO(setgroups(0, NULL));
+
+                        /* Also should work in a child userns since the ancestor walk finds the
+                         * mount ID hash entry (not the setgroups deny entry) */
+                        ASSERT_OK_ERRNO(unshare(CLONE_NEWUSER));
+                        ASSERT_OK_ERRNO(eventfd_write(afd, 1));
+                        uint64_t x;
+                        ASSERT_OK_ERRNO(eventfd_read(bfd, &x));
+
+                        ASSERT_OK_ERRNO(setgroups(0, NULL));
+
+                        _exit(EXIT_SUCCESS);
+                }
+
+                uint64_t x;
+                ASSERT_OK_ERRNO(eventfd_read(afd, &x));
+                write_child_mappings(&pidref, allow_userns_fd);
+                ASSERT_OK_ERRNO(eventfd_write(bfd, 1));
+
+                ASSERT_OK(pidref_wait_for_terminate_and_check("(test-allow)", &pidref, WAIT_LOG));
+        }
+}
+
 DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro);
index a728c7fde9f8030390921867cc1ea57a3e0270e0..371a35086f4248416a5962db4e860a21526534e7 100644 (file)
@@ -20,6 +20,7 @@
 #include "stat-util.h"
 #include "string-util.h"
 #include "strv.h"
+#include "uid-classification.h"
 #include "user-util.h"
 #include "userns-registry.h"
 
@@ -285,8 +286,6 @@ static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **
 
         if (userns_info->userns_inode == 0)
                 return -EBADMSG;
-        if (userns_info->start_uid == 0 || userns_info->start_gid == 0)
-                return -EBADMSG;
 
         if (userns_info->size == 0) {
                 if (uid_is_valid(userns_info->start_uid) || uid_is_valid(userns_info->target_uid))
@@ -611,7 +610,7 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                 goto fail;
         }
 
-        if (uid_is_valid(info->start_uid)) {
+        if (uid_is_transient(info->start_uid)) {
                 if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0) {
                         r = log_oom_debug();
                         goto fail;
@@ -624,7 +623,7 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                 }
         }
 
-        if (gid_is_valid(info->start_gid)) {
+        if (gid_is_transient(info->start_gid)) {
                 if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0) {
                         r = log_oom_debug();
                         goto fail;
@@ -795,7 +794,7 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
         if (r < 0)
                 RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link1_fn));
 
-        if (uid_is_valid(info->start_uid)) {
+        if (uid_is_transient(info->start_uid)) {
                 _cleanup_free_ char *link2_fn = NULL;
 
                 if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0)
@@ -806,7 +805,7 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
                         RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link2_fn));
         }
 
-        if (uid_is_valid(info->start_gid)) {
+        if (gid_is_transient(info->start_gid)) {
                 _cleanup_free_ char *link3_fn = NULL;
 
                 if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0)
index 6a8306de66a6843218b64df545a0516c412d3843..c0d7f8a82daec561c9fae3c3c25d712acb3c7104 100644 (file)
@@ -111,6 +111,10 @@ int userns_restrict_install(
         if (r < 0)
                 return log_error_errno(r, "Failed to size userns ring buffer: %m");
 
+        r = sym_bpf_map__set_max_entries(obj->maps.userns_setgroups_deny, USERNS_MAX);
+        if (r < 0)
+                return log_error_errno(r, "Failed to size userns setgroups deny hash table: %m");
+
         /* Dummy map to satisfy the verifier */
         dummy_mnt_id_hash_fd = make_inner_hash_map();
         if (dummy_mnt_id_hash_fd < 0)
@@ -320,7 +324,7 @@ int userns_restrict_put_by_fd(
 int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
 
 #if HAVE_VMLINUX_H
-        int r, outer_map_fd;
+        int r, outer_map_fd, setgroups_deny_fd;
         unsigned u;
 
         assert(obj);
@@ -339,8 +343,77 @@ int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t use
         if (r < 0)
                 return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", userns_inode);
 
+        setgroups_deny_fd = sym_bpf_map__fd(obj->maps.userns_setgroups_deny);
+        if (setgroups_deny_fd < 0)
+                return log_debug_errno(setgroups_deny_fd, "Failed to get setgroups deny BPF map fd: %m");
+
+        r = sym_bpf_map_delete_elem(setgroups_deny_fd, &u);
+        if (r < 0 && r != -ENOENT)
+                return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from setgroups deny map: %m", userns_inode);
+
         return 0;
 #else
         return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
 #endif
 }
+
+int userns_restrict_setgroups_deny_by_inode(
+                struct userns_restrict_bpf *obj,
+                uint64_t userns_inode) {
+
+#if HAVE_VMLINUX_H
+        int map_fd, r;
+        uint32_t dummy = 1;
+        unsigned ino;
+
+        assert(obj);
+        assert(userns_inode != 0);
+
+        /* The BPF map only supports 32bit keys, and user namespace inode numbers are 32bit too, even though
+         * ino_t is 64bit these days. Should we ever run into a 64bit inode let's refuse early. */
+        if (userns_inode > UINT32_MAX)
+                return -EINVAL;
+
+        ino = (unsigned) userns_inode;
+
+        map_fd = sym_bpf_map__fd(obj->maps.userns_setgroups_deny);
+        if (map_fd < 0)
+                return log_debug_errno(map_fd, "Failed to get setgroups deny BPF map fd: %m");
+
+        r = sym_bpf_map_update_elem(map_fd, &ino, &dummy, BPF_ANY);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to add userns inode to setgroups deny map: %m");
+
+        log_debug("Denying setgroups() on userns inode %" PRIu64, userns_inode);
+
+        return 0;
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+#endif
+}
+
+int userns_restrict_setgroups_deny_by_fd(
+                struct userns_restrict_bpf *obj,
+                int userns_fd) {
+
+#if HAVE_VMLINUX_H
+        struct stat st;
+        int r;
+
+        assert(obj);
+        assert(userns_fd >= 0);
+
+        r = fd_is_namespace(userns_fd, NAMESPACE_USER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
+        if (r == 0)
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
+
+        if (fstat(userns_fd, &st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
+
+        return userns_restrict_setgroups_deny_by_inode(obj, st.st_ino);
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+#endif
+}
index 21a81feaff4e5aa9fe22f9f3b295aeae8f3a46d9..f0673d159446e173955e1972330d61f8aa6d7469 100644 (file)
@@ -13,4 +13,7 @@ int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t usern
 
 int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
 
+int userns_restrict_setgroups_deny_by_fd(struct userns_restrict_bpf *obj, int userns_fd);
+int userns_restrict_setgroups_deny_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
+
 DEFINE_TRIVIAL_CLEANUP_FUNC(struct userns_restrict_bpf*, userns_restrict_bpf_free);
index 7d5f5093224f480b107a318d544b86109c18ebf1..79fff1b592b2b1a0f43f8bd9cfd2cfa06813ea32 100644 (file)
@@ -2,18 +2,27 @@
 
 #include "varlink-io.systemd.NamespaceResource.h"
 
+static SD_VARLINK_DEFINE_ENUM_TYPE(
+                AllocateUserRangeType,
+                SD_VARLINK_FIELD_COMMENT("Allocate a transient UID/GID range from the dynamic range pool. This is the default."),
+                SD_VARLINK_DEFINE_ENUM_VALUE(managed),
+                SD_VARLINK_FIELD_COMMENT("Create a user namespace that maps the peer UID/GID to itself instead of allocating a transient UID range."),
+                SD_VARLINK_DEFINE_ENUM_VALUE(self));
+
 static SD_VARLINK_DEFINE_METHOD(
                 AllocateUserRange,
                 SD_VARLINK_FIELD_COMMENT("The name for the user namespace, a short string that must be fit to be included in a file name and in a user name. This name is included in the user records announced via NSS and is otherwise useful for debugging."),
                 SD_VARLINK_DEFINE_INPUT(name, SD_VARLINK_STRING, 0),
                 SD_VARLINK_FIELD_COMMENT("Controls whether to mangle the provided name if needed so that it is suitable for naming a user namespace. If true this will shorten the name as necessary or randomize it if that's not sufficient. If null defaults to false."),
                 SD_VARLINK_DEFINE_INPUT(mangleName, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE),
-                SD_VARLINK_FIELD_COMMENT("The number of UIDs to assign. Must be 1 or 65536."),
+                SD_VARLINK_FIELD_COMMENT("The number of UIDs to assign. Must be 1 or 65536. If type is 'self', must be 1."),
                 SD_VARLINK_DEFINE_INPUT(size, SD_VARLINK_INT, 0),
-                SD_VARLINK_FIELD_COMMENT("The target UID inside the user namespace. If not specified defaults to 0."),
+                SD_VARLINK_FIELD_COMMENT("The target UID inside the user namespace. If not specified defaults to 0. If type is 'self', must be 0 or unset in which case the peer UID is mapped to itself."),
                 SD_VARLINK_DEFINE_INPUT(target, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("A file descriptor to an allocated userns with no current UID range assignments"),
                 SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, 0),
+                SD_VARLINK_FIELD_COMMENT("The type of allocation to perform. If 'managed' (the default), a transient UID/GID range is allocated from the dynamic range pool. If 'self', the peer UID/GID is mapped to itself. Defaults to 'managed'."),
+                SD_VARLINK_DEFINE_INPUT_BY_TYPE(type, AllocateUserRangeType, SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."),
                 SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."),
@@ -77,6 +86,8 @@ SD_VARLINK_DEFINE_INTERFACE(
                 io_systemd_NamespaceResource,
                 "io.systemd.NamespaceResource",
                 SD_VARLINK_INTERFACE_COMMENT("Allocate transient UID ranges for user namespace, and assign mounts, cgroups and networking devices to them"),
+                SD_VARLINK_SYMBOL_COMMENT("The type of user range allocation to perform."),
+                &vl_type_AllocateUserRangeType,
                 SD_VARLINK_SYMBOL_COMMENT("Assigns a UID range to a client-allocated user namespace that has no UID range assigned so far, and registers it for assignment of other resources."),
                 &vl_method_AllocateUserRange,
                 SD_VARLINK_SYMBOL_COMMENT("Registers an already initialized user namespace for assignment of resources."),
index e5092b56868f35b9c2fe15b6b268c151baa2f815..c468e3b8f89b0549485f8a70c361e8684485800b 100755 (executable)
@@ -77,6 +77,28 @@ test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
         '{"name":"test-fail","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":17}') |&
             grep "io.systemd.NamespaceResource.TooManyDelegations" >/dev/null
 
+# Test self mapping
+# Verify that self mapping maps the peer UID to root (uid_map should show "0 <peer_uid> 1")
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+        --push-fd=/proc/self/ns/user \
+        /run/systemd/userdb/io.systemd.NamespaceResource \
+        io.systemd.NamespaceResource.AllocateUserRange \
+        '{"name":"test-id","target":0,"size":1,"userNamespaceFileDescriptor":0,"type":"self"}' \
+        -- cat /proc/self/uid_map | awk '{print $1, $3}')" = "0 1"
+
+# Test nested delegation with self mapping
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+        --push-fd=/proc/self/ns/user \
+        /run/systemd/userdb/io.systemd.NamespaceResource \
+        io.systemd.NamespaceResource.AllocateUserRange \
+        '{"name":"test-delegate2","type":"self","size":1,"userNamespaceFileDescriptor":0,"delegateContainerRanges":3}' \
+        -- unshare --user varlinkctl --exec call \
+            --push-fd=/proc/self/ns/user \
+            /run/systemd/userdb/io.systemd.NamespaceResource \
+            io.systemd.NamespaceResource.AllocateUserRange \
+            '{"name":"test-delegate3","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \
+            -- cat /proc/self/uid_map | wc -l)" -eq 3
+
 # This should work without the key
 systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
 systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
index 0e2d6b3628c3535dba62b58230768a0cef5a3e01..143f9a9a3d296fc32ad6cbc6b00648f622c7d76f 100644 (file)
@@ -18,7 +18,7 @@ After=modprobe@tun.service
 DefaultDependencies=no
 
 [Service]
-CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER CAP_NET_ADMIN
+CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER CAP_NET_ADMIN CAP_SETFCAP
 ExecStart={{LIBEXECDIR}}/systemd-nsresourced
 IPAddressDeny=any
 LimitNOFILE={{HIGH_RLIMIT_NOFILE}}