namespace, each of size 65536. The ranges are allocated from the container UID ranges as per
<ulink url="https://systemd.io/UIDS-GIDS">Users, Groups, UIDs and GIDs on systemd Systems</ulink>.</para>
+ <para>The allocation API also supports <emphasis>identity mappings</emphasis>: instead of allocating a
+ transient UID/GID range, the user namespace can be configured to map the caller's UID/GID to root (UID
+ 0) inside the namespace, or to itself. Identity mappings can be combined with delegated ranges to enter
+ a privileged user namespace from which the container can be set up after which the container can run in
+ one of the delegated ranges. Identity mapped users are not subject to BPF-LSM write restrictions unlike
+ the transient ranges.</para>
+
<para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
return uid_is_foreign((uid_t) gid);
}
+static inline bool uid_is_transient(uid_t uid) {
+ return uid_is_container(uid) || uid_is_dynamic(uid);
+}
+
+static inline bool gid_is_transient(gid_t gid) {
+ return uid_is_container((uid_t) gid) || uid_is_dynamic((uid_t) gid);
+}
+
typedef struct UGIDAllocationRange {
uid_t system_alloc_uid_min;
uid_t system_uid_max;
return -ESRCH;
}
+int uid_range_translate_userns_fd(int userns_fd, UIDRangeUsernsMode mode, uid_t uid, uid_t *ret) {
+ int r;
+
+ assert(userns_fd >= 0);
+ assert(IN_SET(mode, UID_RANGE_USERNS_OUTSIDE, GID_RANGE_USERNS_OUTSIDE));
+
+ _cleanup_(uid_range_freep) UIDRange *outside_range = NULL;
+ r = uid_range_load_userns_by_fd_full(userns_fd, mode, /* coalesce= */ false, &outside_range);
+ if (r < 0)
+ return r;
+
+ mode = mode == UID_RANGE_USERNS_OUTSIDE ? UID_RANGE_USERNS_INSIDE : GID_RANGE_USERNS_INSIDE;
+
+ _cleanup_(uid_range_freep) UIDRange *inside_range = NULL;
+ r = uid_range_load_userns_by_fd_full(userns_fd, mode, /* coalesce= */ false, &inside_range);
+ if (r < 0)
+ return r;
+
+ return uid_range_translate(outside_range, inside_range, uid, ret);
+}
+
bool uid_range_equal(const UIDRange *a, const UIDRange *b) {
if (a == b)
return true;
int uid_range_copy(const UIDRange *range, UIDRange **ret);
int uid_range_remove(UIDRange *range, uid_t start, uid_t size);
int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret);
+int uid_range_translate_userns_fd(int userns_fd, UIDRangeUsernsMode mode, uid_t uid, uid_t *ret);
int uid_map_search_root(pid_t pid, UIDRangeUsernsMode mode, uid_t *ret);
__array(values, struct mnt_id_map);
} userns_mnt_id_hash SEC(".maps");
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */
+ __type(key, unsigned); /* userns inode */
+ __type(value, int); /* dummy value */
+} userns_setgroups_deny SEC(".maps");
+
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 4096);
return validate_mount(new_dir->mnt, ret);
}
+SEC("lsm/task_fix_setgroups")
+int BPF_PROG(userns_restrict_task_fix_setgroups, struct cred *new_cred, const struct cred *old, int ret) {
+ struct user_namespace *p;
+ unsigned inode;
+
+ if (ret != 0) /* propagate earlier error */
+ return ret;
+
+ /* Walk the task's user namespace and its ancestors to find the first one managed by nsresourced
+ * (i.e. present in either the setgroups deny map or the mount ID hash map). This is necessary
+ * because a task could otherwise trivially bypass the setgroups() restriction by unsharing the user
+ * namespace and mapping the same users and groups. */
+ p = new_cred->user_ns;
+ for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
+ if (!p)
+ break;
+
+ inode = p->ns.inum;
+
+ if (bpf_map_lookup_elem(&userns_setgroups_deny, &inode))
+ return -EPERM;
+
+ if (bpf_map_lookup_elem(&userns_mnt_id_hash, &inode))
+ return 0;
+
+ p = p->parent;
+ }
+
+ /* No nsresourced-managed ancestor found, allow. */
+ return 0;
+}
+
SEC("kprobe/retire_userns_sysctls")
int BPF_KPROBE(userns_restrict_retire_userns_sysctls, struct user_namespace *userns) {
unsigned inode;
- void *mnt_id_map;
/* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into
* user namespaces being deleted than using kprobes, but couldn't find any. */
userns = bpf_rdonly_cast(userns, bpf_core_type_id_kernel(struct user_namespace));
inode = userns->ns.inum;
- mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode);
- if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */
- return 0;
+ /* Check each map separately to avoid the compiler merging the two lookups into a pointer OR
+ * operation, which the BPF verifier rejects. */
+ if (bpf_map_lookup_elem(&userns_mnt_id_hash, &inode))
+ goto notify;
+
+ if (bpf_map_lookup_elem(&userns_setgroups_deny, &inode))
+ goto notify;
+
+ /* No rules installed for this userns? Then send no notification. */
+ return 0;
+notify:
bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0);
return 0;
}
#include "siphash24.h"
#include "socket-util.h"
#include "stat-util.h"
+#include "string-table.h"
#include "string-util.h"
#include "strv.h"
#include "time-util.h"
const char *service;
} LookupParameters;
+typedef enum AllocateUserRangeType {
+ ALLOCATE_USER_RANGE_MANAGED,
+ ALLOCATE_USER_RANGE_SELF,
+ _ALLOCATE_USER_RANGE_TYPE_MAX,
+ _ALLOCATE_USER_RANGE_TYPE_INVALID = -EINVAL,
+} AllocateUserRangeType;
+
+static const char *const allocate_user_range_type_table[_ALLOCATE_USER_RANGE_TYPE_MAX] = {
+ [ALLOCATE_USER_RANGE_MANAGED] = "managed",
+ [ALLOCATE_USER_RANGE_SELF] = "self",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(allocate_user_range_type, AllocateUserRangeType);
+static JSON_DISPATCH_ENUM_DEFINE(dispatch_allocate_user_range_type, AllocateUserRangeType, allocate_user_range_type_from_string);
+
static int build_user_json(UserNamespaceInfo *userns_info, uid_t offset, sd_json_variant **ret) {
_cleanup_free_ char *name = NULL, *realname = NULL;
UserDisposition disposition;
if (r == 0)
return -EEXIST;
- r = allocate_one(
- registry_dir_fd,
- info->name, info->size,
- parent_userns_fd,
- candidates,
- &candidate);
- if (r < 0)
- return r;
+ /* If the source UID/GID are already set we're doing a "self" user namespace and don't need to
+ * allocate a transient range. */
+ if (!uid_is_valid(info->start_uid) && !gid_is_valid(info->start_gid)) {
+ r = allocate_one(
+ registry_dir_fd,
+ info->name, info->size,
+ parent_userns_fd,
+ candidates,
+ &candidate);
+ if (r < 0)
+ return r;
- info->start_uid = candidate;
- info->start_gid = (gid_t) candidate;
+ info->start_uid = candidate;
+ info->start_gid = (gid_t) candidate;
+ }
/* Now allocate delegated ranges if requested */
if (info->n_delegates > 0) {
/* Let's enforce that the transient UID/GID ranges are mapped 1:1 in the parent user namespace, to
* avoid any weird mapping shenanigans that might happen otherwise. */
- if (start_uid != userns_info->start_uid)
+ if (uid_is_transient(userns_info->start_uid) && start_uid != userns_info->start_uid)
return log_debug_errno(
SYNTHETIC_ERRNO(ERANGE),
"Transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
if (r < 0)
return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid);
- if (start_gid != userns_info->start_gid)
+ if (gid_is_transient(userns_info->start_gid) && start_gid != userns_info->start_gid)
return log_debug_errno(
SYNTHETIC_ERRNO(ERANGE),
"Transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
return 0;
}
-static int validate_target_and_size(sd_varlink *link, uid_t target, uint32_t size) {
+static int validate_target_and_size(sd_varlink *link, uid_t target, uint32_t size, AllocateUserRangeType type) {
assert(link);
- if (!IN_SET(size, 1U, 0x10000))
- return sd_varlink_error_invalid_parameter_name(link, "size");
+ if (type == ALLOCATE_USER_RANGE_SELF) {
+ /* Self userns must have size 1 and target must be 0 or unset */
+ if (size != 1)
+ return sd_varlink_error_invalid_parameter_name(link, "size");
+
+ if (!IN_SET(target, UID_INVALID, 0))
+ return sd_varlink_error_invalid_parameter_name(link, "target");
+ } else {
+ if (!IN_SET(size, 1U, 0x10000))
+ return sd_varlink_error_invalid_parameter_name(link, "size");
- if (!uid_is_valid(target) || target > UINT32_MAX - size)
- return sd_varlink_error_invalid_parameter_name(link, "target");
+ if (!uid_is_valid(target) || target > UINT32_MAX - size)
+ return sd_varlink_error_invalid_parameter_name(link, "target");
+ }
return 0;
}
typedef struct AllocateParameters {
const char *name;
+ AllocateUserRangeType type;
uint32_t size;
uid_t target;
unsigned userns_fd_idx;
static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) {
static const sd_json_dispatch_field dispatch_table[] = {
- { "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, offsetof(AllocateParameters, name), SD_JSON_MANDATORY },
- { "size", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, size), SD_JSON_MANDATORY },
- { "target", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid, offsetof(AllocateParameters, target), 0 },
- { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), SD_JSON_MANDATORY },
- { "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 },
- { "delegateContainerRanges", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, delegate_container_ranges), 0 },
+ { "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, offsetof(AllocateParameters, name), SD_JSON_MANDATORY },
+ { "size", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, size), SD_JSON_MANDATORY },
+ { "target", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid, offsetof(AllocateParameters, target), 0 },
+ { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), SD_JSON_MANDATORY },
+ { "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 },
+ { "type", SD_JSON_VARIANT_STRING, dispatch_allocate_user_range_type, offsetof(AllocateParameters, type), 0 },
+ { "delegateContainerRanges", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, delegate_container_ranges), 0 },
{}
};
_cleanup_free_ char *userns_name = NULL;
Context *c = ASSERT_PTR(userdata);
uid_t peer_uid;
+ gid_t peer_gid;
struct stat userns_st;
AllocateParameters p = {
+ .type = ALLOCATE_USER_RANGE_MANAGED,
.size = UINT32_MAX,
+ .target = UID_INVALID,
.userns_fd_idx = UINT_MAX,
};
int r;
if (r != 0)
return r;
+ if (p.type != ALLOCATE_USER_RANGE_SELF && p.target == UID_INVALID)
+ p.target = 0;
+
r = validate_name(link, p.name, p.mangle_name, &userns_name);
if (r != 0)
return r;
- r = validate_target_and_size(link, p.target, p.size);
+ r = validate_target_and_size(link, p.target, p.size, p.type);
if (r != 0)
return r;
if (r < 0)
return r;
+ r = sd_varlink_get_peer_gid(link, &peer_gid);
+ if (r < 0)
+ return r;
+
const char *polkit_details[] = {
"name", userns_name,
NULL,
userns_info->target_uid = p.target;
userns_info->target_gid = (gid_t) p.target;
+ if (p.type == ALLOCATE_USER_RANGE_SELF) {
+ /* The start UID/GID will be mapped to the parent userns in write_userns(). If a self
+ * mapping to the peer UID/GID is requested, we have to map the target UID/GID ourselves here
+ * as write_userns() doesn't take care of that. */
+
+ userns_info->start_uid = peer_uid;
+ userns_info->start_gid = peer_gid;
+
+ if (p.target == UID_INVALID) {
+ r = uid_range_translate_userns_fd(
+ parent_userns_fd,
+ UID_RANGE_USERNS_OUTSIDE,
+ peer_uid,
+ &userns_info->target_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent user namespace: %m", peer_uid);
+
+ r = uid_range_translate_userns_fd(
+ parent_userns_fd,
+ GID_RANGE_USERNS_OUTSIDE,
+ peer_gid,
+ &userns_info->target_gid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent user namespace: %m", peer_gid);
+ }
+ }
+
/* Set up delegation arrays if requested */
if (p.delegate_container_ranges > 0) {
userns_info->delegates = new0(DelegatedUserNamespaceInfo, p.delegate_container_ranges);
if (r < 0)
goto fail;
+ if (p.type == ALLOCATE_USER_RANGE_SELF) {
+ /* For "self" allocations we deny setgroups() via the BPF LSM. We can't use
+ * /proc/self/setgroups for this as that is transitive and also applies to child user
+ * namespaces. The BPF LSM hook only applies to the specific user namespace. */
+ r = userns_restrict_setgroups_deny_by_fd(c->bpf, userns_fd);
+ if (r < 0)
+ goto fail;
+ }
+
r = write_userns(userns_fd, parent_userns_fd, userns_info);
if (r < 0)
goto fail;
/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include <grp.h>
#include <sched.h>
#include <sys/eventfd.h>
#include <sys/mount.h>
#include "errno-util.h"
#include "fd-util.h"
+#include "fileio.h"
#include "namespace-util.h"
#include "pidref.h"
#include "process-util.h"
ASSERT_OK(pidref_wait_for_terminate_and_check("(test)", &pidref, WAIT_LOG));
}
+static void write_child_mappings(PidRef *child, int parent_userns_fd) {
+ /* The kernel requires uid_map/gid_map to be written from the parent user namespace of the
+ * target namespace. Fork a helper that joins the parent userns and writes the mappings from
+ * there, mirroring what write_userns() does in nsresourcework.c. */
+ int r;
+
+ r = ASSERT_OK(pidref_safe_fork("(sd-write-map)", FORK_DEATHSIG_SIGKILL|FORK_WAIT|FORK_LOG, NULL));
+ if (r == 0) {
+ char path[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1];
+
+ ASSERT_OK_ERRNO(setns(parent_userns_fd, CLONE_NEWUSER));
+
+ xsprintf(path, "/proc/" PID_FMT "/uid_map", child->pid);
+ ASSERT_OK(write_string_file(path, "0 0 1\n", WRITE_STRING_FILE_DISABLE_BUFFER));
+
+ xsprintf(path, "/proc/" PID_FMT "/gid_map", child->pid);
+ ASSERT_OK(write_string_file(path, "0 0 1\n", WRITE_STRING_FILE_DISABLE_BUFFER));
+
+ _exit(EXIT_SUCCESS);
+ }
+}
+
+TEST(setgroups_deny) {
+ _cleanup_close_ int deny_userns_fd = -EBADF, allow_userns_fd = -EBADF,
+ afd = -EBADF, bfd = -EBADF;
+ int r;
+
+ _cleanup_free_ char *idmap = NULL;
+ ASSERT_OK(asprintf(&idmap, "0 "UID_FMT" 1", CONTAINER_UID_MIN));
+
+ /* Create a userns that will have setgroups() denied via BPF. We don't set setgroups_deny here
+ * because that uses /proc/self/setgroups which is transitive and we want to test the BPF-LSM
+ * denial specifically. */
+ deny_userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ false));
+
+ ASSERT_OK(userns_restrict_put_by_fd(
+ bpf_obj,
+ deny_userns_fd,
+ /* replace= */ true,
+ /* mount_fds= */ NULL,
+ /* n_mount_fds= */ 0));
+ ASSERT_OK(userns_restrict_setgroups_deny_by_fd(bpf_obj, deny_userns_fd));
+
+ /* Create a userns that is managed (in mount ID hash) but does NOT have setgroups() denied */
+ allow_userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ false));
+
+ ASSERT_OK(userns_restrict_put_by_fd(
+ bpf_obj,
+ allow_userns_fd,
+ /* replace= */ true,
+ /* mount_fds= */ NULL,
+ /* n_mount_fds= */ 0));
+
+ afd = ASSERT_OK_ERRNO(eventfd(0, EFD_CLOEXEC));
+ bfd = ASSERT_OK_ERRNO(eventfd(0, EFD_CLOEXEC));
+
+ /* Test 1: setgroups() should be denied in the deny userns, including after unsharing into a child
+ * user namespace (the ancestor walk should find the deny entry). */
+ {
+ _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
+
+ r = ASSERT_OK(pidref_safe_fork("(test-deny)", FORK_LOG|FORK_DEATHSIG_SIGKILL, &pidref));
+ if (r == 0) {
+ /* Enter the userns manually without going through namespace_enter(), because
+ * that calls reset_uid_gid() which calls setgroups() internally. Since the
+ * BPF LSM denies setgroups(), reset_uid_gid() would fail before calling
+ * setresuid()/setresgid(), leaving us as the overflow UID without
+ * capabilities. */
+ ASSERT_OK_ERRNO(setns(deny_userns_fd, CLONE_NEWUSER));
+ ASSERT_OK_ERRNO(setresgid(0, 0, 0));
+ ASSERT_OK_ERRNO(setresuid(0, 0, 0));
+
+ /* setgroups() should be denied by BPF LSM */
+ ASSERT_ERROR_ERRNO(setgroups(0, NULL), EPERM);
+
+ /* Unshare into a child user namespace. The parent will write the mappings
+ * for us since writing /proc/self/uid_map from inside the userns fails
+ * because the proc mount belongs to the init user namespace. */
+ ASSERT_OK_ERRNO(unshare(CLONE_NEWUSER));
+ ASSERT_OK_ERRNO(eventfd_write(afd, 1));
+ uint64_t x;
+ ASSERT_OK_ERRNO(eventfd_read(bfd, &x));
+
+ /* setgroups() should still be denied because the ancestor walk finds the
+ * deny entry on the parent user namespace */
+ ASSERT_ERROR_ERRNO(setgroups(0, NULL), EPERM);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ uint64_t x;
+ ASSERT_OK_ERRNO(eventfd_read(afd, &x));
+ write_child_mappings(&pidref, deny_userns_fd);
+ ASSERT_OK_ERRNO(eventfd_write(bfd, 1));
+
+ ASSERT_OK(pidref_wait_for_terminate_and_check("(test-deny)", &pidref, WAIT_LOG));
+ }
+
+ /* Test 2: setgroups() should be allowed in the managed-only userns (mount ID hash but no setgroups
+ * deny entry), including in a child user namespace. */
+ {
+ _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
+
+ r = ASSERT_OK(pidref_safe_fork("(test-allow)", FORK_LOG|FORK_DEATHSIG_SIGKILL, &pidref));
+ if (r == 0) {
+ ASSERT_OK_ERRNO(setns(allow_userns_fd, CLONE_NEWUSER));
+ ASSERT_OK_ERRNO(setresgid(0, 0, 0));
+ ASSERT_OK_ERRNO(setresuid(0, 0, 0));
+
+ /* setgroups() should succeed since this userns is only in the mount ID hash */
+ ASSERT_OK_ERRNO(setgroups(0, NULL));
+
+ /* Also should work in a child userns since the ancestor walk finds the
+ * mount ID hash entry (not the setgroups deny entry) */
+ ASSERT_OK_ERRNO(unshare(CLONE_NEWUSER));
+ ASSERT_OK_ERRNO(eventfd_write(afd, 1));
+ uint64_t x;
+ ASSERT_OK_ERRNO(eventfd_read(bfd, &x));
+
+ ASSERT_OK_ERRNO(setgroups(0, NULL));
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ uint64_t x;
+ ASSERT_OK_ERRNO(eventfd_read(afd, &x));
+ write_child_mappings(&pidref, allow_userns_fd);
+ ASSERT_OK_ERRNO(eventfd_write(bfd, 1));
+
+ ASSERT_OK(pidref_wait_for_terminate_and_check("(test-allow)", &pidref, WAIT_LOG));
+ }
+}
+
DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro);
#include "stat-util.h"
#include "string-util.h"
#include "strv.h"
+#include "uid-classification.h"
#include "user-util.h"
#include "userns-registry.h"
if (userns_info->userns_inode == 0)
return -EBADMSG;
- if (userns_info->start_uid == 0 || userns_info->start_gid == 0)
- return -EBADMSG;
if (userns_info->size == 0) {
if (uid_is_valid(userns_info->start_uid) || uid_is_valid(userns_info->target_uid))
goto fail;
}
- if (uid_is_valid(info->start_uid)) {
+ if (uid_is_transient(info->start_uid)) {
if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0) {
r = log_oom_debug();
goto fail;
}
}
- if (gid_is_valid(info->start_gid)) {
+ if (gid_is_transient(info->start_gid)) {
if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0) {
r = log_oom_debug();
goto fail;
if (r < 0)
RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link1_fn));
- if (uid_is_valid(info->start_uid)) {
+ if (uid_is_transient(info->start_uid)) {
_cleanup_free_ char *link2_fn = NULL;
if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0)
RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link2_fn));
}
- if (uid_is_valid(info->start_gid)) {
+ if (gid_is_transient(info->start_gid)) {
_cleanup_free_ char *link3_fn = NULL;
if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0)
if (r < 0)
return log_error_errno(r, "Failed to size userns ring buffer: %m");
+ r = sym_bpf_map__set_max_entries(obj->maps.userns_setgroups_deny, USERNS_MAX);
+ if (r < 0)
+ return log_error_errno(r, "Failed to size userns setgroups deny hash table: %m");
+
/* Dummy map to satisfy the verifier */
dummy_mnt_id_hash_fd = make_inner_hash_map();
if (dummy_mnt_id_hash_fd < 0)
int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
#if HAVE_VMLINUX_H
- int r, outer_map_fd;
+ int r, outer_map_fd, setgroups_deny_fd;
unsigned u;
assert(obj);
if (r < 0)
return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", userns_inode);
+ setgroups_deny_fd = sym_bpf_map__fd(obj->maps.userns_setgroups_deny);
+ if (setgroups_deny_fd < 0)
+ return log_debug_errno(setgroups_deny_fd, "Failed to get setgroups deny BPF map fd: %m");
+
+ r = sym_bpf_map_delete_elem(setgroups_deny_fd, &u);
+ if (r < 0 && r != -ENOENT)
+ return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from setgroups deny map: %m", userns_inode);
+
return 0;
#else
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
#endif
}
+
+int userns_restrict_setgroups_deny_by_inode(
+ struct userns_restrict_bpf *obj,
+ uint64_t userns_inode) {
+
+#if HAVE_VMLINUX_H
+ int map_fd, r;
+ uint32_t dummy = 1;
+ unsigned ino;
+
+ assert(obj);
+ assert(userns_inode != 0);
+
+ /* The BPF map only supports 32bit keys, and user namespace inode numbers are 32bit too, even though
+ * ino_t is 64bit these days. Should we ever run into a 64bit inode let's refuse early. */
+ if (userns_inode > UINT32_MAX)
+ return -EINVAL;
+
+ ino = (unsigned) userns_inode;
+
+ map_fd = sym_bpf_map__fd(obj->maps.userns_setgroups_deny);
+ if (map_fd < 0)
+ return log_debug_errno(map_fd, "Failed to get setgroups deny BPF map fd: %m");
+
+ r = sym_bpf_map_update_elem(map_fd, &ino, &dummy, BPF_ANY);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add userns inode to setgroups deny map: %m");
+
+ log_debug("Denying setgroups() on userns inode %" PRIu64, userns_inode);
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+#endif
+}
+
+int userns_restrict_setgroups_deny_by_fd(
+ struct userns_restrict_bpf *obj,
+ int userns_fd) {
+
+#if HAVE_VMLINUX_H
+ struct stat st;
+ int r;
+
+ assert(obj);
+ assert(userns_fd >= 0);
+
+ r = fd_is_namespace(userns_fd, NAMESPACE_USER);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
+
+ if (fstat(userns_fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
+
+ return userns_restrict_setgroups_deny_by_inode(obj, st.st_ino);
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+#endif
+}
int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
+int userns_restrict_setgroups_deny_by_fd(struct userns_restrict_bpf *obj, int userns_fd);
+int userns_restrict_setgroups_deny_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
+
DEFINE_TRIVIAL_CLEANUP_FUNC(struct userns_restrict_bpf*, userns_restrict_bpf_free);
#include "varlink-io.systemd.NamespaceResource.h"
+static SD_VARLINK_DEFINE_ENUM_TYPE(
+ AllocateUserRangeType,
+ SD_VARLINK_FIELD_COMMENT("Allocate a transient UID/GID range from the dynamic range pool. This is the default."),
+ SD_VARLINK_DEFINE_ENUM_VALUE(managed),
+ SD_VARLINK_FIELD_COMMENT("Create a user namespace that maps the peer UID/GID to itself instead of allocating a transient UID range."),
+ SD_VARLINK_DEFINE_ENUM_VALUE(self));
+
static SD_VARLINK_DEFINE_METHOD(
AllocateUserRange,
SD_VARLINK_FIELD_COMMENT("The name for the user namespace, a short string that must be fit to be included in a file name and in a user name. This name is included in the user records announced via NSS and is otherwise useful for debugging."),
SD_VARLINK_DEFINE_INPUT(name, SD_VARLINK_STRING, 0),
SD_VARLINK_FIELD_COMMENT("Controls whether to mangle the provided name if needed so that it is suitable for naming a user namespace. If true this will shorten the name as necessary or randomize it if that's not sufficient. If null defaults to false."),
SD_VARLINK_DEFINE_INPUT(mangleName, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE),
- SD_VARLINK_FIELD_COMMENT("The number of UIDs to assign. Must be 1 or 65536."),
+ SD_VARLINK_FIELD_COMMENT("The number of UIDs to assign. Must be 1 or 65536. If type is 'self', must be 1."),
SD_VARLINK_DEFINE_INPUT(size, SD_VARLINK_INT, 0),
- SD_VARLINK_FIELD_COMMENT("The target UID inside the user namespace. If not specified defaults to 0."),
+ SD_VARLINK_FIELD_COMMENT("The target UID inside the user namespace. If not specified defaults to 0. If type is 'self', must be 0 or unset in which case the peer UID is mapped to itself."),
SD_VARLINK_DEFINE_INPUT(target, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("A file descriptor to an allocated userns with no current UID range assignments"),
SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, 0),
+ SD_VARLINK_FIELD_COMMENT("The type of allocation to perform. If 'managed' (the default), a transient UID/GID range is allocated from the dynamic range pool. If 'self', the peer UID/GID is mapped to itself. Defaults to 'managed'."),
+ SD_VARLINK_DEFINE_INPUT_BY_TYPE(type, AllocateUserRangeType, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."),
SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."),
io_systemd_NamespaceResource,
"io.systemd.NamespaceResource",
SD_VARLINK_INTERFACE_COMMENT("Allocate transient UID ranges for user namespace, and assign mounts, cgroups and networking devices to them"),
+ SD_VARLINK_SYMBOL_COMMENT("The type of user range allocation to perform."),
+ &vl_type_AllocateUserRangeType,
SD_VARLINK_SYMBOL_COMMENT("Assigns a UID range to a client-allocated user namespace that has no UID range assigned so far, and registers it for assignment of other resources."),
&vl_method_AllocateUserRange,
SD_VARLINK_SYMBOL_COMMENT("Registers an already initialized user namespace for assignment of resources."),
'{"name":"test-fail","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":17}') |&
grep "io.systemd.NamespaceResource.TooManyDelegations" >/dev/null
+# Test self mapping
+# Verify that self mapping maps the peer UID to root (uid_map should show "0 <peer_uid> 1")
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+ --push-fd=/proc/self/ns/user \
+ /run/systemd/userdb/io.systemd.NamespaceResource \
+ io.systemd.NamespaceResource.AllocateUserRange \
+ '{"name":"test-id","target":0,"size":1,"userNamespaceFileDescriptor":0,"type":"self"}' \
+ -- cat /proc/self/uid_map | awk '{print $1, $3}')" = "0 1"
+
+# Test nested delegation with self mapping
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+ --push-fd=/proc/self/ns/user \
+ /run/systemd/userdb/io.systemd.NamespaceResource \
+ io.systemd.NamespaceResource.AllocateUserRange \
+ '{"name":"test-delegate2","type":"self","size":1,"userNamespaceFileDescriptor":0,"delegateContainerRanges":3}' \
+ -- unshare --user varlinkctl --exec call \
+ --push-fd=/proc/self/ns/user \
+ /run/systemd/userdb/io.systemd.NamespaceResource \
+ io.systemd.NamespaceResource.AllocateUserRange \
+ '{"name":"test-delegate3","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \
+ -- cat /proc/self/uid_map | wc -l)" -eq 3
+
# This should work without the key
systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
DefaultDependencies=no
[Service]
-CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER CAP_NET_ADMIN
+CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER CAP_NET_ADMIN CAP_SETFCAP
ExecStart={{LIBEXECDIR}}/systemd-nsresourced
IPAddressDeny=any
LimitNOFILE={{HIGH_RLIMIT_NOFILE}}