registered with this service. Moreover, UIDs and GIDs are always allocated together, and
symmetrically.</para>
+ <para>The allocation API supports <emphasis>delegated ranges</emphasis>: additional UID/GID ranges that
+ are mapped 1:1 into the user namespace rather than being translated to a target UID/GID. These delegated
+ ranges enable nested user namespace scenarios where a container needs to create child user namespaces
+ with their own transient UID ranges. Normally, the kernel restricts which UIDs can be mapped into a user
+ namespace to those that are also mapped in the parent. Delegated ranges solve this by pre-allocating
+ additional ranges that are visible inside the user namespace and can be used by nested
+ <function>AllocateUserRange()</function> calls. Up to 16 delegated ranges can be requested per user
+ namespace, each of size 65536. The ranges are allocated from the container UID ranges as per
+ <ulink url="https://systemd.io/UIDS-GIDS">Users, Groups, UIDs and GIDs on systemd Systems</ulink>.</para>
+
<para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include <sched.h>
#include <string.h>
#include "alloc-util.h"
return 0;
}
-int uid_range_add_str(UIDRange **range, const char *s) {
+int uid_range_add_str_full(UIDRange **range, const char *s, bool coalesce) {
uid_t start, end;
int r;
if (r < 0)
return r;
- return uid_range_add_internal(range, start, end - start + 1, /* coalesce= */ true);
+ return uid_range_add_internal(range, start, end - start + 1, coalesce);
}
int uid_range_next_lower(const UIDRange *range, uid_t *uid) {
return true;
}
-int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret) {
+int uid_range_load_userns_full(const char *path, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret) {
_cleanup_(uid_range_freep) UIDRange *range = NULL;
_cleanup_fclose_ FILE *f = NULL;
int r;
return r;
}
- uid_range_coalesce(range);
+ if (coalesce)
+ uid_range_coalesce(range);
*ret = TAKE_PTR(range);
return 0;
}
-int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret) {
+int uid_range_load_userns_by_fd_full(int userns_fd, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret) {
_cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
int r;
if (r < 0)
return r;
if (r > 0)
- return uid_range_load_userns(/* path= */ NULL, mode, ret);
+ return uid_range_load_userns_full(/* path= */ NULL, mode, coalesce, ret);
r = userns_enter_and_pin(userns_fd, &pidref);
if (r < 0)
pidref.pid,
IN_SET(mode, UID_RANGE_USERNS_INSIDE, UID_RANGE_USERNS_OUTSIDE) ? "uid_map" : "gid_map");
- return uid_range_load_userns(p, mode, ret);
+ return uid_range_load_userns_full(p, mode, coalesce, ret);
}
bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr) {
return false;
}
+int uid_range_clip(UIDRange *range, uid_t min, uid_t max) {
+ assert(range);
+
+ if (min > max)
+ return -EINVAL;
+
+ size_t t = 0;
+ FOREACH_ARRAY(e, range->entries, range->n_entries) {
+ uid_t entry_end = e->start + e->nr; /* one past the last UID in entry */
+
+ /* Skip entries completely outside [min, max] */
+ if (entry_end <= min || e->start > max)
+ continue;
+
+ /* Trim the entry to fit within [min, max] */
+ uid_t new_start = MAX(e->start, min);
+ /* entry_end is exclusive, avoid overflow when max == UINT32_MAX */
+ uid_t new_end = entry_end <= max ? entry_end : max + 1;
+ assert(new_end > new_start);
+
+ range->entries[t++] = (UIDRangeEntry) {
+ .start = new_start,
+ .nr = new_end - new_start,
+ };
+ }
+
+ range->n_entries = t;
+
+ return 0;
+}
+
+int uid_range_partition(UIDRange *range, uid_t size) {
+ assert(range);
+ assert(size > 0);
+
+ /* Partitions the UID range entries into buckets of the given size. Any entry larger than the given
+ * size will be partitioned into multiple entries, each of the given size. Any leftover UIDs in the
+ * entry are dropped. Any entries smaller than the given size are also dropped. */
+
+ /* Count how many entries we'll need after partitioning */
+ size_t n_new_entries = 0;
+ FOREACH_ARRAY(e, range->entries, range->n_entries)
+ n_new_entries += e->nr / size;
+
+ if (n_new_entries == 0) {
+ range->n_entries = 0;
+ return 0;
+ }
+
+ if (n_new_entries > range->n_entries && !GREEDY_REALLOC(range->entries, n_new_entries))
+ return -ENOMEM;
+
+ /* Work backwards to avoid overwriting entries we still need to read */
+ size_t t = n_new_entries;
+ for (size_t i = range->n_entries; i > 0; i--) {
+ UIDRangeEntry *e = range->entries + i - 1;
+ unsigned n_parts = e->nr / size;
+
+ for (unsigned j = n_parts; j > 0; j--)
+ range->entries[--t] = (UIDRangeEntry) {
+ .start = e->start + (j - 1) * size,
+ .nr = size,
+ };
+ }
+
+ range->n_entries = n_new_entries;
+
+ return 0;
+}
+
+int uid_range_copy(const UIDRange *range, UIDRange **ret) {
+ assert(ret);
+
+ if (!range) {
+ *ret = NULL;
+ return 0;
+ }
+
+ _cleanup_(uid_range_freep) UIDRange *copy = new0(UIDRange, 1);
+ if (!copy)
+ return -ENOMEM;
+
+ if (range->n_entries > 0) {
+ copy->entries = newdup(UIDRangeEntry, range->entries, range->n_entries);
+ if (!copy->entries)
+ return -ENOMEM;
+
+ copy->n_entries = range->n_entries;
+ }
+
+ *ret = TAKE_PTR(copy);
+ return 0;
+}
+
+int uid_range_remove(UIDRange *range, uid_t start, uid_t size) {
+ assert(range);
+
+ if (size == 0)
+ return 0;
+
+ uid_t end = start + size; /* one past the last UID to remove */
+
+ for (size_t i = 0; i < range->n_entries; i++) {
+ UIDRangeEntry *e = range->entries + i;
+ uid_t entry_end = e->start + e->nr;
+
+ /* No overlap */
+ if (entry_end <= start || e->start >= end)
+ continue;
+
+ /* Check if this removal splits the entry into two parts */
+ if (e->start < start && entry_end > end) {
+ /* Need to split: grow the array first */
+ if (!GREEDY_REALLOC(range->entries, range->n_entries + 1))
+ return -ENOMEM;
+
+ /* Re-fetch pointer after potential realloc */
+ e = range->entries + i;
+ entry_end = e->start + e->nr;
+
+ /* Shift everything after this entry to make room */
+ memmove(range->entries + i + 2, range->entries + i + 1,
+ (range->n_entries - i - 1) * sizeof(UIDRangeEntry));
+ range->n_entries++;
+
+ /* First part: before the removed range */
+ range->entries[i] = (UIDRangeEntry) {
+ .start = e->start,
+ .nr = start - e->start,
+ };
+
+ /* Second part: after the removed range */
+ range->entries[i + 1] = (UIDRangeEntry) {
+ .start = end,
+ .nr = entry_end - end,
+ };
+
+ /* Skip the newly inserted entry */
+ i++;
+ continue;
+ }
+
+ /* Removal covers the entire entry */
+ if (start <= e->start && end >= entry_end) {
+ memmove(e, e + 1, (range->n_entries - i - 1) * sizeof(UIDRangeEntry));
+ range->n_entries--;
+ i--;
+ continue;
+ }
+
+ /* Removal trims the start of the entry */
+ if (start <= e->start && end > e->start) {
+ e->nr = entry_end - end;
+ e->start = end;
+ continue;
+ }
+
+ /* Removal trims the end of the entry */
+ if (start < entry_end && end >= entry_end) {
+ e->nr = start - e->start;
+ continue;
+ }
+ }
+
+ return 0;
+}
+
+int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret) {
+ assert(uid_range_entries(outside) == uid_range_entries(inside));
+ assert(ret);
+
+ /* Given two UID ranges that represent the outside UID range of a user namespace (the 2nd and 3rd
+ * columns in /proc/xxx/uid_map) and the inside UID range of a user namespace (the 1st and 3rd
+ * columns in /proc/xxx/uid_map), translates the given UID from the outside range to the inside
+ * range. For example, given the following UID range:
+ *
+ * 0 1000 1
+ *
+ * calling uid_range_translate(outside, inside, 1000) will return 0 as the output UID. Alternatively,
+ * calling uid_range_translate(inside, outside, 0) will return 1000 as the output UID.
+ */
+
+ for (size_t i = 0; i < uid_range_entries(outside); i++)
+ assert(outside->entries[i].nr == inside->entries[i].nr);
+
+ for (size_t i = 0; i < uid_range_entries(outside); i++) {
+ const UIDRangeEntry *e = outside->entries + i;
+
+ if (uid < e->start || uid >= e->start + e->nr)
+ continue;
+
+ *ret = inside->entries[i].start + uid - e->start;
+ return 0;
+ }
+
+ return -ESRCH;
+}
+
bool uid_range_equal(const UIDRange *a, const UIDRange *b) {
if (a == b)
return true;
static inline int uid_range_add(UIDRange **range, uid_t start, uid_t nr) {
return uid_range_add_internal(range, start, nr, true);
}
-int uid_range_add_str(UIDRange **range, const char *s);
+int uid_range_add_str_full(UIDRange **range, const char *s, bool coalesce);
+static inline int uid_range_add_str(UIDRange **range, const char *s) {
+ return uid_range_add_str_full(range, s, true);
+}
int uid_range_next_lower(const UIDRange *range, uid_t *uid);
_UID_RANGE_USERNS_MODE_INVALID = -EINVAL,
} UIDRangeUsernsMode;
-int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret);
-int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret);
+int uid_range_load_userns_full(const char *path, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret);
+static inline int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret) {
+ return uid_range_load_userns_full(path, mode, true, ret);
+}
+int uid_range_load_userns_by_fd_full(int userns_fd, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret);
+static inline int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret) {
+ return uid_range_load_userns_by_fd_full(userns_fd, mode, true, ret);
+}
bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr);
+int uid_range_clip(UIDRange *range, uid_t min, uid_t max);
+int uid_range_partition(UIDRange *range, uid_t size);
+int uid_range_copy(const UIDRange *range, UIDRange **ret);
+int uid_range_remove(UIDRange *range, uid_t start, uid_t size);
+int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret);
+
int uid_map_search_root(pid_t pid, UIDRangeUsernsMode mode, uid_t *ret);
uid_t uid_range_base(const UIDRange *range);
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "netlink-util.h"
+#include "nsresource.h"
#include "pidref.h"
#include "process-util.h"
#include "random-util.h"
return sd_varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
}
-static int uid_is_available(
- int registry_dir_fd,
- uid_t candidate) {
-
+static int uid_is_available(int registry_dir_fd, uid_t candidate, int parent_userns_fd) {
int r;
assert(registry_dir_fd >= 0);
log_debug("Checking if UID " UID_FMT " is available.", candidate);
+ uint64_t parent_userns_inode = 0;
+ struct stat parent_st;
+ if (fstat(parent_userns_fd, &parent_st) < 0)
+ return log_debug_errno(errno, "Failed to fstat parent user namespace: %m");
+ parent_userns_inode = parent_st.st_ino;
+
r = userns_registry_uid_exists(registry_dir_fd, candidate);
if (r < 0)
return r;
if (r > 0)
return false;
- r = userdb_by_uid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
- if (r >= 0)
- return false;
- if (r != -ESRCH)
+ /* Also check delegation files. If parent_userns_inode is set and matches the delegation's userns
+ * inode, the UID is available because the parent owns that delegation. */
+ r = userns_registry_delegation_uid_exists(registry_dir_fd, candidate);
+ if (r < 0)
return r;
+ if (r > 0) {
+ _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo delegation = DELEGATED_USER_NAMESPACE_INFO_NULL;
+ r = userns_registry_load_delegation_by_uid(registry_dir_fd, candidate, &delegation);
+ if (r < 0)
+ return r;
- r = groupdb_by_gid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
- if (r >= 0)
- return false;
- if (r != -ESRCH)
+ if (delegation.userns_inode != parent_userns_inode)
+ return false;
+
+ /* The parent userns owns this delegation, so the UID is available for nested allocation */
+ log_debug("UID " UID_FMT " is delegated by parent userns inode %" PRIu64 ", available for nested allocation.",
+ candidate, parent_userns_inode);
+ }
+
+ r = userns_registry_delegation_gid_exists(registry_dir_fd, (gid_t) candidate);
+ if (r < 0)
return r;
+ if (r > 0) {
+ _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo delegation = DELEGATED_USER_NAMESPACE_INFO_NULL;
+ r = userns_registry_load_delegation_by_gid(registry_dir_fd, candidate, &delegation);
+ if (r < 0)
+ return r;
+
+ if (delegation.userns_inode != parent_userns_inode)
+ return false;
+
+ /* The parent userns owns this delegation, so the UID is available for nested allocation */
+ log_debug("UID " UID_FMT " is delegated by parent userns inode %" PRIu64 ", available for nested allocation.",
+ candidate, parent_userns_inode);
+ }
+
+ r = is_our_namespace(parent_userns_fd, NAMESPACE_USER);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check if parent user namespace is our user namespace: %m");
+
+ if (r > 0) {
+ /* Only check userdb if we're allocating from our current user namespace. userdb won't be
+ * to tell us anything on whether UIDs/GIDs in another user namespace are in use or not. On
+ * top of that, for nspawn containers registered with machined's userdb implementation, it
+ * would tell us that any ranges delegated to the container are in use (which is true in the
+ * nsresourced user namespace, but not in the nspawn user namespace). */
+
+ r = userdb_by_uid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
+ if (r >= 0)
+ return false;
+ if (r != -ESRCH)
+ return r;
+
+ r = groupdb_by_gid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
+ if (r >= 0)
+ return false;
+ if (r != -ESRCH)
+ return r;
+ }
log_debug("UID " UID_FMT " is available.", candidate);
return true;
}
-static int allocate_now(
+static int allocate_one(
int registry_dir_fd,
- UserNamespaceInfo *info,
- int *ret_lock_fd) {
+ const char *name,
+ uint32_t size,
+ int parent_userns_fd,
+ UIDRange *candidates,
+ uid_t *ret_candidate) {
static const uint8_t hash_key[16] = {
0xd4, 0xd7, 0x33, 0xa7, 0x4d, 0xd3, 0x42, 0xcd,
0xaa, 0xe9, 0x45, 0xd0, 0xfb, 0xec, 0x79, 0xee,
};
-
- _cleanup_(uid_range_freep) UIDRange *valid_range = NULL;
- uid_t candidate, uidmin, uidmax, uidmask;
+ _cleanup_(uid_range_freep) UIDRange *copy = NULL;
+ uid_t candidate, uidmin, uidmax;
unsigned n_tries = 100;
+ size_t idx;
int r;
- /* Returns the following error codes:
- *
- * EBUSY → all UID candidates we checked are already taken
- * EEXIST → the name for the userns already exists
- * EDEADLK → the userns is already registered in the registry
- */
-
assert(registry_dir_fd >= 0);
- assert(info);
+ assert(candidates);
+ assert(ret_candidate);
- switch (info->size) {
+ switch (size) {
- case 0x10000U:
+ case NSRESOURCE_UIDS_64K:
uidmin = CONTAINER_UID_BASE_MIN;
uidmax = CONTAINER_UID_BASE_MAX;
- uidmask = (uid_t) UINT32_C(0xFFFF0000);
break;
- case 1U:
+ case NSRESOURCE_UIDS_1:
uidmin = DYNAMIC_UID_MIN;
uidmax = DYNAMIC_UID_MAX;
- uidmask = (uid_t) UINT32_C(0xFFFFFFFF);
break;
default:
assert_not_reached();
}
- r = uid_range_load_userns(/* path= */ NULL, UID_RANGE_USERNS_INSIDE, &valid_range);
+ /* Make a copy of candidates that we can modify for the selection algorithm */
+ r = uid_range_copy(candidates, ©);
if (r < 0)
- return r;
+ return log_debug_errno(r, "Failed to copy UID range: %m");
- /* Check early whether we have any chance at all given our own uid range */
- if (!uid_range_overlaps(valid_range, uidmin, uidmax))
+ /* Clip the copy with the valid UID range for this allocation size */
+ r = uid_range_clip(copy, uidmin, uidmax);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to intersect UID range: %m");
+
+ /* Partition entries into entries of exactly the right size */
+ r = uid_range_partition(copy, size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to partition UID ranges: %m");
+
+ if (uid_range_is_empty(copy))
return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate.");
+ log_debug("Partitioned UID range into %zu entries of size %" PRIu32, copy->n_entries, size);
+
+ /* Start from a hash of the input name if we have one, use random values afterwards. */
+ idx = name ? siphash24_string(name, hash_key) : random_u32();
+ for (;; idx = random_u32()) {
+ if (uid_range_is_empty(copy))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "All candidate UIDs already taken.");
+
+ if (--n_tries <= 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available.");
+
+ idx %= copy->n_entries;
+
+ candidate = copy->entries[idx].start;
+
+ /* We only check the base UID for each range. Pass the parent userns inode so that
+ * allocating from a delegated range owned by the parent is allowed. */
+ r = uid_is_available(registry_dir_fd, candidate, parent_userns_fd);
+ if (r < 0)
+ return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate);
+ if (r > 0)
+ break;
+
+ log_debug("UID range " UID_FMT " already taken.", candidate);
+
+ /* Remove this unavailable range from candidates so we don't try it again */
+ r = uid_range_remove(copy, candidate, size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to remove unavailable range from candidates: %m");
+ }
+
+ /* Remove the allocated range from the original candidates */
+ r = uid_range_remove(candidates, candidate, size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to remove allocated range from candidates: %m");
+
+ *ret_candidate = candidate;
+
+ log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + size - 1);
+
+ return 0;
+}
+
+static int allocate_now(
+ int registry_dir_fd,
+ int userns_fd,
+ int parent_userns_fd,
+ UserNamespaceInfo *info,
+ int *ret_lock_fd) {
+
+ _cleanup_(uid_range_freep) UIDRange *candidates = NULL;
+ uid_t candidate;
+ int r;
+
+ /* Returns the following error codes:
+ *
+ * EBUSY → all UID candidates we checked are already taken
+ * EEXIST → the name for the userns already exists
+ * EDEADLK → the userns is already registered in the registry
+ */
+
+ assert(registry_dir_fd >= 0);
+ assert(userns_fd >= 0);
+ assert(info);
+
+ r = uid_range_load_userns_by_fd(parent_userns_fd, UID_RANGE_USERNS_INSIDE, &candidates);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns UID range: %m");
+
_cleanup_close_ int lock_fd = -EBADF;
lock_fd = userns_registry_lock(registry_dir_fd);
if (lock_fd < 0)
if (r == 0)
return -EEXIST;
- for (candidate = siphash24_string(info->name, hash_key) & UINT32_MAX;; /* Start from a hash of the input name */
- candidate = random_u32()) { /* Use random values afterwards */
+ r = allocate_one(
+ registry_dir_fd,
+ info->name, info->size,
+ parent_userns_fd,
+ candidates,
+ &candidate);
+ if (r < 0)
+ return r;
- if (--n_tries <= 0)
- return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available.");
+ info->start_uid = candidate;
+ info->start_gid = (gid_t) candidate;
- candidate = (candidate % (uidmax - uidmin)) + uidmin;
- candidate &= uidmask;
+ /* Now allocate delegated ranges if requested */
+ if (info->n_delegates > 0) {
+ assert(info->delegates);
- if (!uid_range_covers(valid_range, candidate, info->size))
- continue;
+ FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+ r = allocate_one(
+ registry_dir_fd,
+ /* name= */ NULL,
+ delegate->size,
+ parent_userns_fd,
+ candidates,
+ &candidate);
+ if (r < 0)
+ return r;
- /* We only check the base UID for each range (!) */
- r = uid_is_available(registry_dir_fd, candidate);
- if (r < 0)
- return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate);
- if (r > 0) {
- info->start_uid = candidate;
- info->start_gid = (gid_t) candidate;
+ delegate->userns_inode = info->userns_inode;
+ delegate->start_uid = candidate;
+ delegate->start_gid = (gid_t) candidate;
+ }
+ }
- log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + info->size - 1);
+ if (ret_lock_fd)
+ *ret_lock_fd = TAKE_FD(lock_fd);
- if (ret_lock_fd)
- *ret_lock_fd = TAKE_FD(lock_fd);
+ return 0;
+}
- return 0;
- }
+static int write_userns_mappings(PidRef *pidref, const char *uidmap, const char *gidmap) {
+ const char *pmap;
+ int r;
- log_debug("UID range " UID_FMT " already taken.", candidate);
- }
+ assert(pidref);
+ assert(uidmap);
+ assert(gidmap);
+
+ pmap = procfs_file_alloca(pidref->pid, "uid_map");
+ r = write_string_file(pmap, uidmap, /* flags= */ 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m");
+
+ pmap = procfs_file_alloca(pidref->pid, "gid_map");
+ r = write_string_file(pmap, gidmap, /* flags= */ 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m");
+
+ return 0;
}
-static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) {
+static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespaceInfo *userns_info) {
_cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
_cleanup_close_ int efd = -EBADF;
uint64_t u;
int r;
- assert(usernsfd >= 0);
+ assert(userns_fd >= 0);
+ assert(parent_userns_fd >= 0);
assert(userns_info);
assert(uid_is_valid(userns_info->target_uid));
assert(uid_is_valid(userns_info->start_uid));
if (r == 0) {
/* child */
- if (setns(usernsfd, CLONE_NEWUSER) < 0) {
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
log_error_errno(errno, "Failed to join user namespace: %m");
goto child_fail;
}
/* Now write mapping */
- _cleanup_free_ char *pmap = NULL;
+ _cleanup_(uid_range_freep) UIDRange *outside_range = NULL;
+ r = uid_range_load_userns_by_fd_full(parent_userns_fd, UID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &outside_range);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns UID range: %m");
+
+ _cleanup_(uid_range_freep) UIDRange *inside_range = NULL;
+ r = uid_range_load_userns_by_fd_full(parent_userns_fd, UID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &inside_range);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns UID range: %m");
+
+ uid_t start_uid;
+ r = uid_range_translate(outside_range, inside_range, userns_info->start_uid, &start_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", userns_info->start_uid);
- if (asprintf(&pmap, "/proc/" PID_FMT "/uid_map", pidref.pid) < 0)
+ /* Let's enforce that the transient UID/GID ranges are mapped 1:1 in the parent user namespace, to
+ * avoid any weird mapping shenanigans that might happen otherwise. */
+
+ if (start_uid != userns_info->start_uid)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERANGE),
+ "Transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
+ userns_info->start_uid, start_uid);
+
+ /* Build uid_map content: primary mapping + delegated mappings (1:1) */
+ _cleanup_free_ char *uidmap = NULL;
+ if (asprintf(&uidmap, UID_FMT " " UID_FMT " %" PRIu32 "\n",
+ userns_info->target_uid, start_uid, userns_info->size) < 0)
return log_oom();
- r = write_string_filef(pmap, 0, UID_FMT " " UID_FMT " %" PRIu32 "\n", userns_info->target_uid, userns_info->start_uid, userns_info->size);
+ log_debug("UID mapping: " UID_FMT " " UID_FMT " %" PRIu32,
+ userns_info->target_uid, userns_info->start_uid, userns_info->size);
+
+ FOREACH_ARRAY(delegate, userns_info->delegates, userns_info->n_delegates) {
+ r = uid_range_translate(outside_range, inside_range, delegate->start_uid, &start_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", userns_info->start_uid);
+
+ if (start_uid != delegate->start_uid)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERANGE),
+ "Delegated transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
+ delegate->start_uid, start_uid);
+
+ if (strextendf(&uidmap,
+ UID_FMT " " UID_FMT " %" PRIu32 "\n",
+ delegate->start_uid,
+ start_uid,
+ delegate->size) < 0)
+ return log_oom();
+
+ log_debug("UID mapping: " UID_FMT " " UID_FMT " %" PRIu32,
+ delegate->start_uid, start_uid, delegate->size);
+ }
+
+ outside_range = uid_range_free(outside_range);
+ inside_range = uid_range_free(inside_range);
+
+ r = uid_range_load_userns_by_fd_full(parent_userns_fd, GID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &outside_range);
if (r < 0)
- return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m");
+ return log_debug_errno(r, "Failed to read userns GID range: %m");
- pmap = mfree(pmap);
- if (asprintf(&pmap, "/proc/" PID_FMT "/gid_map", pidref.pid) < 0)
+ r = uid_range_load_userns_by_fd_full(parent_userns_fd, GID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &inside_range);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns GID range: %m");
+
+ gid_t start_gid;
+ r = uid_range_translate(outside_range, inside_range, userns_info->start_gid, &start_gid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid);
+
+ if (start_gid != userns_info->start_gid)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERANGE),
+ "Transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
+ userns_info->start_gid, start_gid);
+
+ _cleanup_free_ char *gidmap = NULL;
+ if (asprintf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n",
+ userns_info->target_gid, start_gid, userns_info->size) < 0)
return log_oom();
- r = write_string_filef(pmap, 0, GID_FMT " " GID_FMT " %" PRIu32 "\n", userns_info->target_gid, userns_info->start_gid, userns_info->size);
+ log_debug("GID mapping: " GID_FMT " " GID_FMT " %" PRIu32,
+ userns_info->target_gid, userns_info->start_gid, userns_info->size);
+
+ FOREACH_ARRAY(delegate, userns_info->delegates, userns_info->n_delegates) {
+ r = uid_range_translate(outside_range, inside_range, delegate->start_gid, &start_gid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid);
+
+ if (start_gid != delegate->start_gid)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERANGE),
+ "Delegated transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
+ delegate->start_gid, start_gid);
+
+ /* Delegated ranges are mapped 1:1 (inside GID == outside GID) */
+ if (strextendf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n",
+ delegate->start_gid,
+ start_gid,
+ delegate->size) < 0)
+ return log_oom();
+
+ log_debug("GID mapping: " GID_FMT " " GID_FMT " %" PRIu32,
+ delegate->start_gid, start_gid, delegate->size);
+ }
+
+ r = is_our_namespace(parent_userns_fd, NAMESPACE_USER);
if (r < 0)
- return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m");
+ return log_debug_errno(r, "Failed to check if parent user namespace refers to our own user namespace: %m");
+ if (r > 0)
+ return write_userns_mappings(&pidref, uidmap, gidmap);
+
+ /* The kernel is paranoid that the uid_map and gid_map files are written either from the user
+ * namespace itself or its parent user namespace, so we have to join the parent user namespace to
+ * write the files. */
+
+ r = pidref_safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT, /* ret= */ NULL);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ if (setns(parent_userns_fd, CLONE_NEWUSER) < 0) {
+ log_error_errno(errno, "Failed to join parent user namespace: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = write_userns_mappings(&pidref, uidmap, gidmap);
+ _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
+ }
/* We are done! */
uid_t target;
unsigned userns_fd_idx;
bool mangle_name;
+ uint32_t delegate_container_ranges;
} AllocateParameters;
static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) {
static const sd_json_dispatch_field dispatch_table[] = {
- { "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, offsetof(AllocateParameters, name), SD_JSON_MANDATORY },
- { "size", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, size), SD_JSON_MANDATORY },
- { "target", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid, offsetof(AllocateParameters, target), 0 },
- { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), SD_JSON_MANDATORY },
- { "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 },
+ { "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, offsetof(AllocateParameters, name), SD_JSON_MANDATORY },
+ { "size", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, size), SD_JSON_MANDATORY },
+ { "target", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid, offsetof(AllocateParameters, target), 0 },
+ { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), SD_JSON_MANDATORY },
+ { "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 },
+ { "delegateContainerRanges", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, delegate_container_ranges), 0 },
{}
};
if (r != 0)
return r;
+ if (p.delegate_container_ranges > USER_NAMESPACE_DELEGATIONS_MAX)
+ return sd_varlink_error(link, "io.systemd.NamespaceResource.TooManyDelegations", NULL);
+
userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
if (userns_fd < 0)
return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
if (fstat(userns_fd, &userns_st) < 0)
return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
+ _cleanup_close_ int parent_userns_fd = ioctl(userns_fd, NS_GET_PARENT);
+ if (parent_userns_fd < 0)
+ return log_debug_errno(errno, "Failed to get parent user namespace: %m");
+
r = sd_varlink_get_peer_uid(link, &peer_uid);
if (r < 0)
return r;
userns_info->target_uid = p.target;
userns_info->target_gid = (gid_t) p.target;
- r = allocate_now(registry_dir_fd, userns_info, &lock_fd);
+ /* Set up delegation arrays if requested */
+ if (p.delegate_container_ranges > 0) {
+ userns_info->delegates = new0(DelegatedUserNamespaceInfo, p.delegate_container_ranges);
+ if (!userns_info->delegates)
+ return -ENOMEM;
+
+ FOREACH_ARRAY(delegate, userns_info->delegates, p.delegate_container_ranges) {
+ *delegate = DELEGATED_USER_NAMESPACE_INFO_NULL;
+ delegate->size = NSRESOURCE_UIDS_64K;
+ }
+
+ userns_info->n_delegates = p.delegate_container_ranges;
+ }
+
+ r = allocate_now(registry_dir_fd, userns_fd, parent_userns_fd, userns_info, &lock_fd);
if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */
return sd_varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL);
if (r == -EBUSY) /* All used up */
if (r < 0)
goto fail;
- r = write_userns(userns_fd, userns_info);
+ r = write_userns(userns_fd, parent_userns_fd, userns_info);
if (r < 0)
goto fail;
return TAKE_FD(lock_fd);
}
+void delegated_userns_info_done(DelegatedUserNamespaceInfo *info) {
+ if (!info)
+ return;
+
+ info->ancestor_userns = mfree(info->ancestor_userns);
+ info->n_ancestor_userns = 0;
+}
+
+void delegated_userns_info_done_many(DelegatedUserNamespaceInfo infos[], size_t n) {
+ assert(infos || n == 0);
+
+ FOREACH_ARRAY(info, infos, n)
+ delegated_userns_info_done(info);
+
+ free(infos);
+}
+
UserNamespaceInfo* userns_info_new(void) {
UserNamespaceInfo *info = new(UserNamespaceInfo, 1);
if (!info)
free(userns->cgroups);
free(userns->name);
+ delegated_userns_info_done_many(userns->delegates, userns->n_delegates);
+
strv_free(userns->netifs);
return mfree(userns);
return 0;
}
+static int dispatch_delegates_array(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
+ UserNamespaceInfo *info = ASSERT_PTR(userdata);
+ DelegatedUserNamespaceInfo *delegates = NULL;
+ size_t n = 0;
+ int r;
+
+ CLEANUP_ARRAY(delegates, n, delegated_userns_info_done_many);
+
+ if (sd_json_variant_is_null(variant)) {
+ delegated_userns_info_done_many(info->delegates, info->n_delegates);
+ info->delegates = NULL;
+ info->n_delegates = 0;
+ return 0;
+ }
+
+ if (!sd_json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+ size_t elements = sd_json_variant_elements(variant);
+ if (elements > USER_NAMESPACE_DELEGATIONS_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(E2BIG), "Too many delegations.");
+
+ delegates = new(DelegatedUserNamespaceInfo, elements);
+ if (!delegates)
+ return json_log_oom(variant, flags);
+
+ sd_json_variant *e;
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ static const sd_json_dispatch_field delegate_dispatch_table[] = {
+ { "userns", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(DelegatedUserNamespaceInfo, userns_inode), 0 },
+ { "start", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_uid), SD_JSON_MANDATORY },
+ { "startGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_gid), SD_JSON_MANDATORY },
+ { "size", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32, offsetof(DelegatedUserNamespaceInfo, size), SD_JSON_MANDATORY },
+ {}
+ };
+
+ delegates[n] = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+ r = sd_json_dispatch(e, delegate_dispatch_table, flags, &delegates[n]);
+ if (r < 0)
+ return r;
+
+ if (!uid_is_valid(delegates[n].start_uid) || !gid_is_valid(delegates[n].start_gid))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid delegate UID/GID.");
+
+ if (delegates[n].size == 0)
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid delegate size.");
+
+ n++;
+ }
+
+ delegated_userns_info_done_many(info->delegates, info->n_delegates);
+ info->delegates = TAKE_PTR(delegates);
+ info->n_delegates = n;
+
+ return 0;
+}
+
+static int dispatch_ancestor_userns_array(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
+ DelegatedUserNamespaceInfo *info = ASSERT_PTR(userdata);
+ _cleanup_free_ uint64_t *ancestor_userns = NULL;
+ size_t n = 0;
+
+ if (sd_json_variant_is_null(variant)) {
+ info->ancestor_userns = mfree(info->ancestor_userns);
+ info->n_ancestor_userns = 0;
+ return 0;
+ }
+
+ if (!sd_json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+ ancestor_userns = new(uint64_t, sd_json_variant_elements(variant));
+ if (!ancestor_userns)
+ return json_log_oom(variant, flags);
+
+ sd_json_variant *e;
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ if (!sd_json_variant_is_unsigned(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an unsigned integer.");
+
+ uint64_t v = sd_json_variant_unsigned(e);
+ if (v == 0)
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid ancestor userns inode 0.");
+
+ ancestor_userns[n++] = v;
+ }
+
+ free_and_replace(info->ancestor_userns, ancestor_userns);
+ info->n_ancestor_userns = n;
+
+ return 0;
+}
+
static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **ret) {
static const sd_json_dispatch_field dispatch_table[] = {
{ "targetGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(UserNamespaceInfo, target_gid), 0 },
{ "cgroups", SD_JSON_VARIANT_ARRAY, dispatch_cgroups_array, 0, 0 },
{ "netifs", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserNamespaceInfo, netifs), 0 },
+ { "delegates", SD_JSON_VARIANT_ARRAY, dispatch_delegates_array, 0, 0 },
{}
};
return r;
}
+ _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegates_array = NULL;
+ FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+ r = sd_json_variant_append_arraybo(
+ &delegates_array,
+ SD_JSON_BUILD_PAIR_UNSIGNED("userns", delegate->userns_inode),
+ SD_JSON_BUILD_PAIR_UNSIGNED("start", delegate->start_uid),
+ SD_JSON_BUILD_PAIR_UNSIGNED("startGid", delegate->start_gid),
+ SD_JSON_BUILD_PAIR_UNSIGNED("size", delegate->size));
+ if (r < 0)
+ return r;
+ }
+
_cleanup_(sd_json_variant_unrefp) sd_json_variant *def = NULL;
r = sd_json_buildo(
&def,
SD_JSON_BUILD_PAIR_CONDITION(gid_is_valid(info->start_gid), "startGid", SD_JSON_BUILD_UNSIGNED(info->start_gid)),
SD_JSON_BUILD_PAIR_CONDITION(gid_is_valid(info->target_gid), "targetGid", SD_JSON_BUILD_UNSIGNED(info->target_gid)),
SD_JSON_BUILD_PAIR_CONDITION(!!cgroup_array, "cgroups", SD_JSON_BUILD_VARIANT(cgroup_array)),
- JSON_BUILD_PAIR_STRV_NON_EMPTY("netifs", info->netifs));
+ JSON_BUILD_PAIR_STRV_NON_EMPTY("netifs", info->netifs),
+ SD_JSON_BUILD_PAIR_CONDITION(!!delegates_array, "delegates", SD_JSON_BUILD_VARIANT(delegates_array)));
if (r < 0)
return r;
goto fail;
}
+ /* Store delegation files */
+ FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+ _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegate_def = NULL, *ancestor_array = NULL;
+ _cleanup_free_ char *delegate_buf = NULL, *delegate_uid_fn = NULL, *delegate_gid_fn = NULL;
+
+ if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) < 0) {
+ r = log_oom_debug();
+ goto fail;
+ }
+
+ /* Check if this delegation already exists. If so, this is a recursive
+ * subdelegation: we need to preserve the chain of previous owners so that
+ * ownership can be restored when the current owner goes away. */
+ _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo existing = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+ r = userns_registry_load_delegation_by_uid(dir_fd, delegate->start_uid, &existing);
+ if (r >= 0) {
+ /* Delegation file exists — append old owner to ancestor chain */
+ FOREACH_ARRAY(ancestor_userns, existing.ancestor_userns, existing.n_ancestor_userns) {
+ r = sd_json_variant_append_arrayb(
+ &ancestor_array,
+ SD_JSON_BUILD_UNSIGNED(*ancestor_userns));
+ if (r < 0)
+ goto fail;
+ }
+
+ /* userns_registry_store() is also called to update existing entries in the registry
+ * in which case we don't need to update the ownership of the delegated UID ranges. */
+ if (delegate->userns_inode != existing.userns_inode) {
+ r = sd_json_variant_append_arrayb(
+ &ancestor_array,
+ SD_JSON_BUILD_UNSIGNED(existing.userns_inode));
+ if (r < 0)
+ goto fail;
+ }
+
+ } else if (r != -ENOENT) {
+ log_debug_errno(r, "Failed to load existing delegation for UID " UID_FMT ": %m", delegate->start_uid);
+ goto fail;
+ }
+
+ r = sd_json_buildo(
+ &delegate_def,
+ SD_JSON_BUILD_PAIR_UNSIGNED("userns", delegate->userns_inode),
+ SD_JSON_BUILD_PAIR_UNSIGNED("start", delegate->start_uid),
+ SD_JSON_BUILD_PAIR_UNSIGNED("startGid", delegate->start_gid),
+ SD_JSON_BUILD_PAIR_UNSIGNED("size", delegate->size),
+ SD_JSON_BUILD_PAIR_CONDITION(!!ancestor_array, "ancestorUserns", SD_JSON_BUILD_VARIANT(ancestor_array)));
+ if (r < 0)
+ goto fail;
+
+ r = sd_json_variant_format(delegate_def, /* flags= */ 0, &delegate_buf);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to format delegation JSON object: %m");
+ goto fail;
+ }
+
+ r = write_string_file_at(dir_fd, delegate_uid_fn, delegate_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to write delegation data to '%s' in registry: %m", delegate_uid_fn);
+ goto fail;
+ }
+
+ /* Create GID symlink pointing to the UID file */
+ if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) < 0) {
+ r = log_oom_debug();
+ goto fail;
+ }
+
+ r = linkat_replace(dir_fd, delegate_uid_fn, dir_fd, delegate_gid_fn);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to link delegation data to '%s' in registry: %m", delegate_gid_fn);
+ goto fail;
+ }
+ }
+
return 0;
fail:
if (uid_fn)
(void) unlinkat(dir_fd, uid_fn, AT_REMOVEDIR);
+ /* Clean up any delegation files we created */
+ FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+ _cleanup_free_ char *delegate_uid_fn = NULL, *delegate_gid_fn = NULL;
+
+ if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) >= 0)
+ (void) unlinkat(dir_fd, delegate_uid_fn, /* flags= */ 0);
+
+ if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) >= 0)
+ (void) unlinkat(dir_fd, delegate_gid_fn, /* flags= */ 0);
+ }
+
return r;
}
if (asprintf(®_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0)
return log_oom_debug();
- ret = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0));
+ r = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0));
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", reg_fn));
_cleanup_free_ char *link1_fn = NULL;
link1_fn = strjoin("n", info->name, ".userns");
if (!link1_fn)
return log_oom_debug();
- RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link1_fn, 0)));
+ r = RET_NERRNO(unlinkat(dir_fd, link1_fn, 0));
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link1_fn));
if (uid_is_valid(info->start_uid)) {
_cleanup_free_ char *link2_fn = NULL;
if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0)
return log_oom_debug();
- RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link2_fn, 0)));
+ r = RET_NERRNO(unlinkat(dir_fd, link2_fn, 0));
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link2_fn));
}
if (uid_is_valid(info->start_gid)) {
if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0)
return log_oom_debug();
- RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link3_fn, 0)));
+ r = RET_NERRNO(unlinkat(dir_fd, link3_fn, 0));
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link3_fn));
}
_cleanup_free_ char *uid_fn = NULL;
if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0)
return log_oom_debug();
- RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, owner_fn, 0)));
+ r = RET_NERRNO(unlinkat(dir_fd, owner_fn, 0));
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", owner_fn));
r = RET_NERRNO(unlinkat(dir_fd, uid_fn, AT_REMOVEDIR));
- if (r != -ENOTEMPTY)
- RET_GATHER(ret, r);
+ if (r < 0 && r != -ENOTEMPTY)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", uid_fn));
+
+ /* Remove or restore delegation files */
+ FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+ /* Check if this delegation has ancestor user namespaces. If so, restore ownership to
+ * the last ancestor instead of removing the delegation file entirely. */
+ _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo existing = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+ r = userns_registry_load_delegation_by_uid(dir_fd, delegate->start_uid, &existing);
+ if (r < 0) {
+ log_debug_errno(r,
+ "Failed to load delegated UID range starting at "UID_FMT":"GID_FMT" for userns %"PRIu64": %m",
+ delegate->start_uid, delegate->start_gid, delegate->userns_inode);
+ RET_GATHER(ret, r);
+ continue;
+ }
+
+ _cleanup_free_ char *delegate_uid_fn = NULL;
+ if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) < 0)
+ return log_oom_debug();
+
+ if (existing.n_ancestor_userns > 0) {
+ _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegate_def = NULL, *ancestor_array = NULL;
+ _cleanup_free_ char *delegate_buf = NULL;
+
+ /* Pop the last ancestor userns inode to become the new owner */
+ uint64_t new_owner = existing.ancestor_userns[existing.n_ancestor_userns - 1];
+
+ log_debug("Moving ownership of delegated UID range from %"PRIu64" to %"PRIu64".",
+ delegate->userns_inode, new_owner);
+
+ /* Rebuild ancestor array without the last entry */
+ for (size_t j = 0; j + 1 < existing.n_ancestor_userns; j++) {
+ r = sd_json_variant_append_arrayb(
+ &ancestor_array,
+ SD_JSON_BUILD_UNSIGNED(existing.ancestor_userns[j]));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to append to JSON array: %m");
+ }
+
+ r = sd_json_buildo(
+ &delegate_def,
+ SD_JSON_BUILD_PAIR_UNSIGNED("userns", new_owner),
+ SD_JSON_BUILD_PAIR_UNSIGNED("start", existing.start_uid),
+ SD_JSON_BUILD_PAIR_UNSIGNED("startGid", existing.start_gid),
+ SD_JSON_BUILD_PAIR_UNSIGNED("size", existing.size),
+ SD_JSON_BUILD_PAIR_CONDITION(!!ancestor_array, "ancestorUserns", SD_JSON_BUILD_VARIANT(ancestor_array)));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to build delegate JSON object: %m");
+
+ r = sd_json_variant_format(delegate_def, /* flags= */ 0, &delegate_buf);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to format delegation JSON object: %m");
+
+ r = write_string_file_at(dir_fd, delegate_uid_fn, delegate_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to write restored delegation data to '%s' in registry: %m", delegate_uid_fn));
+
+ /* GID link already points to the UID file, no need to update it */
+ continue;
+ }
+
+ log_debug("Removing delegated UID range starting at "UID_FMT":"GID_FMT" for userns %"PRIu64 ".",
+ delegate->start_uid, delegate->start_gid, delegate->userns_inode);
+
+ /* No ancestor chain — just remove the delegation files */
+ r = RET_NERRNO(unlinkat(dir_fd, delegate_uid_fn, 0));
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", delegate_uid_fn));
+
+ _cleanup_free_ char *delegate_gid_fn = NULL;
+ if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) < 0)
+ return log_oom_debug();
+
+ r = RET_NERRNO(unlinkat(dir_fd, delegate_gid_fn, 0));
+ if (r < 0)
+ RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", delegate_gid_fn));
+ }
return ret;
}
return n;
}
+
+int userns_registry_delegation_uid_exists(int dir_fd, uid_t start) {
+ _cleanup_free_ char *fn = NULL;
+
+ assert(dir_fd >= 0);
+
+ if (!uid_is_valid(start))
+ return -ENOENT;
+
+ if (start == 0)
+ return true;
+
+ if (asprintf(&fn, "u" UID_FMT ".delegate", start) < 0)
+ return -ENOMEM;
+
+ if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+ return errno == ENOENT ? false : -errno;
+
+ return true;
+}
+
+int userns_registry_delegation_gid_exists(int dir_fd, gid_t start) {
+ _cleanup_free_ char *fn = NULL;
+
+ assert(dir_fd >= 0);
+
+ if (!gid_is_valid(start))
+ return -ENOENT;
+
+ if (start == 0)
+ return true;
+
+ if (asprintf(&fn, "g" GID_FMT ".delegate", start) < 0)
+ return -ENOMEM;
+
+ if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+ return errno == ENOENT ? false : -errno;
+
+ return true;
+}
+
+static int userns_registry_load_delegation(int dir_fd, const char *filename, DelegatedUserNamespaceInfo *ret) {
+
+ static const sd_json_dispatch_field dispatch_table[] = {
+ { "userns", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(DelegatedUserNamespaceInfo, userns_inode), SD_JSON_MANDATORY },
+ { "start", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_uid), SD_JSON_MANDATORY },
+ { "startGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_gid), 0 },
+ { "size", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32, offsetof(DelegatedUserNamespaceInfo, size), SD_JSON_MANDATORY },
+ { "ancestorUserns", SD_JSON_VARIANT_ARRAY, dispatch_ancestor_userns_array, 0, 0 },
+ {}
+ };
+
+ _cleanup_(sd_json_variant_unrefp) sd_json_variant *v = NULL;
+ _cleanup_close_ int registry_fd = -EBADF;
+ int r;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ r = sd_json_parse_file_at(/* f= */ NULL, dir_fd, filename, /* flags= */ 0, &v, /* reterr_line= */ NULL, /* reterr_column= */ NULL);
+ if (r < 0)
+ return r;
+
+ _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+ r = sd_json_dispatch(v, dispatch_table, /* flags= */ 0, &data);
+ if (r < 0)
+ return r;
+
+ if (data.userns_inode == 0)
+ return -EBADMSG;
+ if (data.size == 0)
+ return -EBADMSG;
+
+ if (ret)
+ *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL);
+
+ return 0;
+}
+
+int userns_registry_load_delegation_by_uid(int dir_fd, uid_t start, DelegatedUserNamespaceInfo *ret) {
+ _cleanup_free_ char *fn = NULL;
+ int r;
+
+ if (!uid_is_valid(start))
+ return -ENOENT;
+
+ if (asprintf(&fn, "u" UID_FMT ".delegate", start) < 0)
+ return -ENOMEM;
+
+ _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL;
+ r = userns_registry_load_delegation(dir_fd, fn, &data);
+ if (r < 0)
+ return r;
+
+ if (data.start_uid != start)
+ return -EBADMSG;
+
+ if (ret)
+ *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL);
+
+ return 0;
+}
+
+int userns_registry_load_delegation_by_gid(int dir_fd, gid_t start, DelegatedUserNamespaceInfo *ret) {
+ _cleanup_free_ char *fn = NULL;
+ int r;
+
+ if (!gid_is_valid(start))
+ return -ENOENT;
+
+ if (asprintf(&fn, "g" UID_FMT ".delegate", start) < 0)
+ return -ENOMEM;
+
+ _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL;
+ r = userns_registry_load_delegation(dir_fd, fn, &data);
+ if (r < 0)
+ return r;
+
+ if (data.start_gid != start)
+ return -EBADMSG;
+
+ if (ret)
+ *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL);
+
+ return 0;
+}
#define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16U
#define USER_NAMESPACE_NETIFS_DELEGATE_MAX 16U
+#define USER_NAMESPACE_DELEGATIONS_MAX 16U
+
+typedef struct DelegatedUserNamespaceInfo {
+ uint64_t userns_inode;
+ uid_t start_uid;
+ gid_t start_gid;
+ uint32_t size;
+ /* We track all the previous owners of the delegation so we can restore the previous owner of each
+ * delegated range when a user namespace with delegated ranges is freed. */
+ uint64_t *ancestor_userns;
+ size_t n_ancestor_userns;
+} DelegatedUserNamespaceInfo;
+
+#define DELEGATED_USER_NAMESPACE_INFO_NULL (DelegatedUserNamespaceInfo) { \
+ .start_uid = UID_INVALID, \
+ .start_gid = GID_INVALID, \
+}
+
+void delegated_userns_info_done(DelegatedUserNamespaceInfo *info);
+void delegated_userns_info_done_many(DelegatedUserNamespaceInfo infos[], size_t n);
typedef struct UserNamespaceInfo {
uid_t owner;
uint64_t *cgroups;
size_t n_cgroups;
char **netifs;
+ DelegatedUserNamespaceInfo *delegates;
+ size_t n_delegates;
} UserNamespaceInfo;
UserNamespaceInfo* userns_info_new(void);
int userns_registry_gid_exists(int dir_fd, gid_t start);
int userns_registry_per_uid(int dir_fd, uid_t owner);
+
+int userns_registry_delegation_uid_exists(int dir_fd, uid_t start);
+int userns_registry_delegation_gid_exists(int dir_fd, gid_t start);
+int userns_registry_load_delegation_by_uid(int dir_fd, uid_t start, DelegatedUserNamespaceInfo *ret);
+int userns_registry_load_delegation_by_gid(int dir_fd, gid_t start, DelegatedUserNamespaceInfo *ret);
SD_VARLINK_DEFINE_INPUT(target, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("A file descriptor to an allocated userns with no current UID range assignments"),
SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, 0),
+ SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."),
+ SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."),
SD_VARLINK_DEFINE_OUTPUT(name, SD_VARLINK_STRING, SD_VARLINK_NULLABLE));
static SD_VARLINK_DEFINE_ERROR(TooManyControlGroups);
static SD_VARLINK_DEFINE_ERROR(ControlGroupAlreadyAdded);
static SD_VARLINK_DEFINE_ERROR(TooManyNetworkInterfaces);
+static SD_VARLINK_DEFINE_ERROR(TooManyDelegations);
SD_VARLINK_DEFINE_INTERFACE(
io_systemd_NamespaceResource,
SD_VARLINK_SYMBOL_COMMENT("The specified cgroup has already been added to the user namespace."),
&vl_error_ControlGroupAlreadyAdded,
SD_VARLINK_SYMBOL_COMMENT("The per-user namespace limit of network interfaces has been reached."),
- &vl_error_TooManyNetworkInterfaces);
+ &vl_error_TooManyNetworkInterfaces,
+ SD_VARLINK_SYMBOL_COMMENT("The specified number of delegations exceeds the maximum allowed."),
+ &vl_error_TooManyDelegations);
ASSERT_EQ(p->entries[0].nr, 115U);
}
+TEST(uid_range_clip) {
+ _cleanup_(uid_range_freep) UIDRange *p = NULL;
+
+ /* Build a range: 100-199, 300-399, 500-599 */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_add_str(&p, "300-399"));
+ ASSERT_OK(uid_range_add_str(&p, "500-599"));
+ ASSERT_EQ(uid_range_entries(p), 3U);
+
+ /* Intersect with range that covers all entries */
+ ASSERT_OK(uid_range_clip(p, 0, 1000));
+ ASSERT_EQ(uid_range_entries(p), 3U);
+ ASSERT_EQ(p->entries[0].start, 100U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+ ASSERT_EQ(p->entries[1].start, 300U);
+ ASSERT_EQ(p->entries[1].nr, 100U);
+ ASSERT_EQ(p->entries[2].start, 500U);
+ ASSERT_EQ(p->entries[2].nr, 100U);
+
+ /* Intersect with range that excludes first and last entries */
+ ASSERT_OK(uid_range_clip(p, 200, 499));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 300U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+
+ p = uid_range_free(p);
+
+ /* Test partial overlap - trimming from both sides */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_clip(p, 150, 180));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 150U);
+ ASSERT_EQ(p->entries[0].nr, 31U);
+
+ p = uid_range_free(p);
+
+ /* Test intersection that removes all entries */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_clip(p, 500, 600));
+ ASSERT_TRUE(uid_range_is_empty(p));
+
+ p = uid_range_free(p);
+
+ /* Test invalid min > max */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_ERROR(uid_range_clip(p, 200, 100), EINVAL);
+
+ p = uid_range_free(p);
+
+ /* Test with max == UINT32_MAX (should not overflow) */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_clip(p, 0, UINT32_MAX));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 100U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+
+ p = uid_range_free(p);
+
+ /* Test with both min and max at extremes */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_add_str(&p, "500-599"));
+ ASSERT_OK(uid_range_clip(p, 150, UINT32_MAX));
+ ASSERT_EQ(uid_range_entries(p), 2U);
+ ASSERT_EQ(p->entries[0].start, 150U);
+ ASSERT_EQ(p->entries[0].nr, 50U);
+ ASSERT_EQ(p->entries[1].start, 500U);
+ ASSERT_EQ(p->entries[1].nr, 100U);
+}
+
+TEST(uid_range_partition) {
+ _cleanup_(uid_range_freep) UIDRange *p = NULL;
+
+ /* Single entry that divides evenly */
+ ASSERT_OK(uid_range_add_str(&p, "0-299"));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_OK(uid_range_partition(p, 100));
+ ASSERT_EQ(uid_range_entries(p), 3U);
+ ASSERT_EQ(p->entries[0].start, 0U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+ ASSERT_EQ(p->entries[1].start, 100U);
+ ASSERT_EQ(p->entries[1].nr, 100U);
+ ASSERT_EQ(p->entries[2].start, 200U);
+ ASSERT_EQ(p->entries[2].nr, 100U);
+
+ p = uid_range_free(p);
+
+ /* Entry with remainder (gets truncated) */
+ ASSERT_OK(uid_range_add_str(&p, "0-249"));
+ ASSERT_OK(uid_range_partition(p, 100));
+ ASSERT_EQ(uid_range_entries(p), 2U);
+ ASSERT_EQ(p->entries[0].start, 0U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+ ASSERT_EQ(p->entries[1].start, 100U);
+ ASSERT_EQ(p->entries[1].nr, 100U);
+
+ p = uid_range_free(p);
+
+ /* Entry smaller than partition size - gets dropped */
+ ASSERT_OK(uid_range_add_str(&p, "0-49"));
+ ASSERT_OK(uid_range_partition(p, 100));
+ ASSERT_TRUE(uid_range_is_empty(p));
+
+ p = uid_range_free(p);
+
+ /* Multiple entries */
+ ASSERT_OK(uid_range_add_str(&p, "0-199"));
+ ASSERT_OK(uid_range_add_str(&p, "1000-1299"));
+ ASSERT_EQ(uid_range_entries(p), 2U);
+ ASSERT_OK(uid_range_partition(p, 100));
+ ASSERT_EQ(uid_range_entries(p), 5U);
+ ASSERT_EQ(p->entries[0].start, 0U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+ ASSERT_EQ(p->entries[1].start, 100U);
+ ASSERT_EQ(p->entries[1].nr, 100U);
+ ASSERT_EQ(p->entries[2].start, 1000U);
+ ASSERT_EQ(p->entries[2].nr, 100U);
+ ASSERT_EQ(p->entries[3].start, 1100U);
+ ASSERT_EQ(p->entries[3].nr, 100U);
+ ASSERT_EQ(p->entries[4].start, 1200U);
+ ASSERT_EQ(p->entries[4].nr, 100U);
+
+ p = uid_range_free(p);
+
+ /* Partition size of 1 */
+ ASSERT_OK(uid_range_add_str(&p, "100-102"));
+ ASSERT_OK(uid_range_partition(p, 1));
+ ASSERT_EQ(uid_range_entries(p), 3U);
+ ASSERT_EQ(p->entries[0].start, 100U);
+ ASSERT_EQ(p->entries[0].nr, 1U);
+ ASSERT_EQ(p->entries[1].start, 101U);
+ ASSERT_EQ(p->entries[1].nr, 1U);
+ ASSERT_EQ(p->entries[2].start, 102U);
+ ASSERT_EQ(p->entries[2].nr, 1U);
+}
+
+TEST(uid_range_copy) {
+ _cleanup_(uid_range_freep) UIDRange *p = NULL, *copy = NULL;
+
+ /* Copy NULL range */
+ ASSERT_OK(uid_range_copy(NULL, ©));
+ ASSERT_TRUE(uid_range_is_empty(copy));
+
+ copy = uid_range_free(copy);
+
+ /* Copy empty range */
+ p = new0(UIDRange, 1);
+ ASSERT_NOT_NULL(p);
+ ASSERT_OK(uid_range_copy(p, ©));
+ ASSERT_NOT_NULL(copy);
+ ASSERT_TRUE(uid_range_is_empty(copy));
+
+ p = uid_range_free(p);
+ copy = uid_range_free(copy);
+
+ /* Copy range with entries */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_add_str(&p, "300-399"));
+ ASSERT_OK(uid_range_copy(p, ©));
+ ASSERT_TRUE(uid_range_equal(p, copy));
+
+ /* Verify it's a deep copy - modifying original doesn't affect copy */
+ ASSERT_OK(uid_range_add_str(&p, "500-599"));
+ ASSERT_FALSE(uid_range_equal(p, copy));
+ ASSERT_EQ(uid_range_entries(copy), 2U);
+}
+
+TEST(uid_range_remove) {
+ _cleanup_(uid_range_freep) UIDRange *p = NULL;
+
+ /* Build a range: 100-199 */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+
+ /* Remove with size 0 - no-op */
+ ASSERT_OK(uid_range_remove(p, 150, 0));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 100U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+
+ /* Remove range that doesn't overlap - no change */
+ ASSERT_OK(uid_range_remove(p, 0, 50));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 100U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+
+ ASSERT_OK(uid_range_remove(p, 300, 50));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 100U);
+ ASSERT_EQ(p->entries[0].nr, 100U);
+
+ /* Remove from the start of the entry */
+ ASSERT_OK(uid_range_remove(p, 100, 10));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 110U);
+ ASSERT_EQ(p->entries[0].nr, 90U);
+
+ /* Remove from the end of the entry */
+ ASSERT_OK(uid_range_remove(p, 190, 10));
+ ASSERT_EQ(uid_range_entries(p), 1U);
+ ASSERT_EQ(p->entries[0].start, 110U);
+ ASSERT_EQ(p->entries[0].nr, 80U);
+
+ /* Remove from the middle - splits the entry */
+ ASSERT_OK(uid_range_remove(p, 140, 20));
+ ASSERT_EQ(uid_range_entries(p), 2U);
+ ASSERT_EQ(p->entries[0].start, 110U);
+ ASSERT_EQ(p->entries[0].nr, 30U);
+ ASSERT_EQ(p->entries[1].start, 160U);
+ ASSERT_EQ(p->entries[1].nr, 30U);
+
+ p = uid_range_free(p);
+
+ /* Remove entire entry */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_remove(p, 100, 100));
+ ASSERT_TRUE(uid_range_is_empty(p));
+
+ p = uid_range_free(p);
+
+ /* Remove range larger than entry */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_remove(p, 50, 200));
+ ASSERT_TRUE(uid_range_is_empty(p));
+
+ p = uid_range_free(p);
+
+ /* Remove affecting multiple entries */
+ ASSERT_OK(uid_range_add_str(&p, "100-199"));
+ ASSERT_OK(uid_range_add_str(&p, "300-399"));
+ ASSERT_OK(uid_range_add_str(&p, "500-599"));
+ ASSERT_EQ(uid_range_entries(p), 3U);
+
+ /* Remove range spanning the middle entry completely and trimming others */
+ ASSERT_OK(uid_range_remove(p, 150, 400));
+ ASSERT_EQ(uid_range_entries(p), 2U);
+ ASSERT_EQ(p->entries[0].start, 100U);
+ ASSERT_EQ(p->entries[0].nr, 50U);
+ ASSERT_EQ(p->entries[1].start, 550U);
+ ASSERT_EQ(p->entries[1].nr, 50U);
+}
+
+TEST(uid_range_translate) {
+ _cleanup_(uid_range_freep) UIDRange *o = NULL, *i = NULL;
+ uid_t uid;
+
+ ASSERT_OK(uid_range_add_str_full(&o, "200-299", /* coalesce= */ false));
+ ASSERT_OK(uid_range_add_str_full(&i, "100-199", /* coalesce= */ false));
+ ASSERT_OK(uid_range_translate(o, i, 250, &uid));
+ ASSERT_EQ(uid, 150U);
+ ASSERT_OK(uid_range_translate(i, o, 150, &uid));
+ ASSERT_EQ(uid, 250U);
+
+ ASSERT_OK(uid_range_add_str_full(&o, "300-399", /* coalesce= */ false));
+ ASSERT_OK(uid_range_add_str_full(&i, "350-449", /* coalesce= */ false));
+ ASSERT_OK(uid_range_translate(o, i, 350, &uid));
+ ASSERT_EQ(uid, 400U);
+ ASSERT_OK(uid_range_translate(i, o, 400, &uid));
+ ASSERT_EQ(uid, 350U);
+
+ /* Test translating at range boundaries */
+ ASSERT_OK(uid_range_translate(o, i, 200, &uid));
+ ASSERT_EQ(uid, 100U);
+ ASSERT_OK(uid_range_translate(o, i, 299, &uid));
+ ASSERT_EQ(uid, 199U);
+ ASSERT_OK(uid_range_translate(o, i, 300, &uid));
+ ASSERT_EQ(uid, 350U);
+ ASSERT_OK(uid_range_translate(o, i, 399, &uid));
+ ASSERT_EQ(uid, 449U);
+
+ /* Test reverse translation at boundaries */
+ ASSERT_OK(uid_range_translate(i, o, 100, &uid));
+ ASSERT_EQ(uid, 200U);
+ ASSERT_OK(uid_range_translate(i, o, 199, &uid));
+ ASSERT_EQ(uid, 299U);
+ ASSERT_OK(uid_range_translate(i, o, 350, &uid));
+ ASSERT_EQ(uid, 300U);
+ ASSERT_OK(uid_range_translate(i, o, 449, &uid));
+ ASSERT_EQ(uid, 399U);
+
+ /* Test UID not in any range returns ESRCH */
+ ASSERT_ERROR(uid_range_translate(o, i, 0, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(o, i, 199, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(o, i, 400, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(i, o, 0, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(i, o, 99, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(i, o, 200, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(i, o, 349, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(i, o, 450, &uid), ESRCH);
+
+ o = uid_range_free(o);
+ i = uid_range_free(i);
+
+ /* Test with single-element ranges */
+ ASSERT_OK(uid_range_add_str_full(&o, "1000", /* coalesce= */ false));
+ ASSERT_OK(uid_range_add_str_full(&i, "5000", /* coalesce= */ false));
+ ASSERT_OK(uid_range_translate(o, i, 1000, &uid));
+ ASSERT_EQ(uid, 5000U);
+ ASSERT_OK(uid_range_translate(i, o, 5000, &uid));
+ ASSERT_EQ(uid, 1000U);
+ ASSERT_ERROR(uid_range_translate(o, i, 999, &uid), ESRCH);
+ ASSERT_ERROR(uid_range_translate(o, i, 1001, &uid), ESRCH);
+}
+
DEFINE_TEST_MAIN(LOG_DEBUG);
exit 0
fi
+# Test delegated UID ranges
+# Verify that delegated ranges show up in uid_map (6 lines: 1 primary + 2 container ranges + 3 dynamic users)
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+ --push-fd=/proc/self/ns/user \
+ /run/systemd/userdb/io.systemd.NamespaceResource \
+ io.systemd.NamespaceResource.AllocateUserRange \
+ '{"name":"test-delegate","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \
+ -- cat /proc/self/uid_map | wc -l)" -eq 3
+
+# Test that delegateContainerRanges > 16 fails with TooManyDelegations error
+(! run0 -u testuser --pipe unshare --user varlinkctl call \
+ --push-fd=/proc/self/ns/user \
+ /run/systemd/userdb/io.systemd.NamespaceResource \
+ io.systemd.NamespaceResource.AllocateUserRange \
+ '{"name":"test-fail","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":17}') |&
+ grep "io.systemd.NamespaceResource.TooManyDelegations" >/dev/null
+
# This should work without the key
systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null