]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nsresourced: Add support for delegated ranges
authorDaan De Meyer <daan.j.demeyer@gmail.com>
Fri, 23 Jan 2026 11:31:47 +0000 (12:31 +0100)
committerDaan De Meyer <daan@amutable.com>
Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
We want to support the scenario where we bind mount the nsresourced
varlink socket into a container to allow nested containers where the
outer container runs in its own transient range from nsresourced but
can still allocate transient ranges for its own nested containers.

To support this use case let's add support for delegation. Delegated
ranges are allocated when allocating the primary range and are propagated
1:1 to the user namespace. We track delegated ranges in ".delegate" files
in the userns registry so that they can't be used for other range allocations.

We make one exception for delegated ranges though, if we get a request from
a user namespace that is a child of the user namespace that owns the delegated
ranges, we allow allocating from the delegated range. The parent userns already
has full ownership over the child userns, so it doesn't matter that the parent
userns and the child userns share the same range. This allows making use of
delegated ranges without having to run another copy of nsresourced inside the
parent userns to hand out from the delegated range.

To support recursive delegations, we keep track of the previous owners of the
delegated range and restore ownership to the last previous owner when the current
owner is freed.

man/systemd-nsresourced.service.xml
src/basic/uid-range.c
src/basic/uid-range.h
src/nsresourced/nsresourcework.c
src/nsresourced/userns-registry.c
src/nsresourced/userns-registry.h
src/shared/varlink-io.systemd.NamespaceResource.c
src/test/test-uid-range.c
test/units/TEST-50-DISSECT.mountfsd.sh

index 787312d858f993ad275603740c8b501f9d8faae7..853fe09fbc97f96843803efd65424007edb201fe 100644 (file)
     registered with this service. Moreover, UIDs and GIDs are always allocated together, and
     symmetrically.</para>
 
+    <para>The allocation API supports <emphasis>delegated ranges</emphasis>: additional UID/GID ranges that
+    are mapped 1:1 into the user namespace rather than being translated to a target UID/GID. These delegated
+    ranges enable nested user namespace scenarios where a container needs to create child user namespaces
+    with their own transient UID ranges. Normally, the kernel restricts which UIDs can be mapped into a user
+    namespace to those that are also mapped in the parent. Delegated ranges solve this by pre-allocating
+    additional ranges that are visible inside the user namespace and can be used by nested
+    <function>AllocateUserRange()</function> calls. Up to 16 delegated ranges can be requested per user
+    namespace, each of size 65536. The ranges are allocated from the container UID ranges as per
+    <ulink url="https://systemd.io/UIDS-GIDS">Users, Groups, UIDs and GIDs on systemd Systems</ulink>.</para>
+
     <para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
     per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
     namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
index 1aaf760468b5c5bb55a4d7c37ad984809a8a2e68..763c421e91edb510cb3d8b7727a983be784e6dbc 100644 (file)
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
+#include <sched.h>
 #include <string.h>
 
 #include "alloc-util.h"
@@ -115,7 +116,7 @@ int uid_range_add_internal(UIDRange **range, uid_t start, uid_t nr, bool coalesc
         return 0;
 }
 
-int uid_range_add_str(UIDRange **range, const char *s) {
+int uid_range_add_str_full(UIDRange **range, const char *s, bool coalesce) {
         uid_t start, end;
         int r;
 
@@ -126,7 +127,7 @@ int uid_range_add_str(UIDRange **range, const char *s) {
         if (r < 0)
                 return r;
 
-        return uid_range_add_internal(range, start, end - start + 1, /* coalesce= */ true);
+        return uid_range_add_internal(range, start, end - start + 1, coalesce);
 }
 
 int uid_range_next_lower(const UIDRange *range, uid_t *uid) {
@@ -230,7 +231,7 @@ bool uid_range_is_empty(const UIDRange *range) {
         return true;
 }
 
-int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret) {
+int uid_range_load_userns_full(const char *path, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret) {
         _cleanup_(uid_range_freep) UIDRange *range = NULL;
         _cleanup_fclose_ FILE *f = NULL;
         int r;
@@ -280,13 +281,14 @@ int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **
                         return r;
         }
 
-        uid_range_coalesce(range);
+        if (coalesce)
+                uid_range_coalesce(range);
 
         *ret = TAKE_PTR(range);
         return 0;
 }
 
-int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret) {
+int uid_range_load_userns_by_fd_full(int userns_fd, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret) {
         _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
         int r;
 
@@ -299,7 +301,7 @@ int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange
         if (r < 0)
                 return r;
         if (r > 0)
-                return uid_range_load_userns(/* path= */ NULL, mode, ret);
+                return uid_range_load_userns_full(/* path= */ NULL, mode, coalesce, ret);
 
         r = userns_enter_and_pin(userns_fd, &pidref);
         if (r < 0)
@@ -309,7 +311,7 @@ int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange
                         pidref.pid,
                         IN_SET(mode, UID_RANGE_USERNS_INSIDE, UID_RANGE_USERNS_OUTSIDE) ? "uid_map" : "gid_map");
 
-        return uid_range_load_userns(p, mode, ret);
+        return uid_range_load_userns_full(p, mode, coalesce, ret);
 }
 
 bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr) {
@@ -332,6 +334,204 @@ bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr) {
         return false;
 }
 
+int uid_range_clip(UIDRange *range, uid_t min, uid_t max) {
+        assert(range);
+
+        if (min > max)
+                return -EINVAL;
+
+        size_t t = 0;
+        FOREACH_ARRAY(e, range->entries, range->n_entries) {
+                uid_t entry_end = e->start + e->nr; /* one past the last UID in entry */
+
+                /* Skip entries completely outside [min, max] */
+                if (entry_end <= min || e->start > max)
+                        continue;
+
+                /* Trim the entry to fit within [min, max] */
+                uid_t new_start = MAX(e->start, min);
+                /* entry_end is exclusive, avoid overflow when max == UINT32_MAX */
+                uid_t new_end = entry_end <= max ? entry_end : max + 1;
+                assert(new_end > new_start);
+
+                range->entries[t++] = (UIDRangeEntry) {
+                        .start = new_start,
+                        .nr = new_end - new_start,
+                };
+        }
+
+        range->n_entries = t;
+
+        return 0;
+}
+
+int uid_range_partition(UIDRange *range, uid_t size) {
+        assert(range);
+        assert(size > 0);
+
+        /* Partitions the UID range entries into buckets of the given size. Any entry larger than the given
+         * size will be partitioned into multiple entries, each of the given size. Any leftover UIDs in the
+         * entry are dropped. Any entries smaller than the given size are also dropped. */
+
+        /* Count how many entries we'll need after partitioning */
+        size_t n_new_entries = 0;
+        FOREACH_ARRAY(e, range->entries, range->n_entries)
+                n_new_entries += e->nr / size;
+
+        if (n_new_entries == 0) {
+                range->n_entries = 0;
+                return 0;
+        }
+
+        if (n_new_entries > range->n_entries && !GREEDY_REALLOC(range->entries, n_new_entries))
+                return -ENOMEM;
+
+        /* Work backwards to avoid overwriting entries we still need to read */
+        size_t t = n_new_entries;
+        for (size_t i = range->n_entries; i > 0; i--) {
+                UIDRangeEntry *e = range->entries + i - 1;
+                unsigned n_parts = e->nr / size;
+
+                for (unsigned j = n_parts; j > 0; j--)
+                        range->entries[--t] = (UIDRangeEntry) {
+                                .start = e->start + (j - 1) * size,
+                                .nr = size,
+                        };
+        }
+
+        range->n_entries = n_new_entries;
+
+        return 0;
+}
+
+int uid_range_copy(const UIDRange *range, UIDRange **ret) {
+        assert(ret);
+
+        if (!range) {
+                *ret = NULL;
+                return 0;
+        }
+
+        _cleanup_(uid_range_freep) UIDRange *copy = new0(UIDRange, 1);
+        if (!copy)
+                return -ENOMEM;
+
+        if (range->n_entries > 0) {
+                copy->entries = newdup(UIDRangeEntry, range->entries, range->n_entries);
+                if (!copy->entries)
+                        return -ENOMEM;
+
+                copy->n_entries = range->n_entries;
+        }
+
+        *ret = TAKE_PTR(copy);
+        return 0;
+}
+
+int uid_range_remove(UIDRange *range, uid_t start, uid_t size) {
+        assert(range);
+
+        if (size == 0)
+                return 0;
+
+        uid_t end = start + size; /* one past the last UID to remove */
+
+        for (size_t i = 0; i < range->n_entries; i++) {
+                UIDRangeEntry *e = range->entries + i;
+                uid_t entry_end = e->start + e->nr;
+
+                /* No overlap */
+                if (entry_end <= start || e->start >= end)
+                        continue;
+
+                /* Check if this removal splits the entry into two parts */
+                if (e->start < start && entry_end > end) {
+                        /* Need to split: grow the array first */
+                        if (!GREEDY_REALLOC(range->entries, range->n_entries + 1))
+                                return -ENOMEM;
+
+                        /* Re-fetch pointer after potential realloc */
+                        e = range->entries + i;
+                        entry_end = e->start + e->nr;
+
+                        /* Shift everything after this entry to make room */
+                        memmove(range->entries + i + 2, range->entries + i + 1,
+                                (range->n_entries - i - 1) * sizeof(UIDRangeEntry));
+                        range->n_entries++;
+
+                        /* First part: before the removed range */
+                        range->entries[i] = (UIDRangeEntry) {
+                                .start = e->start,
+                                .nr = start - e->start,
+                        };
+
+                        /* Second part: after the removed range */
+                        range->entries[i + 1] = (UIDRangeEntry) {
+                                .start = end,
+                                .nr = entry_end - end,
+                        };
+
+                        /* Skip the newly inserted entry */
+                        i++;
+                        continue;
+                }
+
+                /* Removal covers the entire entry */
+                if (start <= e->start && end >= entry_end) {
+                        memmove(e, e + 1, (range->n_entries - i - 1) * sizeof(UIDRangeEntry));
+                        range->n_entries--;
+                        i--;
+                        continue;
+                }
+
+                /* Removal trims the start of the entry */
+                if (start <= e->start && end > e->start) {
+                        e->nr = entry_end - end;
+                        e->start = end;
+                        continue;
+                }
+
+                /* Removal trims the end of the entry */
+                if (start < entry_end && end >= entry_end) {
+                        e->nr = start - e->start;
+                        continue;
+                }
+        }
+
+        return 0;
+}
+
+int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret) {
+        assert(uid_range_entries(outside) == uid_range_entries(inside));
+        assert(ret);
+
+        /* Given two UID ranges that represent the outside UID range of a user namespace (the 2nd and 3rd
+         * columns in /proc/xxx/uid_map) and the inside UID range of a user namespace (the 1st and 3rd
+         * columns in /proc/xxx/uid_map), translates the given UID from the outside range to the inside
+         * range. For example, given the following UID range:
+         *
+         * 0 1000 1
+         *
+         * calling uid_range_translate(outside, inside, 1000) will return 0 as the output UID. Alternatively,
+         * calling uid_range_translate(inside, outside, 0) will return 1000 as the output UID.
+         */
+
+        for (size_t i = 0; i < uid_range_entries(outside); i++)
+                assert(outside->entries[i].nr == inside->entries[i].nr);
+
+        for (size_t i = 0; i < uid_range_entries(outside); i++) {
+                const UIDRangeEntry *e = outside->entries + i;
+
+                if (uid < e->start || uid >= e->start + e->nr)
+                        continue;
+
+                *ret = inside->entries[i].start + uid - e->start;
+                return 0;
+        }
+
+        return -ESRCH;
+}
+
 bool uid_range_equal(const UIDRange *a, const UIDRange *b) {
         if (a == b)
                 return true;
index c28b02fa7d16a7e33616473e7205429a2312a205..a15a2a8e4f969d49eecacd55255cd668af3c7c9d 100644 (file)
@@ -19,7 +19,10 @@ int uid_range_add_internal(UIDRange **range, uid_t start, uid_t nr, bool coalesc
 static inline int uid_range_add(UIDRange **range, uid_t start, uid_t nr) {
         return uid_range_add_internal(range, start, nr, true);
 }
-int uid_range_add_str(UIDRange **range, const char *s);
+int uid_range_add_str_full(UIDRange **range, const char *s, bool coalesce);
+static inline int uid_range_add_str(UIDRange **range, const char *s) {
+        return uid_range_add_str_full(range, s, true);
+}
 
 int uid_range_next_lower(const UIDRange *range, uid_t *uid);
 
@@ -48,11 +51,23 @@ typedef enum UIDRangeUsernsMode {
         _UID_RANGE_USERNS_MODE_INVALID = -EINVAL,
 } UIDRangeUsernsMode;
 
-int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret);
-int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret);
+int uid_range_load_userns_full(const char *path, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret);
+static inline int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret) {
+        return uid_range_load_userns_full(path, mode, true, ret);
+}
+int uid_range_load_userns_by_fd_full(int userns_fd, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret);
+static inline int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret) {
+        return uid_range_load_userns_by_fd_full(userns_fd, mode, true, ret);
+}
 
 bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr);
 
+int uid_range_clip(UIDRange *range, uid_t min, uid_t max);
+int uid_range_partition(UIDRange *range, uid_t size);
+int uid_range_copy(const UIDRange *range, UIDRange **ret);
+int uid_range_remove(UIDRange *range, uid_t start, uid_t size);
+int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret);
+
 int uid_map_search_root(pid_t pid, UIDRangeUsernsMode mode, uid_t *ret);
 
 uid_t uid_range_base(const UIDRange *range);
index 60d3a01ce06e4c44b2c85ad47c7948ef3557ebe3..abb50955081cf6ad3164d48afddb51671b5db6cb 100644 (file)
@@ -34,6 +34,7 @@
 #include "mountpoint-util.h"
 #include "namespace-util.h"
 #include "netlink-util.h"
+#include "nsresource.h"
 #include "pidref.h"
 #include "process-util.h"
 #include "random-util.h"
@@ -357,16 +358,19 @@ static int vl_method_get_memberships(sd_varlink *link, sd_json_variant *paramete
         return sd_varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
 }
 
-static int uid_is_available(
-                int registry_dir_fd,
-                uid_t candidate) {
-
+static int uid_is_available(int registry_dir_fd, uid_t candidate, int parent_userns_fd) {
         int r;
 
         assert(registry_dir_fd >= 0);
 
         log_debug("Checking if UID " UID_FMT " is available.", candidate);
 
+        uint64_t parent_userns_inode = 0;
+        struct stat parent_st;
+        if (fstat(parent_userns_fd, &parent_st) < 0)
+                return log_debug_errno(errno, "Failed to fstat parent user namespace: %m");
+        parent_userns_inode = parent_st.st_ino;
+
         r = userns_registry_uid_exists(registry_dir_fd, candidate);
         if (r < 0)
                 return r;
@@ -379,17 +383,65 @@ static int uid_is_available(
         if (r > 0)
                 return false;
 
-        r = userdb_by_uid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
-        if (r >= 0)
-                return false;
-        if (r != -ESRCH)
+        /* Also check delegation files. If parent_userns_inode is set and matches the delegation's userns
+         * inode, the UID is available because the parent owns that delegation. */
+        r = userns_registry_delegation_uid_exists(registry_dir_fd, candidate);
+        if (r < 0)
                 return r;
+        if (r > 0) {
+                _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo delegation = DELEGATED_USER_NAMESPACE_INFO_NULL;
+                r = userns_registry_load_delegation_by_uid(registry_dir_fd, candidate, &delegation);
+                if (r < 0)
+                        return r;
 
-        r = groupdb_by_gid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
-        if (r >= 0)
-                return false;
-        if (r != -ESRCH)
+                if (delegation.userns_inode != parent_userns_inode)
+                        return false;
+
+                /* The parent userns owns this delegation, so the UID is available for nested allocation */
+                log_debug("UID " UID_FMT " is delegated by parent userns inode %" PRIu64 ", available for nested allocation.",
+                          candidate, parent_userns_inode);
+        }
+
+        r = userns_registry_delegation_gid_exists(registry_dir_fd, (gid_t) candidate);
+        if (r < 0)
                 return r;
+        if (r > 0) {
+                _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo delegation = DELEGATED_USER_NAMESPACE_INFO_NULL;
+                r = userns_registry_load_delegation_by_gid(registry_dir_fd, candidate, &delegation);
+                if (r < 0)
+                        return r;
+
+                if (delegation.userns_inode != parent_userns_inode)
+                        return false;
+
+                /* The parent userns owns this delegation, so the UID is available for nested allocation */
+                log_debug("UID " UID_FMT " is delegated by parent userns inode %" PRIu64 ", available for nested allocation.",
+                          candidate, parent_userns_inode);
+        }
+
+        r = is_our_namespace(parent_userns_fd, NAMESPACE_USER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to check if parent user namespace is our user namespace: %m");
+
+        if (r > 0) {
+                /* Only check userdb if we're allocating from our current user namespace. userdb won't be
+                 * to tell us anything on whether UIDs/GIDs in another user namespace are in use or not. On
+                 * top of that, for nspawn containers registered with machined's userdb implementation, it
+                 * would tell us that any ranges delegated to the container are in use (which is true in the
+                 * nsresourced user namespace, but not in the nspawn user namespace). */
+
+                r = userdb_by_uid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
+                if (r >= 0)
+                        return false;
+                if (r != -ESRCH)
+                        return r;
+
+                r = groupdb_by_gid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL);
+                if (r >= 0)
+                        return false;
+                if (r != -ESRCH)
+                        return r;
+        }
 
         log_debug("UID " UID_FMT " is available.", candidate);
 
@@ -433,57 +485,131 @@ static int name_is_available(
         return true;
 }
 
-static int allocate_now(
+static int allocate_one(
                 int registry_dir_fd,
-                UserNamespaceInfo *info,
-                int *ret_lock_fd) {
+                const char *name,
+                uint32_t size,
+                int parent_userns_fd,
+                UIDRange *candidates,
+                uid_t *ret_candidate) {
 
         static const uint8_t hash_key[16] = {
                 0xd4, 0xd7, 0x33, 0xa7, 0x4d, 0xd3, 0x42, 0xcd,
                 0xaa, 0xe9, 0x45, 0xd0, 0xfb, 0xec, 0x79, 0xee,
         };
-
-        _cleanup_(uid_range_freep) UIDRange *valid_range = NULL;
-        uid_t candidate, uidmin, uidmax, uidmask;
+        _cleanup_(uid_range_freep) UIDRange *copy = NULL;
+        uid_t candidate, uidmin, uidmax;
         unsigned n_tries = 100;
+        size_t idx;
         int r;
 
-        /* Returns the following error codes:
-         *
-         * EBUSY   â†’ all UID candidates we checked are already taken
-         * EEXIST  â†’ the name for the userns already exists
-         * EDEADLK â†’ the userns is already registered in the registry
-         */
-
         assert(registry_dir_fd >= 0);
-        assert(info);
+        assert(candidates);
+        assert(ret_candidate);
 
-        switch (info->size) {
+        switch (size) {
 
-        case 0x10000U:
+        case NSRESOURCE_UIDS_64K:
                 uidmin = CONTAINER_UID_BASE_MIN;
                 uidmax = CONTAINER_UID_BASE_MAX;
-                uidmask = (uid_t) UINT32_C(0xFFFF0000);
                 break;
 
-        case 1U:
+        case NSRESOURCE_UIDS_1:
                 uidmin = DYNAMIC_UID_MIN;
                 uidmax = DYNAMIC_UID_MAX;
-                uidmask = (uid_t) UINT32_C(0xFFFFFFFF);
                 break;
 
         default:
                 assert_not_reached();
         }
 
-        r = uid_range_load_userns(/* path= */ NULL, UID_RANGE_USERNS_INSIDE, &valid_range);
+        /* Make a copy of candidates that we can modify for the selection algorithm */
+        r = uid_range_copy(candidates, &copy);
         if (r < 0)
-                return r;
+                return log_debug_errno(r, "Failed to copy UID range: %m");
 
-        /* Check early whether we have any chance at all given our own uid range */
-        if (!uid_range_overlaps(valid_range, uidmin, uidmax))
+        /* Clip the copy with the valid UID range for this allocation size */
+        r = uid_range_clip(copy, uidmin, uidmax);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to intersect UID range: %m");
+
+        /* Partition entries into entries of exactly the right size */
+        r = uid_range_partition(copy, size);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to partition UID ranges: %m");
+
+        if (uid_range_is_empty(copy))
                 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate.");
 
+        log_debug("Partitioned UID range into %zu entries of size %" PRIu32, copy->n_entries, size);
+
+        /* Start from a hash of the input name if we have one, use random values afterwards. */
+        idx = name ? siphash24_string(name, hash_key) : random_u32();
+        for (;; idx = random_u32()) {
+                if (uid_range_is_empty(copy))
+                        return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "All candidate UIDs already taken.");
+
+                if (--n_tries <= 0)
+                        return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available.");
+
+                idx %= copy->n_entries;
+
+                candidate = copy->entries[idx].start;
+
+                /* We only check the base UID for each range. Pass the parent userns inode so that
+                 * allocating from a delegated range owned by the parent is allowed. */
+                r = uid_is_available(registry_dir_fd, candidate, parent_userns_fd);
+                if (r < 0)
+                        return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate);
+                if (r > 0)
+                        break;
+
+                log_debug("UID range " UID_FMT " already taken.", candidate);
+
+                /* Remove this unavailable range from candidates so we don't try it again */
+                r = uid_range_remove(copy, candidate, size);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to remove unavailable range from candidates: %m");
+        }
+
+        /* Remove the allocated range from the original candidates */
+        r = uid_range_remove(candidates, candidate, size);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to remove allocated range from candidates: %m");
+
+        *ret_candidate = candidate;
+
+        log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + size - 1);
+
+        return 0;
+}
+
+static int allocate_now(
+                int registry_dir_fd,
+                int userns_fd,
+                int parent_userns_fd,
+                UserNamespaceInfo *info,
+                int *ret_lock_fd) {
+
+        _cleanup_(uid_range_freep) UIDRange *candidates = NULL;
+        uid_t candidate;
+        int r;
+
+        /* Returns the following error codes:
+         *
+         * EBUSY   â†’ all UID candidates we checked are already taken
+         * EEXIST  â†’ the name for the userns already exists
+         * EDEADLK â†’ the userns is already registered in the registry
+         */
+
+        assert(registry_dir_fd >= 0);
+        assert(userns_fd >= 0);
+        assert(info);
+
+        r = uid_range_load_userns_by_fd(parent_userns_fd, UID_RANGE_USERNS_INSIDE, &candidates);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns UID range: %m");
+
         _cleanup_close_ int lock_fd = -EBADF;
         lock_fd = userns_registry_lock(registry_dir_fd);
         if (lock_fd < 0)
@@ -508,45 +634,74 @@ static int allocate_now(
         if (r == 0)
                 return -EEXIST;
 
-        for (candidate = siphash24_string(info->name, hash_key) & UINT32_MAX;; /* Start from a hash of the input name */
-             candidate = random_u32()) {                                 /* Use random values afterwards */
+        r = allocate_one(
+                        registry_dir_fd,
+                        info->name, info->size,
+                        parent_userns_fd,
+                        candidates,
+                        &candidate);
+        if (r < 0)
+                return r;
 
-                if (--n_tries <= 0)
-                        return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available.");
+        info->start_uid = candidate;
+        info->start_gid = (gid_t) candidate;
 
-                candidate = (candidate % (uidmax - uidmin)) + uidmin;
-                candidate &= uidmask;
+        /* Now allocate delegated ranges if requested */
+        if (info->n_delegates > 0) {
+                assert(info->delegates);
 
-                if (!uid_range_covers(valid_range, candidate, info->size))
-                        continue;
+                FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+                        r = allocate_one(
+                                        registry_dir_fd,
+                                        /* name= */ NULL,
+                                        delegate->size,
+                                        parent_userns_fd,
+                                        candidates,
+                                        &candidate);
+                        if (r < 0)
+                                return r;
 
-                /* We only check the base UID for each range (!) */
-                r = uid_is_available(registry_dir_fd, candidate);
-                if (r < 0)
-                        return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate);
-                if (r > 0) {
-                        info->start_uid = candidate;
-                        info->start_gid = (gid_t) candidate;
+                        delegate->userns_inode = info->userns_inode;
+                        delegate->start_uid = candidate;
+                        delegate->start_gid = (gid_t) candidate;
+                }
+        }
 
-                        log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + info->size - 1);
+        if (ret_lock_fd)
+                *ret_lock_fd = TAKE_FD(lock_fd);
 
-                        if (ret_lock_fd)
-                                *ret_lock_fd = TAKE_FD(lock_fd);
+        return 0;
+}
 
-                        return 0;
-                }
+static int write_userns_mappings(PidRef *pidref, const char *uidmap, const char *gidmap) {
+        const char *pmap;
+        int r;
 
-                log_debug("UID range " UID_FMT " already taken.", candidate);
-        }
+        assert(pidref);
+        assert(uidmap);
+        assert(gidmap);
+
+        pmap = procfs_file_alloca(pidref->pid, "uid_map");
+        r = write_string_file(pmap, uidmap, /* flags= */ 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m");
+
+        pmap = procfs_file_alloca(pidref->pid, "gid_map");
+        r = write_string_file(pmap, gidmap, /* flags= */ 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m");
+
+        return 0;
 }
 
-static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) {
+static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespaceInfo *userns_info) {
         _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
         _cleanup_close_ int efd = -EBADF;
         uint64_t u;
         int r;
 
-        assert(usernsfd >= 0);
+        assert(userns_fd >= 0);
+        assert(parent_userns_fd >= 0);
         assert(userns_info);
         assert(uid_is_valid(userns_info->target_uid));
         assert(uid_is_valid(userns_info->start_uid));
@@ -566,7 +721,7 @@ static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) {
         if (r == 0) {
                 /* child */
 
-                if (setns(usernsfd, CLONE_NEWUSER) < 0) {
+                if (setns(userns_fd, CLONE_NEWUSER) < 0) {
                         log_error_errno(errno, "Failed to join user namespace: %m");
                         goto child_fail;
                 }
@@ -588,22 +743,135 @@ static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) {
 
         /* Now write mapping */
 
-        _cleanup_free_ char *pmap = NULL;
+        _cleanup_(uid_range_freep) UIDRange *outside_range = NULL;
+        r = uid_range_load_userns_by_fd_full(parent_userns_fd, UID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &outside_range);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns UID range: %m");
+
+        _cleanup_(uid_range_freep) UIDRange *inside_range = NULL;
+        r = uid_range_load_userns_by_fd_full(parent_userns_fd, UID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &inside_range);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns UID range: %m");
+
+        uid_t start_uid;
+        r = uid_range_translate(outside_range, inside_range, userns_info->start_uid, &start_uid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", userns_info->start_uid);
 
-        if (asprintf(&pmap, "/proc/" PID_FMT "/uid_map", pidref.pid) < 0)
+        /* Let's enforce that the transient UID/GID ranges are mapped 1:1 in the parent user namespace, to
+         * avoid any weird mapping shenanigans that might happen otherwise. */
+
+        if (start_uid != userns_info->start_uid)
+                return log_debug_errno(
+                        SYNTHETIC_ERRNO(ERANGE),
+                        "Transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
+                        userns_info->start_uid, start_uid);
+
+        /* Build uid_map content: primary mapping + delegated mappings (1:1) */
+        _cleanup_free_ char *uidmap = NULL;
+        if (asprintf(&uidmap, UID_FMT " " UID_FMT " %" PRIu32 "\n",
+                     userns_info->target_uid, start_uid, userns_info->size) < 0)
                 return log_oom();
 
-        r = write_string_filef(pmap, 0, UID_FMT " " UID_FMT " %" PRIu32 "\n", userns_info->target_uid, userns_info->start_uid, userns_info->size);
+        log_debug("UID mapping: " UID_FMT " " UID_FMT " %" PRIu32,
+                  userns_info->target_uid, userns_info->start_uid, userns_info->size);
+
+        FOREACH_ARRAY(delegate, userns_info->delegates, userns_info->n_delegates) {
+                r = uid_range_translate(outside_range, inside_range, delegate->start_uid, &start_uid);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", userns_info->start_uid);
+
+                if (start_uid != delegate->start_uid)
+                        return log_debug_errno(
+                                SYNTHETIC_ERRNO(ERANGE),
+                                "Delegated transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
+                                delegate->start_uid, start_uid);
+
+                if (strextendf(&uidmap,
+                               UID_FMT " " UID_FMT " %" PRIu32 "\n",
+                               delegate->start_uid,
+                               start_uid,
+                               delegate->size) < 0)
+                        return log_oom();
+
+                log_debug("UID mapping: " UID_FMT " " UID_FMT " %" PRIu32,
+                          delegate->start_uid, start_uid, delegate->size);
+        }
+
+        outside_range = uid_range_free(outside_range);
+        inside_range = uid_range_free(inside_range);
+
+        r = uid_range_load_userns_by_fd_full(parent_userns_fd, GID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &outside_range);
         if (r < 0)
-                return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m");
+                return log_debug_errno(r, "Failed to read userns GID range: %m");
 
-        pmap = mfree(pmap);
-        if (asprintf(&pmap, "/proc/" PID_FMT "/gid_map", pidref.pid) < 0)
+        r = uid_range_load_userns_by_fd_full(parent_userns_fd, GID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &inside_range);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns GID range: %m");
+
+        gid_t start_gid;
+        r = uid_range_translate(outside_range, inside_range, userns_info->start_gid, &start_gid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid);
+
+        if (start_gid != userns_info->start_gid)
+                return log_debug_errno(
+                        SYNTHETIC_ERRNO(ERANGE),
+                        "Transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
+                        userns_info->start_gid, start_gid);
+
+        _cleanup_free_ char *gidmap = NULL;
+        if (asprintf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n",
+                     userns_info->target_gid, start_gid, userns_info->size) < 0)
                 return log_oom();
 
-        r = write_string_filef(pmap, 0, GID_FMT " " GID_FMT " %" PRIu32 "\n", userns_info->target_gid, userns_info->start_gid, userns_info->size);
+        log_debug("GID mapping: " GID_FMT " " GID_FMT " %" PRIu32,
+                  userns_info->target_gid, userns_info->start_gid, userns_info->size);
+
+        FOREACH_ARRAY(delegate, userns_info->delegates, userns_info->n_delegates) {
+                r = uid_range_translate(outside_range, inside_range, delegate->start_gid, &start_gid);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid);
+
+                if (start_gid != delegate->start_gid)
+                        return log_debug_errno(
+                                SYNTHETIC_ERRNO(ERANGE),
+                                "Delegated transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
+                                delegate->start_gid, start_gid);
+
+                /* Delegated ranges are mapped 1:1 (inside GID == outside GID) */
+                if (strextendf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n",
+                               delegate->start_gid,
+                               start_gid,
+                               delegate->size) < 0)
+                        return log_oom();
+
+                log_debug("GID mapping: " GID_FMT " " GID_FMT " %" PRIu32,
+                          delegate->start_gid, start_gid, delegate->size);
+        }
+
+        r = is_our_namespace(parent_userns_fd, NAMESPACE_USER);
         if (r < 0)
-                return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m");
+                return log_debug_errno(r, "Failed to check if parent user namespace refers to our own user namespace: %m");
+        if (r > 0)
+                return write_userns_mappings(&pidref, uidmap, gidmap);
+
+        /* The kernel is paranoid that the uid_map and gid_map files are written either from the user
+         * namespace itself or its parent user namespace, so we have to join the parent user namespace to
+         * write the files. */
+
+        r = pidref_safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT, /* ret= */ NULL);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                if (setns(parent_userns_fd, CLONE_NEWUSER) < 0) {
+                        log_error_errno(errno, "Failed to join parent user namespace: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = write_userns_mappings(&pidref, uidmap, gidmap);
+                _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
+        }
 
         /* We are done! */
 
@@ -840,16 +1108,18 @@ typedef struct AllocateParameters {
         uid_t target;
         unsigned userns_fd_idx;
         bool mangle_name;
+        uint32_t delegate_container_ranges;
 } AllocateParameters;
 
 static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) {
 
         static const sd_json_dispatch_field dispatch_table[] = {
-                { "name",                        SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string, offsetof(AllocateParameters, name),          SD_JSON_MANDATORY },
-                { "size",                        _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,       offsetof(AllocateParameters, size),          SD_JSON_MANDATORY },
-                { "target",                      _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid,      offsetof(AllocateParameters, target),        0                 },
-                { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint,         offsetof(AllocateParameters, userns_fd_idx), SD_JSON_MANDATORY },
-                { "mangleName",                  SD_JSON_VARIANT_BOOLEAN,       sd_json_dispatch_stdbool,      offsetof(AllocateParameters, mangle_name),   0                 },
+                { "name",                        SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string, offsetof(AllocateParameters, name),                      SD_JSON_MANDATORY },
+                { "size",                        _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,       offsetof(AllocateParameters, size),                      SD_JSON_MANDATORY },
+                { "target",                      _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid,      offsetof(AllocateParameters, target),                    0                 },
+                { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint,         offsetof(AllocateParameters, userns_fd_idx),             SD_JSON_MANDATORY },
+                { "mangleName",                  SD_JSON_VARIANT_BOOLEAN,       sd_json_dispatch_stdbool,      offsetof(AllocateParameters, mangle_name),               0                 },
+                { "delegateContainerRanges",     _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,       offsetof(AllocateParameters, delegate_container_ranges), 0                 },
                 {}
         };
 
@@ -883,6 +1153,9 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         if (r != 0)
                 return r;
 
+        if (p.delegate_container_ranges > USER_NAMESPACE_DELEGATIONS_MAX)
+                return sd_varlink_error(link, "io.systemd.NamespaceResource.TooManyDelegations", NULL);
+
         userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
         if (userns_fd < 0)
                 return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
@@ -898,6 +1171,10 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         if (fstat(userns_fd, &userns_st) < 0)
                 return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
 
+        _cleanup_close_ int parent_userns_fd = ioctl(userns_fd, NS_GET_PARENT);
+        if (parent_userns_fd < 0)
+                return log_debug_errno(errno, "Failed to get parent user namespace: %m");
+
         r = sd_varlink_get_peer_uid(link, &peer_uid);
         if (r < 0)
                 return r;
@@ -942,7 +1219,21 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         userns_info->target_uid = p.target;
         userns_info->target_gid = (gid_t) p.target;
 
-        r = allocate_now(registry_dir_fd, userns_info, &lock_fd);
+        /* Set up delegation arrays if requested */
+        if (p.delegate_container_ranges > 0) {
+                userns_info->delegates = new0(DelegatedUserNamespaceInfo, p.delegate_container_ranges);
+                if (!userns_info->delegates)
+                        return -ENOMEM;
+
+                FOREACH_ARRAY(delegate, userns_info->delegates, p.delegate_container_ranges) {
+                        *delegate = DELEGATED_USER_NAMESPACE_INFO_NULL;
+                        delegate->size = NSRESOURCE_UIDS_64K;
+                }
+
+                userns_info->n_delegates = p.delegate_container_ranges;
+        }
+
+        r = allocate_now(registry_dir_fd, userns_fd, parent_userns_fd, userns_info, &lock_fd);
         if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */
                 return sd_varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL);
         if (r == -EBUSY)     /* All used up */
@@ -968,7 +1259,7 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para
         if (r < 0)
                 goto fail;
 
-        r = write_userns(userns_fd, userns_info);
+        r = write_userns(userns_fd, parent_userns_fd, userns_info);
         if (r < 0)
                 goto fail;
 
index 97a222cfc40529a17e2690d0935d904eec071ea8..a728c7fde9f8030390921867cc1ea57a3e0270e0 100644 (file)
@@ -56,6 +56,23 @@ int userns_registry_lock(int dir_fd) {
         return TAKE_FD(lock_fd);
 }
 
+void delegated_userns_info_done(DelegatedUserNamespaceInfo *info) {
+        if (!info)
+                return;
+
+        info->ancestor_userns = mfree(info->ancestor_userns);
+        info->n_ancestor_userns = 0;
+}
+
+void delegated_userns_info_done_many(DelegatedUserNamespaceInfo infos[], size_t n) {
+        assert(infos || n == 0);
+
+        FOREACH_ARRAY(info, infos, n)
+                delegated_userns_info_done(info);
+
+        free(infos);
+}
+
 UserNamespaceInfo* userns_info_new(void) {
         UserNamespaceInfo *info = new(UserNamespaceInfo, 1);
         if (!info)
@@ -79,6 +96,8 @@ UserNamespaceInfo *userns_info_free(UserNamespaceInfo *userns) {
         free(userns->cgroups);
         free(userns->name);
 
+        delegated_userns_info_done_many(userns->delegates, userns->n_delegates);
+
         strv_free(userns->netifs);
 
         return mfree(userns);
@@ -128,6 +147,100 @@ static int dispatch_cgroups_array(const char *name, sd_json_variant *variant, sd
         return 0;
 }
 
+static int dispatch_delegates_array(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
+        UserNamespaceInfo *info = ASSERT_PTR(userdata);
+        DelegatedUserNamespaceInfo *delegates = NULL;
+        size_t n = 0;
+        int r;
+
+        CLEANUP_ARRAY(delegates, n, delegated_userns_info_done_many);
+
+        if (sd_json_variant_is_null(variant)) {
+                delegated_userns_info_done_many(info->delegates, info->n_delegates);
+                info->delegates = NULL;
+                info->n_delegates = 0;
+                return 0;
+        }
+
+        if (!sd_json_variant_is_array(variant))
+                return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+        size_t elements = sd_json_variant_elements(variant);
+        if (elements > USER_NAMESPACE_DELEGATIONS_MAX)
+                return json_log(variant, flags, SYNTHETIC_ERRNO(E2BIG), "Too many delegations.");
+
+        delegates = new(DelegatedUserNamespaceInfo, elements);
+        if (!delegates)
+                return json_log_oom(variant, flags);
+
+        sd_json_variant *e;
+        JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+                static const sd_json_dispatch_field delegate_dispatch_table[] = {
+                        { "userns",   SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64,  offsetof(DelegatedUserNamespaceInfo, userns_inode), 0                 },
+                        { "start",    SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_uid),    SD_JSON_MANDATORY },
+                        { "startGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_gid),    SD_JSON_MANDATORY },
+                        { "size",     SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32,  offsetof(DelegatedUserNamespaceInfo, size),         SD_JSON_MANDATORY },
+                        {}
+                };
+
+                delegates[n] = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+                r = sd_json_dispatch(e, delegate_dispatch_table, flags, &delegates[n]);
+                if (r < 0)
+                        return r;
+
+                if (!uid_is_valid(delegates[n].start_uid) || !gid_is_valid(delegates[n].start_gid))
+                        return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid delegate UID/GID.");
+
+                if (delegates[n].size == 0)
+                        return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid delegate size.");
+
+                n++;
+        }
+
+        delegated_userns_info_done_many(info->delegates, info->n_delegates);
+        info->delegates = TAKE_PTR(delegates);
+        info->n_delegates = n;
+
+        return 0;
+}
+
+static int dispatch_ancestor_userns_array(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
+        DelegatedUserNamespaceInfo *info = ASSERT_PTR(userdata);
+        _cleanup_free_ uint64_t *ancestor_userns = NULL;
+        size_t n = 0;
+
+        if (sd_json_variant_is_null(variant)) {
+                info->ancestor_userns = mfree(info->ancestor_userns);
+                info->n_ancestor_userns = 0;
+                return 0;
+        }
+
+        if (!sd_json_variant_is_array(variant))
+                return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+        ancestor_userns = new(uint64_t, sd_json_variant_elements(variant));
+        if (!ancestor_userns)
+                return json_log_oom(variant, flags);
+
+        sd_json_variant *e;
+        JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+                if (!sd_json_variant_is_unsigned(e))
+                        return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an unsigned integer.");
+
+                uint64_t v = sd_json_variant_unsigned(e);
+                if (v == 0)
+                        return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid ancestor userns inode 0.");
+
+                ancestor_userns[n++] = v;
+        }
+
+        free_and_replace(info->ancestor_userns, ancestor_userns);
+        info->n_ancestor_userns = n;
+
+        return 0;
+}
+
 static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **ret) {
 
         static const sd_json_dispatch_field dispatch_table[] = {
@@ -141,6 +254,7 @@ static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **
                 { "targetGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid,  offsetof(UserNamespaceInfo, target_gid),   0                 },
                 { "cgroups",   SD_JSON_VARIANT_ARRAY,    dispatch_cgroups_array,    0,                                         0                 },
                 { "netifs",    SD_JSON_VARIANT_ARRAY,    sd_json_dispatch_strv,     offsetof(UserNamespaceInfo, netifs),       0                 },
+                { "delegates", SD_JSON_VARIANT_ARRAY,    dispatch_delegates_array,  0,                                         0                 },
                 {}
         };
 
@@ -443,6 +557,18 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                         return r;
         }
 
+        _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegates_array = NULL;
+        FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+                r = sd_json_variant_append_arraybo(
+                                &delegates_array,
+                                SD_JSON_BUILD_PAIR_UNSIGNED("userns", delegate->userns_inode),
+                                SD_JSON_BUILD_PAIR_UNSIGNED("start", delegate->start_uid),
+                                SD_JSON_BUILD_PAIR_UNSIGNED("startGid", delegate->start_gid),
+                                SD_JSON_BUILD_PAIR_UNSIGNED("size", delegate->size));
+                if (r < 0)
+                        return r;
+        }
+
         _cleanup_(sd_json_variant_unrefp) sd_json_variant *def = NULL;
         r = sd_json_buildo(
                         &def,
@@ -455,7 +581,8 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                         SD_JSON_BUILD_PAIR_CONDITION(gid_is_valid(info->start_gid), "startGid", SD_JSON_BUILD_UNSIGNED(info->start_gid)),
                         SD_JSON_BUILD_PAIR_CONDITION(gid_is_valid(info->target_gid), "targetGid", SD_JSON_BUILD_UNSIGNED(info->target_gid)),
                         SD_JSON_BUILD_PAIR_CONDITION(!!cgroup_array, "cgroups", SD_JSON_BUILD_VARIANT(cgroup_array)),
-                        JSON_BUILD_PAIR_STRV_NON_EMPTY("netifs", info->netifs));
+                        JSON_BUILD_PAIR_STRV_NON_EMPTY("netifs", info->netifs),
+                        SD_JSON_BUILD_PAIR_CONDITION(!!delegates_array, "delegates", SD_JSON_BUILD_VARIANT(delegates_array)));
         if (r < 0)
                 return r;
 
@@ -531,6 +658,82 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
                 goto fail;
         }
 
+        /* Store delegation files */
+        FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+                _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegate_def = NULL, *ancestor_array = NULL;
+                _cleanup_free_ char *delegate_buf = NULL, *delegate_uid_fn = NULL, *delegate_gid_fn = NULL;
+
+                if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) < 0) {
+                        r = log_oom_debug();
+                        goto fail;
+                }
+
+                /* Check if this delegation already exists. If so, this is a recursive
+                 * subdelegation: we need to preserve the chain of previous owners so that
+                 * ownership can be restored when the current owner goes away. */
+                _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo existing = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+                r = userns_registry_load_delegation_by_uid(dir_fd, delegate->start_uid, &existing);
+                if (r >= 0) {
+                        /* Delegation file exists â€” append old owner to ancestor chain */
+                        FOREACH_ARRAY(ancestor_userns, existing.ancestor_userns, existing.n_ancestor_userns) {
+                                r = sd_json_variant_append_arrayb(
+                                                &ancestor_array,
+                                                SD_JSON_BUILD_UNSIGNED(*ancestor_userns));
+                                if (r < 0)
+                                        goto fail;
+                        }
+
+                        /* userns_registry_store() is also called to update existing entries in the registry
+                         * in which case we don't need to update the ownership of the delegated UID ranges. */
+                        if (delegate->userns_inode != existing.userns_inode) {
+                                r = sd_json_variant_append_arrayb(
+                                                &ancestor_array,
+                                                SD_JSON_BUILD_UNSIGNED(existing.userns_inode));
+                                if (r < 0)
+                                        goto fail;
+                        }
+
+                } else if (r != -ENOENT) {
+                        log_debug_errno(r, "Failed to load existing delegation for UID " UID_FMT ": %m", delegate->start_uid);
+                        goto fail;
+                }
+
+                r = sd_json_buildo(
+                                &delegate_def,
+                                SD_JSON_BUILD_PAIR_UNSIGNED("userns", delegate->userns_inode),
+                                SD_JSON_BUILD_PAIR_UNSIGNED("start", delegate->start_uid),
+                                SD_JSON_BUILD_PAIR_UNSIGNED("startGid", delegate->start_gid),
+                                SD_JSON_BUILD_PAIR_UNSIGNED("size", delegate->size),
+                                SD_JSON_BUILD_PAIR_CONDITION(!!ancestor_array, "ancestorUserns", SD_JSON_BUILD_VARIANT(ancestor_array)));
+                if (r < 0)
+                        goto fail;
+
+                r = sd_json_variant_format(delegate_def, /* flags= */ 0, &delegate_buf);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to format delegation JSON object: %m");
+                        goto fail;
+                }
+
+                r = write_string_file_at(dir_fd, delegate_uid_fn, delegate_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to write delegation data to '%s' in registry: %m", delegate_uid_fn);
+                        goto fail;
+                }
+
+                /* Create GID symlink pointing to the UID file */
+                if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) < 0) {
+                        r = log_oom_debug();
+                        goto fail;
+                }
+
+                r = linkat_replace(dir_fd, delegate_uid_fn, dir_fd, delegate_gid_fn);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to link delegation data to '%s' in registry: %m", delegate_gid_fn);
+                        goto fail;
+                }
+        }
+
         return 0;
 
 fail:
@@ -547,6 +750,17 @@ fail:
         if (uid_fn)
                 (void) unlinkat(dir_fd, uid_fn, AT_REMOVEDIR);
 
+        /* Clean up any delegation files we created */
+        FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+                _cleanup_free_ char *delegate_uid_fn = NULL, *delegate_gid_fn = NULL;
+
+                if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) >= 0)
+                        (void) unlinkat(dir_fd, delegate_uid_fn, /* flags= */ 0);
+
+                if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) >= 0)
+                        (void) unlinkat(dir_fd, delegate_gid_fn, /* flags= */ 0);
+        }
+
         return r;
 }
 
@@ -568,14 +782,18 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
         if (asprintf(&reg_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0)
                 return log_oom_debug();
 
-        ret = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0));
+        r = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0));
+        if (r < 0)
+                RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", reg_fn));
 
         _cleanup_free_ char *link1_fn = NULL;
         link1_fn = strjoin("n", info->name, ".userns");
         if (!link1_fn)
                 return log_oom_debug();
 
-        RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link1_fn, 0)));
+        r = RET_NERRNO(unlinkat(dir_fd, link1_fn, 0));
+        if (r < 0)
+                RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link1_fn));
 
         if (uid_is_valid(info->start_uid)) {
                 _cleanup_free_ char *link2_fn = NULL;
@@ -583,7 +801,9 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
                 if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0)
                         return log_oom_debug();
 
-                RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link2_fn, 0)));
+                r = RET_NERRNO(unlinkat(dir_fd, link2_fn, 0));
+                if (r < 0)
+                        RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link2_fn));
         }
 
         if (uid_is_valid(info->start_gid)) {
@@ -592,7 +812,9 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
                 if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0)
                         return log_oom_debug();
 
-                RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link3_fn, 0)));
+                r = RET_NERRNO(unlinkat(dir_fd, link3_fn, 0));
+                if (r < 0)
+                        RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link3_fn));
         }
 
         _cleanup_free_ char *uid_fn = NULL;
@@ -603,11 +825,90 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
         if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0)
                 return log_oom_debug();
 
-        RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, owner_fn, 0)));
+        r = RET_NERRNO(unlinkat(dir_fd, owner_fn, 0));
+        if (r < 0)
+                RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", owner_fn));
 
         r = RET_NERRNO(unlinkat(dir_fd, uid_fn, AT_REMOVEDIR));
-        if (r != -ENOTEMPTY)
-                RET_GATHER(ret, r);
+        if (r < 0 && r != -ENOTEMPTY)
+                RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", uid_fn));
+
+        /* Remove or restore delegation files */
+        FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) {
+                /* Check if this delegation has ancestor user namespaces. If so, restore ownership to
+                 * the last ancestor instead of removing the delegation file entirely. */
+                _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo existing = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+                r = userns_registry_load_delegation_by_uid(dir_fd, delegate->start_uid, &existing);
+                if (r < 0) {
+                        log_debug_errno(r,
+                                        "Failed to load delegated UID range starting at "UID_FMT":"GID_FMT" for userns %"PRIu64": %m",
+                                        delegate->start_uid, delegate->start_gid, delegate->userns_inode);
+                        RET_GATHER(ret, r);
+                        continue;
+                }
+
+                _cleanup_free_ char *delegate_uid_fn = NULL;
+                if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) < 0)
+                        return log_oom_debug();
+
+                if (existing.n_ancestor_userns > 0) {
+                        _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegate_def = NULL, *ancestor_array = NULL;
+                        _cleanup_free_ char *delegate_buf = NULL;
+
+                        /* Pop the last ancestor userns inode to become the new owner */
+                        uint64_t new_owner = existing.ancestor_userns[existing.n_ancestor_userns - 1];
+
+                        log_debug("Moving ownership of delegated UID range from %"PRIu64" to %"PRIu64".",
+                                  delegate->userns_inode, new_owner);
+
+                        /* Rebuild ancestor array without the last entry */
+                        for (size_t j = 0; j + 1 < existing.n_ancestor_userns; j++) {
+                                r = sd_json_variant_append_arrayb(
+                                                &ancestor_array,
+                                                SD_JSON_BUILD_UNSIGNED(existing.ancestor_userns[j]));
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to append to JSON array: %m");
+                        }
+
+                        r = sd_json_buildo(
+                                        &delegate_def,
+                                        SD_JSON_BUILD_PAIR_UNSIGNED("userns", new_owner),
+                                        SD_JSON_BUILD_PAIR_UNSIGNED("start", existing.start_uid),
+                                        SD_JSON_BUILD_PAIR_UNSIGNED("startGid", existing.start_gid),
+                                        SD_JSON_BUILD_PAIR_UNSIGNED("size", existing.size),
+                                        SD_JSON_BUILD_PAIR_CONDITION(!!ancestor_array, "ancestorUserns", SD_JSON_BUILD_VARIANT(ancestor_array)));
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to build delegate JSON object: %m");
+
+                        r = sd_json_variant_format(delegate_def, /* flags= */ 0, &delegate_buf);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to format delegation JSON object: %m");
+
+                        r = write_string_file_at(dir_fd, delegate_uid_fn, delegate_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
+                        if (r < 0)
+                                RET_GATHER(ret, log_debug_errno(r, "Failed to write restored delegation data to '%s' in registry: %m", delegate_uid_fn));
+
+                        /* GID link already points to the UID file, no need to update it */
+                        continue;
+                }
+
+                log_debug("Removing delegated UID range starting at "UID_FMT":"GID_FMT" for userns %"PRIu64 ".",
+                          delegate->start_uid, delegate->start_gid, delegate->userns_inode);
+
+                /* No ancestor chain â€” just remove the delegation files */
+                r = RET_NERRNO(unlinkat(dir_fd, delegate_uid_fn, 0));
+                if (r < 0)
+                        RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", delegate_uid_fn));
+
+                _cleanup_free_ char *delegate_gid_fn = NULL;
+                if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) < 0)
+                        return log_oom_debug();
+
+                r = RET_NERRNO(unlinkat(dir_fd, delegate_gid_fn, 0));
+                if (r < 0)
+                        RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", delegate_gid_fn));
+        }
 
         return ret;
 }
@@ -822,3 +1123,135 @@ int userns_registry_per_uid(int dir_fd, uid_t owner) {
 
         return n;
 }
+
+int userns_registry_delegation_uid_exists(int dir_fd, uid_t start) {
+        _cleanup_free_ char *fn = NULL;
+
+        assert(dir_fd >= 0);
+
+        if (!uid_is_valid(start))
+                return -ENOENT;
+
+        if (start == 0)
+                return true;
+
+        if (asprintf(&fn, "u" UID_FMT ".delegate", start) < 0)
+                return -ENOMEM;
+
+        if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+                return errno == ENOENT ? false : -errno;
+
+        return true;
+}
+
+int userns_registry_delegation_gid_exists(int dir_fd, gid_t start) {
+        _cleanup_free_ char *fn = NULL;
+
+        assert(dir_fd >= 0);
+
+        if (!gid_is_valid(start))
+                return -ENOENT;
+
+        if (start == 0)
+                return true;
+
+        if (asprintf(&fn, "g" GID_FMT ".delegate", start) < 0)
+                return -ENOMEM;
+
+        if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+                return errno == ENOENT ? false : -errno;
+
+        return true;
+}
+
+static int userns_registry_load_delegation(int dir_fd, const char *filename, DelegatedUserNamespaceInfo *ret) {
+
+        static const sd_json_dispatch_field dispatch_table[] = {
+                { "userns",         SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64,        offsetof(DelegatedUserNamespaceInfo, userns_inode), SD_JSON_MANDATORY },
+                { "start",          SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid,       offsetof(DelegatedUserNamespaceInfo, start_uid),    SD_JSON_MANDATORY },
+                { "startGid",       SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid,       offsetof(DelegatedUserNamespaceInfo, start_gid),    0                 },
+                { "size",           SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32,        offsetof(DelegatedUserNamespaceInfo, size),         SD_JSON_MANDATORY },
+                { "ancestorUserns", SD_JSON_VARIANT_ARRAY,    dispatch_ancestor_userns_array, 0,                                                  0                 },
+                {}
+        };
+
+        _cleanup_(sd_json_variant_unrefp) sd_json_variant *v = NULL;
+        _cleanup_close_ int registry_fd = -EBADF;
+        int r;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        r = sd_json_parse_file_at(/* f= */ NULL, dir_fd, filename, /* flags= */ 0, &v, /* reterr_line= */ NULL, /* reterr_column= */ NULL);
+        if (r < 0)
+                return r;
+
+        _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL;
+
+        r = sd_json_dispatch(v, dispatch_table, /* flags= */ 0, &data);
+        if (r < 0)
+                return r;
+
+        if (data.userns_inode == 0)
+                return -EBADMSG;
+        if (data.size == 0)
+                return -EBADMSG;
+
+        if (ret)
+                *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL);
+
+        return 0;
+}
+
+int userns_registry_load_delegation_by_uid(int dir_fd, uid_t start, DelegatedUserNamespaceInfo *ret) {
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        if (!uid_is_valid(start))
+                return -ENOENT;
+
+        if (asprintf(&fn, "u" UID_FMT ".delegate", start) < 0)
+                return -ENOMEM;
+
+        _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL;
+        r = userns_registry_load_delegation(dir_fd, fn, &data);
+        if (r < 0)
+                return r;
+
+        if (data.start_uid != start)
+                return -EBADMSG;
+
+        if (ret)
+                *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL);
+
+        return 0;
+}
+
+int userns_registry_load_delegation_by_gid(int dir_fd, gid_t start, DelegatedUserNamespaceInfo *ret) {
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        if (!gid_is_valid(start))
+                return -ENOENT;
+
+        if (asprintf(&fn, "g" UID_FMT ".delegate", start) < 0)
+                return -ENOMEM;
+
+        _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL;
+        r = userns_registry_load_delegation(dir_fd, fn, &data);
+        if (r < 0)
+                return r;
+
+        if (data.start_gid != start)
+                return -EBADMSG;
+
+        if (ret)
+                *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL);
+
+        return 0;
+}
index fee2623a3b524f47c3b1542cb5eb29038692cc9c..f08b238861ae415ed4ea67d9172f0ee607a7c99d 100644 (file)
@@ -5,6 +5,26 @@
 
 #define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16U
 #define USER_NAMESPACE_NETIFS_DELEGATE_MAX 16U
+#define USER_NAMESPACE_DELEGATIONS_MAX 16U
+
+typedef struct DelegatedUserNamespaceInfo {
+        uint64_t userns_inode;
+        uid_t start_uid;
+        gid_t start_gid;
+        uint32_t size;
+        /* We track all the previous owners of the delegation so we can restore the previous owner of each
+         * delegated range when a user namespace with delegated ranges is freed. */
+        uint64_t *ancestor_userns;
+        size_t n_ancestor_userns;
+} DelegatedUserNamespaceInfo;
+
+#define DELEGATED_USER_NAMESPACE_INFO_NULL (DelegatedUserNamespaceInfo) {       \
+        .start_uid = UID_INVALID,                                               \
+        .start_gid = GID_INVALID,                                               \
+}
+
+void delegated_userns_info_done(DelegatedUserNamespaceInfo *info);
+void delegated_userns_info_done_many(DelegatedUserNamespaceInfo infos[], size_t n);
 
 typedef struct UserNamespaceInfo {
         uid_t owner;
@@ -18,6 +38,8 @@ typedef struct UserNamespaceInfo {
         uint64_t *cgroups;
         size_t n_cgroups;
         char **netifs;
+        DelegatedUserNamespaceInfo *delegates;
+        size_t n_delegates;
 } UserNamespaceInfo;
 
 UserNamespaceInfo* userns_info_new(void);
@@ -51,3 +73,8 @@ int userns_registry_uid_exists(int dir_fd, uid_t start);
 int userns_registry_gid_exists(int dir_fd, gid_t start);
 
 int userns_registry_per_uid(int dir_fd, uid_t owner);
+
+int userns_registry_delegation_uid_exists(int dir_fd, uid_t start);
+int userns_registry_delegation_gid_exists(int dir_fd, gid_t start);
+int userns_registry_load_delegation_by_uid(int dir_fd, uid_t start, DelegatedUserNamespaceInfo *ret);
+int userns_registry_load_delegation_by_gid(int dir_fd, gid_t start, DelegatedUserNamespaceInfo *ret);
index 03bfc41134712af9e9f03f923cc60a6ca70b228d..7d5f5093224f480b107a318d544b86109c18ebf1 100644 (file)
@@ -14,6 +14,8 @@ static SD_VARLINK_DEFINE_METHOD(
                 SD_VARLINK_DEFINE_INPUT(target, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("A file descriptor to an allocated userns with no current UID range assignments"),
                 SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, 0),
+                SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."),
+                SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."),
                 SD_VARLINK_DEFINE_OUTPUT(name, SD_VARLINK_STRING, SD_VARLINK_NULLABLE));
 
@@ -69,6 +71,7 @@ static SD_VARLINK_DEFINE_ERROR(UserNamespaceWithoutUserRange);
 static SD_VARLINK_DEFINE_ERROR(TooManyControlGroups);
 static SD_VARLINK_DEFINE_ERROR(ControlGroupAlreadyAdded);
 static SD_VARLINK_DEFINE_ERROR(TooManyNetworkInterfaces);
+static SD_VARLINK_DEFINE_ERROR(TooManyDelegations);
 
 SD_VARLINK_DEFINE_INTERFACE(
                 io_systemd_NamespaceResource,
@@ -103,4 +106,6 @@ SD_VARLINK_DEFINE_INTERFACE(
                 SD_VARLINK_SYMBOL_COMMENT("The specified cgroup has already been added to the user namespace."),
                 &vl_error_ControlGroupAlreadyAdded,
                 SD_VARLINK_SYMBOL_COMMENT("The per-user namespace limit of network interfaces has been reached."),
-                &vl_error_TooManyNetworkInterfaces);
+                &vl_error_TooManyNetworkInterfaces,
+                SD_VARLINK_SYMBOL_COMMENT("The specified number of delegations exceeds the maximum allowed."),
+                &vl_error_TooManyDelegations);
index 69c39b057505e795ca46e15c29ba8cbcc5463a15..6eef98153ef5579e27d7c3d2d105d7ef5d0a75fd 100644 (file)
@@ -195,4 +195,306 @@ TEST(uid_range_coalesce) {
         ASSERT_EQ(p->entries[0].nr, 115U);
 }
 
+TEST(uid_range_clip) {
+        _cleanup_(uid_range_freep) UIDRange *p = NULL;
+
+        /* Build a range: 100-199, 300-399, 500-599 */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_add_str(&p, "300-399"));
+        ASSERT_OK(uid_range_add_str(&p, "500-599"));
+        ASSERT_EQ(uid_range_entries(p), 3U);
+
+        /* Intersect with range that covers all entries */
+        ASSERT_OK(uid_range_clip(p, 0, 1000));
+        ASSERT_EQ(uid_range_entries(p), 3U);
+        ASSERT_EQ(p->entries[0].start, 100U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+        ASSERT_EQ(p->entries[1].start, 300U);
+        ASSERT_EQ(p->entries[1].nr, 100U);
+        ASSERT_EQ(p->entries[2].start, 500U);
+        ASSERT_EQ(p->entries[2].nr, 100U);
+
+        /* Intersect with range that excludes first and last entries */
+        ASSERT_OK(uid_range_clip(p, 200, 499));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 300U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+
+        p = uid_range_free(p);
+
+        /* Test partial overlap - trimming from both sides */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_clip(p, 150, 180));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 150U);
+        ASSERT_EQ(p->entries[0].nr, 31U);
+
+        p = uid_range_free(p);
+
+        /* Test intersection that removes all entries */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_clip(p, 500, 600));
+        ASSERT_TRUE(uid_range_is_empty(p));
+
+        p = uid_range_free(p);
+
+        /* Test invalid min > max */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_ERROR(uid_range_clip(p, 200, 100), EINVAL);
+
+        p = uid_range_free(p);
+
+        /* Test with max == UINT32_MAX (should not overflow) */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_clip(p, 0, UINT32_MAX));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 100U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+
+        p = uid_range_free(p);
+
+        /* Test with both min and max at extremes */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_add_str(&p, "500-599"));
+        ASSERT_OK(uid_range_clip(p, 150, UINT32_MAX));
+        ASSERT_EQ(uid_range_entries(p), 2U);
+        ASSERT_EQ(p->entries[0].start, 150U);
+        ASSERT_EQ(p->entries[0].nr, 50U);
+        ASSERT_EQ(p->entries[1].start, 500U);
+        ASSERT_EQ(p->entries[1].nr, 100U);
+}
+
+TEST(uid_range_partition) {
+        _cleanup_(uid_range_freep) UIDRange *p = NULL;
+
+        /* Single entry that divides evenly */
+        ASSERT_OK(uid_range_add_str(&p, "0-299"));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_OK(uid_range_partition(p, 100));
+        ASSERT_EQ(uid_range_entries(p), 3U);
+        ASSERT_EQ(p->entries[0].start, 0U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+        ASSERT_EQ(p->entries[1].start, 100U);
+        ASSERT_EQ(p->entries[1].nr, 100U);
+        ASSERT_EQ(p->entries[2].start, 200U);
+        ASSERT_EQ(p->entries[2].nr, 100U);
+
+        p = uid_range_free(p);
+
+        /* Entry with remainder (gets truncated) */
+        ASSERT_OK(uid_range_add_str(&p, "0-249"));
+        ASSERT_OK(uid_range_partition(p, 100));
+        ASSERT_EQ(uid_range_entries(p), 2U);
+        ASSERT_EQ(p->entries[0].start, 0U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+        ASSERT_EQ(p->entries[1].start, 100U);
+        ASSERT_EQ(p->entries[1].nr, 100U);
+
+        p = uid_range_free(p);
+
+        /* Entry smaller than partition size - gets dropped */
+        ASSERT_OK(uid_range_add_str(&p, "0-49"));
+        ASSERT_OK(uid_range_partition(p, 100));
+        ASSERT_TRUE(uid_range_is_empty(p));
+
+        p = uid_range_free(p);
+
+        /* Multiple entries */
+        ASSERT_OK(uid_range_add_str(&p, "0-199"));
+        ASSERT_OK(uid_range_add_str(&p, "1000-1299"));
+        ASSERT_EQ(uid_range_entries(p), 2U);
+        ASSERT_OK(uid_range_partition(p, 100));
+        ASSERT_EQ(uid_range_entries(p), 5U);
+        ASSERT_EQ(p->entries[0].start, 0U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+        ASSERT_EQ(p->entries[1].start, 100U);
+        ASSERT_EQ(p->entries[1].nr, 100U);
+        ASSERT_EQ(p->entries[2].start, 1000U);
+        ASSERT_EQ(p->entries[2].nr, 100U);
+        ASSERT_EQ(p->entries[3].start, 1100U);
+        ASSERT_EQ(p->entries[3].nr, 100U);
+        ASSERT_EQ(p->entries[4].start, 1200U);
+        ASSERT_EQ(p->entries[4].nr, 100U);
+
+        p = uid_range_free(p);
+
+        /* Partition size of 1 */
+        ASSERT_OK(uid_range_add_str(&p, "100-102"));
+        ASSERT_OK(uid_range_partition(p, 1));
+        ASSERT_EQ(uid_range_entries(p), 3U);
+        ASSERT_EQ(p->entries[0].start, 100U);
+        ASSERT_EQ(p->entries[0].nr, 1U);
+        ASSERT_EQ(p->entries[1].start, 101U);
+        ASSERT_EQ(p->entries[1].nr, 1U);
+        ASSERT_EQ(p->entries[2].start, 102U);
+        ASSERT_EQ(p->entries[2].nr, 1U);
+}
+
+TEST(uid_range_copy) {
+        _cleanup_(uid_range_freep) UIDRange *p = NULL, *copy = NULL;
+
+        /* Copy NULL range */
+        ASSERT_OK(uid_range_copy(NULL, &copy));
+        ASSERT_TRUE(uid_range_is_empty(copy));
+
+        copy = uid_range_free(copy);
+
+        /* Copy empty range */
+        p = new0(UIDRange, 1);
+        ASSERT_NOT_NULL(p);
+        ASSERT_OK(uid_range_copy(p, &copy));
+        ASSERT_NOT_NULL(copy);
+        ASSERT_TRUE(uid_range_is_empty(copy));
+
+        p = uid_range_free(p);
+        copy = uid_range_free(copy);
+
+        /* Copy range with entries */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_add_str(&p, "300-399"));
+        ASSERT_OK(uid_range_copy(p, &copy));
+        ASSERT_TRUE(uid_range_equal(p, copy));
+
+        /* Verify it's a deep copy - modifying original doesn't affect copy */
+        ASSERT_OK(uid_range_add_str(&p, "500-599"));
+        ASSERT_FALSE(uid_range_equal(p, copy));
+        ASSERT_EQ(uid_range_entries(copy), 2U);
+}
+
+TEST(uid_range_remove) {
+        _cleanup_(uid_range_freep) UIDRange *p = NULL;
+
+        /* Build a range: 100-199 */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+
+        /* Remove with size 0 - no-op */
+        ASSERT_OK(uid_range_remove(p, 150, 0));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 100U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+
+        /* Remove range that doesn't overlap - no change */
+        ASSERT_OK(uid_range_remove(p, 0, 50));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 100U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+
+        ASSERT_OK(uid_range_remove(p, 300, 50));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 100U);
+        ASSERT_EQ(p->entries[0].nr, 100U);
+
+        /* Remove from the start of the entry */
+        ASSERT_OK(uid_range_remove(p, 100, 10));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 110U);
+        ASSERT_EQ(p->entries[0].nr, 90U);
+
+        /* Remove from the end of the entry */
+        ASSERT_OK(uid_range_remove(p, 190, 10));
+        ASSERT_EQ(uid_range_entries(p), 1U);
+        ASSERT_EQ(p->entries[0].start, 110U);
+        ASSERT_EQ(p->entries[0].nr, 80U);
+
+        /* Remove from the middle - splits the entry */
+        ASSERT_OK(uid_range_remove(p, 140, 20));
+        ASSERT_EQ(uid_range_entries(p), 2U);
+        ASSERT_EQ(p->entries[0].start, 110U);
+        ASSERT_EQ(p->entries[0].nr, 30U);
+        ASSERT_EQ(p->entries[1].start, 160U);
+        ASSERT_EQ(p->entries[1].nr, 30U);
+
+        p = uid_range_free(p);
+
+        /* Remove entire entry */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_remove(p, 100, 100));
+        ASSERT_TRUE(uid_range_is_empty(p));
+
+        p = uid_range_free(p);
+
+        /* Remove range larger than entry */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_remove(p, 50, 200));
+        ASSERT_TRUE(uid_range_is_empty(p));
+
+        p = uid_range_free(p);
+
+        /* Remove affecting multiple entries */
+        ASSERT_OK(uid_range_add_str(&p, "100-199"));
+        ASSERT_OK(uid_range_add_str(&p, "300-399"));
+        ASSERT_OK(uid_range_add_str(&p, "500-599"));
+        ASSERT_EQ(uid_range_entries(p), 3U);
+
+        /* Remove range spanning the middle entry completely and trimming others */
+        ASSERT_OK(uid_range_remove(p, 150, 400));
+        ASSERT_EQ(uid_range_entries(p), 2U);
+        ASSERT_EQ(p->entries[0].start, 100U);
+        ASSERT_EQ(p->entries[0].nr, 50U);
+        ASSERT_EQ(p->entries[1].start, 550U);
+        ASSERT_EQ(p->entries[1].nr, 50U);
+}
+
+TEST(uid_range_translate) {
+        _cleanup_(uid_range_freep) UIDRange *o = NULL, *i = NULL;
+        uid_t uid;
+
+        ASSERT_OK(uid_range_add_str_full(&o, "200-299", /* coalesce= */ false));
+        ASSERT_OK(uid_range_add_str_full(&i, "100-199", /* coalesce= */ false));
+        ASSERT_OK(uid_range_translate(o, i, 250, &uid));
+        ASSERT_EQ(uid, 150U);
+        ASSERT_OK(uid_range_translate(i, o, 150, &uid));
+        ASSERT_EQ(uid, 250U);
+
+        ASSERT_OK(uid_range_add_str_full(&o, "300-399", /* coalesce= */ false));
+        ASSERT_OK(uid_range_add_str_full(&i, "350-449", /* coalesce= */ false));
+        ASSERT_OK(uid_range_translate(o, i, 350, &uid));
+        ASSERT_EQ(uid, 400U);
+        ASSERT_OK(uid_range_translate(i, o, 400, &uid));
+        ASSERT_EQ(uid, 350U);
+
+        /* Test translating at range boundaries */
+        ASSERT_OK(uid_range_translate(o, i, 200, &uid));
+        ASSERT_EQ(uid, 100U);
+        ASSERT_OK(uid_range_translate(o, i, 299, &uid));
+        ASSERT_EQ(uid, 199U);
+        ASSERT_OK(uid_range_translate(o, i, 300, &uid));
+        ASSERT_EQ(uid, 350U);
+        ASSERT_OK(uid_range_translate(o, i, 399, &uid));
+        ASSERT_EQ(uid, 449U);
+
+        /* Test reverse translation at boundaries */
+        ASSERT_OK(uid_range_translate(i, o, 100, &uid));
+        ASSERT_EQ(uid, 200U);
+        ASSERT_OK(uid_range_translate(i, o, 199, &uid));
+        ASSERT_EQ(uid, 299U);
+        ASSERT_OK(uid_range_translate(i, o, 350, &uid));
+        ASSERT_EQ(uid, 300U);
+        ASSERT_OK(uid_range_translate(i, o, 449, &uid));
+        ASSERT_EQ(uid, 399U);
+
+        /* Test UID not in any range returns ESRCH */
+        ASSERT_ERROR(uid_range_translate(o, i, 0, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(o, i, 199, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(o, i, 400, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(i, o, 0, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(i, o, 99, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(i, o, 200, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(i, o, 349, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(i, o, 450, &uid), ESRCH);
+
+        o = uid_range_free(o);
+        i = uid_range_free(i);
+
+        /* Test with single-element ranges */
+        ASSERT_OK(uid_range_add_str_full(&o, "1000", /* coalesce= */ false));
+        ASSERT_OK(uid_range_add_str_full(&i, "5000", /* coalesce= */ false));
+        ASSERT_OK(uid_range_translate(o, i, 1000, &uid));
+        ASSERT_EQ(uid, 5000U);
+        ASSERT_OK(uid_range_translate(i, o, 5000, &uid));
+        ASSERT_EQ(uid, 1000U);
+        ASSERT_ERROR(uid_range_translate(o, i, 999, &uid), ESRCH);
+        ASSERT_ERROR(uid_range_translate(o, i, 1001, &uid), ESRCH);
+}
+
 DEFINE_TEST_MAIN(LOG_DEBUG);
index 94f802e780d6cb3154fd5e285ed4297f2ad58d53..e5092b56868f35b9c2fe15b6b268c151baa2f815 100755 (executable)
@@ -60,6 +60,23 @@ if (SYSTEMD_LOG_TARGET=console varlinkctl call \
     exit 0
 fi
 
+# Test delegated UID ranges
+# Verify that delegated ranges show up in uid_map (6 lines: 1 primary + 2 container ranges + 3 dynamic users)
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+        --push-fd=/proc/self/ns/user \
+        /run/systemd/userdb/io.systemd.NamespaceResource \
+        io.systemd.NamespaceResource.AllocateUserRange \
+        '{"name":"test-delegate","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \
+        -- cat /proc/self/uid_map | wc -l)" -eq 3
+
+# Test that delegateContainerRanges > 16 fails with TooManyDelegations error
+(! run0 -u testuser --pipe unshare --user varlinkctl call \
+        --push-fd=/proc/self/ns/user \
+        /run/systemd/userdb/io.systemd.NamespaceResource \
+        io.systemd.NamespaceResource.AllocateUserRange \
+        '{"name":"test-fail","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":17}') |&
+            grep "io.systemd.NamespaceResource.TooManyDelegations" >/dev/null
+
 # This should work without the key
 systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
 systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null