From: Daan De Meyer Date: Thu, 29 Jan 2026 15:45:59 +0000 (+0100) Subject: nsresourced: Optionally map foreign UID range X-Git-Tag: v260-rc1~9^2~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=838528104ba79fcf99aefa0a01613cbb1a7bc44f;p=thirdparty%2Fsystemd.git nsresourced: Optionally map foreign UID range Whenever delegating UID ranges to a user namespace, it can also be useful to map the foreign UID range, so that the container running in the user namespace with delegated UID ranges can download container images and unpack them to the foreign UID range. Let's add an option mapForeign to make this possible. Note that this option gives unprivileged users full access to the any foreign UID range owned directory that they can access. Hence it is recommended (and already was recommended) to store foreign UID range owned directories in a 0700 directory owned by the owner of the tree to avoid access and modifications by other users. This is already the case for the main users of the foreign UID range, namely /var/lib/machines, /var/lib/portables and /home/ which all use 0700 as their mode. Users will also be able to create foreign UID range owned inodes in any directories their own user can write to (on most systems this means /tmp, /var/tmp and /home/). --- diff --git a/docs/UIDS-GIDS.md b/docs/UIDS-GIDS.md index e475e2fd68c..3bb1a984fb8 100644 --- a/docs/UIDS-GIDS.md +++ b/docs/UIDS-GIDS.md @@ -145,8 +145,13 @@ possible. available locally whose UID/GID ownerships do not make sense in the local context but only within the OS image itself. This 64K UID range can be used to have a clearly defined ownership even on the host, that can be mapped via - idmapped mount to a dynamic runtime UID range as needed. (These numbers in - hexadecimal are 0x7FFE0000…0x7FFEFFFF.) + idmapped mount to a dynamic runtime UID range as needed. These numbers in + hexadecimal are 0x7FFE0000…0x7FFEFFFF. Note that all users have full access + to the foreign UID range, hence it is recommended to never make foreign UID + range owned inodes accessible in directories accessible to other users. In + other words, always make sure each foreign UID range owned inode is inside + of a directory with mode `0700` (or stricter) owned by the only user that + should have access to the foreign UID range owned inode(s). Note for the `DynamicUser=` and the `systemd-nspawn` allocation ranges: when a UID allocation takes place NSS is checked for collisions first, and a different diff --git a/man/systemd-nsresourced.service.xml b/man/systemd-nsresourced.service.xml index e9b66127164..120027aab0e 100644 --- a/man/systemd-nsresourced.service.xml +++ b/man/systemd-nsresourced.service.xml @@ -69,6 +69,10 @@ one of the delegated ranges. Identity mapped users are not subject to BPF-LSM write restrictions unlike the transient ranges. + Additionally, the allocation API supports mapping the foreign UID range into + the user namespace. When this option is enabled, the foreign UID range is mapped 1:1 into the user + namespace, allowing processes inside to access and manipulate files owned by the foreign UID range. + The service provides API calls to allowlist mounts (referenced via their mount file descriptors as per Linux fsmount() API), to pass ownership of a cgroup subtree to the user namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination diff --git a/src/basic/uid-classification.h b/src/basic/uid-classification.h index a4d8f916e9f..58692f1ed2c 100644 --- a/src/basic/uid-classification.h +++ b/src/basic/uid-classification.h @@ -12,7 +12,7 @@ assert_cc((CONTAINER_UID_BASE_MAX & 0xFFFFU) == 0); #define CONTAINER_UID_MAX ((uid_t) CONTAINER_UID_BASE_MAX + 0xFFFFU) assert_cc((FOREIGN_UID_BASE & 0xFFFFU) == 0); -#define FOREIGN_UID_MIN (FOREIGN_UID_BASE) +#define FOREIGN_UID_MIN (FOREIGN_UID_BASE + 0U) #define FOREIGN_UID_MAX (FOREIGN_UID_BASE + 0xFFFFU) bool uid_is_system(uid_t uid); diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c index 82640fac4f7..3b2450529c3 100644 --- a/src/nsresourced/nsresourcework.c +++ b/src/nsresourced/nsresourcework.c @@ -714,7 +714,12 @@ static int write_userns_mappings(PidRef *pidref, const char *uidmap, const char return 0; } -static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespaceInfo *userns_info) { +static int write_userns( + int userns_fd, + int parent_userns_fd, + const UserNamespaceInfo *userns_info, + bool map_foreign) { + _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL; _cleanup_close_ int efd = -EBADF; uint64_t u; @@ -818,6 +823,22 @@ static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespace delegate->start_uid, start_uid, delegate->size); } + if (map_foreign) { + r = uid_range_translate(outside_range, inside_range, FOREIGN_UID_MIN, &start_uid); + if (r < 0) + return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", FOREIGN_UID_MIN); + + if (start_uid != FOREIGN_UID_MIN) + return log_debug_errno( + SYNTHETIC_ERRNO(ERANGE), + "Foreign UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")", + FOREIGN_UID_MIN, start_uid); + + if (strextendf(&uidmap, UID_FMT " " UID_FMT " %" PRIu32 "\n", + FOREIGN_UID_MIN, start_uid, NSRESOURCE_UIDS_64K) < 0) + return log_oom(); + } + outside_range = uid_range_free(outside_range); inside_range = uid_range_free(inside_range); @@ -870,6 +891,22 @@ static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespace delegate->start_gid, start_gid, delegate->size); } + if (map_foreign) { + r = uid_range_translate(outside_range, inside_range, FOREIGN_UID_MIN, &start_gid); + if (r < 0) + return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", FOREIGN_UID_MIN); + + if (start_gid != FOREIGN_UID_MIN) + return log_debug_errno( + SYNTHETIC_ERRNO(ERANGE), + "Foreign GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")", + FOREIGN_UID_MIN, start_gid); + + if (strextendf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n", + FOREIGN_UID_MIN, start_gid, NSRESOURCE_UIDS_64K) < 0) + return log_oom(); + } + r = is_our_namespace(parent_userns_fd, NAMESPACE_USER); if (r < 0) return log_debug_errno(r, "Failed to check if parent user namespace refers to our own user namespace: %m"); @@ -1139,6 +1176,7 @@ typedef struct AllocateParameters { unsigned userns_fd_idx; bool mangle_name; uint32_t delegate_container_ranges; + bool map_foreign; } AllocateParameters; static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { @@ -1151,6 +1189,7 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para { "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 }, { "type", SD_JSON_VARIANT_STRING, dispatch_allocate_user_range_type, offsetof(AllocateParameters, type), 0 }, { "delegateContainerRanges", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, delegate_container_ranges), 0 }, + { "mapForeign", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_stdbool, offsetof(AllocateParameters, map_foreign), 0 }, {} }; @@ -1336,7 +1375,7 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para goto fail; } - r = write_userns(userns_fd, parent_userns_fd, userns_info); + r = write_userns(userns_fd, parent_userns_fd, userns_info, p.map_foreign); if (r < 0) goto fail; diff --git a/src/shared/varlink-io.systemd.NamespaceResource.c b/src/shared/varlink-io.systemd.NamespaceResource.c index 79fff1b592b..4e592e496c9 100644 --- a/src/shared/varlink-io.systemd.NamespaceResource.c +++ b/src/shared/varlink-io.systemd.NamespaceResource.c @@ -25,6 +25,8 @@ static SD_VARLINK_DEFINE_METHOD( SD_VARLINK_DEFINE_INPUT_BY_TYPE(type, AllocateUserRangeType, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."), SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("If true, map the foreign UID range 1:1 into the user namespace."), + SD_VARLINK_DEFINE_INPUT(mapForeign, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."), SD_VARLINK_DEFINE_OUTPUT(name, SD_VARLINK_STRING, SD_VARLINK_NULLABLE)); diff --git a/test/units/TEST-50-DISSECT.mountfsd.sh b/test/units/TEST-50-DISSECT.mountfsd.sh index c468e3b8f89..12a72f8257d 100755 --- a/test/units/TEST-50-DISSECT.mountfsd.sh +++ b/test/units/TEST-50-DISSECT.mountfsd.sh @@ -99,6 +99,24 @@ test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \ '{"name":"test-delegate3","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \ -- cat /proc/self/uid_map | wc -l)" -eq 3 +# Test mapForeign parameter +# Verify that the foreign UID range is mapped into the user namespace +# When mapForeign is true, uid_map should have 2 lines: primary range + foreign range +test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \ + --push-fd=/proc/self/ns/user \ + /run/systemd/userdb/io.systemd.NamespaceResource \ + io.systemd.NamespaceResource.AllocateUserRange \ + '{"name":"test-foreign","size":65536,"userNamespaceFileDescriptor":0,"mapForeign":true}' \ + -- cat /proc/self/uid_map | wc -l)" -eq 2 + +# Verify the foreign range is mapped 1:1. +test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \ + --push-fd=/proc/self/ns/user \ + /run/systemd/userdb/io.systemd.NamespaceResource \ + io.systemd.NamespaceResource.AllocateUserRange \ + '{"name":"test-foreign2","size":65536,"userNamespaceFileDescriptor":0,"mapForeign":true}' \ + -- cat /proc/self/uid_map | grep -c 2147352576)" -eq 1 + # This should work without the key systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null