available locally whose UID/GID ownerships do not make sense in the local
context but only within the OS image itself. This 64K UID range can be used
to have a clearly defined ownership even on the host, that can be mapped via
- idmapped mount to a dynamic runtime UID range as needed. (These numbers in
- hexadecimal are 0x7FFE0000…0x7FFEFFFF.)
+ idmapped mount to a dynamic runtime UID range as needed. These numbers in
+ hexadecimal are 0x7FFE0000…0x7FFEFFFF. Note that all users have full access
+ to the foreign UID range, hence it is recommended to never make foreign UID
+ range owned inodes accessible in directories accessible to other users. In
+ other words, always make sure each foreign UID range owned inode is inside
+ of a directory with mode `0700` (or stricter) owned by the only user that
+ should have access to the foreign UID range owned inode(s).
Note for the `DynamicUser=` and the `systemd-nspawn` allocation ranges: when a
UID allocation takes place NSS is checked for collisions first, and a different
one of the delegated ranges. Identity mapped users are not subject to BPF-LSM write restrictions unlike
the transient ranges.</para>
+ <para>Additionally, the allocation API supports mapping the <emphasis>foreign UID range</emphasis> into
+ the user namespace. When this option is enabled, the foreign UID range is mapped 1:1 into the user
+ namespace, allowing processes inside to access and manipulate files owned by the foreign UID range.</para>
+
<para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
#define CONTAINER_UID_MAX ((uid_t) CONTAINER_UID_BASE_MAX + 0xFFFFU)
assert_cc((FOREIGN_UID_BASE & 0xFFFFU) == 0);
-#define FOREIGN_UID_MIN (FOREIGN_UID_BASE)
+#define FOREIGN_UID_MIN (FOREIGN_UID_BASE + 0U)
#define FOREIGN_UID_MAX (FOREIGN_UID_BASE + 0xFFFFU)
bool uid_is_system(uid_t uid);
return 0;
}
-static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespaceInfo *userns_info) {
+static int write_userns(
+ int userns_fd,
+ int parent_userns_fd,
+ const UserNamespaceInfo *userns_info,
+ bool map_foreign) {
+
_cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL;
_cleanup_close_ int efd = -EBADF;
uint64_t u;
delegate->start_uid, start_uid, delegate->size);
}
+ if (map_foreign) {
+ r = uid_range_translate(outside_range, inside_range, FOREIGN_UID_MIN, &start_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", FOREIGN_UID_MIN);
+
+ if (start_uid != FOREIGN_UID_MIN)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERANGE),
+ "Foreign UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")",
+ FOREIGN_UID_MIN, start_uid);
+
+ if (strextendf(&uidmap, UID_FMT " " UID_FMT " %" PRIu32 "\n",
+ FOREIGN_UID_MIN, start_uid, NSRESOURCE_UIDS_64K) < 0)
+ return log_oom();
+ }
+
outside_range = uid_range_free(outside_range);
inside_range = uid_range_free(inside_range);
delegate->start_gid, start_gid, delegate->size);
}
+ if (map_foreign) {
+ r = uid_range_translate(outside_range, inside_range, FOREIGN_UID_MIN, &start_gid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", FOREIGN_UID_MIN);
+
+ if (start_gid != FOREIGN_UID_MIN)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERANGE),
+ "Foreign GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")",
+ FOREIGN_UID_MIN, start_gid);
+
+ if (strextendf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n",
+ FOREIGN_UID_MIN, start_gid, NSRESOURCE_UIDS_64K) < 0)
+ return log_oom();
+ }
+
r = is_our_namespace(parent_userns_fd, NAMESPACE_USER);
if (r < 0)
return log_debug_errno(r, "Failed to check if parent user namespace refers to our own user namespace: %m");
unsigned userns_fd_idx;
bool mangle_name;
uint32_t delegate_container_ranges;
+ bool map_foreign;
} AllocateParameters;
static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) {
{ "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 },
{ "type", SD_JSON_VARIANT_STRING, dispatch_allocate_user_range_type, offsetof(AllocateParameters, type), 0 },
{ "delegateContainerRanges", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, delegate_container_ranges), 0 },
+ { "mapForeign", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_stdbool, offsetof(AllocateParameters, map_foreign), 0 },
{}
};
goto fail;
}
- r = write_userns(userns_fd, parent_userns_fd, userns_info);
+ r = write_userns(userns_fd, parent_userns_fd, userns_info, p.map_foreign);
if (r < 0)
goto fail;
SD_VARLINK_DEFINE_INPUT_BY_TYPE(type, AllocateUserRangeType, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."),
SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("If true, map the foreign UID range 1:1 into the user namespace."),
+ SD_VARLINK_DEFINE_INPUT(mapForeign, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."),
SD_VARLINK_DEFINE_OUTPUT(name, SD_VARLINK_STRING, SD_VARLINK_NULLABLE));
'{"name":"test-delegate3","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \
-- cat /proc/self/uid_map | wc -l)" -eq 3
+# Test mapForeign parameter
+# Verify that the foreign UID range is mapped into the user namespace
+# When mapForeign is true, uid_map should have 2 lines: primary range + foreign range
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+ --push-fd=/proc/self/ns/user \
+ /run/systemd/userdb/io.systemd.NamespaceResource \
+ io.systemd.NamespaceResource.AllocateUserRange \
+ '{"name":"test-foreign","size":65536,"userNamespaceFileDescriptor":0,"mapForeign":true}' \
+ -- cat /proc/self/uid_map | wc -l)" -eq 2
+
+# Verify the foreign range is mapped 1:1.
+test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \
+ --push-fd=/proc/self/ns/user \
+ /run/systemd/userdb/io.systemd.NamespaceResource \
+ io.systemd.NamespaceResource.AllocateUserRange \
+ '{"name":"test-foreign2","size":65536,"userNamespaceFileDescriptor":0,"mapForeign":true}' \
+ -- cat /proc/self/uid_map | grep -c 2147352576)" -eq 1
+
# This should work without the key
systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null
systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null