From: Ryan Wilson Date: Fri, 15 Nov 2024 14:56:05 +0000 (-0800) Subject: core: Add PrivateUsers=full X-Git-Tag: v258-rc1~1874^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=705cc82938b67fa110f2f6f5d28bfb9ec2f339c0;p=thirdparty%2Fsystemd.git core: Add PrivateUsers=full Recently, PrivateUsers=identity was added to support mapping the first 65536 UIDs/GIDs from parent to the child namespace and mapping the other UID/GIDs to the nobody user. However, there are use cases where users have UIDs/GIDs > 65536 and need to do a similar identity mapping. Moreover, in some of those cases, users want a full identity mapping from 0 -> UID_MAX. Note to differentiate ourselves from the init user namespace, we need to set up the uid_map/gid_map like: ``` 0 0 1 1 1 UINT32_MAX - 1 ``` as the init user namedspace uses `0 0 UINT32_MAX` and some applications - like systemd itself - determine if its a non-init user namespace based on uid_map/gid_map files. Note systemd will remove this heuristic in running_in_userns() in version 258 and uses namespace inode. But some users may be running a container image with older systemd < 258 so we keep this hack until version 259. To support this, we add PrivateUsers=full that does identity mapping for all available UID/GIDs. Fixes: #35168 --- diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 607c88128de..482dbbda80a 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2009,8 +2009,8 @@ BindReadOnlyPaths=/var/lib/systemd PrivateUsers= - Takes a boolean argument or one of self or - identity. Defaults to false. If enabled, sets up a new user namespace for the + Takes a boolean argument or one of self, identity, + or full. Defaults to false. If enabled, sets up a new user namespace for the executed processes and configures a user and group mapping. If set to a true value or self, a minimal user and group mapping is configured that maps the root user and group as well as the unit's own user and group to themselves and @@ -2026,6 +2026,10 @@ BindReadOnlyPaths=/var/lib/systemd since all UIDs/GIDs are chosen identically it does provide process capability isolation, and hence is often a good choice if proper user namespacing with distinct UID maps is not appropriate. + If the parameter is full, user namespacing is set up with an identity + mapping for all UIDs/GIDs. Similar to identity, this does not provide UID/GID + isolation, but it does provide process capability isolation. + If this mode is enabled, all unit processes are run without privileges in the host user namespace (regardless if the unit's own user/group is root or not). Specifically this means that the process will have zero process capabilities on the host's user namespace, but diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 9d636f55295..682d6449d76 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2103,6 +2103,29 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi uid_map = strdup("0 0 65536\n"); if (!uid_map) return -ENOMEM; + } else if (private_users == PRIVATE_USERS_FULL) { + /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because + * this is the same UID/GID map as the init user namespace and systemd's running_in_userns() + * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`. + * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user + * namespace from the init namespace: + * 0 0 1 + * 1 1 UINT32_MAX - 1 + * + * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258 + * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep + * this uid_map/gid_map hack until version 259 for version N-1 compatibility. + * + * TODO: Switch to `0 0 UINT32_MAX` in systemd v259. + * + * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though + * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is + * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to + * use these UIDs/GIDs so we need to map them. */ + r = asprintf(&uid_map, "0 0 1\n" + "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1)); + if (r < 0) + return -ENOMEM; /* Can only set up multiple mappings with CAP_SETUID. */ } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) { r = asprintf(&uid_map, @@ -2123,6 +2146,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi gid_map = strdup("0 0 65536\n"); if (!gid_map) return -ENOMEM; + } else if (private_users == PRIVATE_USERS_FULL) { + r = asprintf(&gid_map, "0 0 1\n" + "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1)); + if (r < 0) + return -ENOMEM; /* Can only set up multiple mappings with CAP_SETGID. */ } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) { r = asprintf(&gid_map, diff --git a/src/core/namespace.c b/src/core/namespace.c index 57dbbc4fc7d..c584ea35724 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -3364,6 +3364,7 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = { [PRIVATE_USERS_NO] = "no", [PRIVATE_USERS_SELF] = "self", [PRIVATE_USERS_IDENTITY] = "identity", + [PRIVATE_USERS_FULL] = "full", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF); diff --git a/src/core/namespace.h b/src/core/namespace.h index bd48aa31da7..5d466a8c1c7 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -65,6 +65,7 @@ typedef enum PrivateUsers { PRIVATE_USERS_NO, PRIVATE_USERS_SELF, PRIVATE_USERS_IDENTITY, + PRIVATE_USERS_FULL, _PRIVATE_USERS_MAX, _PRIVATE_USERS_INVALID = -EINVAL, } PrivateUsers; diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh index 2475b5d365d..ba85248f960 100755 --- a/test/units/TEST-07-PID1.private-users.sh +++ b/test/units/TEST-07-PID1.private-users.sh @@ -10,3 +10,5 @@ systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_ma systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'