From: Lennart Poettering Date: Thu, 27 Nov 2025 07:58:26 +0000 (+0100) Subject: core: add PrivateUsers=managed X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6b22ac31afcfab53dc9b51d6b5f7862e52607923;p=thirdparty%2Fsystemd.git core: add PrivateUsers=managed --- diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 0ba3e011f21..87bd3fd92ae 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2202,16 +2202,17 @@ BindReadOnlyPaths=/var/lib/systemd PrivateUsers= - Takes a boolean argument or one of self, identity, - or full. Defaults to false. If enabled, sets up a new user namespace for the - executed processes and configures a user and group mapping. If set to a true value or - self, a minimal user and group mapping is configured that maps the - root user and group as well as the unit's own user and group to themselves and - everything else to the nobody user and group. This is useful to securely detach - the user and group databases used by the unit from the rest of the system, and thus to create an - effective sandbox environment. All files, directories, processes, IPC objects and other resources - owned by users/groups not equaling root or the unit's own will stay visible from - within the unit but appear owned by the nobody user and group. + Takes a boolean argument or one of self, + identity, full or managed. Defaults to + false. If enabled, sets up a new user namespace for the executed processes and configures a user and + group mapping. If set to a true value or self, a minimal user and group mapping is + configured that maps the root user and group as well as the unit's own user and + group to themselves and everything else to the nobody user and group. This is + useful to securely detach the user and group databases used by the unit from the rest of the system, + and thus to create an effective sandbox environment. All files, directories, processes, IPC objects + and other resources owned by users/groups not equaling root or the unit's own will + stay visible from within the unit but appear owned by the nobody user and + group. If the parameter is identity, user namespacing is set up with an identity mapping for the first 65536 UIDs/GIDs. Any UIDs/GIDs above 65536 will be mapped to the @@ -2224,14 +2225,21 @@ BindReadOnlyPaths=/var/lib/systemd to call setgroups() system calls (by setting /proc/pid/setgroups to allow). Similar to identity, this does not provide UID/GID isolation, but it does provide - process capability isolation. - - If this mode is enabled, all unit processes are run without privileges in the host user - namespace (regardless of whether the unit's own user/group is root or not). Specifically - this means that the process will have zero process capabilities on the host's user namespace, but - full capabilities within the service's user namespace. Settings such as - CapabilityBoundingSet= will affect only the latter, and there's no way to acquire - additional capabilities in the host's user namespace. + process capability isolation. If this mode is enabled, all unit processes are run without privileges + in the host user namespace (regardless of whether the unit's own user/group is + root or not). Specifically this means that the process will have zero process + capabilities on the host's user namespace, but full capabilities within the service's user + namespace. Settings such as CapabilityBoundingSet= will affect only the latter, + and there's no way to acquire additional capabilities in the host's user namespace. + + If the paramater is managed a transient, dynamically allocated range of + 65536 UIDs/GIDs is allocated for the unit, and a UID/GID mapping is assigned to the unit's process + so the UID/GID 0 from inside the unit maps to the first UID/GID of the allocated mapping. Note that + in this mode the UID/GID the service process will run as is different depending if looking from the + host side (where it will be a high, dynamically assigned UID) or from inside the unit (where it will + be 0). Also note that this mode will enable file system UID mapping for the file systems this service + accesses, mapping the "foreign" UID range on disk to the selected dynamic UID range at + runtime. When this setting is set up by a per-user instance of the service manager, the mapping of the root user and group to itself is omitted (unless the user manager is root). diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 298fd0754e7..48e3902633c 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -57,6 +57,7 @@ #include "mountpoint-util.h" #include "namespace-util.h" #include "nsflags.h" +#include "nsresource.h" #include "open-file.h" #include "osc-context.h" #include "pam-util.h" @@ -2396,10 +2397,10 @@ static int setup_private_users_child(int unshare_ready_fd, const char *uid_map, static int setup_private_users( PrivateUsers private_users, - uid_t ouid, - gid_t ogid, - uid_t uid, - gid_t gid, + uid_t ouid, /* service manager uid */ + gid_t ogid, /* service manager gid */ + uid_t uid, /* unit uid */ + gid_t gid, /* unit gid */ bool allow_setgroups) { _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; @@ -2425,6 +2426,25 @@ static int setup_private_users( case PRIVATE_USERS_NO: return 0; /* Early exit */ + case PRIVATE_USERS_MANAGED: { + if (uid != 0 || gid != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "When allocating dynamic user namespace range, target UID/GID must be root, refusing."); + + _cleanup_close_ int userns_fd = nsresource_allocate_userns(/* name= */ NULL, NSRESOURCE_UIDS_64K); + if (userns_fd < 0) + return userns_fd; + + if (setns(userns_fd, CLONE_NEWUSER) < 0) + return log_debug_errno(errno, "Failed to join freshly allocated user namespace: %m"); + + /* In "managed" mode the originating UID is not mapped hence we need to explicitly become root in the new userns now. */ + r = reset_uid_gid(); + if (r < 0) + return log_debug_errno(r, "Failed to reset UID/GID to root: %m"); + + return 1; /* Early exit */ + } + case PRIVATE_USERS_IDENTITY: uid_map = strdup("0 0 65536\n"); if (!uid_map) diff --git a/src/core/namespace.c b/src/core/namespace.c index 6927845b9c3..56fc268fef2 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -4137,6 +4137,7 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = { [PRIVATE_USERS_SELF] = "self", [PRIVATE_USERS_IDENTITY] = "identity", [PRIVATE_USERS_FULL] = "full", + [PRIVATE_USERS_MANAGED] = "managed", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF); diff --git a/src/core/namespace.h b/src/core/namespace.h index 4f5e6546bd8..4b62debf2fc 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -70,6 +70,7 @@ typedef enum PrivateUsers { PRIVATE_USERS_SELF, PRIVATE_USERS_IDENTITY, PRIVATE_USERS_FULL, + PRIVATE_USERS_MANAGED, _PRIVATE_USERS_MAX, _PRIVATE_USERS_INVALID = -EINVAL, } PrivateUsers; diff --git a/src/core/unit.c b/src/core/unit.c index d468f1303d2..b636e097cd6 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -4328,6 +4328,9 @@ static int unit_verify_contexts(const Unit *u) { if (ec->pam_name && kc && !IN_SET(kc->kill_mode, KILL_CONTROL_GROUP, KILL_MIXED)) return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group' or 'mixed'. Refusing."); + if ((ec->user || ec->dynamic_user) && ec->private_users == PRIVATE_USERS_MANAGED) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivateUsers=managed may not be used in combination with User=/DynamicUser=, refusing."); + return 0; }