<para>Also note that some sandboxing functionality is generally not available in user services (i.e. services run
by the per-user service manager). Specifically, the various settings requiring file system namespacing support
(such as <varname>ProtectSystem=</varname>) are not available, as the underlying kernel functionality is only
- accessible to privileged processes.</para>
+ accessible to privileged processes. However, most namespacing settings, that will not work on their own in user
+ services, will work when used in conjunction with <varname>PrivateUsers=</varname><option>true</option>.</para>
<variablelist class='unit-directives'>
such as <varname>CapabilityBoundingSet=</varname> will affect only the latter, and there's no way to acquire
additional capabilities in the host's user namespace. Defaults to off.</para>
+ <para>When this setting is set up by a per-user instance of the service manager, the mapping of the
+ <literal>root</literal> user and group to itself is omitted (unless the user manager is root).
+ Additionally, in the per-user instance manager case, the
+ user namespace will be set up before most other namespaces. This means that combining
+ <varname>PrivateUsers=</varname><option>true</option> with other namespaces will enable use of features not
+ normally supported by the per-user instances of the service manager.</para>
+
<para>This setting is particularly useful in conjunction with
<varname>RootDirectory=</varname>/<varname>RootImage=</varname>, as the need to synchronize the user and group
databases in the root directory and on the host is reduced, as the only users and groups who need to be matched
<para>Note that the implementation of this setting might be impossible (for example if user namespaces are not
available), and the unit should be written in a way that does not solely rely on this setting for
- security.</para>
-
- <xi:include href="system-only.xml" xpointer="singular"/></listitem>
+ security.</para></listitem>
</varlistentry>
<varlistentry>
return false;
}
-static int setup_private_users(uid_t uid, gid_t gid) {
+static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
_cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
_cleanup_close_ int unshare_ready_fd = -1;
ssize_t n;
int r;
- /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
+ /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
+ * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
* nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
* we however lack after opening the user namespace. To work around this we fork() a temporary child process,
* which waits for the parent to create the new user namespace while staying in the original namespace. The
* child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
- * continues execution normally. */
+ * continues execution normally.
+ * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
+ * does not need CAP_SETUID to write the single line mapping to itself. */
- if (uid != 0 && uid_is_valid(uid)) {
+ /* Can only set up multiple mappings with CAP_SETUID. */
+ if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
r = asprintf(&uid_map,
- "0 0 1\n" /* Map root → root */
+ UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
- uid, uid);
- if (r < 0)
- return -ENOMEM;
- } else {
- uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
- if (!uid_map)
- return -ENOMEM;
- }
+ ouid, ouid, uid, uid);
+ else
+ r = asprintf(&uid_map,
+ UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
+ ouid, ouid);
- if (gid != 0 && gid_is_valid(gid)) {
+ if (r < 0)
+ return -ENOMEM;
+
+ /* Can only set up multiple mappings with CAP_SETGID. */
+ if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
r = asprintf(&gid_map,
- "0 0 1\n" /* Map root → root */
+ GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
- gid, gid);
- if (r < 0)
- return -ENOMEM;
- } else {
- gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
- if (!gid_map)
- return -ENOMEM;
- }
+ ogid, ogid, gid, gid);
+ else
+ r = asprintf(&gid_map,
+ GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
+ ogid, ogid);
+
+ if (r < 0)
+ return -ENOMEM;
/* Create a communication channel so that the parent can tell the child when it finished creating the user
* namespace. */
char **final_argv = NULL;
dev_t journal_stream_dev = 0;
ino_t journal_stream_ino = 0;
+ bool userns_set_up = false;
bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
#if HAVE_APPARMOR
bool use_apparmor = false;
#endif
+ uid_t saved_uid = getuid();
+ gid_t saved_gid = getgid();
uid_t uid = UID_INVALID;
gid_t gid = GID_INVALID;
size_t n_fds;
}
}
+ if (needs_sandboxing) {
+#if HAVE_SELINUX
+ if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
+ r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
+ if (r < 0) {
+ *exit_status = EXIT_SELINUX_CONTEXT;
+ return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
+ }
+ }
+#endif
+
+ /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
+ * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
+ * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
+ if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
+ userns_set_up = true;
+ r = setup_private_users(saved_uid, saved_gid, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
+ }
+ }
+ }
+
if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
if (ns_type_supported(NAMESPACE_NET)) {
#endif
}
- /* Drop groups as early as possbile */
+ /* Drop groups as early as possible.
+ * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
+ * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
if (needs_setuid) {
r = enforce_groups(gid, supplementary_gids, ngids);
if (r < 0) {
}
}
- if (needs_sandboxing) {
-#if HAVE_SELINUX
- if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
- r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
- if (r < 0) {
- *exit_status = EXIT_SELINUX_CONTEXT;
- return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
- }
- }
-#endif
+ /* If the user namespace was not set up above, try to do it now.
+ * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
+ * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
+ * case of mount namespaces being less privileged when the mount point list is copied from a
+ * different user namespace). */
- if (context->private_users) {
- r = setup_private_users(uid, gid);
- if (r < 0) {
- *exit_status = EXIT_USER;
- return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
- }
+ if (needs_sandboxing && context->private_users && !userns_set_up) {
+ r = setup_private_users(saved_uid, saved_gid, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
}
}