core: PrivateUsers=true for (unprivileged) user managers

author Anita Zhang <the.anitazha@gmail.com>

Wed, 23 Oct 2019 00:37:47 +0000 (17:37 -0700)

committer Anita Zhang <the.anitazha@gmail.com>

Wed, 18 Dec 2019 19:09:30 +0000 (11:09 -0800)
author Anita Zhang <the.anitazha@gmail.com>
Wed, 23 Oct 2019 00:37:47 +0000 (17:37 -0700)
committer Anita Zhang <the.anitazha@gmail.com>
Wed, 18 Dec 2019 19:09:30 +0000 (11:09 -0800)
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml

index ff332e9f04f11cb19df6b7dcf5035bc865e0fe0f..27b58432b9649021a61edc723312be59452b3c96 100644 (file)
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -830,7 +830,8 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
      <para>Also note that some sandboxing functionality is generally not available in user services (i.e. services run
      by the per-user service manager). Specifically, the various settings requiring file system namespacing support
      (such as <varname>ProtectSystem=</varname>) are not available, as the underlying kernel functionality is only
-    accessible to privileged processes.</para>
+    accessible to privileged processes. However, most namespacing settings, that will not work on their own in user
+    services, will work when used in conjunction with <varname>PrivateUsers=</varname><option>true</option>.</para>
  
      <variablelist class='unit-directives'>
  
@@ -1251,6 +1252,13 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
          such as <varname>CapabilityBoundingSet=</varname> will affect only the latter, and there's no way to acquire
          additional capabilities in the host's user namespace. Defaults to off.</para>
  
+        <para>When this setting is set up by a per-user instance of the service manager, the mapping of the
+        <literal>root</literal> user and group to itself is omitted (unless the user manager is root).
+        Additionally, in the per-user instance manager case, the
+        user namespace will be set up before most other namespaces. This means that combining
+        <varname>PrivateUsers=</varname><option>true</option> with other namespaces will enable use of features not
+        normally supported by the per-user instances of the service manager.</para>
+
          <para>This setting is particularly useful in conjunction with
          <varname>RootDirectory=</varname>/<varname>RootImage=</varname>, as the need to synchronize the user and group
          databases in the root directory and on the host is reduced, as the only users and groups who need to be matched
@@ -1258,9 +1266,7 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
  
          <para>Note that the implementation of this setting might be impossible (for example if user namespaces are not
          available), and the unit should be written in a way that does not solely rely on this setting for
-        security.</para>
-
-        <xi:include href="system-only.xml" xpointer="singular"/></listitem>
+        security.</para></listitem>
        </varlistentry>
  
        <varlistentry>
diff --git a/src/core/execute.c b/src/core/execute.c

index 4f96d1f4102cb71bc119f85f28ebf16ab67db5f2..2cb2392d16bc1de107b7daca187f514bb31f2e20 100644 (file)
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1900,7 +1900,7 @@ static bool exec_needs_mount_namespace(
          return false;
  }
  
-static int setup_private_users(uid_t uid, gid_t gid) {
+static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
          _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
          _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
          _cleanup_close_ int unshare_ready_fd = -1;
@@ -1909,38 +1909,43 @@ static int setup_private_users(uid_t uid, gid_t gid) {
          ssize_t n;
          int r;
  
-        /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
+        /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
+         * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
           * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
           * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
           * which waits for the parent to create the new user namespace while staying in the original namespace. The
           * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
-         * continues execution normally. */
+         * continues execution normally.
+         * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
+         * does not need CAP_SETUID to write the single line mapping to itself. */
  
-        if (uid != 0 && uid_is_valid(uid)) {
+        /* Can only set up multiple mappings with CAP_SETUID. */
+        if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
                  r = asprintf(&uid_map,
-                             "0 0 1\n"                      /* Map root → root */
+                             UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
                               UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
-                             uid, uid);
-                if (r < 0)
-                        return -ENOMEM;
-        } else {
-                uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
-                if (!uid_map)
-                        return -ENOMEM;
-        }
+                             ouid, ouid, uid, uid);
+        else
+                r = asprintf(&uid_map,
+                             UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
+                             ouid, ouid);
  
-        if (gid != 0 && gid_is_valid(gid)) {
+        if (r < 0)
+                return -ENOMEM;
+
+        /* Can only set up multiple mappings with CAP_SETGID. */
+        if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
                  r = asprintf(&gid_map,
-                             "0 0 1\n"                      /* Map root → root */
+                             GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
                               GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
-                             gid, gid);
-                if (r < 0)
-                        return -ENOMEM;
-        } else {
-                gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
-                if (!gid_map)
-                        return -ENOMEM;
-        }
+                             ogid, ogid, gid, gid);
+        else
+                r = asprintf(&gid_map,
+                             GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
+                             ogid, ogid);
+
+        if (r < 0)
+                return -ENOMEM;
  
          /* Create a communication channel so that the parent can tell the child when it finished creating the user
           * namespace. */
@@ -2983,6 +2988,7 @@ static int exec_child(
          char **final_argv = NULL;
          dev_t journal_stream_dev = 0;
          ino_t journal_stream_ino = 0;
+        bool userns_set_up = false;
          bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
                  needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
                  needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
@@ -2997,6 +3003,8 @@ static int exec_child(
  #if HAVE_APPARMOR
          bool use_apparmor = false;
  #endif
+        uid_t saved_uid = getuid();
+        gid_t saved_gid = getgid();
          uid_t uid = UID_INVALID;
          gid_t gid = GID_INVALID;
          size_t n_fds;
@@ -3418,6 +3426,30 @@ static int exec_child(
                  }
          }
  
+        if (needs_sandboxing) {
+#if HAVE_SELINUX
+                if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
+                        r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
+                        if (r < 0) {
+                                *exit_status = EXIT_SELINUX_CONTEXT;
+                                return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
+                        }
+                }
+#endif
+
+                /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
+                 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
+                 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
+                if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
+                        userns_set_up = true;
+                        r = setup_private_users(saved_uid, saved_gid, uid, gid);
+                        if (r < 0) {
+                                *exit_status = EXIT_USER;
+                                return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
+                        }
+                }
+        }
+
          if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
  
                  if (ns_type_supported(NAMESPACE_NET)) {
@@ -3466,7 +3498,9 @@ static int exec_child(
  #endif
          }
  
-        /* Drop groups as early as possbile */
+        /* Drop groups as early as possible.
+         * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
+         * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
          if (needs_setuid) {
                  r = enforce_groups(gid, supplementary_gids, ngids);
                  if (r < 0) {
@@ -3475,23 +3509,17 @@ static int exec_child(
                  }
          }
  
-        if (needs_sandboxing) {
-#if HAVE_SELINUX
-                if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
-                        r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
-                        if (r < 0) {
-                                *exit_status = EXIT_SELINUX_CONTEXT;
-                                return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
-                        }
-                }
-#endif
+        /* If the user namespace was not set up above, try to do it now.
+         * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
+         * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
+         * case of mount namespaces being less privileged when the mount point list is copied from a
+         * different user namespace). */
  
-                if (context->private_users) {
-                        r = setup_private_users(uid, gid);
-                        if (r < 0) {
-                                *exit_status = EXIT_USER;
-                                return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
-                        }
+        if (needs_sandboxing && context->private_users && !userns_set_up) {
+                r = setup_private_users(saved_uid, saved_gid, uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_USER;
+                        return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
                  }
          }
author	Anita Zhang <the.anitazha@gmail.com>
	Wed, 23 Oct 2019 00:37:47 +0000 (17:37 -0700)
committer	Anita Zhang <the.anitazha@gmail.com>
	Wed, 18 Dec 2019 19:09:30 +0000 (11:09 -0800)
man/systemd.exec.xml		patch \| blob \| blame \| history
src/core/execute.c		patch \| blob \| blame \| history