]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: add new PrivateUsers= option to service execution
authorLennart Poettering <lennart@poettering.net>
Wed, 3 Aug 2016 16:44:51 +0000 (18:44 +0200)
committerLennart Poettering <lennart@poettering.net>
Wed, 3 Aug 2016 18:42:04 +0000 (20:42 +0200)
This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

man/systemd.exec.xml
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/shared/bus-unit-util.c

index 58ba5829112dfa5e00f2f0cbd633ef8aecdbc0b2..2190da55d4739e60bfc6d9407aa6849ca91a217a 100644 (file)
       <varlistentry>
         <term><varname>WorkingDirectory=</varname></term>
 
-        <listitem><para>Takes a directory path relative to the service's root
-        directory specified by <varname>RootDirectory=</varname>, or the
-        special value <literal>~</literal>. Sets the working directory
-        for executed processes. If set to <literal>~</literal>, the
-        home directory of the user specified in
-        <varname>User=</varname> is used. If not set, defaults to the
-        root directory when systemd is running as a system instance
-        and the respective user's home directory if run as user. If
-        the setting is prefixed with the <literal>-</literal>
-        character, a missing working directory is not considered
-        fatal. If <varname>RootDirectory=</varname> is not set, then
-        <varname>WorkingDirectory=</varname> is relative to the root of
-        the system running the service manager.
-        Note that setting this parameter might result in
-        additional dependencies to be added to the unit (see
-        above).</para></listitem>
+        <listitem><para>Takes a directory path relative to the service's root directory specified by
+        <varname>RootDirectory=</varname>, or the special value <literal>~</literal>. Sets the working directory for
+        executed processes. If set to <literal>~</literal>, the home directory of the user specified in
+        <varname>User=</varname> is used. If not set, defaults to the root directory when systemd is running as a
+        system instance and the respective user's home directory if run as user. If the setting is prefixed with the
+        <literal>-</literal> character, a missing working directory is not considered fatal. If
+        <varname>RootDirectory=</varname> is not set, then <varname>WorkingDirectory=</varname> is relative to the root
+        of the system running the service manager.  Note that setting this parameter might result in additional
+        dependencies to be added to the unit (see above).</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>RootDirectory=</varname></term>
 
-        <listitem><para>Takes a directory path relative to the host's root directory
-        (i.e. the root of the system running the service manager). Sets the
-        root directory for executed processes, with the <citerefentry
-        project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-        system call. If this is used, it must be ensured that the
-        process binary and all its auxiliary files are available in
-        the <function>chroot()</function> jail. Note that setting this
-        parameter might result in additional dependencies to be added
-        to the unit (see above).</para></listitem>
+        <listitem><para>Takes a directory path relative to the host's root directory (i.e. the root of the system
+        running the service manager). Sets the root directory for executed processes, with the <citerefentry
+        project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>2</manvolnum></citerefentry> system
+        call. If this is used, it must be ensured that the process binary and all its auxiliary files are available in
+        the <function>chroot()</function> jail. Note that setting this parameter might result in additional
+        dependencies to be added to the unit (see above).</para>
+
+        <para>The <varname>PrivateUsers=</varname> setting is particularly useful in conjunction with
+        <varname>RootDirectory=</varname>. For details, see below.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         accessible).</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>PrivateUsers=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, sets up a new user namespace for the executed processes and
+        configures a minimal user and group mapping, that maps the <literal>root</literal> user and group as well as
+        the unit's own user and group to themselves and everything else to the <literal>nobody</literal> user and
+        group. This is useful to securely detach the user and group databases used by the unit from the rest of the
+        system, and thus to create an effective sandbox environment. All files, directories, processes, IPC objects and
+        other resources owned by users/groups not equalling <literal>root</literal> or the unit's own will stay visible
+        from within the unit but appear owned by the <literal>nobody</literal> user and group. If this mode is enabled,
+        all unit processes are run without privileges in the host user namespace (regardless if the unit's own
+        user/group is <literal>root</literal> or not). Specifically this means that the process will have zero process
+        capabilities on the host's user namespace, but full capabilities within the service's user namespace. Settings
+        such as <varname>CapabilityBoundingSet=</varname> will affect only the latter, and there's no way to acquire
+        additional capabilities in the host's user namespace. Defaults to off.</para>
+
+        <para>This setting is particularly useful in conjunction with <varname>RootDirectory=</varname>, as the need to
+        synchronize the user and group databases in the root directory and on the host is reduced, as the only users
+        and groups who need to be matched are <literal>root</literal>, <literal>nobody</literal> and the unit's own
+        user and group.</para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>ProtectSystem=</varname></term>
 
index 9c50cd93e530f56a6c5675d67971b73f8ab281c3..4b3bbfbc7d527f4d0af80ed6069b652667ec9b8f 100644 (file)
@@ -705,8 +705,9 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
-        SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectSystem", "s", bus_property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1068,7 +1069,7 @@ int bus_exec_context_set_transient_property(
 
         } else if (STR_IN_SET(name,
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
-                              "PrivateTmp", "PrivateDevices", "PrivateNetwork",
+                              "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
                               "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
                               "RestrictRealtime", "DynamicUser")) {
                 int b;
@@ -1090,6 +1091,8 @@ int bus_exec_context_set_transient_property(
                                 c->private_devices = b;
                         else if (streq(name, "PrivateNetwork"))
                                 c->private_network = b;
+                        else if (streq(name, "PrivateUsers"))
+                                c->private_users = b;
                         else if (streq(name, "NoNewPrivileges"))
                                 c->no_new_privileges = b;
                         else if (streq(name, "SyslogLevelPrefix"))
index 26e9cd5339ede7353870995045c93a7ce5ddad2d..cec3b3cf40bfd268074c68a97f3134bbbefec3db 100644 (file)
@@ -25,6 +25,7 @@
 #include <signal.h>
 #include <string.h>
 #include <sys/capability.h>
+#include <sys/eventfd.h>
 #include <sys/mman.h>
 #include <sys/personality.h>
 #include <sys/prctl.h>
@@ -1526,6 +1527,159 @@ static bool exec_needs_mount_namespace(
         return false;
 }
 
+static int setup_private_users(uid_t uid, gid_t gid) {
+        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
+        _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
+        _cleanup_close_ int unshare_ready_fd = -1;
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        uint64_t c = 1;
+        siginfo_t si;
+        ssize_t n;
+        int r;
+
+        /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
+         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
+         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
+         * which waits for the parent to create the new user namespace while staying in the original namespace. The
+         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
+         * continues execution normally. */
+
+        if (uid != 0 && uid_is_valid(uid))
+                asprintf(&uid_map,
+                         "0 0 1\n"                      /* Map root → root */
+                         UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
+                         uid, uid);                     /* The case where the above is the same */
+        else
+                uid_map = strdup("0 0 1\n");
+        if (!uid_map)
+                return -ENOMEM;
+
+        if (gid != 0 && gid_is_valid(gid))
+                asprintf(&gid_map,
+                         "0 0 1\n"                      /* Map root → root */
+                         GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
+                         gid, gid);
+        else
+                gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
+        if (!gid_map)
+                return -ENOMEM;
+
+        /* Create a communication channel so that the parent can tell the child when it finished creating the user
+         * namespace. */
+        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
+        if (unshare_ready_fd < 0)
+                return -errno;
+
+        /* Create a communication channel so that the child can tell the parent a proper error code in case it
+         * failed. */
+        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+                return -errno;
+
+        pid = fork();
+        if (pid < 0)
+                return -errno;
+
+        if (pid == 0) {
+                _cleanup_close_ int fd = -1;
+                const char *a;
+                pid_t ppid;
+
+                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
+                 * here, after the parent opened its own user namespace. */
+
+                ppid = getppid();
+                errno_pipe[0] = safe_close(errno_pipe[0]);
+
+                /* Wait until the parent unshared the user namespace */
+                if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+
+                /* Disable the setgroups() system call in the child user namespace, for good. */
+                a = procfs_file_alloca(ppid, "setgroups");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        if (errno != ENOENT) {
+                                r = -errno;
+                                goto child_fail;
+                        }
+
+                        /* If the file is missing the kernel is too old, let's continue anyway. */
+                } else {
+                        if (write(fd, "deny\n", 5) < 0) {
+                                r = -errno;
+                                goto child_fail;
+                        }
+
+                        fd = safe_close(fd);
+                }
+
+                /* First write the GID map */
+                a = procfs_file_alloca(ppid, "gid_map");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                if (write(fd, gid_map, strlen(gid_map)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                fd = safe_close(fd);
+
+                /* The write the UID map */
+                a = procfs_file_alloca(ppid, "uid_map");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                if (write(fd, uid_map, strlen(uid_map)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+
+                _exit(EXIT_SUCCESS);
+
+        child_fail:
+                (void) write(errno_pipe[1], &r, sizeof(r));
+                _exit(EXIT_FAILURE);
+        }
+
+        errno_pipe[1] = safe_close(errno_pipe[1]);
+
+        if (unshare(CLONE_NEWUSER) < 0)
+                return -errno;
+
+        /* Let the child know that the namespace is ready now */
+        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
+                return -errno;
+
+        /* Try to read an error code from the child */
+        n = read(errno_pipe[0], &r, sizeof(r));
+        if (n < 0)
+                return -errno;
+        if (n == sizeof(r)) { /* an error code was sent to us */
+                if (r < 0)
+                        return r;
+                return -EIO;
+        }
+        if (n != 0) /* on success we should have read 0 bytes */
+                return -EIO;
+
+        r = wait_for_terminate(pid, &si);
+        if (r < 0)
+                return r;
+        pid = 0;
+
+        /* If something strange happened with the child, let's consider this fatal, too */
+        if (si.si_code != CLD_EXITED || si.si_status != 0)
+                return -EIO;
+
+        return 0;
+}
+
 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
         assert(array);
         assert(n);
@@ -2037,6 +2191,14 @@ static int exec_child(
         }
 #endif
 
+        if (params->apply_permissions && context->private_users) {
+                r = setup_private_users(uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_USER;
+                        return r;
+                }
+        }
+
         /* We repeat the fd closing here, to make sure that
          * nothing is leaked from the PAM modules. Note that
          * we are more aggressive this time since socket_fd
@@ -2598,8 +2760,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sRootDirectory: %s\n"
                 "%sNonBlocking: %s\n"
                 "%sPrivateTmp: %s\n"
-                "%sPrivateNetwork: %s\n"
                 "%sPrivateDevices: %s\n"
+                "%sPrivateNetwork: %s\n"
+                "%sPrivateUsers: %s\n"
                 "%sProtectHome: %s\n"
                 "%sProtectSystem: %s\n"
                 "%sIgnoreSIGPIPE: %s\n"
@@ -2610,8 +2773,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, c->root_directory ? c->root_directory : "/",
                 prefix, yes_no(c->non_blocking),
                 prefix, yes_no(c->private_tmp),
-                prefix, yes_no(c->private_network),
                 prefix, yes_no(c->private_devices),
+                prefix, yes_no(c->private_network),
+                prefix, yes_no(c->private_users),
                 prefix, protect_home_to_string(c->protect_home),
                 prefix, protect_system_to_string(c->protect_system),
                 prefix, yes_no(c->ignore_sigpipe),
index 48cc18fbb37fd1fe814e2f582a895d1d14d00836..5fac3e85e86cfab1f4fda222839aaaec4d674084 100644 (file)
@@ -171,6 +171,7 @@ struct ExecContext {
         bool private_tmp;
         bool private_network;
         bool private_devices;
+        bool private_users;
         ProtectSystem protect_system;
         ProtectHome protect_home;
 
index 396f84721378beb4ae7621bffc6c5e9e03e62083..251155b428a1b44c84d2522638286c6ac819d433 100644 (file)
@@ -88,8 +88,9 @@ $1.ReadWritePaths,               config_parse_namespace_path_strv,   0,
 $1.ReadOnlyPaths,                config_parse_namespace_path_strv,   0,                             offsetof($1, exec_context.read_only_paths)
 $1.InaccessiblePaths,            config_parse_namespace_path_strv,   0,                             offsetof($1, exec_context.inaccessible_paths)
 $1.PrivateTmp,                   config_parse_bool,                  0,                             offsetof($1, exec_context.private_tmp)
-$1.PrivateNetwork,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_network)
 $1.PrivateDevices,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_devices)
+$1.PrivateNetwork,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_network)
+$1.PrivateUsers,                 config_parse_bool,                  0,                             offsetof($1, exec_context.private_users)
 $1.ProtectSystem,                config_parse_protect_system,        0,                             offsetof($1, exec_context)
 $1.ProtectHome,                  config_parse_protect_home,          0,                             offsetof($1, exec_context)
 $1.MountFlags,                   config_parse_exec_mount_flags,      0,                             offsetof($1, exec_context)
index 14bf8ad6270fa4d4695196b2064612bd7f15fde6..9d8061b539bef6a8bb253857d4661b5398839509 100644 (file)
@@ -202,7 +202,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                               "CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", "TasksAccounting",
                               "SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies",
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
-                              "PrivateTmp", "PrivateDevices", "PrivateNetwork", "NoNewPrivileges",
+                              "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
                               "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
                               "RestrictRealtime", "DynamicUser")) {