]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/core/execute.c
resolve: remove unsed counter (#4349)
[thirdparty/systemd.git] / src / core / execute.c
index 90281397235333e5ba03fb612711e2dadd5769e9..7079aeed6e95e2351f7867c0f0923c561bafbb0b 100644 (file)
@@ -25,6 +25,7 @@
 #include <signal.h>
 #include <string.h>
 #include <sys/capability.h>
+#include <sys/eventfd.h>
 #include <sys/mman.h>
 #include <sys/personality.h>
 #include <sys/prctl.h>
@@ -90,6 +91,7 @@
 #include "selinux-util.h"
 #include "signal-util.h"
 #include "smack-util.h"
+#include "special.h"
 #include "string-table.h"
 #include "string-util.h"
 #include "strv.h"
@@ -779,9 +781,10 @@ static int enforce_groups(const ExecContext *context, const char *username, gid_
                         k++;
                 }
 
-                if (setgroups(k, gids) < 0) {
+                r = maybe_setgroups(k, gids);
+                if (r < 0) {
                         free(gids);
-                        return -errno;
+                        return r;
                 }
 
                 free(gids);
@@ -835,14 +838,19 @@ static int null_conv(
         return PAM_CONV_ERR;
 }
 
+#endif
+
 static int setup_pam(
                 const char *name,
                 const char *user,
                 uid_t uid,
+                gid_t gid,
                 const char *tty,
                 char ***env,
                 int fds[], unsigned n_fds) {
 
+#ifdef HAVE_PAM
+
         static const struct pam_conv conv = {
                 .conv = null_conv,
                 .appdata_ptr = NULL
@@ -942,8 +950,14 @@ static int setup_pam(
                  * and this will make PR_SET_PDEATHSIG work in most cases.
                  * If this fails, ignore the error - but expect sd-pam threads
                  * to fail to exit normally */
+
+                r = maybe_setgroups(0, NULL);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
+                if (setresgid(gid, gid, gid) < 0)
+                        log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
                 if (setresuid(uid, uid, uid) < 0)
-                        log_error_errno(r, "Error: Failed to setresuid() in sd-pam: %m");
+                        log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
 
                 (void) ignore_signals(SIGPIPE, -1);
 
@@ -1036,8 +1050,10 @@ fail:
         closelog();
 
         return r;
-}
+#else
+        return 0;
 #endif
+}
 
 static void rename_process_from_path(const char *path) {
         char process_name[11];
@@ -1072,7 +1088,17 @@ static void rename_process_from_path(const char *path) {
 
 #ifdef HAVE_SECCOMP
 
-static int apply_seccomp(const ExecContext *c) {
+static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
+        if (!is_seccomp_available()) {
+                log_open();
+                log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
+                log_close();
+                return true;
+        }
+        return false;
+}
+
+static int apply_seccomp(const Unit* u, const ExecContext *c) {
         uint32_t negative_action, action;
         scmp_filter_ctx *seccomp;
         Iterator i;
@@ -1081,6 +1107,9 @@ static int apply_seccomp(const ExecContext *c) {
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "syscall filtering"))
+                return 0;
+
         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
 
         seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW);
@@ -1121,13 +1150,16 @@ finish:
         return r;
 }
 
-static int apply_address_families(const ExecContext *c) {
+static int apply_address_families(const Unit* u, const ExecContext *c) {
         scmp_filter_ctx *seccomp;
         Iterator i;
         int r;
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
+                return 0;
+
         seccomp = seccomp_init(SCMP_ACT_ALLOW);
         if (!seccomp)
                 return -ENOMEM;
@@ -1242,16 +1274,23 @@ finish:
         return r;
 }
 
-static int apply_memory_deny_write_execute(const ExecContext *c) {
+static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
         scmp_filter_ctx *seccomp;
         int r;
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
+                return 0;
+
         seccomp = seccomp_init(SCMP_ACT_ALLOW);
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         r = seccomp_rule_add(
                         seccomp,
                         SCMP_ACT_ERRNO(EPERM),
@@ -1281,7 +1320,7 @@ finish:
         return r;
 }
 
-static int apply_restrict_realtime(const ExecContext *c) {
+static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
         static const int permitted_policies[] = {
                 SCHED_OTHER,
                 SCHED_BATCH,
@@ -1294,10 +1333,17 @@ static int apply_restrict_realtime(const ExecContext *c) {
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
+                return 0;
+
         seccomp = seccomp_init(SCMP_ACT_ALLOW);
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         /* Determine the highest policy constant we want to allow */
         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
                 if (permitted_policies[i] > max_policy)
@@ -1351,12 +1397,121 @@ finish:
         return r;
 }
 
+static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
+        scmp_filter_ctx *seccomp;
+        int r;
+
+        assert(c);
+
+        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+         * let's protect even those systems where this is left on in the kernel. */
+
+        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(_sysctl),
+                        0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
+static int apply_private_devices(Unit *u, const ExecContext *c) {
+        const SystemCallFilterSet *set;
+        scmp_filter_ctx *seccomp;
+        const char *sys;
+        bool syscalls_found = false;
+        int r;
+
+        assert(c);
+
+        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
+
+        if (skip_seccomp_unavailable(u, "PrivateDevices="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        for (set = syscall_filter_sets; set->set_name; set++)
+                if (streq(set->set_name, "@raw-io")) {
+                        syscalls_found = true;
+                        break;
+                }
+
+        /* We should never fail here */
+        if (!syscalls_found) {
+                r = -EOPNOTSUPP;
+                goto finish;
+        }
+
+        NULSTR_FOREACH(sys, set->value) {
+                int id;
+                bool add = true;
+
+#ifndef __NR_s390_pci_mmio_read
+                if (streq(sys, "s390_pci_mmio_read"))
+                        add = false;
+#endif
+#ifndef __NR_s390_pci_mmio_write
+                if (streq(sys, "s390_pci_mmio_write"))
+                        add = false;
+#endif
+
+                if (!add)
+                        continue;
+
+                id = seccomp_syscall_resolve_name(sys);
+
+                r = seccomp_rule_add(
+                                seccomp,
+                                SCMP_ACT_ERRNO(EPERM),
+                                id, 0);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
         assert(idle_pipe);
 
-
         idle_pipe[1] = safe_close(idle_pipe[1]);
         idle_pipe[2] = safe_close(idle_pipe[2]);
 
@@ -1383,6 +1538,7 @@ static void do_idle_pipe_dance(int idle_pipe[4]) {
 }
 
 static int build_environment(
+                Unit *u,
                 const ExecContext *c,
                 const ExecParameters *p,
                 unsigned n_fds,
@@ -1397,10 +1553,11 @@ static int build_environment(
         unsigned n_env = 0;
         char *x;
 
+        assert(u);
         assert(c);
         assert(ret);
 
-        our_env = new0(char*, 12);
+        our_env = new0(char*, 14);
         if (!our_env)
                 return -ENOMEM;
 
@@ -1425,7 +1582,7 @@ static int build_environment(
                 our_env[n_env++] = x;
         }
 
-        if (p->watchdog_usec > 0) {
+        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid()) < 0)
                         return -ENOMEM;
                 our_env[n_env++] = x;
@@ -1435,6 +1592,16 @@ static int build_environment(
                 our_env[n_env++] = x;
         }
 
+        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
+         * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
+         * check the database directly. */
+        if (unit_has_name(u, SPECIAL_DBUS_SERVICE)) {
+                x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
+                if (!x)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+        }
+
         if (home) {
                 x = strappend("HOME=", home);
                 if (!x)
@@ -1461,6 +1628,13 @@ static int build_environment(
                 our_env[n_env++] = x;
         }
 
+        if (!sd_id128_is_null(u->invocation_id)) {
+                if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+        }
+
         if (exec_context_needs_term(c)) {
                 const char *tty_path, *term = NULL;
 
@@ -1546,12 +1720,272 @@ static bool exec_needs_mount_namespace(
 
         if (context->private_devices ||
             context->protect_system != PROTECT_SYSTEM_NO ||
-            context->protect_home != PROTECT_HOME_NO)
+            context->protect_home != PROTECT_HOME_NO ||
+            context->protect_kernel_tunables ||
+            context->protect_control_groups)
                 return true;
 
         return false;
 }
 
+static int setup_private_users(uid_t uid, gid_t gid) {
+        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
+        _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
+        _cleanup_close_ int unshare_ready_fd = -1;
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        uint64_t c = 1;
+        siginfo_t si;
+        ssize_t n;
+        int r;
+
+        /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
+         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
+         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
+         * which waits for the parent to create the new user namespace while staying in the original namespace. The
+         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
+         * continues execution normally. */
+
+        if (uid != 0 && uid_is_valid(uid))
+                asprintf(&uid_map,
+                         "0 0 1\n"                      /* Map root → root */
+                         UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
+                         uid, uid);                     /* The case where the above is the same */
+        else
+                uid_map = strdup("0 0 1\n");
+        if (!uid_map)
+                return -ENOMEM;
+
+        if (gid != 0 && gid_is_valid(gid))
+                asprintf(&gid_map,
+                         "0 0 1\n"                      /* Map root → root */
+                         GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
+                         gid, gid);
+        else
+                gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
+        if (!gid_map)
+                return -ENOMEM;
+
+        /* Create a communication channel so that the parent can tell the child when it finished creating the user
+         * namespace. */
+        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
+        if (unshare_ready_fd < 0)
+                return -errno;
+
+        /* Create a communication channel so that the child can tell the parent a proper error code in case it
+         * failed. */
+        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+                return -errno;
+
+        pid = fork();
+        if (pid < 0)
+                return -errno;
+
+        if (pid == 0) {
+                _cleanup_close_ int fd = -1;
+                const char *a;
+                pid_t ppid;
+
+                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
+                 * here, after the parent opened its own user namespace. */
+
+                ppid = getppid();
+                errno_pipe[0] = safe_close(errno_pipe[0]);
+
+                /* Wait until the parent unshared the user namespace */
+                if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+
+                /* Disable the setgroups() system call in the child user namespace, for good. */
+                a = procfs_file_alloca(ppid, "setgroups");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        if (errno != ENOENT) {
+                                r = -errno;
+                                goto child_fail;
+                        }
+
+                        /* If the file is missing the kernel is too old, let's continue anyway. */
+                } else {
+                        if (write(fd, "deny\n", 5) < 0) {
+                                r = -errno;
+                                goto child_fail;
+                        }
+
+                        fd = safe_close(fd);
+                }
+
+                /* First write the GID map */
+                a = procfs_file_alloca(ppid, "gid_map");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                if (write(fd, gid_map, strlen(gid_map)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                fd = safe_close(fd);
+
+                /* The write the UID map */
+                a = procfs_file_alloca(ppid, "uid_map");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                if (write(fd, uid_map, strlen(uid_map)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+
+                _exit(EXIT_SUCCESS);
+
+        child_fail:
+                (void) write(errno_pipe[1], &r, sizeof(r));
+                _exit(EXIT_FAILURE);
+        }
+
+        errno_pipe[1] = safe_close(errno_pipe[1]);
+
+        if (unshare(CLONE_NEWUSER) < 0)
+                return -errno;
+
+        /* Let the child know that the namespace is ready now */
+        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
+                return -errno;
+
+        /* Try to read an error code from the child */
+        n = read(errno_pipe[0], &r, sizeof(r));
+        if (n < 0)
+                return -errno;
+        if (n == sizeof(r)) { /* an error code was sent to us */
+                if (r < 0)
+                        return r;
+                return -EIO;
+        }
+        if (n != 0) /* on success we should have read 0 bytes */
+                return -EIO;
+
+        r = wait_for_terminate(pid, &si);
+        if (r < 0)
+                return r;
+        pid = 0;
+
+        /* If something strange happened with the child, let's consider this fatal, too */
+        if (si.si_code != CLD_EXITED || si.si_status != 0)
+                return -EIO;
+
+        return 0;
+}
+
+static int setup_runtime_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                uid_t uid,
+                gid_t gid) {
+
+        char **rt;
+        int r;
+
+        assert(context);
+        assert(params);
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                _cleanup_free_ char *p;
+
+                p = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!p)
+                        return -ENOMEM;
+
+                r = mkdir_p_label(p, context->runtime_directory_mode);
+                if (r < 0)
+                        return r;
+
+                r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int setup_smack(
+                const ExecContext *context,
+                const ExecCommand *command) {
+
+#ifdef HAVE_SMACK
+        int r;
+
+        assert(context);
+        assert(command);
+
+        if (!mac_smack_use())
+                return 0;
+
+        if (context->smack_process_label) {
+                r = mac_smack_apply_pid(0, context->smack_process_label);
+                if (r < 0)
+                        return r;
+        }
+#ifdef SMACK_DEFAULT_PROCESS_LABEL
+        else {
+                _cleanup_free_ char *exec_label = NULL;
+
+                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
+                if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
+                        return r;
+
+                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
+                if (r < 0)
+                        return r;
+        }
+#endif
+#endif
+
+        return 0;
+}
+
+static int compile_read_write_paths(
+                const ExecContext *context,
+                const ExecParameters *params,
+                char ***ret) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        char **rt;
+
+        /* Compile the list of writable paths. This is the combination of the explicitly configured paths, plus all
+         * runtime directories. */
+
+        if (strv_isempty(context->read_write_paths) &&
+            strv_isempty(context->runtime_directory)) {
+                *ret = NULL; /* NOP if neither is set */
+                return 0;
+        }
+
+        l = strv_copy(context->read_write_paths);
+        if (!l)
+                return -ENOMEM;
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                char *s;
+
+                s = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!s)
+                        return -ENOMEM;
+
+                if (strv_consume(&l, s) < 0)
+                        return -ENOMEM;
+        }
+
+        *ret = l;
+        l = NULL;
+
+        return 0;
+}
+
 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
         assert(array);
         assert(n);
@@ -1569,11 +2003,12 @@ static int close_remaining_fds(
                 const ExecParameters *params,
                 ExecRuntime *runtime,
                 DynamicCreds *dcreds,
+                int user_lookup_fd,
                 int socket_fd,
                 int *fds, unsigned n_fds) {
 
         unsigned n_dont_close = 0;
-        int dont_close[n_fds + 11];
+        int dont_close[n_fds + 12];
 
         assert(params);
 
@@ -1601,9 +2036,71 @@ static int close_remaining_fds(
                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
         }
 
+        if (user_lookup_fd >= 0)
+                dont_close[n_dont_close++] = user_lookup_fd;
+
         return close_all_fds(dont_close, n_dont_close);
 }
 
+static bool context_has_address_families(const ExecContext *c) {
+        assert(c);
+
+        return c->address_families_whitelist ||
+                !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+        assert(c);
+
+        return c->syscall_whitelist ||
+                !set_isempty(c->syscall_filter) ||
+                !set_isempty(c->syscall_archs);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+        assert(c);
+
+        if (c->no_new_privileges)
+                return true;
+
+        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+                return false;
+
+        return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
+                c->memory_deny_write_execute ||
+                c->restrict_realtime ||
+                c->protect_kernel_tunables ||
+                context_has_syscall_filters(c);
+}
+
+static int send_user_lookup(
+                Unit *unit,
+                int user_lookup_fd,
+                uid_t uid,
+                gid_t gid) {
+
+        assert(unit);
+
+        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
+         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
+         * specified. */
+
+        if (user_lookup_fd < 0)
+                return 0;
+
+        if (!uid_is_valid(uid) && !gid_is_valid(gid))
+                return 0;
+
+        if (writev(user_lookup_fd,
+               (struct iovec[]) {
+                           { .iov_base = &uid, .iov_len = sizeof(uid) },
+                           { .iov_base = &gid, .iov_len = sizeof(gid) },
+                           { .iov_base = unit->id, .iov_len = strlen(unit->id) }}, 3) < 0)
+                return -errno;
+
+        return 0;
+}
+
 static int exec_child(
                 Unit *unit,
                 ExecCommand *command,
@@ -1615,6 +2112,7 @@ static int exec_child(
                 int socket_fd,
                 int *fds, unsigned n_fds,
                 char **files_env,
+                int user_lookup_fd,
                 int *exit_status) {
 
         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
@@ -1661,7 +2159,7 @@ static int exec_child(
 
         log_forget_fds();
 
-        r = close_remaining_fds(params, runtime, dcreds, socket_fd, fds, n_fds);
+        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
         if (r < 0) {
                 *exit_status = EXIT_FDS;
                 return r;
@@ -1708,7 +2206,7 @@ static int exec_child(
                         return r;
                 }
 
-                if (uid == UID_INVALID || gid == GID_INVALID) {
+                if (!uid_is_valid(uid) || !gid_is_valid(gid)) {
                         *exit_status = EXIT_USER;
                         return -ESRCH;
                 }
@@ -1719,22 +2217,14 @@ static int exec_child(
         } else {
                 if (context->user) {
                         username = context->user;
-                        r = get_user_creds(&username, &uid, &gid, &home, &shell);
+                        r = get_user_creds_clean(&username, &uid, &gid, &home, &shell);
                         if (r < 0) {
                                 *exit_status = EXIT_USER;
                                 return r;
                         }
 
-                        /* Don't set $HOME or $SHELL if they are are not particularly enlightening anyway. */
-                        if (isempty(home) || path_equal(home, "/"))
-                                home = NULL;
-
-                        if (isempty(shell) || PATH_IN_SET(shell,
-                                                          "/bin/nologin",
-                                                          "/sbin/nologin",
-                                                          "/usr/bin/nologin",
-                                                          "/usr/sbin/nologin"))
-                                shell = NULL;
+                        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
+                         * (i.e. are "/" or "/bin/nologin"). */
                 }
 
                 if (context->group) {
@@ -1748,6 +2238,14 @@ static int exec_child(
                 }
         }
 
+        r = send_user_lookup(unit, user_lookup_fd, uid, gid);
+        if (r < 0) {
+                *exit_status = EXIT_USER;
+                return r;
+        }
+
+        user_lookup_fd = safe_close(user_lookup_fd);
+
         /* If a socket is connected to STDIN/STDOUT/STDERR, we
          * must sure to drop O_NONBLOCK */
         if (socket_fd >= 0)
@@ -1879,32 +2377,15 @@ static int exec_child(
         }
 
         if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) {
-                char **rt;
-
-                STRV_FOREACH(rt, context->runtime_directory) {
-                        _cleanup_free_ char *p;
-
-                        p = strjoin(params->runtime_prefix, "/", *rt, NULL);
-                        if (!p) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return -ENOMEM;
-                        }
-
-                        r = mkdir_p_label(p, context->runtime_directory_mode);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
-
-                        r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
+                r = setup_runtime_directory(context, params, uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_RUNTIME_DIRECTORY;
+                        return r;
                 }
         }
 
         r = build_environment(
+                        unit,
                         context,
                         params,
                         n_fds,
@@ -1938,49 +2419,22 @@ static int exec_child(
         }
         accum_env = strv_env_clean(accum_env);
 
-        umask(context->umask);
+        (void) umask(context->umask);
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
-                r = enforce_groups(context, username, gid);
+                r = setup_smack(context, command);
                 if (r < 0) {
-                        *exit_status = EXIT_GROUP;
+                        *exit_status = EXIT_SMACK_PROCESS_LABEL;
                         return r;
                 }
-#ifdef HAVE_SMACK
-                if (context->smack_process_label) {
-                        r = mac_smack_apply_pid(0, context->smack_process_label);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-                }
-#ifdef SMACK_DEFAULT_PROCESS_LABEL
-                else {
-                        _cleanup_free_ char *exec_label = NULL;
-
-                        r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
-                        if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
 
-                        r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-                }
-#endif
-#endif
-#ifdef HAVE_PAM
                 if (context->pam_name && username) {
-                        r = setup_pam(context->pam_name, username, uid, context->tty_path, &accum_env, fds, n_fds);
+                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
                         if (r < 0) {
                                 *exit_status = EXIT_PAM;
                                 return r;
                         }
                 }
-#endif
         }
 
         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
@@ -1992,8 +2446,8 @@ static int exec_child(
         }
 
         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
-
         if (needs_mount_namespace) {
+                _cleanup_free_ char **rw = NULL;
                 char *tmp = NULL, *var = NULL;
 
                 /* The runtime struct only contains the parent
@@ -2009,14 +2463,22 @@ static int exec_child(
                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
                 }
 
+                r = compile_read_write_paths(context, params, &rw);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return r;
+                }
+
                 r = setup_namespace(
                                 (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL,
-                                context->read_write_paths,
+                                rw,
                                 context->read_only_paths,
                                 context->inaccessible_paths,
                                 tmp,
                                 var,
                                 context->private_devices,
+                                context->protect_kernel_tunables,
+                                context->protect_control_groups,
                                 context->protect_home,
                                 context->protect_system,
                                 context->mount_flags);
@@ -2034,6 +2496,14 @@ static int exec_child(
                 }
         }
 
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
+                r = enforce_groups(context, username, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_GROUP;
+                        return r;
+                }
+        }
+
         if (context->working_directory_home)
                 wd = home;
         else if (context->working_directory)
@@ -2079,6 +2549,14 @@ static int exec_child(
         }
 #endif
 
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && context->private_users) {
+                r = setup_private_users(uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_USER;
+                        return r;
+                }
+        }
+
         /* We repeat the fd closing here, to make sure that
          * nothing is leaked from the PAM modules. Note that
          * we are more aggressive this time since socket_fd
@@ -2097,11 +2575,6 @@ static int exec_child(
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
 
-                bool use_address_families = context->address_families_whitelist ||
-                        !set_isempty(context->address_families);
-                bool use_syscall_filter = context->syscall_whitelist ||
-                        !set_isempty(context->syscall_filter) ||
-                        !set_isempty(context->syscall_archs);
                 int secure_bits = context->secure_bits;
 
                 for (i = 0; i < _RLIMIT_MAX; i++) {
@@ -2178,16 +2651,15 @@ static int exec_child(
                                 return -errno;
                         }
 
-                if (context->no_new_privileges ||
-                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
+                if (context_has_no_new_privileges(context))
                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                 return -errno;
                         }
 
 #ifdef HAVE_SECCOMP
-                if (use_address_families) {
-                        r = apply_address_families(context);
+                if (context_has_address_families(context)) {
+                        r = apply_address_families(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_ADDRESS_FAMILIES;
                                 return r;
@@ -2195,7 +2667,7 @@ static int exec_child(
                 }
 
                 if (context->memory_deny_write_execute) {
-                        r = apply_memory_deny_write_execute(context);
+                        r = apply_memory_deny_write_execute(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
                                 return r;
@@ -2203,15 +2675,31 @@ static int exec_child(
                 }
 
                 if (context->restrict_realtime) {
-                        r = apply_restrict_realtime(context);
+                        r = apply_restrict_realtime(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
                                 return r;
                         }
                 }
 
-                if (use_syscall_filter) {
-                        r = apply_seccomp(context);
+                if (context->protect_kernel_tunables) {
+                        r = apply_protect_sysctl(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context->private_devices) {
+                        r = apply_private_devices(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context_has_syscall_filters(context)) {
+                        r = apply_seccomp(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
                                 return r;
@@ -2339,6 +2827,7 @@ int exec_spawn(Unit *unit,
                                socket_fd,
                                fds, n_fds,
                                files_env,
+                               unit->manager->user_lookup_fds[1],
                                &exit_status);
                 if (r < 0) {
                         log_open();
@@ -2640,8 +3129,11 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sRootDirectory: %s\n"
                 "%sNonBlocking: %s\n"
                 "%sPrivateTmp: %s\n"
-                "%sPrivateNetwork: %s\n"
                 "%sPrivateDevices: %s\n"
+                "%sProtectKernelTunables: %s\n"
+                "%sProtectControlGroups: %s\n"
+                "%sPrivateNetwork: %s\n"
+                "%sPrivateUsers: %s\n"
                 "%sProtectHome: %s\n"
                 "%sProtectSystem: %s\n"
                 "%sIgnoreSIGPIPE: %s\n"
@@ -2652,8 +3144,11 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, c->root_directory ? c->root_directory : "/",
                 prefix, yes_no(c->non_blocking),
                 prefix, yes_no(c->private_tmp),
-                prefix, yes_no(c->private_network),
                 prefix, yes_no(c->private_devices),
+                prefix, yes_no(c->protect_kernel_tunables),
+                prefix, yes_no(c->protect_control_groups),
+                prefix, yes_no(c->private_network),
+                prefix, yes_no(c->private_users),
                 prefix, protect_home_to_string(c->protect_home),
                 prefix, protect_system_to_string(c->protect_system),
                 prefix, yes_no(c->ignore_sigpipe),