]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/core/execute.c
tree-wide: use mfree more
[thirdparty/systemd.git] / src / core / execute.c
index cec3b3cf40bfd268074c68a97f3134bbbefec3db..59c2bd0dd2aeb00818d02aaa6cee21a7abc5a252 100644 (file)
@@ -91,6 +91,7 @@
 #include "selinux-util.h"
 #include "signal-util.h"
 #include "smack-util.h"
+#include "special.h"
 #include "string-table.h"
 #include "string-util.h"
 #include "strv.h"
@@ -220,12 +221,36 @@ static void exec_context_tty_reset(const ExecContext *context, const ExecParamet
                 (void) vt_disallocate(path);
 }
 
+static bool is_terminal_input(ExecInput i) {
+        return IN_SET(i,
+                      EXEC_INPUT_TTY,
+                      EXEC_INPUT_TTY_FORCE,
+                      EXEC_INPUT_TTY_FAIL);
+}
+
 static bool is_terminal_output(ExecOutput o) {
-        return
-                o == EXEC_OUTPUT_TTY ||
-                o == EXEC_OUTPUT_SYSLOG_AND_CONSOLE ||
-                o == EXEC_OUTPUT_KMSG_AND_CONSOLE ||
-                o == EXEC_OUTPUT_JOURNAL_AND_CONSOLE;
+        return IN_SET(o,
+                      EXEC_OUTPUT_TTY,
+                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
+                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
+                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
+}
+
+static bool exec_context_needs_term(const ExecContext *c) {
+        assert(c);
+
+        /* Return true if the execution context suggests we should set $TERM to something useful. */
+
+        if (is_terminal_input(c->std_input))
+                return true;
+
+        if (is_terminal_output(c->std_output))
+                return true;
+
+        if (is_terminal_output(c->std_error))
+                return true;
+
+        return !!c->tty_path;
 }
 
 static int open_null_as(int flags, int nfd) {
@@ -364,13 +389,6 @@ static int open_terminal_as(const char *path, mode_t mode, int nfd) {
         return r;
 }
 
-static bool is_terminal_input(ExecInput i) {
-        return
-                i == EXEC_INPUT_TTY ||
-                i == EXEC_INPUT_TTY_FORCE ||
-                i == EXEC_INPUT_TTY_FAIL;
-}
-
 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
 
         if (is_terminal_input(std_input) && !apply_tty_stdin)
@@ -411,7 +429,7 @@ static int setup_input(
                 return STDIN_FILENO;
         }
 
-        i = fixup_input(context->std_input, socket_fd, params->apply_tty_stdin);
+        i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 
         switch (i) {
 
@@ -486,7 +504,7 @@ static int setup_output(
                 return STDERR_FILENO;
         }
 
-        i = fixup_input(context->std_input, socket_fd, params->apply_tty_stdin);
+        i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
         o = fixup_output(context->std_output, socket_fd);
 
         if (fileno == STDERR_FILENO) {
@@ -763,9 +781,10 @@ static int enforce_groups(const ExecContext *context, const char *username, gid_
                         k++;
                 }
 
-                if (setgroups(k, gids) < 0) {
+                r = maybe_setgroups(k, gids);
+                if (r < 0) {
                         free(gids);
-                        return -errno;
+                        return r;
                 }
 
                 free(gids);
@@ -819,14 +838,19 @@ static int null_conv(
         return PAM_CONV_ERR;
 }
 
+#endif
+
 static int setup_pam(
                 const char *name,
                 const char *user,
                 uid_t uid,
+                gid_t gid,
                 const char *tty,
                 char ***env,
                 int fds[], unsigned n_fds) {
 
+#ifdef HAVE_PAM
+
         static const struct pam_conv conv = {
                 .conv = null_conv,
                 .appdata_ptr = NULL
@@ -926,8 +950,14 @@ static int setup_pam(
                  * and this will make PR_SET_PDEATHSIG work in most cases.
                  * If this fails, ignore the error - but expect sd-pam threads
                  * to fail to exit normally */
+
+                r = maybe_setgroups(0, NULL);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
+                if (setresgid(gid, gid, gid) < 0)
+                        log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
                 if (setresuid(uid, uid, uid) < 0)
-                        log_error_errno(r, "Error: Failed to setresuid() in sd-pam: %m");
+                        log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
 
                 (void) ignore_signals(SIGPIPE, -1);
 
@@ -1020,8 +1050,10 @@ fail:
         closelog();
 
         return r;
-}
+#else
+        return 0;
 #endif
+}
 
 static void rename_process_from_path(const char *path) {
         char process_name[11];
@@ -1056,7 +1088,17 @@ static void rename_process_from_path(const char *path) {
 
 #ifdef HAVE_SECCOMP
 
-static int apply_seccomp(const ExecContext *c) {
+static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
+        if (!is_seccomp_available()) {
+                log_open();
+                log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
+                log_close();
+                return true;
+        }
+        return false;
+}
+
+static int apply_seccomp(const Unit* u, const ExecContext *c) {
         uint32_t negative_action, action;
         scmp_filter_ctx *seccomp;
         Iterator i;
@@ -1065,6 +1107,9 @@ static int apply_seccomp(const ExecContext *c) {
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "syscall filtering"))
+                return 0;
+
         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
 
         seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW);
@@ -1105,13 +1150,16 @@ finish:
         return r;
 }
 
-static int apply_address_families(const ExecContext *c) {
+static int apply_address_families(const Unit* u, const ExecContext *c) {
         scmp_filter_ctx *seccomp;
         Iterator i;
         int r;
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
+                return 0;
+
         seccomp = seccomp_init(SCMP_ACT_ALLOW);
         if (!seccomp)
                 return -ENOMEM;
@@ -1226,16 +1274,23 @@ finish:
         return r;
 }
 
-static int apply_memory_deny_write_execute(const ExecContext *c) {
+static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
         scmp_filter_ctx *seccomp;
         int r;
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
+                return 0;
+
         seccomp = seccomp_init(SCMP_ACT_ALLOW);
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         r = seccomp_rule_add(
                         seccomp,
                         SCMP_ACT_ERRNO(EPERM),
@@ -1265,7 +1320,7 @@ finish:
         return r;
 }
 
-static int apply_restrict_realtime(const ExecContext *c) {
+static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
         static const int permitted_policies[] = {
                 SCHED_OTHER,
                 SCHED_BATCH,
@@ -1278,10 +1333,17 @@ static int apply_restrict_realtime(const ExecContext *c) {
 
         assert(c);
 
+        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
+                return 0;
+
         seccomp = seccomp_init(SCMP_ACT_ALLOW);
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         /* Determine the highest policy constant we want to allow */
         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
                 if (permitted_policies[i] > max_policy)
@@ -1335,12 +1397,165 @@ finish:
         return r;
 }
 
+static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
+        scmp_filter_ctx *seccomp;
+        int r;
+
+        assert(c);
+
+        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+         * let's protect even those systems where this is left on in the kernel. */
+
+        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(_sysctl),
+                        0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
+static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) {
+        static const int module_syscalls[] = {
+                SCMP_SYS(delete_module),
+                SCMP_SYS(finit_module),
+                SCMP_SYS(init_module),
+        };
+
+        scmp_filter_ctx *seccomp;
+        unsigned i;
+        int r;
+
+        assert(c);
+
+        /* Turn of module syscalls on ProtectKernelModules=yes */
+
+        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        for (i = 0; i < ELEMENTSOF(module_syscalls); i++) {
+                r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM),
+                                     module_syscalls[i], 0);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
+static int apply_private_devices(Unit *u, const ExecContext *c) {
+        const SystemCallFilterSet *set;
+        scmp_filter_ctx *seccomp;
+        const char *sys;
+        bool syscalls_found = false;
+        int r;
+
+        assert(c);
+
+        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
+
+        if (skip_seccomp_unavailable(u, "PrivateDevices="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        for (set = syscall_filter_sets; set->set_name; set++)
+                if (streq(set->set_name, "@raw-io")) {
+                        syscalls_found = true;
+                        break;
+                }
+
+        /* We should never fail here */
+        if (!syscalls_found) {
+                r = -EOPNOTSUPP;
+                goto finish;
+        }
+
+        NULSTR_FOREACH(sys, set->value) {
+                int id;
+                bool add = true;
+
+#ifndef __NR_s390_pci_mmio_read
+                if (streq(sys, "s390_pci_mmio_read"))
+                        add = false;
+#endif
+#ifndef __NR_s390_pci_mmio_write
+                if (streq(sys, "s390_pci_mmio_write"))
+                        add = false;
+#endif
+
+                if (!add)
+                        continue;
+
+                id = seccomp_syscall_resolve_name(sys);
+
+                r = seccomp_rule_add(
+                                seccomp,
+                                SCMP_ACT_ERRNO(EPERM),
+                                id, 0);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
         assert(idle_pipe);
 
-
         idle_pipe[1] = safe_close(idle_pipe[1]);
         idle_pipe[2] = safe_close(idle_pipe[2]);
 
@@ -1367,6 +1582,7 @@ static void do_idle_pipe_dance(int idle_pipe[4]) {
 }
 
 static int build_environment(
+                Unit *u,
                 const ExecContext *c,
                 const ExecParameters *p,
                 unsigned n_fds,
@@ -1381,10 +1597,11 @@ static int build_environment(
         unsigned n_env = 0;
         char *x;
 
+        assert(u);
         assert(c);
         assert(ret);
 
-        our_env = new0(char*, 12);
+        our_env = new0(char*, 14);
         if (!our_env)
                 return -ENOMEM;
 
@@ -1409,7 +1626,7 @@ static int build_environment(
                 our_env[n_env++] = x;
         }
 
-        if (p->watchdog_usec > 0) {
+        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid()) < 0)
                         return -ENOMEM;
                 our_env[n_env++] = x;
@@ -1419,6 +1636,16 @@ static int build_environment(
                 our_env[n_env++] = x;
         }
 
+        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
+         * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
+         * check the database directly. */
+        if (unit_has_name(u, SPECIAL_DBUS_SERVICE)) {
+                x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
+                if (!x)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+        }
+
         if (home) {
                 x = strappend("HOME=", home);
                 if (!x)
@@ -1445,12 +1672,28 @@ static int build_environment(
                 our_env[n_env++] = x;
         }
 
-        if (is_terminal_input(c->std_input) ||
-            c->std_output == EXEC_OUTPUT_TTY ||
-            c->std_error == EXEC_OUTPUT_TTY ||
-            c->tty_path) {
+        if (!sd_id128_is_null(u->invocation_id)) {
+                if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+        }
+
+        if (exec_context_needs_term(c)) {
+                const char *tty_path, *term = NULL;
+
+                tty_path = exec_context_tty_path(c);
+
+                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
+                 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
+                 * passes to PID 1 ends up all the way in the console login shown. */
+
+                if (path_equal(tty_path, "/dev/console") && getppid() == 1)
+                        term = getenv("TERM");
+                if (!term)
+                        term = default_term_for_tty(tty_path);
 
-                x = strdup(default_term_for_tty(exec_context_tty_path(c)));
+                x = strappend("TERM=", term);
                 if (!x)
                         return -ENOMEM;
                 our_env[n_env++] = x;
@@ -1521,7 +1764,10 @@ static bool exec_needs_mount_namespace(
 
         if (context->private_devices ||
             context->protect_system != PROTECT_SYSTEM_NO ||
-            context->protect_home != PROTECT_HOME_NO)
+            context->protect_home != PROTECT_HOME_NO ||
+            context->protect_kernel_tunables ||
+            context->protect_kernel_modules ||
+            context->protect_control_groups)
                 return true;
 
         return false;
@@ -1680,6 +1926,111 @@ static int setup_private_users(uid_t uid, gid_t gid) {
         return 0;
 }
 
+static int setup_runtime_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                uid_t uid,
+                gid_t gid) {
+
+        char **rt;
+        int r;
+
+        assert(context);
+        assert(params);
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                _cleanup_free_ char *p;
+
+                p = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!p)
+                        return -ENOMEM;
+
+                r = mkdir_p_label(p, context->runtime_directory_mode);
+                if (r < 0)
+                        return r;
+
+                r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int setup_smack(
+                const ExecContext *context,
+                const ExecCommand *command) {
+
+#ifdef HAVE_SMACK
+        int r;
+
+        assert(context);
+        assert(command);
+
+        if (!mac_smack_use())
+                return 0;
+
+        if (context->smack_process_label) {
+                r = mac_smack_apply_pid(0, context->smack_process_label);
+                if (r < 0)
+                        return r;
+        }
+#ifdef SMACK_DEFAULT_PROCESS_LABEL
+        else {
+                _cleanup_free_ char *exec_label = NULL;
+
+                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
+                if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
+                        return r;
+
+                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
+                if (r < 0)
+                        return r;
+        }
+#endif
+#endif
+
+        return 0;
+}
+
+static int compile_read_write_paths(
+                const ExecContext *context,
+                const ExecParameters *params,
+                char ***ret) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        char **rt;
+
+        /* Compile the list of writable paths. This is the combination of the explicitly configured paths, plus all
+         * runtime directories. */
+
+        if (strv_isempty(context->read_write_paths) &&
+            strv_isempty(context->runtime_directory)) {
+                *ret = NULL; /* NOP if neither is set */
+                return 0;
+        }
+
+        l = strv_copy(context->read_write_paths);
+        if (!l)
+                return -ENOMEM;
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                char *s;
+
+                s = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!s)
+                        return -ENOMEM;
+
+                if (strv_consume(&l, s) < 0)
+                        return -ENOMEM;
+        }
+
+        *ret = l;
+        l = NULL;
+
+        return 0;
+}
+
 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
         assert(array);
         assert(n);
@@ -1697,11 +2048,12 @@ static int close_remaining_fds(
                 const ExecParameters *params,
                 ExecRuntime *runtime,
                 DynamicCreds *dcreds,
+                int user_lookup_fd,
                 int socket_fd,
                 int *fds, unsigned n_fds) {
 
         unsigned n_dont_close = 0;
-        int dont_close[n_fds + 11];
+        int dont_close[n_fds + 12];
 
         assert(params);
 
@@ -1729,9 +2081,73 @@ static int close_remaining_fds(
                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
         }
 
+        if (user_lookup_fd >= 0)
+                dont_close[n_dont_close++] = user_lookup_fd;
+
         return close_all_fds(dont_close, n_dont_close);
 }
 
+static bool context_has_address_families(const ExecContext *c) {
+        assert(c);
+
+        return c->address_families_whitelist ||
+                !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+        assert(c);
+
+        return c->syscall_whitelist ||
+                !set_isempty(c->syscall_filter) ||
+                !set_isempty(c->syscall_archs);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+        assert(c);
+
+        if (c->no_new_privileges)
+                return true;
+
+        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+                return false;
+
+        return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
+                c->memory_deny_write_execute ||
+                c->restrict_realtime ||
+                c->protect_kernel_tunables ||
+                c->protect_kernel_modules ||
+                c->private_devices ||
+                context_has_syscall_filters(c);
+}
+
+static int send_user_lookup(
+                Unit *unit,
+                int user_lookup_fd,
+                uid_t uid,
+                gid_t gid) {
+
+        assert(unit);
+
+        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
+         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
+         * specified. */
+
+        if (user_lookup_fd < 0)
+                return 0;
+
+        if (!uid_is_valid(uid) && !gid_is_valid(gid))
+                return 0;
+
+        if (writev(user_lookup_fd,
+               (struct iovec[]) {
+                           { .iov_base = &uid, .iov_len = sizeof(uid) },
+                           { .iov_base = &gid, .iov_len = sizeof(gid) },
+                           { .iov_base = unit->id, .iov_len = strlen(unit->id) }}, 3) < 0)
+                return -errno;
+
+        return 0;
+}
+
 static int exec_child(
                 Unit *unit,
                 ExecCommand *command,
@@ -1743,6 +2159,7 @@ static int exec_child(
                 int socket_fd,
                 int *fds, unsigned n_fds,
                 char **files_env,
+                int user_lookup_fd,
                 int *exit_status) {
 
         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
@@ -1789,7 +2206,7 @@ static int exec_child(
 
         log_forget_fds();
 
-        r = close_remaining_fds(params, runtime, dcreds, socket_fd, fds, n_fds);
+        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
         if (r < 0) {
                 *exit_status = EXIT_FDS;
                 return r;
@@ -1803,7 +2220,7 @@ static int exec_child(
 
         exec_context_tty_reset(context, params);
 
-        if (params->confirm_spawn) {
+        if (params->flags & EXEC_CONFIRM_SPAWN) {
                 char response;
 
                 r = ask_for_confirmation(&response, argv);
@@ -1836,7 +2253,7 @@ static int exec_child(
                         return r;
                 }
 
-                if (uid == UID_INVALID || gid == GID_INVALID) {
+                if (!uid_is_valid(uid) || !gid_is_valid(gid)) {
                         *exit_status = EXIT_USER;
                         return -ESRCH;
                 }
@@ -1847,11 +2264,14 @@ static int exec_child(
         } else {
                 if (context->user) {
                         username = context->user;
-                        r = get_user_creds(&username, &uid, &gid, &home, &shell);
+                        r = get_user_creds_clean(&username, &uid, &gid, &home, &shell);
                         if (r < 0) {
                                 *exit_status = EXIT_USER;
                                 return r;
                         }
+
+                        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
+                         * (i.e. are "/" or "/bin/nologin"). */
                 }
 
                 if (context->group) {
@@ -1865,6 +2285,14 @@ static int exec_child(
                 }
         }
 
+        r = send_user_lookup(unit, user_lookup_fd, uid, gid);
+        if (r < 0) {
+                *exit_status = EXIT_USER;
+                return r;
+        }
+
+        user_lookup_fd = safe_close(user_lookup_fd);
+
         /* If a socket is connected to STDIN/STDOUT/STDERR, we
          * must sure to drop O_NONBLOCK */
         if (socket_fd >= 0)
@@ -1969,7 +2397,7 @@ static int exec_child(
                                       USER_PROCESS,
                                       username ? "root" : context->user);
 
-        if (context->user && is_terminal_input(context->std_input)) {
+        if (context->user) {
                 r = chown_terminal(STDIN_FILENO, uid);
                 if (r < 0) {
                         *exit_status = EXIT_STDIN;
@@ -1996,32 +2424,15 @@ static int exec_child(
         }
 
         if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) {
-                char **rt;
-
-                STRV_FOREACH(rt, context->runtime_directory) {
-                        _cleanup_free_ char *p;
-
-                        p = strjoin(params->runtime_prefix, "/", *rt, NULL);
-                        if (!p) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return -ENOMEM;
-                        }
-
-                        r = mkdir_p_label(p, context->runtime_directory_mode);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
-
-                        r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
+                r = setup_runtime_directory(context, params, uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_RUNTIME_DIRECTORY;
+                        return r;
                 }
         }
 
         r = build_environment(
+                        unit,
                         context,
                         params,
                         n_fds,
@@ -2055,49 +2466,22 @@ static int exec_child(
         }
         accum_env = strv_env_clean(accum_env);
 
-        umask(context->umask);
+        (void) umask(context->umask);
 
-        if (params->apply_permissions && !command->privileged) {
-                r = enforce_groups(context, username, gid);
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
+                r = setup_smack(context, command);
                 if (r < 0) {
-                        *exit_status = EXIT_GROUP;
+                        *exit_status = EXIT_SMACK_PROCESS_LABEL;
                         return r;
                 }
-#ifdef HAVE_SMACK
-                if (context->smack_process_label) {
-                        r = mac_smack_apply_pid(0, context->smack_process_label);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-                }
-#ifdef SMACK_DEFAULT_PROCESS_LABEL
-                else {
-                        _cleanup_free_ char *exec_label = NULL;
-
-                        r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
-                        if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
 
-                        r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-                }
-#endif
-#endif
-#ifdef HAVE_PAM
                 if (context->pam_name && username) {
-                        r = setup_pam(context->pam_name, username, uid, context->tty_path, &accum_env, fds, n_fds);
+                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
                         if (r < 0) {
                                 *exit_status = EXIT_PAM;
                                 return r;
                         }
                 }
-#endif
         }
 
         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
@@ -2109,9 +2493,15 @@ static int exec_child(
         }
 
         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
-
         if (needs_mount_namespace) {
+                _cleanup_free_ char **rw = NULL;
                 char *tmp = NULL, *var = NULL;
+                NameSpaceInfo ns_info = {
+                        .private_dev = context->private_devices,
+                        .protect_control_groups = context->protect_control_groups,
+                        .protect_kernel_tunables = context->protect_kernel_tunables,
+                        .protect_kernel_modules = context->protect_kernel_modules,
+                };
 
                 /* The runtime struct only contains the parent
                  * of the private /tmp, which is
@@ -2126,14 +2516,20 @@ static int exec_child(
                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
                 }
 
+                r = compile_read_write_paths(context, params, &rw);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return r;
+                }
+
                 r = setup_namespace(
-                                params->apply_chroot ? context->root_directory : NULL,
-                                context->read_write_paths,
+                                (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL,
+                                &ns_info,
+                                rw,
                                 context->read_only_paths,
                                 context->inaccessible_paths,
                                 tmp,
                                 var,
-                                context->private_devices,
                                 context->protect_home,
                                 context->protect_system,
                                 context->mount_flags);
@@ -2151,6 +2547,14 @@ static int exec_child(
                 }
         }
 
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
+                r = enforce_groups(context, username, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_GROUP;
+                        return r;
+                }
+        }
+
         if (context->working_directory_home)
                 wd = home;
         else if (context->working_directory)
@@ -2158,7 +2562,7 @@ static int exec_child(
         else
                 wd = "/";
 
-        if (params->apply_chroot) {
+        if (params->flags & EXEC_APPLY_CHROOT) {
                 if (!needs_mount_namespace && context->root_directory)
                         if (chroot(context->root_directory) < 0) {
                                 *exit_status = EXIT_CHROOT;
@@ -2182,7 +2586,12 @@ static int exec_child(
         }
 
 #ifdef HAVE_SELINUX
-        if (params->apply_permissions && mac_selinux_use() && params->selinux_context_net && socket_fd >= 0 && !command->privileged) {
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) &&
+            mac_selinux_use() &&
+            params->selinux_context_net &&
+            socket_fd >= 0 &&
+            !command->privileged) {
+
                 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
                 if (r < 0) {
                         *exit_status = EXIT_SELINUX_CONTEXT;
@@ -2191,7 +2600,7 @@ static int exec_child(
         }
 #endif
 
-        if (params->apply_permissions && context->private_users) {
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && context->private_users) {
                 r = setup_private_users(uid, gid);
                 if (r < 0) {
                         *exit_status = EXIT_USER;
@@ -2215,13 +2624,8 @@ static int exec_child(
                 return r;
         }
 
-        if (params->apply_permissions && !command->privileged) {
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
 
-                bool use_address_families = context->address_families_whitelist ||
-                        !set_isempty(context->address_families);
-                bool use_syscall_filter = context->syscall_whitelist ||
-                        !set_isempty(context->syscall_filter) ||
-                        !set_isempty(context->syscall_archs);
                 int secure_bits = context->secure_bits;
 
                 for (i = 0; i < _RLIMIT_MAX; i++) {
@@ -2298,16 +2702,15 @@ static int exec_child(
                                 return -errno;
                         }
 
-                if (context->no_new_privileges ||
-                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
+                if (context_has_no_new_privileges(context))
                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                 return -errno;
                         }
 
 #ifdef HAVE_SECCOMP
-                if (use_address_families) {
-                        r = apply_address_families(context);
+                if (context_has_address_families(context)) {
+                        r = apply_address_families(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_ADDRESS_FAMILIES;
                                 return r;
@@ -2315,7 +2718,7 @@ static int exec_child(
                 }
 
                 if (context->memory_deny_write_execute) {
-                        r = apply_memory_deny_write_execute(context);
+                        r = apply_memory_deny_write_execute(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
                                 return r;
@@ -2323,15 +2726,39 @@ static int exec_child(
                 }
 
                 if (context->restrict_realtime) {
-                        r = apply_restrict_realtime(context);
+                        r = apply_restrict_realtime(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context->protect_kernel_tunables) {
+                        r = apply_protect_sysctl(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context->protect_kernel_modules) {
+                        r = apply_protect_kernel_modules(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
                                 return r;
                         }
                 }
 
-                if (use_syscall_filter) {
-                        r = apply_seccomp(context);
+                if (context->private_devices) {
+                        r = apply_private_devices(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context_has_syscall_filters(context)) {
+                        r = apply_seccomp(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
                                 return r;
@@ -2459,6 +2886,7 @@ int exec_spawn(Unit *unit,
                                socket_fd,
                                fds, n_fds,
                                files_env,
+                               unit->manager->user_lookup_fds[1],
                                &exit_status);
                 if (r < 0) {
                         log_open();
@@ -2761,6 +3189,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sNonBlocking: %s\n"
                 "%sPrivateTmp: %s\n"
                 "%sPrivateDevices: %s\n"
+                "%sProtectKernelTunables: %s\n"
+                "%sProtectKernelModules: %s\n"
+                "%sProtectControlGroups: %s\n"
                 "%sPrivateNetwork: %s\n"
                 "%sPrivateUsers: %s\n"
                 "%sProtectHome: %s\n"
@@ -2774,6 +3205,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, yes_no(c->non_blocking),
                 prefix, yes_no(c->private_tmp),
                 prefix, yes_no(c->private_devices),
+                prefix, yes_no(c->protect_kernel_tunables),
+                prefix, yes_no(c->protect_kernel_modules),
+                prefix, yes_no(c->protect_control_groups),
                 prefix, yes_no(c->private_network),
                 prefix, yes_no(c->private_users),
                 prefix, protect_home_to_string(c->protect_home),
@@ -3091,12 +3525,12 @@ void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
                 "%sPID: "PID_FMT"\n",
                 prefix, s->pid);
 
-        if (s->start_timestamp.realtime > 0)
+        if (dual_timestamp_is_set(&s->start_timestamp))
                 fprintf(f,
                         "%sStart Timestamp: %s\n",
                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
 
-        if (s->exit_timestamp.realtime > 0)
+        if (dual_timestamp_is_set(&s->exit_timestamp))
                 fprintf(f,
                         "%sExit Timestamp: %s\n"
                         "%sExit Code: %s\n"
@@ -3306,9 +3740,7 @@ ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
         free(r->tmp_dir);
         free(r->var_tmp_dir);
         safe_close_pair(r->netns_storage_socket);
-        free(r);
-
-        return NULL;
+        return mfree(r);
 }
 
 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {