core: add RootImage= setting for using a specific image file as root directory for...

[thirdparty/systemd.git] / src / core / execute.c
diff --git a/src/core/execute.c b/src/core/execute.c

index 1b7b4a928d8167577111257dfc4441f2c7e67460..f57eb26388718280aa87155b8aa0c3d34b933932 100644 (file)
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -29,8 +29,10 @@
  #include <sys/mman.h>
  #include <sys/personality.h>
  #include <sys/prctl.h>
+#include <sys/shm.h>
  #include <sys/socket.h>
  #include <sys/stat.h>
+#include <sys/types.h>
  #include <sys/un.h>
  #include <unistd.h>
  #include <utmpx.h>
@@ -69,7 +71,7 @@
  #include "exit-status.h"
  #include "fd-util.h"
  #include "fileio.h"
-#include "formats-util.h"
+#include "format-util.h"
  #include "fs-util.h"
  #include "glob-util.h"
  #include "io-util.h"
@@ -622,7 +624,7 @@ static int chown_terminal(int fd, uid_t uid) {
          return 0;
  }
  
-static int setup_confirm_stdio(int *_saved_stdin, int *_saved_stdout) {
+static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
          _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
          int r;
  
@@ -637,12 +639,7 @@ static int setup_confirm_stdio(int *_saved_stdin, int *_saved_stdout) {
          if (saved_stdout < 0)
                  return -errno;
  
-        fd = acquire_terminal(
-                        "/dev/console",
-                        false,
-                        false,
-                        false,
-                        DEFAULT_CONFIRM_USEC);
+        fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
          if (fd < 0)
                  return fd;
  
@@ -672,21 +669,27 @@ static int setup_confirm_stdio(int *_saved_stdin, int *_saved_stdout) {
          return 0;
  }
  
-_printf_(1, 2) static int write_confirm_message(const char *format, ...) {
+static void write_confirm_error_fd(int err, int fd, const Unit *u) {
+        assert(err < 0);
+
+        if (err == -ETIMEDOUT)
+                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
+        else {
+                errno = -err;
+                dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
+        }
+}
+
+static void write_confirm_error(int err, const char *vc, const Unit *u) {
          _cleanup_close_ int fd = -1;
-        va_list ap;
  
-        assert(format);
+        assert(vc);
  
-        fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
          if (fd < 0)
-                return fd;
-
-        va_start(ap, format);
-        vdprintf(fd, format, ap);
-        va_end(ap);
+                return;
  
-        return 0;
+        write_confirm_error_fd(err, fd, u);
  }
  
  static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
@@ -711,93 +714,247 @@ static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
          return r;
  }
  
-static int ask_for_confirmation(char *response, char **argv) {
+enum {
+        CONFIRM_PRETEND_FAILURE = -1,
+        CONFIRM_PRETEND_SUCCESS =  0,
+        CONFIRM_EXECUTE = 1,
+};
+
+static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
          int saved_stdout = -1, saved_stdin = -1, r;
-        _cleanup_free_ char *line = NULL;
+        _cleanup_free_ char *e = NULL;
+        char c;
  
-        r = setup_confirm_stdio(&saved_stdin, &saved_stdout);
-        if (r < 0)
-                return r;
+        /* For any internal errors, assume a positive response. */
+        r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
+        if (r < 0) {
+                write_confirm_error(r, vc, u);
+                return CONFIRM_EXECUTE;
+        }
  
-        line = exec_command_line(argv);
-        if (!line)
-                return -ENOMEM;
+        /* confirm_spawn might have been disabled while we were sleeping. */
+        if (manager_is_confirm_spawn_disabled(u->manager)) {
+                r = 1;
+                goto restore_stdio;
+        }
  
-        r = ask_char(response, "yns", "Execute %s? [Yes, No, Skip] ", line);
+        e = ellipsize(cmdline, 60, 100);
+        if (!e) {
+                log_oom();
+                r = CONFIRM_EXECUTE;
+                goto restore_stdio;
+        }
  
-        restore_confirm_stdio(&saved_stdin, &saved_stdout);
+        for (;;) {
+                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
+                if (r < 0) {
+                        write_confirm_error_fd(r, STDOUT_FILENO, u);
+                        r = CONFIRM_EXECUTE;
+                        goto restore_stdio;
+                }
  
+                switch (c) {
+                case 'c':
+                        printf("Resuming normal execution.\n");
+                        manager_disable_confirm_spawn();
+                        r = 1;
+                        break;
+                case 'D':
+                        unit_dump(u, stdout, "  ");
+                        continue; /* ask again */
+                case 'f':
+                        printf("Failing execution.\n");
+                        r = CONFIRM_PRETEND_FAILURE;
+                        break;
+                case 'h':
+                        printf("  c - continue, proceed without asking anymore\n"
+                               "  D - dump, show the state of the unit\n"
+                               "  f - fail, don't execute the command and pretend it failed\n"
+                               "  h - help\n"
+                               "  i - info, show a short summary of the unit\n"
+                               "  j - jobs, show jobs that are in progress\n"
+                               "  s - skip, don't execute the command and pretend it succeeded\n"
+                               "  y - yes, execute the command\n");
+                        continue; /* ask again */
+                case 'i':
+                        printf("  Description: %s\n"
+                               "  Unit:        %s\n"
+                               "  Command:     %s\n",
+                               u->id, u->description, cmdline);
+                        continue; /* ask again */
+                case 'j':
+                        manager_dump_jobs(u->manager, stdout, "  ");
+                        continue; /* ask again */
+                case 'n':
+                        /* 'n' was removed in favor of 'f'. */
+                        printf("Didn't understand 'n', did you mean 'f'?\n");
+                        continue; /* ask again */
+                case 's':
+                        printf("Skipping execution.\n");
+                        r = CONFIRM_PRETEND_SUCCESS;
+                        break;
+                case 'y':
+                        r = CONFIRM_EXECUTE;
+                        break;
+                default:
+                        assert_not_reached("Unhandled choice");
+                }
+                break;
+        }
+
+restore_stdio:
+        restore_confirm_stdio(&saved_stdin, &saved_stdout);
          return r;
  }
  
-static int enforce_groups(const ExecContext *context, const char *username, gid_t gid) {
-        bool keep_groups = false;
+static int get_fixed_user(const ExecContext *c, const char **user,
+                          uid_t *uid, gid_t *gid,
+                          const char **home, const char **shell) {
          int r;
+        const char *name;
  
-        assert(context);
+        assert(c);
+
+        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
+         * (i.e. are "/" or "/bin/nologin"). */
+
+        name = c->user ?: "root";
+        r = get_user_creds_clean(&name, uid, gid, home, shell);
+        if (r < 0)
+                return r;
+
+        *user = name;
+        return 0;
+}
+
+static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
+        int r;
+        const char *name;
  
-        /* Lookup and set GID and supplementary group list. Here too
-         * we avoid NSS lookups for gid=0. */
+        assert(c);
  
-        if (context->group || username) {
+        if (!c->group)
+                return 0;
+
+        name = c->group;
+        r = get_group_creds(&name, gid);
+        if (r < 0)
+                return r;
+
+        *group = name;
+        return 0;
+}
+
+static int get_supplementary_groups(const ExecContext *c, const char *user,
+                                    const char *group, gid_t gid,
+                                    gid_t **supplementary_gids, int *ngids) {
+        char **i;
+        int r, k = 0;
+        int ngroups_max;
+        bool keep_groups = false;
+        gid_t *groups = NULL;
+        _cleanup_free_ gid_t *l_gids = NULL;
+
+        assert(c);
+
+        /*
+         * If user is given, then lookup GID and supplementary groups list.
+         * We avoid NSS lookups for gid=0. Also we have to initialize groups
+         * here and as early as possible so we keep the list of supplementary
+         * groups of the caller.
+         */
+        if (user && gid_is_valid(gid) && gid != 0) {
                  /* First step, initialize groups from /etc/groups */
-                if (username && gid != 0) {
-                        if (initgroups(username, gid) < 0)
-                                return -errno;
+                if (initgroups(user, gid) < 0)
+                        return -errno;
  
-                        keep_groups = true;
-                }
+                keep_groups = true;
+        }
  
-                /* Second step, set our gids */
-                if (setresgid(gid, gid, gid) < 0)
+        if (!c->supplementary_groups)
+                return 0;
+
+        /*
+         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
+         * be positive, otherwise fail.
+         */
+        errno = 0;
+        ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
+        if (ngroups_max <= 0) {
+                if (errno > 0)
                          return -errno;
+                else
+                        return -EOPNOTSUPP; /* For all other values */
          }
  
-        if (context->supplementary_groups) {
-                int ngroups_max, k;
-                gid_t *gids;
-                char **i;
+        l_gids = new(gid_t, ngroups_max);
+        if (!l_gids)
+                return -ENOMEM;
  
-                /* Final step, initialize any manually set supplementary groups */
-                assert_se((ngroups_max = (int) sysconf(_SC_NGROUPS_MAX)) > 0);
+        if (keep_groups) {
+                /*
+                 * Lookup the list of groups that the user belongs to, we
+                 * avoid NSS lookups here too for gid=0.
+                 */
+                k = ngroups_max;
+                if (getgrouplist(user, gid, l_gids, &k) < 0)
+                        return -EINVAL;
+        } else
+                k = 0;
  
-                if (!(gids = new(gid_t, ngroups_max)))
-                        return -ENOMEM;
+        STRV_FOREACH(i, c->supplementary_groups) {
+                const char *g;
  
-                if (keep_groups) {
-                        k = getgroups(ngroups_max, gids);
-                        if (k < 0) {
-                                free(gids);
-                                return -errno;
-                        }
-                } else
-                        k = 0;
+                if (k >= ngroups_max)
+                        return -E2BIG;
  
-                STRV_FOREACH(i, context->supplementary_groups) {
-                        const char *g;
+                g = *i;
+                r = get_group_creds(&g, l_gids+k);
+                if (r < 0)
+                        return r;
  
-                        if (k >= ngroups_max) {
-                                free(gids);
-                                return -E2BIG;
-                        }
+                k++;
+        }
  
-                        g = *i;
-                        r = get_group_creds(&g, gids+k);
-                        if (r < 0) {
-                                free(gids);
-                                return r;
-                        }
+        /*
+         * Sets ngids to zero to drop all supplementary groups, happens
+         * when we are under root and SupplementaryGroups= is empty.
+         */
+        if (k == 0) {
+                *ngids = 0;
+                return 0;
+        }
  
-                        k++;
-                }
+        /* Otherwise get the final list of supplementary groups */
+        groups = memdup(l_gids, sizeof(gid_t) * k);
+        if (!groups)
+                return -ENOMEM;
  
-                r = maybe_setgroups(k, gids);
-                if (r < 0) {
-                        free(gids);
+        *supplementary_gids = groups;
+        *ngids = k;
+
+        groups = NULL;
+
+        return 0;
+}
+
+static int enforce_groups(const ExecContext *context, gid_t gid,
+                          gid_t *supplementary_gids, int ngids) {
+        int r;
+
+        assert(context);
+
+        /* Handle SupplementaryGroups= even if it is empty */
+        if (context->supplementary_groups) {
+                r = maybe_setgroups(ngids, supplementary_gids);
+                if (r < 0)
                          return r;
-                }
+        }
  
-                free(gids);
+        if (gid_is_valid(gid)) {
+                /* Then set our gids */
+                if (setresgid(gid, gid, gid) < 0)
+                        return -errno;
          }
  
          return 0;
@@ -806,6 +963,9 @@ static int enforce_groups(const ExecContext *context, const char *username, gid_
  static int enforce_user(const ExecContext *context, uid_t uid) {
          assert(context);
  
+        if (!uid_is_valid(uid))
+                return 0;
+
          /* Sets (but doesn't look up) the uid and make sure we keep the
           * capabilities while doing so. */
  
@@ -1096,469 +1256,188 @@ static void rename_process_from_path(const char *path) {
          rename_process(process_name);
  }
  
-#ifdef HAVE_SECCOMP
+static bool context_has_address_families(const ExecContext *c) {
+        assert(c);
  
-static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
-        if (!is_seccomp_available()) {
-                log_open();
-                log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
-                log_close();
-                return true;
-        }
-        return false;
+        return c->address_families_whitelist ||
+                !set_isempty(c->address_families);
  }
  
-static int apply_seccomp(const Unit* u, const ExecContext *c) {
-        uint32_t negative_action, action;
-        scmp_filter_ctx *seccomp;
-        Iterator i;
-        void *id;
-        int r;
-
+static bool context_has_syscall_filters(const ExecContext *c) {
          assert(c);
  
-        if (skip_seccomp_unavailable(u, "syscall filtering"))
-                return 0;
-
-        negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
+        return c->syscall_whitelist ||
+                !set_isempty(c->syscall_filter);
+}
  
-        seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return -ENOMEM;
+static bool context_has_no_new_privileges(const ExecContext *c) {
+        assert(c);
  
-        if (c->syscall_archs) {
+        if (c->no_new_privileges)
+                return true;
  
-                SET_FOREACH(id, c->syscall_archs, i) {
-                        r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
-                        if (r == -EEXIST)
-                                continue;
-                        if (r < 0)
-                                goto finish;
-                }
+        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+                return false;
  
-        } else {
-                r = seccomp_add_secondary_archs(seccomp);
-                if (r < 0)
-                        goto finish;
-        }
+        /* We need NNP if we have any form of seccomp and are unprivileged */
+        return context_has_address_families(c) ||
+                c->memory_deny_write_execute ||
+                c->restrict_realtime ||
+                exec_context_restrict_namespaces_set(c) ||
+                c->protect_kernel_tunables ||
+                c->protect_kernel_modules ||
+                c->private_devices ||
+                context_has_syscall_filters(c) ||
+                !set_isempty(c->syscall_archs);
+}
  
-        action = c->syscall_whitelist ? SCMP_ACT_ALLOW : negative_action;
-        SET_FOREACH(id, c->syscall_filter, i) {
-                r = seccomp_rule_add(seccomp, action, PTR_TO_INT(id) - 1, 0);
-                if (r < 0)
-                        goto finish;
-        }
+#ifdef HAVE_SECCOMP
  
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0)
-                goto finish;
+static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
  
-        r = seccomp_load(seccomp);
+        if (is_seccomp_available())
+                return false;
  
-finish:
-        seccomp_release(seccomp);
-        return r;
+        log_open();
+        log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
+        log_close();
+        return true;
  }
  
-static int apply_address_families(const Unit* u, const ExecContext *c) {
-        scmp_filter_ctx *seccomp;
-        Iterator i;
-        int r;
+static int apply_syscall_filter(const Unit* u, const ExecContext *c) {
+        uint32_t negative_action, default_action, action;
  
+        assert(u);
          assert(c);
  
-        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
+        if (!context_has_syscall_filters(c))
                  return 0;
  
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return -ENOMEM;
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0)
-                goto finish;
-
-        if (c->address_families_whitelist) {
-                int af, first = 0, last = 0;
-                void *afp;
-
-                /* If this is a whitelist, we first block the address
-                 * families that are out of range and then everything
-                 * that is not in the set. First, we find the lowest
-                 * and highest address family in the set. */
-
-                SET_FOREACH(afp, c->address_families, i) {
-                        af = PTR_TO_INT(afp);
-
-                        if (af <= 0 || af >= af_max())
-                                continue;
-
-                        if (first == 0 || af < first)
-                                first = af;
-
-                        if (last == 0 || af > last)
-                                last = af;
-                }
-
-                assert((first == 0) == (last == 0));
-
-                if (first == 0) {
+        if (skip_seccomp_unavailable(u, "SystemCallFilter="))
+                return 0;
  
-                        /* No entries in the valid range, block everything */
-                        r = seccomp_rule_add(
-                                        seccomp,
-                                        SCMP_ACT_ERRNO(EPROTONOSUPPORT),
-                                        SCMP_SYS(socket),
-                                        0);
-                        if (r < 0)
-                                goto finish;
+        negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
  
-                } else {
+        if (c->syscall_whitelist) {
+                default_action = negative_action;
+                action = SCMP_ACT_ALLOW;
+        } else {
+                default_action = SCMP_ACT_ALLOW;
+                action = negative_action;
+        }
  
-                        /* Block everything below the first entry */
-                        r = seccomp_rule_add(
-                                        seccomp,
-                                        SCMP_ACT_ERRNO(EPROTONOSUPPORT),
-                                        SCMP_SYS(socket),
-                                        1,
-                                        SCMP_A0(SCMP_CMP_LT, first));
-                        if (r < 0)
-                                goto finish;
-
-                        /* Block everything above the last entry */
-                        r = seccomp_rule_add(
-                                        seccomp,
-                                        SCMP_ACT_ERRNO(EPROTONOSUPPORT),
-                                        SCMP_SYS(socket),
-                                        1,
-                                        SCMP_A0(SCMP_CMP_GT, last));
-                        if (r < 0)
-                                goto finish;
-
-                        /* Block everything between the first and last
-                         * entry */
-                        for (af = 1; af < af_max(); af++) {
-
-                                if (set_contains(c->address_families, INT_TO_PTR(af)))
-                                        continue;
+        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
+}
  
-                                r = seccomp_rule_add(
-                                                seccomp,
-                                                SCMP_ACT_ERRNO(EPROTONOSUPPORT),
-                                                SCMP_SYS(socket),
-                                                1,
-                                                SCMP_A0(SCMP_CMP_EQ, af));
-                                if (r < 0)
-                                        goto finish;
-                        }
-                }
+static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
+        assert(u);
+        assert(c);
  
-        } else {
-                void *af;
+        if (set_isempty(c->syscall_archs))
+                return 0;
  
-                /* If this is a blacklist, then generate one rule for
-                 * each address family that are then combined in OR
-                 * checks. */
+        if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
+                return 0;
  
-                SET_FOREACH(af, c->address_families, i) {
+        return seccomp_restrict_archs(c->syscall_archs);
+}
  
-                        r = seccomp_rule_add(
-                                        seccomp,
-                                        SCMP_ACT_ERRNO(EPROTONOSUPPORT),
-                                        SCMP_SYS(socket),
-                                        1,
-                                        SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
-                        if (r < 0)
-                                goto finish;
-                }
-        }
+static int apply_address_families(const Unit* u, const ExecContext *c) {
+        assert(u);
+        assert(c);
  
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0)
-                goto finish;
+        if (!context_has_address_families(c))
+                return 0;
  
-        r = seccomp_load(seccomp);
+        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
+                return 0;
  
-finish:
-        seccomp_release(seccomp);
-        return r;
+        return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
  }
  
  static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
-        scmp_filter_ctx *seccomp;
-        int r;
-
+        assert(u);
          assert(c);
  
-        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
+        if (!c->memory_deny_write_execute)
                  return 0;
  
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return -ENOMEM;
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_rule_add(
-                        seccomp,
-                        SCMP_ACT_ERRNO(EPERM),
-                        SCMP_SYS(mmap),
-                        1,
-                        SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_rule_add(
-                        seccomp,
-                        SCMP_ACT_ERRNO(EPERM),
-                        SCMP_SYS(mprotect),
-                        1,
-                        SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_load(seccomp);
+        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
+                return 0;
  
-finish:
-        seccomp_release(seccomp);
-        return r;
+        return seccomp_memory_deny_write_execute();
  }
  
  static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
-        static const int permitted_policies[] = {
-                SCHED_OTHER,
-                SCHED_BATCH,
-                SCHED_IDLE,
-        };
-
-        scmp_filter_ctx *seccomp;
-        unsigned i;
-        int r, p, max_policy = 0;
-
+        assert(u);
          assert(c);
  
-        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
+        if (!c->restrict_realtime)
                  return 0;
  
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return -ENOMEM;
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0)
-                goto finish;
-
-        /* Determine the highest policy constant we want to allow */
-        for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
-                if (permitted_policies[i] > max_policy)
-                        max_policy = permitted_policies[i];
-
-        /* Go through all policies with lower values than that, and block them -- unless they appear in the
-         * whitelist. */
-        for (p = 0; p < max_policy; p++) {
-                bool good = false;
-
-                /* Check if this is in the whitelist. */
-                for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
-                        if (permitted_policies[i] == p) {
-                                good = true;
-                                break;
-                        }
-
-                if (good)
-                        continue;
-
-                /* Deny this policy */
-                r = seccomp_rule_add(
-                                seccomp,
-                                SCMP_ACT_ERRNO(EPERM),
-                                SCMP_SYS(sched_setscheduler),
-                                1,
-                                SCMP_A1(SCMP_CMP_EQ, p));
-                if (r < 0)
-                        goto finish;
-        }
-
-        /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
-         * hence no need no check for < 0 values. */
-        r = seccomp_rule_add(
-                        seccomp,
-                        SCMP_ACT_ERRNO(EPERM),
-                        SCMP_SYS(sched_setscheduler),
-                        1,
-                        SCMP_A1(SCMP_CMP_GT, max_policy));
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_load(seccomp);
+        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
+                return 0;
  
-finish:
-        seccomp_release(seccomp);
-        return r;
+        return seccomp_restrict_realtime();
  }
  
-static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
-        scmp_filter_ctx *seccomp;
-        int r;
-
+static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
+        assert(u);
          assert(c);
  
          /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
           * let's protect even those systems where this is left on in the kernel. */
  
-        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+        if (!c->protect_kernel_tunables)
                  return 0;
  
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return -ENOMEM;
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_rule_add(
-                        seccomp,
-                        SCMP_ACT_ERRNO(EPERM),
-                        SCMP_SYS(_sysctl),
-                        0);
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_load(seccomp);
+        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+                return 0;
  
-finish:
-        seccomp_release(seccomp);
-        return r;
+        return seccomp_protect_sysctl();
  }
  
-static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) {
-        static const int module_syscalls[] = {
-                SCMP_SYS(delete_module),
-                SCMP_SYS(finit_module),
-                SCMP_SYS(init_module),
-        };
-
-        scmp_filter_ctx *seccomp;
-        unsigned i;
-        int r;
-
+static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
+        assert(u);
          assert(c);
  
-        /* Turn of module syscalls on ProtectKernelModules=yes */
+        /* Turn off module syscalls on ProtectKernelModules=yes */
  
-        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
+        if (!c->protect_kernel_modules)
                  return 0;
  
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return -ENOMEM;
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0)
-                goto finish;
-
-        for (i = 0; i < ELEMENTSOF(module_syscalls); i++) {
-                r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM),
-                                     module_syscalls[i], 0);
-                if (r < 0)
-                        goto finish;
-        }
-
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0)
-                goto finish;
-
-        r = seccomp_load(seccomp);
+        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
+                return 0;
  
-finish:
-        seccomp_release(seccomp);
-        return r;
+        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
  }
  
-static int apply_private_devices(Unit *u, const ExecContext *c) {
-        const SystemCallFilterSet *set;
-        scmp_filter_ctx *seccomp;
-        const char *sys;
-        bool syscalls_found = false;
-        int r;
-
+static int apply_private_devices(const Unit *u, const ExecContext *c) {
+        assert(u);
          assert(c);
  
          /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
  
-        if (skip_seccomp_unavailable(u, "PrivateDevices="))
+        if (!c->private_devices)
                  return 0;
  
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return -ENOMEM;
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0)
-                goto finish;
-
-        for (set = syscall_filter_sets; set->set_name; set++)
-                if (streq(set->set_name, "@raw-io")) {
-                        syscalls_found = true;
-                        break;
-                }
-
-        /* We should never fail here */
-        if (!syscalls_found) {
-                r = -EOPNOTSUPP;
-                goto finish;
-        }
-
-        NULSTR_FOREACH(sys, set->value) {
-                int id;
-                bool add = true;
-
-#ifndef __NR_s390_pci_mmio_read
-                if (streq(sys, "s390_pci_mmio_read"))
-                        add = false;
-#endif
-#ifndef __NR_s390_pci_mmio_write
-                if (streq(sys, "s390_pci_mmio_write"))
-                        add = false;
-#endif
-
-                if (!add)
-                        continue;
+        if (skip_seccomp_unavailable(u, "PrivateDevices="))
+                return 0;
  
-                id = seccomp_syscall_resolve_name(sys);
+        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
+}
  
-                r = seccomp_rule_add(
-                                seccomp,
-                                SCMP_ACT_ERRNO(EPERM),
-                                id, 0);
-                if (r < 0)
-                        goto finish;
-        }
+static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
+        assert(u);
+        assert(c);
  
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0)
-                goto finish;
+        if (!exec_context_restrict_namespaces_set(c))
+                return 0;
  
-        r = seccomp_load(seccomp);
+        if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
+                return 0;
  
-finish:
-        seccomp_release(seccomp);
-        return r;
+        return seccomp_restrict_namespaces(c->restrict_namespaces);
  }
  
  #endif
@@ -1630,7 +1509,7 @@ static int build_environment(
                  if (!joined)
                          return -ENOMEM;
  
-                x = strjoin("LISTEN_FDNAMES=", joined, NULL);
+                x = strjoin("LISTEN_FDNAMES=", joined);
                  if (!x)
                          return -ENOMEM;
                  our_env[n_env++] = x;
@@ -1737,7 +1616,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) {
                  v = getenv(*i);
                  if (!v)
                          continue;
-                x = strjoin(*i, "=", v, NULL);
+                x = strjoin(*i, "=", v);
                  if (!x)
                          return -ENOMEM;
                  if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
@@ -1761,11 +1640,17 @@ static bool exec_needs_mount_namespace(
          assert(context);
          assert(params);
  
+        if (context->root_image)
+                return true;
+
          if (!strv_isempty(context->read_write_paths) ||
              !strv_isempty(context->read_only_paths) ||
              !strv_isempty(context->inaccessible_paths))
                  return true;
  
+        if (context->n_bind_mounts > 0)
+                return true;
+
          if (context->mount_flags != 0)
                  return true;
  
@@ -1780,6 +1665,9 @@ static bool exec_needs_mount_namespace(
              context->protect_control_groups)
                  return true;
  
+        if (context->mount_apivfs)
+                return true;
+
          return false;
  }
  
@@ -1800,25 +1688,31 @@ static int setup_private_users(uid_t uid, gid_t gid) {
           * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
           * continues execution normally. */
  
-        if (uid != 0 && uid_is_valid(uid))
-                asprintf(&uid_map,
-                         "0 0 1\n"                      /* Map root → root */
-                         UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
-                         uid, uid);                     /* The case where the above is the same */
-        else
-                uid_map = strdup("0 0 1\n");
-        if (!uid_map)
-                return -ENOMEM;
+        if (uid != 0 && uid_is_valid(uid)) {
+                r = asprintf(&uid_map,
+                             "0 0 1\n"                      /* Map root → root */
+                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
+                             uid, uid);
+                if (r < 0)
+                        return -ENOMEM;
+        } else {
+                uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
+                if (!uid_map)
+                        return -ENOMEM;
+        }
  
-        if (gid != 0 && gid_is_valid(gid))
-                asprintf(&gid_map,
-                         "0 0 1\n"                      /* Map root → root */
-                         GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
-                         gid, gid);
-        else
+        if (gid != 0 && gid_is_valid(gid)) {
+                r = asprintf(&gid_map,
+                             "0 0 1\n"                      /* Map root → root */
+                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
+                             gid, gid);
+                if (r < 0)
+                        return -ENOMEM;
+        } else {
                  gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
-        if (!gid_map)
-                return -ENOMEM;
+                if (!gid_map)
+                        return -ENOMEM;
+        }
  
          /* Create a communication channel so that the parent can tell the child when it finished creating the user
           * namespace. */
@@ -1951,7 +1845,7 @@ static int setup_runtime_directory(
          STRV_FOREACH(rt, context->runtime_directory) {
                  _cleanup_free_ char *p;
  
-                p = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                p = strjoin(params->runtime_prefix, "/", *rt);
                  if (!p)
                          return -ENOMEM;
  
@@ -1989,54 +1883,213 @@ static int setup_smack(
          else {
                  _cleanup_free_ char *exec_label = NULL;
  
-                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
-                if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
-                        return r;
+                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
+                if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
+                        return r;
+
+                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
+                if (r < 0)
+                        return r;
+        }
+#endif
+#endif
+
+        return 0;
+}
+
+static int compile_read_write_paths(
+                const ExecContext *context,
+                const ExecParameters *params,
+                char ***ret) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        char **rt;
+
+        /* Compile the list of writable paths. This is the combination of
+         * the explicitly configured paths, plus all runtime directories. */
+
+        if (strv_isempty(context->read_write_paths) &&
+            strv_isempty(context->runtime_directory)) {
+                *ret = NULL; /* NOP if neither is set */
+                return 0;
+        }
+
+        l = strv_copy(context->read_write_paths);
+        if (!l)
+                return -ENOMEM;
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                char *s;
+
+                s = strjoin(params->runtime_prefix, "/", *rt);
+                if (!s)
+                        return -ENOMEM;
+
+                if (strv_consume(&l, s) < 0)
+                        return -ENOMEM;
+        }
+
+        *ret = l;
+        l = NULL;
+
+        return 0;
+}
+
+static int apply_mount_namespace(Unit *u, const ExecContext *context,
+                                 const ExecParameters *params,
+                                 ExecRuntime *runtime) {
+        int r;
+        _cleanup_strv_free_ char **rw = NULL;
+        char *tmp = NULL, *var = NULL;
+        const char *root_dir = NULL, *root_image = NULL;
+        NameSpaceInfo ns_info = {
+                .ignore_protect_paths = false,
+                .private_dev = context->private_devices,
+                .protect_control_groups = context->protect_control_groups,
+                .protect_kernel_tunables = context->protect_kernel_tunables,
+                .protect_kernel_modules = context->protect_kernel_modules,
+                .mount_apivfs = context->mount_apivfs,
+        };
+
+        assert(context);
+
+        /* The runtime struct only contains the parent of the private /tmp,
+         * which is non-accessible to world users. Inside of it there's a /tmp
+         * that is sticky, and that's the one we want to use here. */
+
+        if (context->private_tmp && runtime) {
+                if (runtime->tmp_dir)
+                        tmp = strjoina(runtime->tmp_dir, "/tmp");
+                if (runtime->var_tmp_dir)
+                        var = strjoina(runtime->var_tmp_dir, "/tmp");
+        }
+
+        r = compile_read_write_paths(context, params, &rw);
+        if (r < 0)
+                return r;
+
+        if (params->flags & EXEC_APPLY_CHROOT) {
+                root_image = context->root_image;
+
+                if (!root_image)
+                        root_dir = context->root_directory;
+        }
+
+        /*
+         * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
+         * sandbox info, otherwise enforce it, don't ignore protected paths and
+         * fail if we are enable to apply the sandbox inside the mount namespace.
+         */
+        if (!context->dynamic_user && root_dir)
+                ns_info.ignore_protect_paths = true;
+
+        r = setup_namespace(root_dir, root_image,
+                            &ns_info, rw,
+                            context->read_only_paths,
+                            context->inaccessible_paths,
+                            context->bind_mounts,
+                            context->n_bind_mounts,
+                            tmp,
+                            var,
+                            context->protect_home,
+                            context->protect_system,
+                            context->mount_flags,
+                            DISSECT_IMAGE_DISCARD_ON_LOOP);
+
+        /* If we couldn't set up the namespace this is probably due to a
+         * missing capability. In this case, silently proceeed. */
+        if (IN_SET(r, -EPERM, -EACCES)) {
+                log_open();
+                log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
+                log_close();
+                r = 0;
+        }
+
+        return r;
+}
+
+static int apply_working_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *home,
+                const bool needs_mount_ns) {
+
+        const char *d;
+        const char *wd;
+
+        assert(context);
+
+        if (context->working_directory_home)
+                wd = home;
+        else if (context->working_directory)
+                wd = context->working_directory;
+        else
+                wd = "/";
+
+        if (params->flags & EXEC_APPLY_CHROOT) {
+                if (!needs_mount_ns && context->root_directory)
+                        if (chroot(context->root_directory) < 0)
+                                return -errno;
  
-                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
-                if (r < 0)
-                        return r;
-        }
-#endif
-#endif
+                d = wd;
+        } else
+                d = strjoina(strempty(context->root_directory), "/", strempty(wd));
+
+        if (chdir(d) < 0 && !context->working_directory_missing_ok)
+                return -errno;
  
          return 0;
  }
  
-static int compile_read_write_paths(
-                const ExecContext *context,
-                const ExecParameters *params,
-                char ***ret) {
+static int setup_keyring(Unit *u, const ExecParameters *p, uid_t uid, gid_t gid) {
+        key_serial_t keyring;
  
-        _cleanup_strv_free_ char **l = NULL;
-        char **rt;
+        assert(u);
+        assert(p);
  
-        /* Compile the list of writable paths. This is the combination of the explicitly configured paths, plus all
-         * runtime directories. */
+        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
+         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
+         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
+         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
+         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
+         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
  
-        if (strv_isempty(context->read_write_paths) &&
-            strv_isempty(context->runtime_directory)) {
-                *ret = NULL; /* NOP if neither is set */
+        if (!(p->flags & EXEC_NEW_KEYRING))
                  return 0;
-        }
  
-        l = strv_copy(context->read_write_paths);
-        if (!l)
-                return -ENOMEM;
+        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
+        if (keyring == -1) {
+                if (errno == ENOSYS)
+                        log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
+                else if (IN_SET(errno, EACCES, EPERM))
+                        log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
+                else if (errno == EDQUOT)
+                        log_debug_errno(errno, "Out of kernel keyrings to allocate, ignoring.");
+                else
+                        return log_error_errno(errno, "Setting up kernel keyring failed: %m");
  
-        STRV_FOREACH(rt, context->runtime_directory) {
-                char *s;
+                return 0;
+        }
  
-                s = strjoin(params->runtime_prefix, "/", *rt, NULL);
-                if (!s)
-                        return -ENOMEM;
+        /* Populate they keyring with the invocation ID by default. */
+        if (!sd_id128_is_null(u->invocation_id)) {
+                key_serial_t key;
  
-                if (strv_consume(&l, s) < 0)
-                        return -ENOMEM;
+                key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
+                if (key == -1)
+                        log_debug_errno(errno, "Failed to add invocation ID to keyring, ignoring: %m");
+                else {
+                        if (keyctl(KEYCTL_SETPERM, key,
+                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
+                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
+                                return log_error_errno(errno, "Failed to restrict invocation ID permission: %m");
+                }
          }
  
-        *ret = l;
-        l = NULL;
+        /* And now, make the keyring owned by the service's user */
+        if (uid_is_valid(uid) || gid_is_valid(gid))
+                if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
+                        return log_error_errno(errno, "Failed to change ownership of session keyring: %m");
  
          return 0;
  }
@@ -2097,39 +2150,6 @@ static int close_remaining_fds(
          return close_all_fds(dont_close, n_dont_close);
  }
  
-static bool context_has_address_families(const ExecContext *c) {
-        assert(c);
-
-        return c->address_families_whitelist ||
-                !set_isempty(c->address_families);
-}
-
-static bool context_has_syscall_filters(const ExecContext *c) {
-        assert(c);
-
-        return c->syscall_whitelist ||
-                !set_isempty(c->syscall_filter) ||
-                !set_isempty(c->syscall_archs);
-}
-
-static bool context_has_no_new_privileges(const ExecContext *c) {
-        assert(c);
-
-        if (c->no_new_privileges)
-                return true;
-
-        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
-                return false;
-
-        return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
-                c->memory_deny_write_execute ||
-                c->restrict_realtime ||
-                c->protect_kernel_tunables ||
-                c->protect_kernel_modules ||
-                c->private_devices ||
-                context_has_syscall_filters(c);
-}
-
  static int send_user_lookup(
                  Unit *unit,
                  int user_lookup_fd,
@@ -2171,23 +2191,29 @@ static int exec_child(
                  int *fds, unsigned n_fds,
                  char **files_env,
                  int user_lookup_fd,
-                int *exit_status) {
+                int *exit_status,
+                char **error_message) {
  
          _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
          _cleanup_free_ char *mac_selinux_context_net = NULL;
-        const char *username = NULL, *home = NULL, *shell = NULL, *wd;
+        _cleanup_free_ gid_t *supplementary_gids = NULL;
+        const char *username = NULL, *groupname = NULL;
+        const char *home = NULL, *shell = NULL;
          dev_t journal_stream_dev = 0;
          ino_t journal_stream_ino = 0;
          bool needs_mount_namespace;
          uid_t uid = UID_INVALID;
          gid_t gid = GID_INVALID;
-        int i, r;
+        int i, r, ngids = 0;
  
          assert(unit);
          assert(command);
          assert(context);
          assert(params);
          assert(exit_status);
+        assert(error_message);
+        /* We don't always set error_message, hence it must be initialized */
+        assert(*error_message == NULL);
  
          rename_process_from_path(command->path);
  
@@ -2205,6 +2231,8 @@ static int exec_child(
          r = reset_signal_mask();
          if (r < 0) {
                  *exit_status = EXIT_SIGNAL_MASK;
+                *error_message = strdup("Failed to reset signal mask");
+                /* If strdup fails, here and below, we will just print the generic error message. */
                  return r;
          }
  
@@ -2220,6 +2248,7 @@ static int exec_child(
          r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
          if (r < 0) {
                  *exit_status = EXIT_FDS;
+                *error_message = strdup("Failed to close remaining fds");
                  return r;
          }
  
@@ -2231,22 +2260,25 @@ static int exec_child(
  
          exec_context_tty_reset(context, params);
  
-        if (params->flags & EXEC_CONFIRM_SPAWN) {
-                char response;
+        if (unit_shall_confirm_spawn(unit)) {
+                const char *vc = params->confirm_spawn;
+                _cleanup_free_ char *cmdline = NULL;
+
+                cmdline = exec_command_line(argv);
+                if (!cmdline) {
+                        *exit_status = EXIT_CONFIRM;
+                        return -ENOMEM;
+                }
  
-                r = ask_for_confirmation(&response, argv);
-                if (r == -ETIMEDOUT)
-                        write_confirm_message("Confirmation question timed out, assuming positive response.\n");
-                else if (r < 0)
-                        write_confirm_message("Couldn't ask confirmation question, assuming positive response: %s\n", strerror(-r));
-                else if (response == 's') {
-                        write_confirm_message("Skipping execution.\n");
+                r = ask_for_confirmation(vc, unit, cmdline);
+                if (r != CONFIRM_EXECUTE) {
+                        if (r == CONFIRM_PRETEND_SUCCESS) {
+                                *exit_status = EXIT_SUCCESS;
+                                return 0;
+                        }
                          *exit_status = EXIT_CONFIRM;
+                        *error_message = strdup("Execution cancelled");
                          return -ECANCELED;
-                } else if (response == 'n') {
-                        write_confirm_message("Failing execution.\n");
-                        *exit_status = 0;
-                        return 0;
                  }
          }
  
@@ -2255,17 +2287,27 @@ static int exec_child(
                  /* Make sure we bypass our own NSS module for any NSS checks */
                  if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
                          *exit_status = EXIT_USER;
+                        *error_message = strdup("Failed to update environment");
                          return -errno;
                  }
  
                  r = dynamic_creds_realize(dcreds, &uid, &gid);
                  if (r < 0) {
                          *exit_status = EXIT_USER;
+                        *error_message = strdup("Failed to update dynamic user credentials");
                          return r;
                  }
  
-                if (!uid_is_valid(uid) || !gid_is_valid(gid)) {
+                if (!uid_is_valid(uid)) {
+                        *exit_status = EXIT_USER;
+                        (void) asprintf(error_message, "UID validation failed for \""UID_FMT"\"", uid);
+                        /* If asprintf fails, here and below, we will just print the generic error message. */
+                        return -ESRCH;
+                }
+
+                if (!gid_is_valid(gid)) {
                          *exit_status = EXIT_USER;
+                        (void) asprintf(error_message, "GID validation failed for \""GID_FMT"\"", gid);
                          return -ESRCH;
                  }
  
@@ -2273,32 +2315,34 @@ static int exec_child(
                          username = dcreds->user->name;
  
          } else {
-                if (context->user) {
-                        username = context->user;
-                        r = get_user_creds_clean(&username, &uid, &gid, &home, &shell);
-                        if (r < 0) {
-                                *exit_status = EXIT_USER;
-                                return r;
-                        }
-
-                        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
-                         * (i.e. are "/" or "/bin/nologin"). */
+                r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
+                if (r < 0) {
+                        *exit_status = EXIT_USER;
+                        *error_message = strdup("Failed to determine user credentials");
+                        return r;
                  }
  
-                if (context->group) {
-                        const char *g = context->group;
-
-                        r = get_group_creds(&g, &gid);
-                        if (r < 0) {
-                                *exit_status = EXIT_GROUP;
-                                return r;
-                        }
+                r = get_fixed_group(context, &groupname, &gid);
+                if (r < 0) {
+                        *exit_status = EXIT_GROUP;
+                        *error_message = strdup("Failed to determine group credentials");
+                        return r;
                  }
          }
  
+        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
+        r = get_supplementary_groups(context, username, groupname, gid,
+                                     &supplementary_gids, &ngids);
+        if (r < 0) {
+                *exit_status = EXIT_GROUP;
+                *error_message = strdup("Failed to determine supplementary groups");
+                return r;
+        }
+
          r = send_user_lookup(unit, user_lookup_fd, uid, gid);
          if (r < 0) {
                  *exit_status = EXIT_USER;
+                *error_message = strdup("Failed to send user credentials to PID1");
                  return r;
          }
  
@@ -2312,18 +2356,21 @@ static int exec_child(
          r = setup_input(context, params, socket_fd, named_iofds);
          if (r < 0) {
                  *exit_status = EXIT_STDIN;
+                *error_message = strdup("Failed to set up stdin");
                  return r;
          }
  
          r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
          if (r < 0) {
                  *exit_status = EXIT_STDOUT;
+                *error_message = strdup("Failed to set up stdout");
                  return r;
          }
  
          r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
          if (r < 0) {
                  *exit_status = EXIT_STDERR;
+                *error_message = strdup("Failed to set up stderr");
                  return r;
          }
  
@@ -2331,6 +2378,7 @@ static int exec_child(
                  r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
                  if (r < 0) {
                          *exit_status = EXIT_CGROUP;
+                        (void) asprintf(error_message, "Failed to attach to cgroup %s", params->cgroup_path);
                          return r;
                  }
          }
@@ -2351,6 +2399,7 @@ static int exec_child(
                          log_close();
                  } else if (r < 0) {
                          *exit_status = EXIT_OOM_ADJUST;
+                        *error_message = strdup("Failed to write /proc/self/oom_score_adj");
                          return -errno;
                  }
          }
@@ -2402,11 +2451,12 @@ static int exec_child(
                  }
  
          if (context->utmp_id)
-                utmp_put_init_process(context->utmp_id, getpid(), getsid(0), context->tty_path,
+                utmp_put_init_process(context->utmp_id, getpid(), getsid(0),
+                                      context->tty_path,
                                        context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
                                        context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
                                        USER_PROCESS,
-                                      username ? "root" : context->user);
+                                      username);
  
          if (context->user) {
                  r = chown_terminal(STDIN_FILENO, uid);
@@ -2479,13 +2529,13 @@ static int exec_child(
  
          (void) umask(context->umask);
  
-        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
-                r = setup_smack(context, command);
-                if (r < 0) {
-                        *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                        return r;
-                }
+        r = setup_keyring(unit, params, uid, gid);
+        if (r < 0) {
+                *exit_status = EXIT_KEYRING;
+                return r;
+        }
  
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
                  if (context->pam_name && username) {
                          r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
                          if (r < 0) {
@@ -2505,97 +2555,29 @@ static int exec_child(
  
          needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
          if (needs_mount_namespace) {
-                _cleanup_free_ char **rw = NULL;
-                char *tmp = NULL, *var = NULL;
-                NameSpaceInfo ns_info = {
-                        .private_dev = context->private_devices,
-                        .protect_control_groups = context->protect_control_groups,
-                        .protect_kernel_tunables = context->protect_kernel_tunables,
-                        .protect_kernel_modules = context->protect_kernel_modules,
-                };
-
-                /* The runtime struct only contains the parent
-                 * of the private /tmp, which is
-                 * non-accessible to world users. Inside of it
-                 * there's a /tmp that is sticky, and that's
-                 * the one we want to use here. */
-
-                if (context->private_tmp && runtime) {
-                        if (runtime->tmp_dir)
-                                tmp = strjoina(runtime->tmp_dir, "/tmp");
-                        if (runtime->var_tmp_dir)
-                                var = strjoina(runtime->var_tmp_dir, "/tmp");
-                }
-
-                r = compile_read_write_paths(context, params, &rw);
+                r = apply_mount_namespace(unit, context, params, runtime);
                  if (r < 0) {
                          *exit_status = EXIT_NAMESPACE;
                          return r;
                  }
+        }
  
-                r = setup_namespace(
-                                (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL,
-                                &ns_info,
-                                rw,
-                                context->read_only_paths,
-                                context->inaccessible_paths,
-                                tmp,
-                                var,
-                                context->protect_home,
-                                context->protect_system,
-                                context->mount_flags);
-
-                /* If we couldn't set up the namespace this is
-                 * probably due to a missing capability. In this case,
-                 * silently proceeed. */
-                if (r == -EPERM || r == -EACCES) {
-                        log_open();
-                        log_unit_debug_errno(unit, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
-                        log_close();
-                } else if (r < 0) {
-                        *exit_status = EXIT_NAMESPACE;
-                        return r;
-                }
+        /* Apply just after mount namespace setup */
+        r = apply_working_directory(context, params, home, needs_mount_namespace);
+        if (r < 0) {
+                *exit_status = EXIT_CHROOT;
+                return r;
          }
  
+        /* Drop groups as early as possbile */
          if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
-                r = enforce_groups(context, username, gid);
+                r = enforce_groups(context, gid, supplementary_gids, ngids);
                  if (r < 0) {
                          *exit_status = EXIT_GROUP;
                          return r;
                  }
          }
  
-        if (context->working_directory_home)
-                wd = home;
-        else if (context->working_directory)
-                wd = context->working_directory;
-        else
-                wd = "/";
-
-        if (params->flags & EXEC_APPLY_CHROOT) {
-                if (!needs_mount_namespace && context->root_directory)
-                        if (chroot(context->root_directory) < 0) {
-                                *exit_status = EXIT_CHROOT;
-                                return -errno;
-                        }
-
-                if (chdir(wd) < 0 &&
-                    !context->working_directory_missing_ok) {
-                        *exit_status = EXIT_CHDIR;
-                        return -errno;
-                }
-        } else {
-                const char *d;
-
-                d = strjoina(strempty(context->root_directory), "/", strempty(wd));
-                if (chdir(d) < 0 &&
-                    !context->working_directory_missing_ok) {
-                        *exit_status = EXIT_CHDIR;
-                        return -errno;
-                }
-        }
-
  #ifdef HAVE_SELINUX
          if ((params->flags & EXEC_APPLY_PERMISSIONS) &&
              mac_selinux_use() &&
@@ -2663,6 +2645,7 @@ static int exec_child(
                          r = capability_bounding_set_drop(context->capability_bounding_set, false);
                          if (r < 0) {
                                  *exit_status = EXIT_CAPABILITIES;
+                                *error_message = strdup("Failed to drop capabilities");
                                  return r;
                          }
                  }
@@ -2673,6 +2656,7 @@ static int exec_child(
                          r = capability_ambient_set_apply(context->capability_ambient_set, true);
                          if (r < 0) {
                                  *exit_status = EXIT_CAPABILITIES;
+                                *error_message = strdup("Failed to apply ambient capabilities (before UID change)");
                                  return r;
                          }
                  }
@@ -2681,6 +2665,7 @@ static int exec_child(
                          r = enforce_user(context, uid);
                          if (r < 0) {
                                  *exit_status = EXIT_USER;
+                                (void) asprintf(error_message, "Failed to change UID to "UID_FMT, uid);
                                  return r;
                          }
                          if (context->capability_ambient_set != 0) {
@@ -2689,6 +2674,7 @@ static int exec_child(
                                  r = capability_ambient_set_apply(context->capability_ambient_set, false);
                                  if (r < 0) {
                                          *exit_status = EXIT_CAPABILITIES;
+                                        *error_message = strdup("Failed to apply ambient capabilities (after UID change)");
                                          return r;
                                  }
  
@@ -2703,6 +2689,46 @@ static int exec_child(
                          }
                  }
  
+                /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
+                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
+                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
+                 * are restricted. */
+
+#ifdef HAVE_SELINUX
+                if (mac_selinux_use()) {
+                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;
+
+                        if (exec_context) {
+                                r = setexeccon(exec_context);
+                                if (r < 0) {
+                                        *exit_status = EXIT_SELINUX_CONTEXT;
+                                        (void) asprintf(error_message, "Failed to set SELinux context to %s", exec_context);
+                                        return r;
+                                }
+                        }
+                }
+#endif
+
+                r = setup_smack(context, command);
+                if (r < 0) {
+                        *exit_status = EXIT_SMACK_PROCESS_LABEL;
+                        *error_message = strdup("Failed to set SMACK process label");
+                        return r;
+                }
+
+#ifdef HAVE_APPARMOR
+                if (context->apparmor_profile && mac_apparmor_use()) {
+                        r = aa_change_onexec(context->apparmor_profile);
+                        if (r < 0 && !context->apparmor_profile_ignore) {
+                                *exit_status = EXIT_APPARMOR_PROFILE;
+                                (void) asprintf(error_message,
+                                                "Failed to prepare AppArmor profile change to %s",
+                                                context->apparmor_profile);
+                                return -errno;
+                        }
+                }
+#endif
+
                  /* PR_GET_SECUREBITS is not privileged, while
                   * PR_SET_SECUREBITS is. So to suppress
                   * potential EPERMs we'll try not to call
@@ -2710,94 +2736,81 @@ static int exec_child(
                  if (prctl(PR_GET_SECUREBITS) != secure_bits)
                          if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
                                  *exit_status = EXIT_SECUREBITS;
+                                *error_message = strdup("Failed to set secure bits");
                                  return -errno;
                          }
  
                  if (context_has_no_new_privileges(context))
                          if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                  *exit_status = EXIT_NO_NEW_PRIVILEGES;
+                                *error_message = strdup("Failed to disable new privileges");
                                  return -errno;
                          }
  
  #ifdef HAVE_SECCOMP
-                if (context_has_address_families(context)) {
-                        r = apply_address_families(unit, context);
-                        if (r < 0) {
-                                *exit_status = EXIT_ADDRESS_FAMILIES;
-                                return r;
-                        }
+                r = apply_address_families(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_ADDRESS_FAMILIES;
+                        *error_message = strdup("Failed to restrict address families");
+                        return r;
                  }
  
-                if (context->memory_deny_write_execute) {
-                        r = apply_memory_deny_write_execute(unit, context);
-                        if (r < 0) {
-                                *exit_status = EXIT_SECCOMP;
-                                return r;
-                        }
+                r = apply_memory_deny_write_execute(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to disable writing to executable memory");
+                        return r;
                  }
  
-                if (context->restrict_realtime) {
-                        r = apply_restrict_realtime(unit, context);
-                        if (r < 0) {
-                                *exit_status = EXIT_SECCOMP;
-                                return r;
-                        }
+                r = apply_restrict_realtime(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to apply realtime restrictions");
+                        return r;
                  }
  
-                if (context->protect_kernel_tunables) {
-                        r = apply_protect_sysctl(unit, context);
-                        if (r < 0) {
-                                *exit_status = EXIT_SECCOMP;
-                                return r;
-                        }
+                r = apply_restrict_namespaces(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to apply namespace restrictions");
+                        return r;
                  }
  
-                if (context->protect_kernel_modules) {
-                        r = apply_protect_kernel_modules(unit, context);
-                        if (r < 0) {
-                                *exit_status = EXIT_SECCOMP;
-                                return r;
-                        }
+                r = apply_protect_sysctl(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to apply sysctl restrictions");
+                        return r;
                  }
  
-                if (context->private_devices) {
-                        r = apply_private_devices(unit, context);
-                        if (r < 0) {
-                                *exit_status = EXIT_SECCOMP;
-                                return r;
-                        }
+                r = apply_protect_kernel_modules(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to apply module loading restrictions");
+                        return r;
                  }
  
-                if (context_has_syscall_filters(context)) {
-                        r = apply_seccomp(unit, context);
-                        if (r < 0) {
-                                *exit_status = EXIT_SECCOMP;
-                                return r;
-                        }
+                r = apply_private_devices(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to set up private devices");
+                        return r;
                  }
-#endif
-
-#ifdef HAVE_SELINUX
-                if (mac_selinux_use()) {
-                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;
  
-                        if (exec_context) {
-                                r = setexeccon(exec_context);
-                                if (r < 0) {
-                                        *exit_status = EXIT_SELINUX_CONTEXT;
-                                        return r;
-                                }
-                        }
+                r = apply_syscall_archs(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to apply syscall architecture restrictions");
+                        return r;
                  }
-#endif
  
-#ifdef HAVE_APPARMOR
-                if (context->apparmor_profile && mac_apparmor_use()) {
-                        r = aa_change_onexec(context->apparmor_profile);
-                        if (r < 0 && !context->apparmor_profile_ignore) {
-                                *exit_status = EXIT_APPARMOR_PROFILE;
-                                return -errno;
-                        }
+                /* This really should remain the last step before the execve(), to make sure our own code is unaffected
+                 * by the filter as little as possible. */
+                r = apply_syscall_filter(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        *error_message = strdup("Failed to apply syscall filters");
+                        return r;
                  }
  #endif
          }
@@ -2805,6 +2818,7 @@ static int exec_child(
          final_argv = replace_env_argv(argv, accum_env);
          if (!final_argv) {
                  *exit_status = EXIT_MEMORY;
+                *error_message = strdup("Failed to prepare process arguments");
                  return -ENOMEM;
          }
  
@@ -2891,6 +2905,7 @@ int exec_spawn(Unit *unit,
  
          if (pid == 0) {
                  int exit_status;
+                _cleanup_free_ char *error_message = NULL;
  
                  r = exec_child(unit,
                                 command,
@@ -2904,17 +2919,27 @@ int exec_spawn(Unit *unit,
                                 fds, n_fds,
                                 files_env,
                                 unit->manager->user_lookup_fds[1],
-                               &exit_status);
+                               &exit_status,
+                               &error_message);
                  if (r < 0) {
                          log_open();
-                        log_struct_errno(LOG_ERR, r,
-                                         LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED),
-                                         LOG_UNIT_ID(unit),
-                                         LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
-                                                          exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
-                                                          command->path),
-                                         "EXECUTABLE=%s", command->path,
-                                         NULL);
+                        if (error_message)
+                                log_struct_errno(LOG_ERR, r,
+                                                 LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED),
+                                                 LOG_UNIT_ID(unit),
+                                                 LOG_UNIT_MESSAGE(unit, "%s: %m",
+                                                                  error_message),
+                                                 "EXECUTABLE=%s", command->path,
+                                                 NULL);
+                        else
+                                log_struct_errno(LOG_ERR, r,
+                                                 LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED),
+                                                 LOG_UNIT_ID(unit),
+                                                 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
+                                                                  exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
+                                                                  command->path),
+                                                 "EXECUTABLE=%s", command->path,
+                                                 NULL);
                  }
  
                  _exit(exit_status);
@@ -2949,6 +2974,7 @@ void exec_context_init(ExecContext *c) {
          c->personality = PERSONALITY_INVALID;
          c->runtime_directory_mode = 0755;
          c->capability_bounding_set = CAP_ALL;
+        c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
  }
  
  void exec_context_done(ExecContext *c) {
@@ -2968,6 +2994,7 @@ void exec_context_done(ExecContext *c) {
  
          c->working_directory = mfree(c->working_directory);
          c->root_directory = mfree(c->root_directory);
+        c->root_image = mfree(c->root_image);
          c->tty_path = mfree(c->tty_path);
          c->syslog_identifier = mfree(c->syslog_identifier);
          c->user = mfree(c->user);
@@ -2981,6 +3008,8 @@ void exec_context_done(ExecContext *c) {
          c->read_write_paths = strv_free(c->read_write_paths);
          c->inaccessible_paths = strv_free(c->inaccessible_paths);
  
+        bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+
          if (c->cpuset)
                  CPU_FREE(c->cpuset);
  
@@ -3006,7 +3035,7 @@ int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_p
          STRV_FOREACH(i, c->runtime_directory) {
                  _cleanup_free_ char *p;
  
-                p = strjoin(runtime_prefix, "/", *i, NULL);
+                p = strjoin(runtime_prefix, "/", *i);
                  if (!p)
                          return -ENOMEM;
  
@@ -3087,7 +3116,7 @@ const char* exec_context_fdname(const ExecContext *c, int fd_index) {
  
  int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
          unsigned i, targets;
-        const char *stdio_fdname[3];
+        const char* stdio_fdname[3];
  
          assert(c);
          assert(p);
@@ -3100,18 +3129,32 @@ int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParamet
                  stdio_fdname[i] = exec_context_fdname(c, i);
  
          for (i = 0; i < p->n_fds && targets > 0; i++)
-                if (named_iofds[STDIN_FILENO] < 0 && c->std_input == EXEC_INPUT_NAMED_FD && stdio_fdname[STDIN_FILENO] && streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
+                if (named_iofds[STDIN_FILENO] < 0 &&
+                    c->std_input == EXEC_INPUT_NAMED_FD &&
+                    stdio_fdname[STDIN_FILENO] &&
+                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
+
                          named_iofds[STDIN_FILENO] = p->fds[i];
                          targets--;
-                } else if (named_iofds[STDOUT_FILENO] < 0 && c->std_output == EXEC_OUTPUT_NAMED_FD && stdio_fdname[STDOUT_FILENO] && streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
+
+                } else if (named_iofds[STDOUT_FILENO] < 0 &&
+                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
+                           stdio_fdname[STDOUT_FILENO] &&
+                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
+
                          named_iofds[STDOUT_FILENO] = p->fds[i];
                          targets--;
-                } else if (named_iofds[STDERR_FILENO] < 0 && c->std_error == EXEC_OUTPUT_NAMED_FD && stdio_fdname[STDERR_FILENO] && streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
+
+                } else if (named_iofds[STDERR_FILENO] < 0 &&
+                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
+                           stdio_fdname[STDERR_FILENO] &&
+                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
+
                          named_iofds[STDERR_FILENO] = p->fds[i];
                          targets--;
                  }
  
-        return (targets == 0 ? 0 : -ENOENT);
+        return targets == 0 ? 0 : -ENOENT;
  }
  
  int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
@@ -3246,6 +3289,7 @@ static void strv_fprintf(FILE *f, char **l) {
  void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
          char **e, **d;
          unsigned i;
+        int r;
  
          assert(c);
          assert(f);
@@ -3266,6 +3310,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                  "%sPrivateUsers: %s\n"
                  "%sProtectHome: %s\n"
                  "%sProtectSystem: %s\n"
+                "%sMountAPIVFS: %s\n"
                  "%sIgnoreSIGPIPE: %s\n"
                  "%sMemoryDenyWriteExecute: %s\n"
                  "%sRestrictRealtime: %s\n",
@@ -3282,10 +3327,14 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                  prefix, yes_no(c->private_users),
                  prefix, protect_home_to_string(c->protect_home),
                  prefix, protect_system_to_string(c->protect_system),
+                prefix, yes_no(c->mount_apivfs),
                  prefix, yes_no(c->ignore_sigpipe),
                  prefix, yes_no(c->memory_deny_write_execute),
                  prefix, yes_no(c->restrict_realtime));
  
+        if (c->root_image)
+                fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
+
          STRV_FOREACH(e, c->environment)
                  fprintf(f, "%sEnvironment: %s\n", prefix, *e);
  
@@ -3463,6 +3512,15 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                  fputs("\n", f);
          }
  
+        if (c->n_bind_mounts > 0)
+                for (i = 0; i < c->n_bind_mounts; i++) {
+                        fprintf(f, "%s%s: %s:%s:%s\n", prefix,
+                                c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
+                                c->bind_mounts[i].source,
+                                c->bind_mounts[i].destination,
+                                c->bind_mounts[i].recursive ? "rbind" : "norbind");
+                }
+
          if (c->utmp_id)
                  fprintf(f,
                          "%sUtmpIdentifier: %s\n",
@@ -3526,6 +3584,15 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                  fputc('\n', f);
          }
  
+        if (exec_context_restrict_namespaces_set(c)) {
+                _cleanup_free_ char *s = NULL;
+
+                r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
+                if (r >= 0)
+                        fprintf(f, "%sRestrictNamespaces: %s\n",
+                                prefix, s);
+        }
+
          if (c->syscall_errno > 0)
                  fprintf(f,
                          "%sSystemCallErrorNumber: %s\n",
@@ -3621,7 +3688,8 @@ char *exec_command_line(char **argv) {
          STRV_FOREACH(a, argv)
                  k += strlen(*a)+3;
  
-        if (!(n = new(char, k)))
+        n = new(char, k);
+        if (!n)
                  return NULL;
  
          p = n;