]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
core: unified cgroup hierarchy support
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index 90c4415e307f0d739724e712f2549cfd1f4fe6b9..a56960506cffb17dcaabcf05cda636f29f6d0552 100644 (file)
@@ -55,7 +55,8 @@
 #include "sd-daemon.h"
 #include "sd-bus.h"
 #include "sd-id128.h"
-#include "sd-rtnl.h"
+#include "sd-netlink.h"
+#include "random-util.h"
 #include "log.h"
 #include "util.h"
 #include "mkdir.h"
@@ -74,7 +75,7 @@
 #include "bus-error.h"
 #include "ptyfwd.h"
 #include "env-util.h"
-#include "rtnl-util.h"
+#include "netlink-util.h"
 #include "udev-util.h"
 #include "blkid-util.h"
 #include "gpt.h"
 #include "machine-image.h"
 #include "list.h"
 #include "in-addr-util.h"
-#include "fw-util.h"
+#include "firewall-util.h"
 #include "local-addresses.h"
+#include "formats-util.h"
+#include "process-util.h"
+#include "terminal-util.h"
+#include "hostname-util.h"
+#include "signal-util.h"
 
 #ifdef HAVE_SECCOMP
 #include "seccomp-util.h"
@@ -121,6 +127,22 @@ typedef enum Volatile {
         VOLATILE_STATE,
 } Volatile;
 
+typedef enum CustomMountType {
+        CUSTOM_MOUNT_BIND,
+        CUSTOM_MOUNT_TMPFS,
+        CUSTOM_MOUNT_OVERLAY,
+} CustomMountType;
+
+typedef struct CustomMount {
+        CustomMountType type;
+        bool read_only;
+        char *source; /* for overlayfs this is the upper directory */
+        char *destination;
+        char *options;
+        char *work_dir;
+        char **lower;
+} CustomMount;
+
 static char *arg_directory = NULL;
 static char *arg_template = NULL;
 static char *arg_user = NULL;
@@ -162,9 +184,8 @@ static uint64_t arg_retain =
         (1ULL << CAP_AUDIT_WRITE) |
         (1ULL << CAP_AUDIT_CONTROL) |
         (1ULL << CAP_MKNOD);
-static char **arg_bind = NULL;
-static char **arg_bind_ro = NULL;
-static char **arg_tmpfs = NULL;
+static CustomMount *arg_custom_mounts = NULL;
+static unsigned arg_n_custom_mounts = 0;
 static char **arg_setenv = NULL;
 static bool arg_quiet = false;
 static bool arg_share_system = false;
@@ -175,7 +196,7 @@ static char **arg_network_macvlan = NULL;
 static char **arg_network_ipvlan = NULL;
 static bool arg_network_veth = false;
 static const char *arg_network_bridge = NULL;
-static unsigned long arg_personality = 0xffffffffLU;
+static unsigned long arg_personality = PERSONALITY_INVALID;
 static char *arg_image = NULL;
 static Volatile arg_volatile = VOLATILE_NO;
 static ExposePort *arg_expose_ports = NULL;
@@ -183,6 +204,7 @@ static char **arg_property = NULL;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns = false;
 static int arg_kill_signal = 0;
+static bool arg_unified_cgroup_hierarchy = false;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -202,6 +224,8 @@ static void help(void) {
                "     --uuid=UUID            Set a specific machine UUID for the container\n"
                "  -S --slice=SLICE          Place the container in the specified slice\n"
                "     --property=NAME=VALUE  Set scope unit property\n"
+               "     --private-users[=UIDBASE[:NUIDS]]\n"
+               "                            Run within user namespace\n"
                "     --private-network      Disable network in container\n"
                "     --network-interface=INTERFACE\n"
                "                            Assign an existing network interface to the\n"
@@ -218,8 +242,6 @@ static void help(void) {
                "                            Add a virtual ethernet connection between host\n"
                "                            and container and add it to an existing bridge on\n"
                "                            the host\n"
-               "     --private-users[=UIDBASE[:NUIDS]]\n"
-               "                            Run within user namespace\n"
                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
                "                            Expose a container IP port on the host\n"
                "  -Z --selinux-context=SECLABEL\n"
@@ -236,10 +258,17 @@ static void help(void) {
                "                            try-guest, try-host\n"
                "  -j                        Equivalent to --link-journal=try-guest\n"
                "     --read-only            Mount the root directory read-only\n"
-               "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
+               "     --bind=PATH[:PATH[:OPTIONS]]\n"
+               "                            Bind mount a file or directory from the host into\n"
                "                            the container\n"
-               "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
+               "     --bind-ro=PATH[:PATH[:OPTIONS]\n"
+               "                            Similar, but creates a read-only bind mount\n"
                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
+               "     --overlay=PATH[:PATH...]:PATH\n"
+               "                            Create an overlay mount from the host to \n"
+               "                            the container\n"
+               "     --overlay-ro=PATH[:PATH...]:PATH\n"
+               "                            Similar, but creates a read-only overlay mount\n"
                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
                "     --share-system         Share system namespaces with host\n"
                "     --register=BOOLEAN     Register container as machine\n"
@@ -249,6 +278,93 @@ static void help(void) {
                , program_invocation_short_name);
 }
 
+static CustomMount* custom_mount_add(CustomMountType t) {
+        CustomMount *c, *ret;
+
+        c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
+        if (!c)
+                return NULL;
+
+        arg_custom_mounts = c;
+        ret = arg_custom_mounts + arg_n_custom_mounts;
+        arg_n_custom_mounts++;
+
+        *ret = (CustomMount) { .type = t };
+
+        return ret;
+}
+
+static void custom_mount_free_all(void) {
+        unsigned i;
+
+        for (i = 0; i < arg_n_custom_mounts; i++) {
+                CustomMount *m = &arg_custom_mounts[i];
+
+                free(m->source);
+                free(m->destination);
+                free(m->options);
+
+                if (m->work_dir) {
+                        (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
+                        free(m->work_dir);
+                }
+
+                strv_free(m->lower);
+        }
+
+        arg_custom_mounts = mfree(arg_custom_mounts);
+        arg_n_custom_mounts = 0;
+}
+
+static int custom_mount_compare(const void *a, const void *b) {
+        const CustomMount *x = a, *y = b;
+        int r;
+
+        r = path_compare(x->destination, y->destination);
+        if (r != 0)
+                return r;
+
+        if (x->type < y->type)
+                return -1;
+        if (x->type > y->type)
+                return 1;
+
+        return 0;
+}
+
+static int custom_mounts_prepare(void) {
+        unsigned i;
+        int r;
+
+        /* Ensure the mounts are applied prefix first. */
+        qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
+
+        /* Allocate working directories for the overlay file systems that need it */
+        for (i = 0; i < arg_n_custom_mounts; i++) {
+                CustomMount *m = &arg_custom_mounts[i];
+
+                if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
+                        log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
+                        return -EINVAL;
+                }
+
+                if (m->type != CUSTOM_MOUNT_OVERLAY)
+                        continue;
+
+                if (m->work_dir)
+                        continue;
+
+                if (m->read_only)
+                        continue;
+
+                r = tempfn_random(m->source, NULL, &m->work_dir);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
+        }
+
+        return 0;
+}
+
 static int set_sanitized_path(char **b, const char *path) {
         char *p;
 
@@ -270,6 +386,30 @@ static int set_sanitized_path(char **b, const char *path) {
         return 0;
 }
 
+static int detect_unified_cgroup_hierarchy(void) {
+        const char *e;
+        int r;
+
+        /* Allow the user to control whether the unified hierarchy is used */
+        e = getenv("UNIFIED_CGROUP_HIERARCHY");
+        if (e) {
+                r = parse_boolean(e);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+
+                arg_unified_cgroup_hierarchy = r;
+                return 0;
+        }
+
+        /* Otherwise inherit the default from the host system */
+        r = cg_unified();
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
+        arg_unified_cgroup_hierarchy = r;
+        return 0;
+}
+
 static int parse_argv(int argc, char *argv[]) {
 
         enum {
@@ -283,6 +423,8 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_BIND,
                 ARG_BIND_RO,
                 ARG_TMPFS,
+                ARG_OVERLAY,
+                ARG_OVERLAY_RO,
                 ARG_SETENV,
                 ARG_SHARE_SYSTEM,
                 ARG_REGISTER,
@@ -316,6 +458,8 @@ static int parse_argv(int argc, char *argv[]) {
                 { "bind",                  required_argument, NULL, ARG_BIND              },
                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
+                { "overlay",               required_argument, NULL, ARG_OVERLAY           },
+                { "overlay-ro",            required_argument, NULL, ARG_OVERLAY_RO        },
                 { "machine",               required_argument, NULL, 'M'                   },
                 { "slice",                 required_argument, NULL, 'S'                   },
                 { "setenv",                required_argument, NULL, ARG_SETENV            },
@@ -385,9 +529,8 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case 'u':
-                        free(arg_user);
-                        arg_user = strdup(optarg);
-                        if (!arg_user)
+                        r = free_and_strdup(&arg_user, optarg);
+                        if (r < 0)
                                 return log_oom();
 
                         break;
@@ -443,10 +586,9 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case 'M':
-                        if (isempty(optarg)) {
-                                free(arg_machine);
-                                arg_machine = NULL;
-                        } else {
+                        if (isempty(optarg))
+                                arg_machine = mfree(arg_machine);
+                        else {
                                 if (!machine_name_is_valid(optarg)) {
                                         log_error("Invalid machine name: %s", optarg);
                                         return -EINVAL;
@@ -540,72 +682,143 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_BIND:
                 case ARG_BIND_RO: {
-                        _cleanup_free_ char *a = NULL, *b = NULL;
-                        char *e;
-                        char ***x;
-
-                        x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
-
-                        e = strchr(optarg, ':');
-                        if (e) {
-                                a = strndup(optarg, e - optarg);
-                                b = strdup(e + 1);
-                        } else {
-                                a = strdup(optarg);
-                                b = strdup(optarg);
+                        const char *current = optarg;
+                        _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
+                        CustomMount *m;
+
+                        r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL);
+                        switch (r) {
+                        case 1:
+                                destination = strdup(source);
+                        case 2:
+                        case 3:
+                                break;
+                        case -ENOMEM:
+                                return log_oom();
+                        default:
+                                log_error("Invalid bind mount specification: %s", optarg);
+                                return -EINVAL;
                         }
 
-                        if (!a || !b)
+                        if (!source || !destination)
                                 return log_oom();
 
-                        if (!path_is_absolute(a) || !path_is_absolute(b)) {
+                        if (!path_is_absolute(source) || !path_is_absolute(destination)) {
                                 log_error("Invalid bind mount specification: %s", optarg);
                                 return -EINVAL;
                         }
 
-                        r = strv_extend(x, a);
-                        if (r < 0)
+                        m = custom_mount_add(CUSTOM_MOUNT_BIND);
+                        if (!m)
                                 return log_oom();
 
-                        r = strv_extend(x, b);
-                        if (r < 0)
-                                return log_oom();
+                        m->source = source;
+                        m->destination = destination;
+                        m->read_only = c == ARG_BIND_RO;
+                        m->options = opts;
+
+                        source = destination = opts = NULL;
 
                         break;
                 }
 
                 case ARG_TMPFS: {
-                        _cleanup_free_ char *a = NULL, *b = NULL;
-                        char *e;
+                        const char *current = optarg;
+                        _cleanup_free_ char *path = NULL, *opts = NULL;
+                        CustomMount *m;
 
-                        e = strchr(optarg, ':');
-                        if (e) {
-                                a = strndup(optarg, e - optarg);
-                                b = strdup(e + 1);
-                        } else {
-                                a = strdup(optarg);
-                                b = strdup("mode=0755");
+                        r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        else if (r < 0) {
+                                log_error("Invalid tmpfs specification: %s", optarg);
+                                return r;
                         }
+                        if (r)
+                                opts = strdup(current);
+                        else
+                                opts = strdup("mode=0755");
 
-                        if (!a || !b)
+                        if (!path || !opts)
                                 return log_oom();
 
-                        if (!path_is_absolute(a)) {
+                        if (!path_is_absolute(path)) {
                                 log_error("Invalid tmpfs specification: %s", optarg);
                                 return -EINVAL;
                         }
 
-                        r = strv_push(&arg_tmpfs, a);
-                        if (r < 0)
+                        m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
+                        if (!m)
                                 return log_oom();
 
-                        a = NULL;
+                        m->destination = path;
+                        m->options = opts;
 
-                        r = strv_push(&arg_tmpfs, b);
-                        if (r < 0)
+                        path = opts = NULL;
+
+                        break;
+                }
+
+                case ARG_OVERLAY:
+                case ARG_OVERLAY_RO: {
+                        _cleanup_free_ char *upper = NULL, *destination = NULL;
+                        _cleanup_strv_free_ char **lower = NULL;
+                        CustomMount *m;
+                        unsigned n = 0;
+                        char **i;
+
+                        r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        else if (r < 0) {
+                                log_error("Invalid overlay specification: %s", optarg);
+                                return r;
+                        }
+
+                        STRV_FOREACH(i, lower) {
+                                if (!path_is_absolute(*i)) {
+                                        log_error("Overlay path %s is not absolute.", *i);
+                                        return -EINVAL;
+                                }
+
+                                n++;
+                        }
+
+                        if (n < 2) {
+                                log_error("--overlay= needs at least two colon-separated directories specified.");
+                                return -EINVAL;
+                        }
+
+                        if (n == 2) {
+                                /* If two parameters are specified,
+                                 * the first one is the lower, the
+                                 * second one the upper directory. And
+                                 * we'll also define the destination
+                                 * mount point the same as the upper. */
+                                upper = lower[1];
+                                lower[1] = NULL;
+
+                                destination = strdup(upper);
+                                if (!destination)
+                                        return log_oom();
+
+                        } else {
+                                upper = lower[n - 2];
+                                destination = lower[n - 1];
+                                lower[n - 2] = NULL;
+                        }
+
+                        m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
+                        if (!m)
                                 return log_oom();
 
-                        b = NULL;
+                        m->destination = destination;
+                        m->source = upper;
+                        m->lower = lower;
+                        m->read_only = c == ARG_OVERLAY_RO;
+
+                        upper = destination = NULL;
+                        lower = NULL;
 
                         break;
                 }
@@ -652,7 +865,7 @@ static int parse_argv(int argc, char *argv[]) {
                 case ARG_PERSONALITY:
 
                         arg_personality = personality_from_string(optarg);
-                        if (arg_personality == 0xffffffffLU) {
+                        if (arg_personality == PERSONALITY_INVALID) {
                                 log_error("Unknown or unsupported personality '%s'.", optarg);
                                 return -EINVAL;
                         }
@@ -841,15 +1054,60 @@ static int parse_argv(int argc, char *argv[]) {
                 return -EINVAL;
         }
 
+        if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
+                return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
+
         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
 
         if (arg_boot && arg_kill_signal <= 0)
                 arg_kill_signal = SIGRTMIN+3;
 
+        r = detect_unified_cgroup_hierarchy();
+        if (r < 0)
+                return r;
+
         return 1;
 }
 
-static int mount_all(const char *dest) {
+static int tmpfs_patch_options(const char *options, char **ret) {
+        char *buf = NULL;
+
+        if (arg_userns && arg_uid_shift != 0) {
+                assert(arg_uid_shift != UID_INVALID);
+
+                if (options)
+                        (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
+                else
+                        (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
+                if (!buf)
+                        return -ENOMEM;
+
+                options = buf;
+        }
+
+#ifdef HAVE_SELINUX
+        if (arg_selinux_apifs_context) {
+                char *t;
+
+                if (options)
+                        t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
+                else
+                        t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
+                if (!t) {
+                        free(buf);
+                        return -ENOMEM;
+                }
+
+                free(buf);
+                buf = t;
+        }
+#endif
+
+        *ret = buf;
+        return !!buf;
+}
+
+static int mount_all(const char *dest, bool userns) {
 
         typedef struct MountPoint {
                 const char *what;
@@ -858,87 +1116,62 @@ static int mount_all(const char *dest) {
                 const char *options;
                 unsigned long flags;
                 bool fatal;
+                bool userns;
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
-                { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
-                { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
-                { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
-                { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
-                { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
-                { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
-                { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
-                { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
+                { "proc",      "/proc",          "proc",   NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true  },
+                { "/proc/sys", "/proc/sys",      NULL,     NULL,        MS_BIND,                                                   true,  true  },   /* Bind mount first */
+                { NULL,        "/proc/sys",      NULL,     NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true  },   /* Then, make it r/o */
+                { "sysfs",     "/sys",           "sysfs",  NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false },
+                { "tmpfs",     "/dev",           "tmpfs",  "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false },
+                { "tmpfs",     "/dev/shm",       "tmpfs",  "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
+                { "tmpfs",     "/run",           "tmpfs",  "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
+                { "tmpfs",     "/tmp",           "tmpfs",  "mode=1777", MS_STRICTATIME,                                            true,  false },
 #ifdef HAVE_SELINUX
-                { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
-                { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
+                { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL,     MS_BIND,                                                   false, false },  /* Bind mount first */
+                { NULL,              "/sys/fs/selinux", NULL, NULL,     MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false },  /* Then, make it r/o */
 #endif
         };
 
         unsigned k;
-        int r = 0;
+        int r;
 
         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
                 _cleanup_free_ char *where = NULL, *options = NULL;
                 const char *o;
-                int t;
 
-                where = strjoin(dest, "/", mount_table[k].where, NULL);
+                if (userns != mount_table[k].userns)
+                        continue;
+
+                where = prefix_root(dest, mount_table[k].where);
                 if (!where)
                         return log_oom();
 
-                t = path_is_mount_point(where, true);
-                if (t < 0) {
-                        log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
-
-                        if (r == 0)
-                                r = t;
-
-                        continue;
-                }
+                r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
+                if (r < 0 && r != -ENOENT)
+                        return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
 
                 /* Skip this entry if it is not a remount. */
-                if (mount_table[k].what && t > 0)
+                if (mount_table[k].what && r > 0)
                         continue;
 
-                t = mkdir_p(where, 0755);
-                if (t < 0) {
-                        if (mount_table[k].fatal) {
-                               log_error_errno(t, "Failed to create directory %s: %m", where);
-
-                                if (r == 0)
-                                        r = t;
-                        } else
-                               log_warning_errno(t, "Failed to create directory %s: %m", where);
+                r = mkdir_p(where, 0755);
+                if (r < 0) {
+                        if (mount_table[k].fatal)
+                                return log_error_errno(r, "Failed to create directory %s: %m", where);
 
+                        log_warning_errno(r, "Failed to create directory %s: %m", where);
                         continue;
                 }
 
-#ifdef HAVE_SELINUX
-                if (arg_selinux_apifs_context &&
-                    (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
-                        options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
-                        if (!options)
-                                return log_oom();
-
-                        o = options;
-                } else
-#endif
-                        o = mount_table[k].options;
-
-                if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
-                        char *uid_options = NULL;
-
-                        if (o)
-                                asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
-                        else
-                                asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
-                        if (!uid_options)
+                o = mount_table[k].options;
+                if (streq_ptr(mount_table[k].type, "tmpfs")) {
+                        r = tmpfs_patch_options(o, &options);
+                        if (r < 0)
                                 return log_oom();
-
-                        free(options);
-                        o = options = uid_options;
+                        if (r > 0)
+                                o = options;
                 }
 
                 if (mount(mount_table[k].what,
@@ -947,155 +1180,321 @@ static int mount_all(const char *dest) {
                           mount_table[k].flags,
                           o) < 0) {
 
-                        if (mount_table[k].fatal) {
-                                log_error_errno(errno, "mount(%s) failed: %m", where);
+                        if (mount_table[k].fatal)
+                                return log_error_errno(errno, "mount(%s) failed: %m", where);
 
-                                if (r == 0)
-                                        r = -errno;
-                        } else
-                                log_warning_errno(errno, "mount(%s) failed: %m", where);
+                        log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
                 }
         }
 
-        return r;
+        return 0;
 }
 
-static int mount_binds(const char *dest, char **l, bool ro) {
-        char **x, **y;
+static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
+        const char *p = options;
+        unsigned long flags = *mount_flags;
+        char *opts = NULL;
 
-        STRV_FOREACH_PAIR(x, y, l) {
-                _cleanup_free_ char *where = NULL;
-                struct stat source_st, dest_st;
-                int r;
+        assert(options);
 
-                if (stat(*x, &source_st) < 0)
-                        return log_error_errno(errno, "Failed to stat %s: %m", *x);
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+                int r = extract_first_word(&p, &word, ",", 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract mount option: %m");
+                if (r == 0)
+                        break;
 
-                where = strappend(dest, *y);
-                if (!where)
-                        return log_oom();
+                if (streq(word, "rbind"))
+                        flags |= MS_REC;
+                else if (streq(word, "norbind"))
+                        flags &= ~MS_REC;
+                else {
+                        log_error("Invalid bind mount option: %s", word);
+                        return -EINVAL;
+                }
+        }
 
-                r = stat(where, &dest_st);
-                if (r == 0) {
-                        if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
-                                log_error("Cannot bind mount directory %s on file %s.", *x, where);
-                                return -EINVAL;
-                        }
-                        if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
-                                log_error("Cannot bind mount file %s on directory %s.", *x, where);
-                                return -EINVAL;
-                        }
-                } else if (errno == ENOENT) {
-                        r = mkdir_parents_label(where, 0755);
-                        if (r < 0)
-                                return log_error_errno(r, "Failed to bind mount %s: %m", *x);
-                } else {
-                        log_error_errno(errno, "Failed to bind mount %s: %m", *x);
-                        return -errno;
+        *mount_flags = flags;
+        /* in the future mount_opts will hold string options for mount(2) */
+        *mount_opts = opts;
+
+        return 0;
+}
+
+static int mount_bind(const char *dest, CustomMount *m) {
+        struct stat source_st, dest_st;
+        const char *where;
+        unsigned long mount_flags = MS_BIND | MS_REC;
+        _cleanup_free_ char *mount_opts = NULL;
+        int r;
+
+        assert(m);
+
+        if (m->options) {
+                r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
+                if (r < 0)
+                        return r;
+        }
+
+        if (stat(m->source, &source_st) < 0)
+                return log_error_errno(errno, "Failed to stat %s: %m", m->source);
+
+        where = prefix_roota(dest, m->destination);
+
+        if (stat(where, &dest_st) >= 0) {
+                if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
+                        log_error("Cannot bind mount directory %s on file %s.", m->source, where);
+                        return -EINVAL;
                 }
 
-                /* Create the mount point. Any non-directory file can be
-                 * mounted on any non-directory file (regular, fifo, socket,
-                 * char, block).
-                 */
-                if (S_ISDIR(source_st.st_mode)) {
-                        r = mkdir_label(where, 0755);
-                        if (r < 0 && errno != EEXIST)
-                                return log_error_errno(r, "Failed to create mount point %s: %m", where);
-                } else {
-                        r = touch(where);
-                        if (r < 0)
-                                return log_error_errno(r, "Failed to create mount point %s: %m", where);
+                if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
+                        log_error("Cannot bind mount file %s on directory %s.", m->source, where);
+                        return -EINVAL;
                 }
 
-                if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
-                        return log_error_errno(errno, "mount(%s) failed: %m", where);
+        } else if (errno == ENOENT) {
+                r = mkdir_parents_label(where, 0755);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to make parents of %s: %m", where);
+        } else {
+                log_error_errno(errno, "Failed to stat %s: %m", where);
+                return -errno;
+        }
 
-                if (ro) {
-                        r = bind_remount_recursive(where, true);
-                        if (r < 0)
-                                return log_error_errno(r, "Read-Only bind mount failed: %m");
-                }
+        /* Create the mount point. Any non-directory file can be
+         * mounted on any non-directory file (regular, fifo, socket,
+         * char, block).
+         */
+        if (S_ISDIR(source_st.st_mode))
+                r = mkdir_label(where, 0755);
+        else
+                r = touch(where);
+        if (r < 0 && r != -EEXIST)
+                return log_error_errno(r, "Failed to create mount point %s: %m", where);
+
+        if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
+                return log_error_errno(errno, "mount(%s) failed: %m", where);
+
+        if (m->read_only) {
+                r = bind_remount_recursive(where, true);
+                if (r < 0)
+                        return log_error_errno(r, "Read-only bind mount failed: %m");
         }
 
         return 0;
 }
 
-static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
-        char *to;
+static int mount_tmpfs(const char *dest, CustomMount *m) {
+        const char *where, *options;
+        _cleanup_free_ char *buf = NULL;
         int r;
 
-        to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
+        assert(dest);
+        assert(m);
+
+        where = prefix_roota(dest, m->destination);
 
-        r = path_is_mount_point(to, false);
+        r = mkdir_p_label(where, 0755);
+        if (r < 0 && r != -EEXIST)
+                return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
+
+        r = tmpfs_patch_options(m->options, &buf);
         if (r < 0)
-                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
-        if (r > 0)
-                return 0;
+                return log_oom();
+        options = r > 0 ? buf : m->options;
 
-        mkdir_p(to, 0755);
+        if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
+                return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
 
-        /* The superblock mount options of the mount point need to be
-         * identical to the hosts', and hence writable... */
-        if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
-                return log_error_errno(errno, "Failed to mount to %s: %m", to);
+        return 0;
+}
 
-        /* ... hence let's only make the bind mount read-only, not the
-         * superblock. */
-        if (read_only) {
-                if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
-                        return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
-        }
-        return 1;
+static char *joined_and_escaped_lower_dirs(char * const *lower) {
+        _cleanup_strv_free_ char **sv = NULL;
+
+        sv = strv_copy(lower);
+        if (!sv)
+                return NULL;
+
+        strv_reverse(sv);
+
+        if (!strv_shell_escape(sv, ",:"))
+                return NULL;
+
+        return strv_join(sv, ":");
 }
 
-static int mount_cgroup(const char *dest) {
-        _cleanup_set_free_free_ Set *controllers = NULL;
-        _cleanup_free_ char *own_cgroup_path = NULL;
-        const char *cgroup_root, *systemd_root, *systemd_own;
+static int mount_overlay(const char *dest, CustomMount *m) {
+        _cleanup_free_ char *lower = NULL;
+        const char *where, *options;
         int r;
 
-        controllers = set_new(&string_hash_ops);
-        if (!controllers)
-                return log_oom();
+        assert(dest);
+        assert(m);
 
-        r = cg_kernel_controllers(controllers);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+        where = prefix_roota(dest, m->destination);
 
-        r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+        r = mkdir_label(where, 0755);
+        if (r < 0 && r != -EEXIST)
+                return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
 
-        cgroup_root = strjoina(dest, "/sys/fs/cgroup");
-        if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
-                return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
+        (void) mkdir_p_label(m->source, 0755);
 
-        for (;;) {
-                _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
+        lower = joined_and_escaped_lower_dirs(m->lower);
+        if (!lower)
+                return log_oom();
 
-                controller = set_steal_first(controllers);
-                if (!controller)
-                        break;
+        if (m->read_only) {
+                _cleanup_free_ char *escaped_source = NULL;
 
-                origin = strappend("/sys/fs/cgroup/", controller);
-                if (!origin)
+                escaped_source = shell_escape(m->source, ",:");
+                if (!escaped_source)
                         return log_oom();
 
-                r = readlink_malloc(origin, &combined);
-                if (r == -EINVAL) {
-                        /* Not a symbolic link, but directly a single cgroup hierarchy */
+                options = strjoina("lowerdir=", escaped_source, ":", lower);
+        } else {
+                _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
 
-                        r = mount_cgroup_hierarchy(dest, controller, controller, true);
-                        if (r < 0)
-                                return r;
+                assert(m->work_dir);
+                (void) mkdir_label(m->work_dir, 0700);
 
-                } else if (r < 0)
+                escaped_source = shell_escape(m->source, ",:");
+                if (!escaped_source)
+                        return log_oom();
+                escaped_work_dir = shell_escape(m->work_dir, ",:");
+                if (!escaped_work_dir)
+                        return log_oom();
+
+                options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
+        }
+
+        if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
+                return log_error_errno(errno, "overlay mount to %s failed: %m", where);
+
+        return 0;
+}
+
+static int mount_custom(const char *dest) {
+        unsigned i;
+        int r;
+
+        assert(dest);
+
+        for (i = 0; i < arg_n_custom_mounts; i++) {
+                CustomMount *m = &arg_custom_mounts[i];
+
+                switch (m->type) {
+
+                case CUSTOM_MOUNT_BIND:
+                        r = mount_bind(dest, m);
+                        break;
+
+                case CUSTOM_MOUNT_TMPFS:
+                        r = mount_tmpfs(dest, m);
+                        break;
+
+                case CUSTOM_MOUNT_OVERLAY:
+                        r = mount_overlay(dest, m);
+                        break;
+
+                default:
+                        assert_not_reached("Unknown custom mount type");
+                }
+
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+        char *to;
+        int r;
+
+        to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
+
+        r = path_is_mount_point(to, 0);
+        if (r < 0 && r != -ENOENT)
+                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
+        if (r > 0)
+                return 0;
+
+        mkdir_p(to, 0755);
+
+        /* The superblock mount options of the mount point need to be
+         * identical to the hosts', and hence writable... */
+        if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
+                return log_error_errno(errno, "Failed to mount to %s: %m", to);
+
+        /* ... hence let's only make the bind mount read-only, not the
+         * superblock. */
+        if (read_only) {
+                if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
+                        return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
+        }
+        return 1;
+}
+
+static int mount_legacy_cgroups(const char *dest) {
+        _cleanup_set_free_free_ Set *controllers = NULL;
+        const char *cgroup_root;
+        int r;
+
+        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+        /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+        if (r == 0) {
+                _cleanup_free_ char *options = NULL;
+
+                r = tmpfs_patch_options("mode=755", &options);
+                if (r < 0)
+                        return log_oom();
+
+                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+        }
+
+        if (cg_unified() > 0)
+                goto skip_controllers;
+
+        controllers = set_new(&string_hash_ops);
+        if (!controllers)
+                return log_oom();
+
+        r = cg_kernel_controllers(controllers);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+
+        for (;;) {
+                _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
+
+                controller = set_steal_first(controllers);
+                if (!controller)
+                        break;
+
+                origin = prefix_root("/sys/fs/cgroup/", controller);
+                if (!origin)
+                        return log_oom();
+
+                r = readlink_malloc(origin, &combined);
+                if (r == -EINVAL) {
+                        /* Not a symbolic link, but directly a single cgroup hierarchy */
+
+                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
+                        if (r < 0)
+                                return r;
+
+                } else if (r < 0)
                         return log_error_errno(r, "Failed to read link %s: %m", origin);
                 else {
                         _cleanup_free_ char *target = NULL;
 
-                        target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
+                        target = prefix_root(dest, origin);
                         if (!target)
                                 return log_oom();
 
@@ -1106,59 +1505,145 @@ static int mount_cgroup(const char *dest) {
                                 continue;
                         }
 
-                        r = mount_cgroup_hierarchy(dest, combined, combined, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
                         if (r < 0)
                                 return r;
 
-                        if (symlink(combined, target) < 0)
-                                return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
+                        r = symlink_idempotent(combined, target);
+                        if (r == -EINVAL) {
+                                log_error("Invalid existing symlink for combined hierarchy");
+                                return r;
+                        }
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
                 }
         }
 
-        r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
+skip_controllers:
+        r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
         if (r < 0)
                 return r;
 
+        if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
+                return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
+
+        return 0;
+}
+
+static int mount_unified_cgroups(const char *dest) {
+        const char *p;
+        int r;
+
+        assert(dest);
+
+        p = strjoina(dest, "/sys/fs/cgroup");
+
+        r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
+        if (r > 0) {
+                p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
+                if (access(p, F_OK) >= 0)
+                        return 0;
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
+
+                log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
+                return -EINVAL;
+        }
+
+        if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+                return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
+
+        return 0;
+}
+
+static int mount_cgroups(const char *dest) {
+        if (arg_unified_cgroup_hierarchy)
+                return mount_unified_cgroups(dest);
+        else
+                return mount_legacy_cgroups(dest);
+}
+
+static int mount_systemd_cgroup_writable(const char *dest) {
+        _cleanup_free_ char *own_cgroup_path = NULL;
+        const char *systemd_root, *systemd_own;
+        int r;
+
+        assert(dest);
+
+        r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+
+        /* If we are living in the top-level, then there's nothing to do... */
+        if (path_equal(own_cgroup_path, "/"))
+                return 0;
+
+        if (arg_unified_cgroup_hierarchy) {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
+        } else {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+        }
+
         /* Make our own cgroup a (writable) bind mount */
-        systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
 
         /* And then remount the systemd cgroup root read-only */
-        systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
 
-        if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
-                return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
-
         return 0;
 }
 
-static int mount_tmpfs(const char *dest) {
-        char **i, **o;
+static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
+        assert(p);
 
-        STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
-                _cleanup_free_ char *where = NULL;
-                int r;
+        if (!arg_userns)
+                return 0;
 
-                where = strappend(dest, *i);
-                if (!where)
-                        return log_oom();
+        if (uid == UID_INVALID && gid == GID_INVALID)
+                return 0;
 
-                r = mkdir_label(where, 0755);
-                if (r < 0 && r != -EEXIST)
-                        return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
+        if (uid != UID_INVALID) {
+                uid += arg_uid_shift;
+
+                if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
+                        return -EOVERFLOW;
+        }
 
-                if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
-                        return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
+        if (gid != GID_INVALID) {
+                gid += (gid_t) arg_uid_shift;
+
+                if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
+                        return -EOVERFLOW;
         }
 
+        if (lchown(p, uid, gid) < 0)
+                return -errno;
+
         return 0;
 }
 
+static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
+        const char *q;
+
+        q = prefix_roota(root, path);
+        if (mkdir(q, mode) < 0) {
+                if (errno == EEXIST)
+                        return 0;
+                return -errno;
+        }
+
+        return userns_lchown(q, uid, gid);
+}
+
 static int setup_timezone(const char *dest) {
-        _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
+        _cleanup_free_ char *p = NULL, *q = NULL;
+        const char *where, *check, *what;
         char *z, *y;
         int r;
 
@@ -1179,10 +1664,7 @@ static int setup_timezone(const char *dest) {
                 return 0;
         }
 
-        where = strappend(dest, "/etc/localtime");
-        if (!where)
-                return log_oom();
-
+        where = prefix_roota(dest, "/etc/localtime");
         r = readlink_malloc(where, &q);
         if (r >= 0) {
                 y = path_startswith(q, "../usr/share/zoneinfo/");
@@ -1194,43 +1676,34 @@ static int setup_timezone(const char *dest) {
                         return 0;
         }
 
-        check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
-        if (!check)
-                return log_oom();
-
-        if (access(check, F_OK) < 0) {
+        check = strjoina("/usr/share/zoneinfo/", z);
+        check = prefix_root(dest, check);
+        if (laccess(check, F_OK) < 0) {
                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
                 return 0;
         }
 
-        what = strappend("../usr/share/zoneinfo/", z);
-        if (!what)
-                return log_oom();
-
-        r = mkdir_parents(where, 0755);
-        if (r < 0) {
-                log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
-
-                return 0;
-        }
-
         r = unlink(where);
         if (r < 0 && errno != ENOENT) {
                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
-
                 return 0;
         }
 
+        what = strjoina("../usr/share/zoneinfo/", z);
         if (symlink(what, where) < 0) {
                 log_error_errno(errno, "Failed to correct timezone of container: %m");
                 return 0;
         }
 
+        r = userns_lchown(where, 0, 0);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
+
         return 0;
 }
 
 static int setup_resolv_conf(const char *dest) {
-        _cleanup_free_ char *where = NULL;
+        const char *where = NULL;
         int r;
 
         assert(dest);
@@ -1239,31 +1712,33 @@ static int setup_resolv_conf(const char *dest) {
                 return 0;
 
         /* Fix resolv.conf, if possible */
-        where = strappend(dest, "/etc/resolv.conf");
-        if (!where)
-                return log_oom();
-
-        /* We don't really care for the results of this really. If it
-         * fails, it fails, but meh... */
-        r = mkdir_parents(where, 0755);
-        if (r < 0) {
-                log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
-
-                return 0;
-        }
+        where = prefix_roota(dest, "/etc/resolv.conf");
 
         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
         if (r < 0) {
-                log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
-
+                /* If the file already exists as symlink, let's
+                 * suppress the warning, under the assumption that
+                 * resolved or something similar runs inside and the
+                 * symlink points there.
+                 *
+                 * If the disk image is read-only, there's also no
+                 * point in complaining.
+                 */
+                log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to copy /etc/resolv.conf to %s: %m", where);
                 return 0;
         }
 
+        r = userns_lchown(where, 0, 0);
+        if (r < 0)
+                log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
+
         return 0;
 }
 
 static int setup_volatile_state(const char *directory) {
-        const char *p;
+        _cleanup_free_ char *buf = NULL;
+        const char *p, *options;
         int r;
 
         assert(directory);
@@ -1278,12 +1753,19 @@ static int setup_volatile_state(const char *directory) {
         if (r < 0)
                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
 
-        p = strjoina(directory, "/var");
+        p = prefix_roota(directory, "/var");
         r = mkdir(p, 0755);
         if (r < 0 && errno != EEXIST)
                 return log_error_errno(errno, "Failed to create %s: %m", directory);
 
-        if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
+        options = "mode=755";
+        r = tmpfs_patch_options(options, &buf);
+        if (r < 0)
+                return log_oom();
+        if (r > 0)
+                options = buf;
+
+        if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
 
         return 0;
@@ -1292,7 +1774,8 @@ static int setup_volatile_state(const char *directory) {
 static int setup_volatile(const char *directory) {
         bool tmpfs_mounted = false, bind_mounted = false;
         char template[] = "/tmp/nspawn-volatile-XXXXXX";
-        const char *f, *t;
+        _cleanup_free_ char *buf = NULL;
+        const char *f, *t, *options;
         int r;
 
         assert(directory);
@@ -1306,27 +1789,31 @@ static int setup_volatile(const char *directory) {
         if (!mkdtemp(template))
                 return log_error_errno(errno, "Failed to create temporary directory: %m");
 
-        if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
-                log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
-                r = -errno;
+        options = "mode=755";
+        r = tmpfs_patch_options(options, &buf);
+        if (r < 0)
+                return log_oom();
+        if (r > 0)
+                options = buf;
+
+        if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
+                r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
                 goto fail;
         }
 
         tmpfs_mounted = true;
 
-        f = strjoina(directory, "/usr");
-        t = strjoina(template, "/usr");
+        f = prefix_roota(directory, "/usr");
+        t = prefix_roota(template, "/usr");
 
         r = mkdir(t, 0755);
         if (r < 0 && errno != EEXIST) {
-                log_error_errno(errno, "Failed to create %s: %m", t);
-                r = -errno;
+                r = log_error_errno(errno, "Failed to create %s: %m", t);
                 goto fail;
         }
 
         if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                log_error_errno(errno, "Failed to create /usr bind mount: %m");
-                r = -errno;
+                r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
                 goto fail;
         }
 
@@ -1339,25 +1826,26 @@ static int setup_volatile(const char *directory) {
         }
 
         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
-                log_error_errno(errno, "Failed to move root mount: %m");
-                r = -errno;
+                r = log_error_errno(errno, "Failed to move root mount: %m");
                 goto fail;
         }
 
-        rmdir(template);
+        (void) rmdir(template);
 
         return 0;
 
 fail:
         if (bind_mounted)
-                umount(t);
+                (void) umount(t);
+
         if (tmpfs_mounted)
-                umount(template);
-        rmdir(template);
+                (void) umount(template);
+        (void) rmdir(template);
         return r;
 }
 
 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
+        assert(s);
 
         snprintf(s, 37,
                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
@@ -1367,23 +1855,19 @@ static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
 }
 
 static int setup_boot_id(const char *dest) {
-        _cleanup_free_ char *from = NULL, *to = NULL;
+        const char *from, *to;
         sd_id128_t rnd = {};
         char as_uuid[37];
         int r;
 
-        assert(dest);
-
         if (arg_share_system)
                 return 0;
 
         /* Generate a new randomized boot ID, so that each boot-up of
          * the container gets a new one */
 
-        from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
-        to = strappend(dest, "/proc/sys/kernel/random/boot_id");
-        if (!from || !to)
-                return log_oom();
+        from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
+        to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
 
         r = sd_id128_randomize(&rnd);
         if (r < 0)
@@ -1391,14 +1875,13 @@ static int setup_boot_id(const char *dest) {
 
         id128_format_as_uuid(rnd, as_uuid);
 
-        r = write_string_file(from, as_uuid);
+        r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
         if (r < 0)
                 return log_error_errno(r, "Failed to write boot id: %m");
 
-        if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
-                log_error_errno(errno, "Failed to bind mount boot id: %m");
-                r = -errno;
-        } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
+        if (mount(from, to, NULL, MS_BIND, NULL) < 0)
+                r = log_error_errno(errno, "Failed to bind mount boot id: %m");
+        else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
 
         unlink(from);
@@ -1424,14 +1907,16 @@ static int copy_devnodes(const char *dest) {
 
         u = umask(0000);
 
+        /* Create /dev/net, so that we can create /dev/net/tun in it */
+        if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
+                return log_error_errno(r, "Failed to create /dev/net directory: %m");
+
         NULSTR_FOREACH(d, devnodes) {
                 _cleanup_free_ char *from = NULL, *to = NULL;
                 struct stat st;
 
                 from = strappend("/dev/", d);
-                to = strjoin(dest, "/dev/", d, NULL);
-                if (!from || !to)
-                        return log_oom();
+                to = prefix_root(dest, from);
 
                 if (stat(from, &st) < 0) {
 
@@ -1440,16 +1925,10 @@ static int copy_devnodes(const char *dest) {
 
                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 
-                        log_error("%s is not a char or block device, cannot copy", from);
+                        log_error("%s is not a char or block device, cannot copy.", from);
                         return -EIO;
 
                 } else {
-                        r = mkdir_parents(to, 0775);
-                        if (r < 0) {
-                                log_error_errno(r, "Failed to create parent directory of %s: %m", to);
-                                return -r;
-                        }
-
                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
                                 if (errno != EPERM)
                                         return log_error_errno(errno, "mknod(%s) failed: %m", to);
@@ -1463,28 +1942,54 @@ static int copy_devnodes(const char *dest) {
                                         return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
                         }
 
-                        if (arg_userns && arg_uid_shift != UID_INVALID)
-                                if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
-                                        return log_error_errno(errno, "chown() of device node %s failed: %m", to);
+                        r = userns_lchown(to, 0, 0);
+                        if (r < 0)
+                                return log_error_errno(r, "chown() of device node %s failed: %m", to);
                 }
         }
 
         return r;
 }
 
-static int setup_ptmx(const char *dest) {
-        _cleanup_free_ char *p = NULL;
+static int setup_pts(const char *dest) {
+        _cleanup_free_ char *options = NULL;
+        const char *p;
+
+#ifdef HAVE_SELINUX
+        if (arg_selinux_apifs_context)
+                (void) asprintf(&options,
+                                "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
+                                arg_uid_shift + TTY_GID,
+                                arg_selinux_apifs_context);
+        else
+#endif
+                (void) asprintf(&options,
+                                "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
+                                arg_uid_shift + TTY_GID);
 
-        p = strappend(dest, "/dev/ptmx");
-        if (!p)
+        if (!options)
                 return log_oom();
 
+        /* Mount /dev/pts itself */
+        p = prefix_roota(dest, "/dev/pts");
+        if (mkdir(p, 0755) < 0)
+                return log_error_errno(errno, "Failed to create /dev/pts: %m");
+        if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
+                return log_error_errno(errno, "Failed to mount /dev/pts: %m");
+        if (userns_lchown(p, 0, 0) < 0)
+                return log_error_errno(errno, "Failed to chown /dev/pts: %m");
+
+        /* Create /dev/ptmx symlink */
+        p = prefix_roota(dest, "/dev/ptmx");
         if (symlink("pts/ptmx", p) < 0)
                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
+        if (userns_lchown(p, 0, 0) < 0)
+                return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
 
-        if (arg_userns && arg_uid_shift != UID_INVALID)
-                if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
-                        return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
+        /* And fix /dev/pts/ptmx ownership */
+        p = prefix_roota(dest, "/dev/pts/ptmx");
+        if (userns_lchown(p, 0, 0) < 0)
+                return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
 
         return 0;
 }
@@ -1499,7 +2004,7 @@ static int setup_dev_console(const char *dest, const char *console) {
 
         u = umask(0000);
 
-        r = chmod_and_chown(console, 0600, 0, 0);
+        r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
         if (r < 0)
                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
 
@@ -1507,7 +2012,7 @@ static int setup_dev_console(const char *dest, const char *console) {
          * ptys can only exist on pts file systems. To have something
          * to bind mount things on we create a empty regular file. */
 
-        to = strjoina(dest, "/dev/console");
+        to = prefix_roota(dest, "/dev/console");
         r = touch(to);
         if (r < 0)
                 return log_error_errno(r, "touch() for /dev/console failed: %m");
@@ -1519,9 +2024,9 @@ static int setup_dev_console(const char *dest, const char *console) {
 }
 
 static int setup_kmsg(const char *dest, int kmsg_socket) {
-        _cleanup_free_ char *from = NULL, *to = NULL;
+        const char *from, *to;
         _cleanup_umask_ mode_t u;
-        int r, fd, k;
+        int fd, k;
         union {
                 struct cmsghdr cmsghdr;
                 uint8_t buf[CMSG_SPACE(sizeof(int))];
@@ -1532,29 +2037,22 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
         };
         struct cmsghdr *cmsg;
 
-        assert(dest);
         assert(kmsg_socket >= 0);
 
         u = umask(0000);
 
-        /* We create the kmsg FIFO as /dev/kmsg, but immediately
+        /* We create the kmsg FIFO as /run/kmsg, but immediately
          * delete it after bind mounting it to /proc/kmsg. While FIFOs
          * on the reading side behave very similar to /proc/kmsg,
          * their writing side behaves differently from /dev/kmsg in
          * that writing blocks when nothing is reading. In order to
          * avoid any problems with containers deadlocking due to this
          * we simply make /dev/kmsg unavailable to the container. */
-        if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
-            asprintf(&to, "%s/proc/kmsg", dest) < 0)
-                return log_oom();
+        from = prefix_roota(dest, "/run/kmsg");
+        to = prefix_roota(dest, "/proc/kmsg");
 
         if (mkfifo(from, 0600) < 0)
-                return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
-
-        r = chmod_and_chown(from, 0600, 0, 0);
-        if (r < 0)
-                return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
-
+                return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
         if (mount(from, to, NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
 
@@ -1578,8 +2076,9 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
         if (k < 0)
                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
 
-        /* And now make the FIFO unavailable as /dev/kmsg... */
-        unlink(from);
+        /* And now make the FIFO unavailable as /run/kmsg... */
+        (void) unlink(from);
+
         return 0;
 }
 
@@ -1603,7 +2102,7 @@ static int send_rtnl(int send_fd) {
 
         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
         if (fd < 0)
-                return log_error_errno(errno, "failed to allocate container netlink: %m");
+                return log_error_errno(errno, "Failed to allocate container netlink: %m");
 
         cmsg = CMSG_FIRSTHDR(&mh);
         cmsg->cmsg_level = SOL_SOCKET;
@@ -1655,7 +2154,7 @@ static int flush_ports(union in_addr_union *exposed) {
         return 0;
 }
 
-static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
+static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
         _cleanup_free_ struct local_address *addresses = NULL;
         _cleanup_free_ char *pretty = NULL;
         union in_addr_union new_exposed;
@@ -1709,7 +2208,7 @@ static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
         return 0;
 }
 
-static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
+static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
         union in_addr_union *exposed = userdata;
 
         assert(rtnl);
@@ -1720,7 +2219,7 @@ static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata)
         return 0;
 }
 
-static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
+static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
         union {
                 struct cmsghdr cmsghdr;
                 uint8_t buf[CMSG_SPACE(sizeof(int))];
@@ -1730,7 +2229,7 @@ static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed
                 .msg_controllen = sizeof(control),
         };
         struct cmsghdr *cmsg;
-        _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+        _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
         int fd, r;
         ssize_t k;
 
@@ -1751,21 +2250,21 @@ static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed
         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
 
-        r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
+        r = sd_netlink_open_fd(&rtnl, fd);
         if (r < 0) {
                 safe_close(fd);
                 return log_error_errno(r, "Failed to create rtnl object: %m");
         }
 
-        r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
+        r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
         if (r < 0)
                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
 
-        r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
+        r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
         if (r < 0)
                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
 
-        r = sd_rtnl_attach_event(rtnl, event, 0);
+        r = sd_netlink_attach_event(rtnl, event, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to add to even loop: %m");
 
@@ -1788,7 +2287,8 @@ static int setup_hostname(void) {
 
 static int setup_journal(const char *directory) {
         sd_id128_t machine_id, this_id;
-        _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
+        _cleanup_free_ char *b = NULL, *d = NULL;
+        const char *etc_machine_id, *p, *q;
         char *id;
         int r;
 
@@ -1796,15 +2296,13 @@ static int setup_journal(const char *directory) {
         if (arg_ephemeral)
                 return 0;
 
-        p = strappend(directory, "/etc/machine-id");
-        if (!p)
-                return log_oom();
+        etc_machine_id = prefix_roota(directory, "/etc/machine-id");
 
-        r = read_one_line_file(p, &b);
+        r = read_one_line_file(etc_machine_id, &b);
         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
                 return 0;
         else if (r < 0)
-                return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
+                return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
 
         id = strstrip(b);
         if (isempty(id) && arg_link_journal == LINK_AUTO)
@@ -1813,7 +2311,7 @@ static int setup_journal(const char *directory) {
         /* Verify validity */
         r = sd_id128_from_string(id, &machine_id);
         if (r < 0)
-                return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
+                return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
 
         r = sd_id128_get_machine(&this_id);
         if (r < 0)
@@ -1830,13 +2328,22 @@ static int setup_journal(const char *directory) {
         if (arg_link_journal == LINK_NO)
                 return 0;
 
-        free(p);
-        p = strappend("/var/log/journal/", id);
-        q = strjoin(directory, "/var/log/journal/", id, NULL);
-        if (!p || !q)
-                return log_oom();
+        r = userns_mkdir(directory, "/var", 0755, 0, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create /var: %m");
+
+        r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create /var/log: %m");
 
-        if (path_is_mount_point(p, false) > 0) {
+        r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create /var/log/journal: %m");
+
+        p = strjoina("/var/log/journal/", id);
+        q = prefix_roota(directory, p);
+
+        if (path_is_mount_point(p, 0) > 0) {
                 if (arg_link_journal != LINK_AUTO) {
                         log_error("%s: already a mount point, refusing to use for journal", p);
                         return -EEXIST;
@@ -1845,7 +2352,7 @@ static int setup_journal(const char *directory) {
                 return 0;
         }
 
-        if (path_is_mount_point(q, false) > 0) {
+        if (path_is_mount_point(q, 0) > 0) {
                 if (arg_link_journal != LINK_AUTO) {
                         log_error("%s: already a mount point, refusing to use for journal", q);
                         return -EEXIST;
@@ -1860,7 +2367,7 @@ static int setup_journal(const char *directory) {
                      arg_link_journal == LINK_AUTO) &&
                     path_equal(d, q)) {
 
-                        r = mkdir_p(q, 0755);
+                        r = userns_mkdir(directory, p, 0755, 0, 0);
                         if (r < 0)
                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
                         return 0;
@@ -1898,7 +2405,7 @@ static int setup_journal(const char *directory) {
                         }
                 }
 
-                r = mkdir_p(q, 0755);
+                r = userns_mkdir(directory, p, 0755, 0, 0);
                 if (r < 0)
                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
                 return 0;
@@ -1924,7 +2431,7 @@ static int setup_journal(const char *directory) {
         if (dir_is_empty(q) == 0)
                 log_warning("%s is not empty, proceeding anyway.", q);
 
-        r = mkdir_p(q, 0755);
+        r = userns_mkdir(directory, p, 0755, 0, 0);
         if (r < 0) {
                 log_error_errno(errno, "Failed to create %s: %m", q);
                 return r;
@@ -1942,7 +2449,7 @@ static int drop_capabilities(void) {
 
 static int register_machine(pid_t pid, int local_ifindex) {
         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
-        _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+        _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
         int r;
 
         if (!arg_register)
@@ -1972,6 +2479,7 @@ static int register_machine(pid_t pid, int local_ifindex) {
         } else {
                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
                 char **i;
+                unsigned j;
 
                 r = sd_bus_message_new_method_call(
                                 bus,
@@ -2010,6 +2518,10 @@ static int register_machine(pid_t pid, int local_ifindex) {
                 if (r < 0)
                         return bus_log_create_error(r);
 
+                /* If you make changes here, also make sure to update
+                 * systemd-nspawn@.service, to keep the device
+                 * policies in sync regardless if we are run with or
+                 * without the --keep-unit switch. */
                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
                                           /* Allow the container to
                                            * access and create the API
@@ -2032,7 +2544,35 @@ static int register_machine(pid_t pid, int local_ifindex) {
                                           "/dev/pts/ptmx", "rw",
                                           "char-pts", "rw");
                 if (r < 0)
-                        return log_error_errno(r, "Failed to add device whitelist: %m");
+                        return bus_log_create_error(r);
+
+                for (j = 0; j < arg_n_custom_mounts; j++) {
+                        CustomMount *cm = &arg_custom_mounts[j];
+
+                        if (cm->type != CUSTOM_MOUNT_BIND)
+                                continue;
+
+                        r = is_device_node(cm->source);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to stat %s: %m", cm->source);
+
+                        if (r) {
+                                r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
+                                        cm->source, cm->read_only ? "r" : "rw");
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to append message arguments: %m");
+                        }
+                }
+
+                if (arg_kill_signal != 0) {
+                        r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
+                        if (r < 0)
+                                return bus_log_create_error(r);
+                }
 
                 STRV_FOREACH(i, arg_property) {
                         r = sd_bus_message_open_container(m, 'r', "sv");
@@ -2066,13 +2606,18 @@ static int register_machine(pid_t pid, int local_ifindex) {
 static int terminate_machine(pid_t pid) {
         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
-        _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+        _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
         const char *path;
         int r;
 
         if (!arg_register)
                 return 0;
 
+        /* If we are reusing the unit, then just exit, systemd will do
+         * the right thing when we exit. */
+        if (arg_keep_unit)
+                return 0;
+
         r = sd_bus_default_system(&bus);
         if (r < 0)
                 return log_error_errno(r, "Failed to open system bus: %m");
@@ -2133,13 +2678,14 @@ static int reset_audit_loginuid(void) {
         if (streq(p, "4294967295"))
                 return 0;
 
-        r = write_string_file("/proc/self/loginuid", "4294967295");
+        r = write_string_file("/proc/self/loginuid", "4294967295", 0);
         if (r < 0) {
-                log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
-                          "old and you have audit enabled. Note that the auditing subsystem is known to\n"
-                          "be incompatible with containers on old kernels. Please make sure to upgrade\n"
-                          "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
-                          "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
+                log_error_errno(r,
+                                "Failed to reset audit login UID. This probably means that your kernel is too\n"
+                                "old and you have audit enabled. Note that the auditing subsystem is known to\n"
+                                "be incompatible with containers on old kernels. Please make sure to upgrade\n"
+                                "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
+                                "using systemd-nspawn. Sleeping for 5s... (%m)");
 
                 sleep(5);
         }
@@ -2192,8 +2738,8 @@ static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t id
 }
 
 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
-        _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
-        _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+        _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
+        _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
         struct ether_addr mac_host, mac_container;
         int r, i;
 
@@ -2216,7 +2762,7 @@ static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
         if (r < 0)
                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
 
-        r = sd_rtnl_open(&rtnl, 0);
+        r = sd_netlink_open(&rtnl);
         if (r < 0)
                 return log_error_errno(r, "Failed to connect to netlink: %m");
 
@@ -2224,53 +2770,53 @@ static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
         if (r < 0)
                 return log_error_errno(r, "Failed to allocate netlink message: %m");
 
-        r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
+        r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
         if (r < 0)
                 return log_error_errno(r, "Failed to add netlink interface name: %m");
 
-        r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
+        r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
         if (r < 0)
                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
 
-        r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+        r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
         if (r < 0)
                 return log_error_errno(r, "Failed to open netlink container: %m");
 
-        r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
+        r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
         if (r < 0)
                 return log_error_errno(r, "Failed to open netlink container: %m");
 
-        r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
+        r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
         if (r < 0)
                 return log_error_errno(r, "Failed to open netlink container: %m");
 
-        r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
+        r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
         if (r < 0)
                 return log_error_errno(r, "Failed to add netlink interface name: %m");
 
-        r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
+        r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
         if (r < 0)
                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
 
-        r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+        r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
         if (r < 0)
                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
 
-        r = sd_rtnl_message_close_container(m);
+        r = sd_netlink_message_close_container(m);
         if (r < 0)
                 return log_error_errno(r, "Failed to close netlink container: %m");
 
-        r = sd_rtnl_message_close_container(m);
+        r = sd_netlink_message_close_container(m);
         if (r < 0)
                 return log_error_errno(r, "Failed to close netlink container: %m");
 
-        r = sd_rtnl_message_close_container(m);
+        r = sd_netlink_message_close_container(m);
         if (r < 0)
                 return log_error_errno(r, "Failed to close netlink container: %m");
 
-        r = sd_rtnl_call(rtnl, m, 0, NULL);
+        r = sd_netlink_call(rtnl, m, 0, NULL);
         if (r < 0)
-                return log_error_errno(r, "Failed to add new veth interfaces: %m");
+                return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
 
         i = (int) if_nametoindex(iface_name);
         if (i <= 0)
@@ -2282,8 +2828,8 @@ static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
 }
 
 static int setup_bridge(const char veth_name[], int *ifi) {
-        _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
-        _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+        _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
+        _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
         int r, bridge;
 
         if (!arg_private_network)
@@ -2301,7 +2847,7 @@ static int setup_bridge(const char veth_name[], int *ifi) {
 
         *ifi = bridge;
 
-        r = sd_rtnl_open(&rtnl, 0);
+        r = sd_netlink_open(&rtnl);
         if (r < 0)
                 return log_error_errno(r, "Failed to connect to netlink: %m");
 
@@ -2313,15 +2859,15 @@ static int setup_bridge(const char veth_name[], int *ifi) {
         if (r < 0)
                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
 
-        r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
+        r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
         if (r < 0)
                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
 
-        r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
+        r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
         if (r < 0)
                 return log_error_errno(r, "Failed to add netlink master field: %m");
 
-        r = sd_rtnl_call(rtnl, m, 0, NULL);
+        r = sd_netlink_call(rtnl, m, 0, NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
 
@@ -2352,7 +2898,7 @@ static int parse_interface(struct udev *udev, const char *name) {
 
 static int move_network_interfaces(pid_t pid) {
         _cleanup_udev_unref_ struct udev *udev = NULL;
-        _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+        _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
         char **i;
         int r;
 
@@ -2362,7 +2908,7 @@ static int move_network_interfaces(pid_t pid) {
         if (strv_isempty(arg_network_interfaces))
                 return 0;
 
-        r = sd_rtnl_open(&rtnl, 0);
+        r = sd_netlink_open(&rtnl);
         if (r < 0)
                 return log_error_errno(r, "Failed to connect to netlink: %m");
 
@@ -2373,7 +2919,7 @@ static int move_network_interfaces(pid_t pid) {
         }
 
         STRV_FOREACH(i, arg_network_interfaces) {
-                _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+                _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
                 int ifi;
 
                 ifi = parse_interface(udev, *i);
@@ -2384,11 +2930,11 @@ static int move_network_interfaces(pid_t pid) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate netlink message: %m");
 
-                r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+                r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
                 if (r < 0)
                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
 
-                r = sd_rtnl_call(rtnl, m, 0, NULL);
+                r = sd_netlink_call(rtnl, m, 0, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
         }
@@ -2398,7 +2944,7 @@ static int move_network_interfaces(pid_t pid) {
 
 static int setup_macvlan(pid_t pid) {
         _cleanup_udev_unref_ struct udev *udev = NULL;
-        _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+        _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
         unsigned idx = 0;
         char **i;
         int r;
@@ -2409,7 +2955,7 @@ static int setup_macvlan(pid_t pid) {
         if (strv_isempty(arg_network_macvlan))
                 return 0;
 
-        r = sd_rtnl_open(&rtnl, 0);
+        r = sd_netlink_open(&rtnl);
         if (r < 0)
                 return log_error_errno(r, "Failed to connect to netlink: %m");
 
@@ -2420,7 +2966,7 @@ static int setup_macvlan(pid_t pid) {
         }
 
         STRV_FOREACH(i, arg_network_macvlan) {
-                _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+                _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
                 _cleanup_free_ char *n = NULL;
                 struct ether_addr mac;
                 int ifi;
@@ -2437,7 +2983,7 @@ static int setup_macvlan(pid_t pid) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate netlink message: %m");
 
-                r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
+                r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add netlink interface index: %m");
 
@@ -2447,39 +2993,39 @@ static int setup_macvlan(pid_t pid) {
 
                 strshorten(n, IFNAMSIZ-1);
 
-                r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
+                r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add netlink interface name: %m");
 
-                r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
+                r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
 
-                r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+                r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
 
-                r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+                r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
                 if (r < 0)
                         return log_error_errno(r, "Failed to open netlink container: %m");
 
-                r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
+                r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
                 if (r < 0)
                         return log_error_errno(r, "Failed to open netlink container: %m");
 
-                r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
+                r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
                 if (r < 0)
                         return log_error_errno(r, "Failed to append macvlan mode: %m");
 
-                r = sd_rtnl_message_close_container(m);
+                r = sd_netlink_message_close_container(m);
                 if (r < 0)
                         return log_error_errno(r, "Failed to close netlink container: %m");
 
-                r = sd_rtnl_message_close_container(m);
+                r = sd_netlink_message_close_container(m);
                 if (r < 0)
                         return log_error_errno(r, "Failed to close netlink container: %m");
 
-                r = sd_rtnl_call(rtnl, m, 0, NULL);
+                r = sd_netlink_call(rtnl, m, 0, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
         }
@@ -2489,7 +3035,7 @@ static int setup_macvlan(pid_t pid) {
 
 static int setup_ipvlan(pid_t pid) {
         _cleanup_udev_unref_ struct udev *udev = NULL;
-        _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+        _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
         char **i;
         int r;
 
@@ -2499,7 +3045,7 @@ static int setup_ipvlan(pid_t pid) {
         if (strv_isempty(arg_network_ipvlan))
                 return 0;
 
-        r = sd_rtnl_open(&rtnl, 0);
+        r = sd_netlink_open(&rtnl);
         if (r < 0)
                 return log_error_errno(r, "Failed to connect to netlink: %m");
 
@@ -2510,7 +3056,7 @@ static int setup_ipvlan(pid_t pid) {
         }
 
         STRV_FOREACH(i, arg_network_ipvlan) {
-                _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+                _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
                 _cleanup_free_ char *n = NULL;
                 int ifi;
 
@@ -2522,7 +3068,7 @@ static int setup_ipvlan(pid_t pid) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate netlink message: %m");
 
-                r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
+                r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add netlink interface index: %m");
 
@@ -2532,35 +3078,35 @@ static int setup_ipvlan(pid_t pid) {
 
                 strshorten(n, IFNAMSIZ-1);
 
-                r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
+                r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add netlink interface name: %m");
 
-                r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+                r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
 
-                r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+                r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
                 if (r < 0)
                         return log_error_errno(r, "Failed to open netlink container: %m");
 
-                r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
+                r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
                 if (r < 0)
                         return log_error_errno(r, "Failed to open netlink container: %m");
 
-                r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
+                r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
 
-                r = sd_rtnl_message_close_container(m);
+                r = sd_netlink_message_close_container(m);
                 if (r < 0)
                         return log_error_errno(r, "Failed to close netlink container: %m");
 
-                r = sd_rtnl_message_close_container(m);
+                r = sd_netlink_message_close_container(m);
                 if (r < 0)
                         return log_error_errno(r, "Failed to close netlink container: %m");
 
-                r = sd_rtnl_call(rtnl, m, 0, NULL);
+                r = sd_netlink_call(rtnl, m, 0, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
         }
@@ -2575,15 +3121,16 @@ static int setup_seccomp(void) {
                 uint64_t capability;
                 int syscall_num;
         } blacklist[] = {
-                { CAP_SYS_RAWIO,  SCMP_SYS(iopl)},
-                { CAP_SYS_RAWIO,  SCMP_SYS(ioperm)},
-                { CAP_SYS_BOOT,   SCMP_SYS(kexec_load)},
-                { CAP_SYS_ADMIN,  SCMP_SYS(swapon)},
-                { CAP_SYS_ADMIN,  SCMP_SYS(swapoff)},
-                { CAP_SYS_ADMIN,  SCMP_SYS(open_by_handle_at)},
-                { CAP_SYS_MODULE, SCMP_SYS(init_module)},
-                { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
-                { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
+                { CAP_SYS_RAWIO,  SCMP_SYS(iopl)              },
+                { CAP_SYS_RAWIO,  SCMP_SYS(ioperm)            },
+                { CAP_SYS_BOOT,   SCMP_SYS(kexec_load)        },
+                { CAP_SYS_ADMIN,  SCMP_SYS(swapon)            },
+                { CAP_SYS_ADMIN,  SCMP_SYS(swapoff)           },
+                { CAP_SYS_ADMIN,  SCMP_SYS(open_by_handle_at) },
+                { CAP_SYS_MODULE, SCMP_SYS(init_module)       },
+                { CAP_SYS_MODULE, SCMP_SYS(finit_module)      },
+                { CAP_SYS_MODULE, SCMP_SYS(delete_module)     },
+                { CAP_SYSLOG,     SCMP_SYS(syslog)            },
         };
 
         scmp_filter_ctx seccomp;
@@ -2643,8 +3190,15 @@ static int setup_seccomp(void) {
         }
 
         r = seccomp_load(seccomp);
-        if (r < 0)
+        if (r == -EINVAL) {
+                log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
+                r = 0;
+                goto finish;
+        }
+        if (r < 0) {
                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
+                goto finish;
+        }
 
 finish:
         seccomp_release(seccomp);
@@ -2663,10 +3217,16 @@ static int setup_propagate(const char *root) {
         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
         (void) mkdir_p(p, 0600);
 
-        q = strjoina(root, "/run/systemd/nspawn/incoming");
-        mkdir_parents(q, 0755);
-        mkdir_p(q, 0600);
+        if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
+                return log_error_errno(errno, "Failed to create /run/systemd: %m");
 
+        if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
+                return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
+
+        if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
+                return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
+
+        q = prefix_roota(root, "/run/systemd/nspawn/incoming");
         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to install propagation bind mount.");
 
@@ -3291,7 +3851,8 @@ static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
                 if (nullfd > 2)
                         safe_close(nullfd);
 
-                reset_all_signal_handlers();
+                (void) reset_all_signal_handlers();
+                (void) reset_signal_mask();
                 close_all_fds(NULL, 0);
 
                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
@@ -3325,14 +3886,9 @@ static int change_uid_gid(char **_home) {
         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
                 /* Reset everything fully to 0, just in case */
 
-                if (setgroups(0, NULL) < 0)
-                        return log_error_errno(errno, "setgroups() failed: %m");
-
-                if (setresgid(0, 0, 0) < 0)
-                        return log_error_errno(errno, "setregid() failed: %m");
-
-                if (setresuid(0, 0, 0) < 0)
-                        return log_error_errno(errno, "setreuid() failed: %m");
+                r = reset_uid_gid();
+                if (r < 0)
+                        return log_error_errno(r, "Failed to become root: %m");
 
                 *_home = NULL;
                 return 0;
@@ -3476,9 +4032,9 @@ static int change_uid_gid(char **_home) {
         if (r < 0 && r != -EEXIST)
                 return log_error_errno(r, "Failed to make home directory: %m");
 
-        fchown(STDIN_FILENO, uid, gid);
-        fchown(STDOUT_FILENO, uid, gid);
-        fchown(STDERR_FILENO, uid, gid);
+        (void) fchown(STDIN_FILENO, uid, gid);
+        (void) fchown(STDOUT_FILENO, uid, gid);
+        (void) fchown(STDERR_FILENO, uid, gid);
 
         if (setgroups(n_uids, uids) < 0)
                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
@@ -3583,9 +4139,20 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo
 static int determine_names(void) {
         int r;
 
-        if (!arg_image && !arg_directory) {
-                if (arg_machine) {
-                        _cleanup_(image_unrefp) Image *i = NULL;
+        if (arg_template && !arg_directory && arg_machine) {
+
+                /* If --template= was specified then we should not
+                 * search for a machine, but instead create a new one
+                 * in /var/lib/machine. */
+
+                arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
+                if (!arg_directory)
+                        return log_oom();
+        }
+
+        if (!arg_image && !arg_directory) {
+                if (arg_machine) {
+                        _cleanup_(image_unrefp) Image *i = NULL;
 
                         r = image_find(arg_machine, &i);
                         if (r < 0)
@@ -3602,7 +4169,8 @@ static int determine_names(void) {
                         if (r < 0)
                                 return log_error_errno(r, "Invalid image directory: %m");
 
-                        arg_read_only = arg_read_only || i->read_only;
+                        if (!arg_ephemeral)
+                                arg_read_only = arg_read_only || i->read_only;
                 } else
                         arg_directory = get_current_dir_name();
 
@@ -3621,7 +4189,7 @@ static int determine_names(void) {
                 if (!arg_machine)
                         return log_oom();
 
-                hostname_cleanup(arg_machine, false);
+                hostname_cleanup(arg_machine);
                 if (!machine_name_is_valid(arg_machine)) {
                         log_error("Failed to determine machine name automatically, please use -M.");
                         return -EINVAL;
@@ -3646,23 +4214,25 @@ static int determine_names(void) {
         return 0;
 }
 
-static int determine_uid_shift(void) {
+static int determine_uid_shift(const char *directory) {
         int r;
 
-        if (!arg_userns)
+        if (!arg_userns) {
+                arg_uid_shift = 0;
                 return 0;
+        }
 
         if (arg_uid_shift == UID_INVALID) {
                 struct stat st;
 
-                r = stat(arg_directory, &st);
+                r = stat(directory, &st);
                 if (r < 0)
-                        return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
+                        return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
 
                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
 
                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
-                        log_error("UID and GID base of %s don't match.", arg_directory);
+                        log_error("UID and GID base of %s don't match.", directory);
                         return -EINVAL;
                 }
 
@@ -3678,6 +4248,531 @@ static int determine_uid_shift(void) {
         return 0;
 }
 
+static int inner_child(
+                Barrier *barrier,
+                const char *directory,
+                bool secondary,
+                int kmsg_socket,
+                int rtnl_socket,
+                FDSet *fds,
+                int argc,
+                char *argv[]) {
+
+        _cleanup_free_ char *home = NULL;
+        unsigned n_env = 2;
+        const char *envp[] = {
+                "PATH=" DEFAULT_PATH_SPLIT_USR,
+                "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
+                NULL, /* TERM */
+                NULL, /* HOME */
+                NULL, /* USER */
+                NULL, /* LOGNAME */
+                NULL, /* container_uuid */
+                NULL, /* LISTEN_FDS */
+                NULL, /* LISTEN_PID */
+                NULL
+        };
+
+        _cleanup_strv_free_ char **env_use = NULL;
+        int r;
+
+        assert(barrier);
+        assert(directory);
+        assert(kmsg_socket >= 0);
+
+        cg_unified_flush();
+
+        if (arg_userns) {
+                /* Tell the parent, that it now can write the UID map. */
+                (void) barrier_place(barrier); /* #1 */
+
+                /* Wait until the parent wrote the UID map */
+                if (!barrier_place_and_sync(barrier)) { /* #2 */
+                        log_error("Parent died too early");
+                        return -ESRCH;
+                }
+        }
+
+        r = mount_all(NULL, true);
+        if (r < 0)
+                return r;
+
+        /* Wait until we are cgroup-ified, so that we
+         * can mount the right cgroup path writable */
+        if (!barrier_place_and_sync(barrier)) { /* #3 */
+                log_error("Parent died too early");
+                return -ESRCH;
+        }
+
+        r = mount_systemd_cgroup_writable("");
+        if (r < 0)
+                return r;
+
+        r = reset_uid_gid();
+        if (r < 0)
+                return log_error_errno(r, "Couldn't become new root: %m");
+
+        r = setup_boot_id(NULL);
+        if (r < 0)
+                return r;
+
+        r = setup_kmsg(NULL, kmsg_socket);
+        if (r < 0)
+                return r;
+        kmsg_socket = safe_close(kmsg_socket);
+
+        umask(0022);
+
+        if (setsid() < 0)
+                return log_error_errno(errno, "setsid() failed: %m");
+
+        if (arg_private_network)
+                loopback_setup();
+
+        r = send_rtnl(rtnl_socket);
+        if (r < 0)
+                return r;
+        rtnl_socket = safe_close(rtnl_socket);
+
+        if (drop_capabilities() < 0)
+                return log_error_errno(errno, "drop_capabilities() failed: %m");
+
+        setup_hostname();
+
+        if (arg_personality != PERSONALITY_INVALID) {
+                if (personality(arg_personality) < 0)
+                        return log_error_errno(errno, "personality() failed: %m");
+        } else if (secondary) {
+                if (personality(PER_LINUX32) < 0)
+                        return log_error_errno(errno, "personality() failed: %m");
+        }
+
+#ifdef HAVE_SELINUX
+        if (arg_selinux_context)
+                if (setexeccon((security_context_t) arg_selinux_context) < 0)
+                        return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
+#endif
+
+        r = change_uid_gid(&home);
+        if (r < 0)
+                return r;
+
+        envp[n_env] = strv_find_prefix(environ, "TERM=");
+        if (envp[n_env])
+                n_env ++;
+
+        if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
+            (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
+            (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
+                return log_oom();
+
+        if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
+                char as_uuid[37];
+
+                if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
+                        return log_oom();
+        }
+
+        if (fdset_size(fds) > 0) {
+                r = fdset_cloexec(fds, false);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
+
+                if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
+                    (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
+                        return log_oom();
+        }
+
+        env_use = strv_env_merge(2, envp, arg_setenv);
+        if (!env_use)
+                return log_oom();
+
+        /* Let the parent know that we are ready and
+         * wait until the parent is ready with the
+         * setup, too... */
+        if (!barrier_place_and_sync(barrier)) { /* #4 */
+                log_error("Parent died too early");
+                return -ESRCH;
+        }
+
+        /* Now, explicitly close the log, so that we
+         * then can close all remaining fds. Closing
+         * the log explicitly first has the benefit
+         * that the logging subsystem knows about it,
+         * and is thus ready to be reopened should we
+         * need it again. Note that the other fds
+         * closed here are at least the locking and
+         * barrier fds. */
+        log_close();
+        (void) fdset_close_others(fds);
+
+        if (arg_boot) {
+                char **a;
+                size_t m;
+
+                /* Automatically search for the init system */
+
+                m = 1 + argc - optind;
+                a = newa(char*, m + 1);
+                memcpy(a + 1, argv + optind, m * sizeof(char*));
+
+                a[0] = (char*) "/usr/lib/systemd/systemd";
+                execve(a[0], a, env_use);
+
+                a[0] = (char*) "/lib/systemd/systemd";
+                execve(a[0], a, env_use);
+
+                a[0] = (char*) "/sbin/init";
+                execve(a[0], a, env_use);
+        } else if (argc > optind)
+                execvpe(argv[optind], argv + optind, env_use);
+        else {
+                chdir(home ? home : "/root");
+                execle("/bin/bash", "-bash", NULL, env_use);
+                execle("/bin/sh", "-sh", NULL, env_use);
+        }
+
+        (void) log_open();
+        return log_error_errno(errno, "execv() failed: %m");
+}
+
+static int outer_child(
+                Barrier *barrier,
+                const char *directory,
+                const char *console,
+                const char *root_device, bool root_device_rw,
+                const char *home_device, bool home_device_rw,
+                const char *srv_device, bool srv_device_rw,
+                bool interactive,
+                bool secondary,
+                int pid_socket,
+                int kmsg_socket,
+                int rtnl_socket,
+                int uid_shift_socket,
+                FDSet *fds,
+                int argc,
+                char *argv[]) {
+
+        pid_t pid;
+        ssize_t l;
+        int r;
+
+        assert(barrier);
+        assert(directory);
+        assert(console);
+        assert(pid_socket >= 0);
+        assert(kmsg_socket >= 0);
+
+        cg_unified_flush();
+
+        if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
+                return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
+
+        if (interactive) {
+                close_nointr(STDIN_FILENO);
+                close_nointr(STDOUT_FILENO);
+                close_nointr(STDERR_FILENO);
+
+                r = open_terminal(console, O_RDWR);
+                if (r != STDIN_FILENO) {
+                        if (r >= 0) {
+                                safe_close(r);
+                                r = -EINVAL;
+                        }
+
+                        return log_error_errno(r, "Failed to open console: %m");
+                }
+
+                if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
+                    dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
+                        return log_error_errno(errno, "Failed to duplicate console: %m");
+        }
+
+        r = reset_audit_loginuid();
+        if (r < 0)
+                return r;
+
+        /* Mark everything as slave, so that we still
+         * receive mounts from the real root, but don't
+         * propagate mounts to the real root. */
+        if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
+                return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
+
+        r = mount_devices(directory,
+                          root_device, root_device_rw,
+                          home_device, home_device_rw,
+                          srv_device, srv_device_rw);
+        if (r < 0)
+                return r;
+
+        r = determine_uid_shift(directory);
+        if (r < 0)
+                return r;
+
+        if (arg_userns) {
+                l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to send UID shift: %m");
+                if (l != sizeof(arg_uid_shift)) {
+                        log_error("Short write while sending UID shift.");
+                        return -EIO;
+                }
+        }
+
+        /* Turn directory into bind mount */
+        if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
+                return log_error_errno(errno, "Failed to make bind mount: %m");
+
+        r = setup_volatile(directory);
+        if (r < 0)
+                return r;
+
+        r = setup_volatile_state(directory);
+        if (r < 0)
+                return r;
+
+        r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
+        if (r < 0)
+                return r;
+
+        if (arg_read_only) {
+                r = bind_remount_recursive(directory, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to make tree read-only: %m");
+        }
+
+        r = mount_all(directory, false);
+        if (r < 0)
+                return r;
+
+        if (copy_devnodes(directory) < 0)
+                return r;
+
+        dev_setup(directory, arg_uid_shift, arg_uid_shift);
+
+        if (setup_pts(directory) < 0)
+                return r;
+
+        r = setup_propagate(directory);
+        if (r < 0)
+                return r;
+
+        r = setup_dev_console(directory, console);
+        if (r < 0)
+                return r;
+
+        r = setup_seccomp();
+        if (r < 0)
+                return r;
+
+        r = setup_timezone(directory);
+        if (r < 0)
+                return r;
+
+        r = setup_resolv_conf(directory);
+        if (r < 0)
+                return r;
+
+        r = setup_journal(directory);
+        if (r < 0)
+                return r;
+
+        r = mount_custom(directory);
+        if (r < 0)
+                return r;
+
+        r = mount_cgroups(directory);
+        if (r < 0)
+                return r;
+
+        r = mount_move_root(directory);
+        if (r < 0)
+                return log_error_errno(r, "Failed to move root directory: %m");
+
+        pid = raw_clone(SIGCHLD|CLONE_NEWNS|
+                        (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
+                        (arg_private_network ? CLONE_NEWNET : 0) |
+                        (arg_userns ? CLONE_NEWUSER : 0),
+                        NULL);
+        if (pid < 0)
+                return log_error_errno(errno, "Failed to fork inner child: %m");
+        if (pid == 0) {
+                pid_socket = safe_close(pid_socket);
+                uid_shift_socket = safe_close(uid_shift_socket);
+
+                /* The inner child has all namespaces that are
+                 * requested, so that we all are owned by the user if
+                 * user namespaces are turned on. */
+
+                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
+                if (r < 0)
+                        _exit(EXIT_FAILURE);
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
+        if (l < 0)
+                return log_error_errno(errno, "Failed to send PID: %m");
+        if (l != sizeof(pid)) {
+                log_error("Short write while sending PID.");
+                return -EIO;
+        }
+
+        pid_socket = safe_close(pid_socket);
+
+        return 0;
+}
+
+static int setup_uid_map(pid_t pid) {
+        char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+        int r;
+
+        assert(pid > 1);
+
+        xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
+        xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
+        r = write_string_file(uid_map, line, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write UID map: %m");
+
+        /* We always assign the same UID and GID ranges */
+        xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
+        r = write_string_file(uid_map, line, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write GID map: %m");
+
+        return 0;
+}
+
+static int chown_cgroup(pid_t pid) {
+        _cleanup_free_ char *path = NULL, *fs = NULL;
+        _cleanup_close_ int fd = -1;
+        const char *fn;
+        int r;
+
+        r = cg_pid_get_path(NULL, pid, &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+        fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+        if (fd < 0)
+                return log_error_errno(errno, "Failed to open %s: %m", fs);
+
+        FOREACH_STRING(fn,
+                       ".",
+                       "tasks",
+                       "notify_on_release",
+                       "cgroup.procs",
+                       "cgroup.clone_children",
+                       "cgroup.controllers",
+                       "cgroup.subtree_control",
+                       "cgroup.populated")
+                if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
+                        log_full_errno(errno == ENOENT ? LOG_DEBUG :  LOG_WARNING, errno,
+                                       "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+        return 0;
+}
+
+static int sync_cgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+        bool undo_mount = false;
+        const char *fn;
+        int unified, r;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+
+        if ((unified > 0) == arg_unified_cgroup_hierarchy)
+                return 0;
+
+        /* When the host uses the legacy cgroup setup, but the
+         * container shall use the unified hierarchy, let's make sure
+         * we copy the path from the name=systemd hierarchy into the
+         * unified hierarchy. Similar for the reverse situation. */
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
+
+        /* In order to access the unified hierarchy we need to mount it */
+        if (!mkdtemp(tree))
+                return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+
+        if (unified)
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+        else
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+        if (r < 0) {
+                r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+                goto finish;
+        }
+
+        undo_mount = true;
+
+        fn = strjoina(tree, cgroup, "/cgroup.procs");
+        (void) mkdir_parents(fn, 0755);
+
+        sprintf(pid_string, PID_FMT, pid);
+        r = write_string_file(fn, pid_string, 0);
+        if (r < 0)
+                log_error_errno(r, "Failed to move process: %m");
+
+finish:
+        if (undo_mount)
+                (void) umount(tree);
+
+        (void) rmdir(tree);
+        return r;
+}
+
+static int create_subcgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        const char *child;
+        int unified, r;
+
+        /* In the unified hierarchy inner nodes may only only contain
+         * subgroups, but not processes. Hence, if we running in the
+         * unified hierarchy and the container does the same, and we
+         * did not create a scope unit for the container move us and
+         * the container into two separate subcgroups. */
+
+        if (!arg_keep_unit)
+                return 0;
+
+        if (!arg_unified_cgroup_hierarchy)
+                return 0;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+        if (unified == 0)
+                return 0;
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get our control group: %m");
+
+        child = strjoina(cgroup, "/payload");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+        child = strjoina(cgroup, "/supervisor");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+        return 0;
+}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
@@ -3687,7 +4782,7 @@ int main(int argc, char *argv[]) {
         int r, n_fd_passed, loop_nr = -1;
         char veth_name[IFNAMSIZ];
         bool secondary = false, remove_subvol = false;
-        sigset_t mask, mask_chld;
+        sigset_t mask_chld;
         pid_t pid = 0;
         int ret = EXIT_SUCCESS;
         union in_addr_union exposed = {};
@@ -3711,7 +4806,6 @@ int main(int argc, char *argv[]) {
                 goto finish;
         }
 
-        log_close();
         n_fd_passed = sd_listen_fds(false);
         if (n_fd_passed > 0) {
                 r = fdset_new_listen_fds(&fds, false);
@@ -3720,8 +4814,6 @@ int main(int argc, char *argv[]) {
                         goto finish;
                 }
         }
-        fdset_close_others(fds);
-        log_open();
 
         if (arg_directory) {
                 assert(!arg_image);
@@ -3741,15 +4833,15 @@ int main(int argc, char *argv[]) {
                          * the specified is not a mount point we
                          * create the new snapshot in the parent
                          * directory, just next to it. */
-                        r = path_is_mount_point(arg_directory, false);
+                        r = path_is_mount_point(arg_directory, 0);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
                                 goto finish;
                         }
                         if (r > 0)
-                                r = tempfn_random_child(arg_directory, &np);
+                                r = tempfn_random_child(arg_directory, "machine.", &np);
                         else
-                                r = tempfn_random(arg_directory, &np);
+                                r = tempfn_random(arg_directory, "machine.", &np);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
                                 goto finish;
@@ -3761,7 +4853,7 @@ int main(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
+                        r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
                                 goto finish;
@@ -3785,7 +4877,7 @@ int main(int argc, char *argv[]) {
                         }
 
                         if (arg_template) {
-                                r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
+                                r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
                                 if (r == -EEXIST) {
                                         if (!arg_quiet)
                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
@@ -3860,11 +4952,13 @@ int main(int argc, char *argv[]) {
                         goto finish;
         }
 
-        r = determine_uid_shift();
+        r = custom_mounts_prepare();
         if (r < 0)
                 goto finish;
 
-        interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
+        interactive =
+                isatty(STDIN_FILENO) > 0 &&
+                isatty(STDOUT_FILENO) > 0;
 
         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
         if (master < 0) {
@@ -3887,21 +4981,31 @@ int main(int argc, char *argv[]) {
                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
                          arg_machine, arg_image ?: arg_directory);
 
-        assert_se(sigemptyset(&mask) == 0);
-        sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
-        assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
 
         assert_se(sigemptyset(&mask_chld) == 0);
         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
 
+        if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
+                r = log_error_errno(errno, "Failed to become subreaper: %m");
+                goto finish;
+        }
+
         for (;;) {
-                _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
+                _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
+                                         uid_shift_socket_pair[2] = { -1, -1 };
                 ContainerStatus container_status;
                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
-                struct sigaction sa = {
+                static const struct sigaction sa = {
                         .sa_handler = nop_handler,
                         .sa_flags = SA_NOCLDSTOP,
                 };
+                int ifi = 0;
+                ssize_t l;
+                _cleanup_event_unref_ sd_event *event = NULL;
+                _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+                _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
+                char last_char = 0;
 
                 r = barrier_create(&barrier);
                 if (r < 0) {
@@ -3919,6 +5023,17 @@ int main(int argc, char *argv[]) {
                         goto finish;
                 }
 
+                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
+                        r = log_error_errno(errno, "Failed to create pid socket pair: %m");
+                        goto finish;
+                }
+
+                if (arg_userns)
+                        if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
+                                r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+                                goto finish;
+                        }
+
                 /* Child can be killed before execv(), so handle SIGCHLD
                  * in order to interrupt parent's blocking calls and
                  * give it a chance to call wait() and terminate. */
@@ -3934,9 +5049,7 @@ int main(int argc, char *argv[]) {
                         goto finish;
                 }
 
-                pid = raw_clone(SIGCHLD|CLONE_NEWNS|
-                                (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
-                                (arg_private_network ? CLONE_NEWNET : 0), NULL);
+                pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
                 if (pid < 0) {
                         if (errno == EINVAL)
                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
@@ -3947,466 +5060,213 @@ int main(int argc, char *argv[]) {
                 }
 
                 if (pid == 0) {
-                        /* child */
-                        _cleanup_free_ char *home = NULL;
-                        unsigned n_env = 2;
-                        const char *envp[] = {
-                                "PATH=" DEFAULT_PATH_SPLIT_USR,
-                                "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
-                                NULL, /* TERM */
-                                NULL, /* HOME */
-                                NULL, /* USER */
-                                NULL, /* LOGNAME */
-                                NULL, /* container_uuid */
-                                NULL, /* LISTEN_FDS */
-                                NULL, /* LISTEN_PID */
-                                NULL
-                        };
-                        char **env_use;
-
+                        /* The outer child only has a file system namespace. */
                         barrier_set_role(&barrier, BARRIER_CHILD);
 
-                        envp[n_env] = strv_find_prefix(environ, "TERM=");
-                        if (envp[n_env])
-                                n_env ++;
-
                         master = safe_close(master);
 
                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
-
-                        reset_all_signal_handlers();
-                        reset_signal_mask();
-
-                        if (interactive) {
-                                close_nointr(STDIN_FILENO);
-                                close_nointr(STDOUT_FILENO);
-                                close_nointr(STDERR_FILENO);
-
-                                r = open_terminal(console, O_RDWR);
-                                if (r != STDIN_FILENO) {
-                                        if (r >= 0) {
-                                                safe_close(r);
-                                                r = -EINVAL;
-                                        }
-
-                                        log_error_errno(r, "Failed to open console: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-
-                                if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
-                                    dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
-                                        log_error_errno(errno, "Failed to duplicate console: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        }
-
-                        if (setsid() < 0) {
-                                log_error_errno(errno, "setsid() failed: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (reset_audit_loginuid() < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
-                                log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (arg_private_network)
-                                loopback_setup();
-
-                        /* Mark everything as slave, so that we still
-                         * receive mounts from the real root, but don't
-                         * propagate mounts to the real root. */
-                        if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
-                                log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (mount_devices(arg_directory,
-                                          root_device, root_device_rw,
-                                          home_device, home_device_rw,
-                                          srv_device, srv_device_rw) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        /* Turn directory into bind mount */
-                        if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                                log_error_errno(errno, "Failed to make bind mount: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        r = setup_volatile(arg_directory);
-                        if (r < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_volatile_state(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        r = base_filesystem_create(arg_directory);
-                        if (r < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (arg_read_only) {
-                                r = bind_remount_recursive(arg_directory, true);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to make tree read-only: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        }
-
-                        if (mount_all(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (copy_devnodes(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_ptmx(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        dev_setup(arg_directory);
-
-                        if (setup_propagate(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_seccomp() < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_dev_console(arg_directory, console) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
-                                _exit(EXIT_FAILURE);
-                        kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
-
-                        if (send_rtnl(rtnl_socket_pair[1]) < 0)
-                                _exit(EXIT_FAILURE);
-                        rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
-
-                        /* Tell the parent that we are ready, and that
-                         * it can cgroupify us to that we lack access
-                         * to certain devices and resources. */
-                        (void) barrier_place(&barrier); /* #1 */
-
-                        if (setup_boot_id(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_timezone(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_resolv_conf(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (setup_journal(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (mount_binds(arg_directory, arg_bind, false) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (mount_tmpfs(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        /* Wait until we are cgroup-ified, so that we
-                         * can mount the right cgroup path writable */
-                        (void) barrier_place_and_sync(&barrier); /* #2 */
-
-                        if (mount_cgroup(arg_directory) < 0)
-                                _exit(EXIT_FAILURE);
-
-                        if (chdir(arg_directory) < 0) {
-                                log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
-                                log_error_errno(errno, "mount(MS_MOVE) failed: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (chroot(".") < 0) {
-                                log_error_errno(errno, "chroot() failed: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (chdir("/") < 0) {
-                                log_error_errno(errno, "chdir() failed: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (arg_userns) {
-                                if (unshare(CLONE_NEWUSER) < 0) {
-                                        log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-
-                                /* Tell the parent, that it now can
-                                 * write the UID map. */
-                                (void) barrier_place(&barrier); /* #3 */
-
-                                /* Wait until the parent wrote the UID
-                                 * map */
-                                (void) barrier_place_and_sync(&barrier); /* #4 */
-                        }
-
-                        umask(0022);
-
-                        if (drop_capabilities() < 0) {
-                                log_error_errno(errno, "drop_capabilities() failed: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        setup_hostname();
-
-                        if (arg_personality != 0xffffffffLU) {
-                                if (personality(arg_personality) < 0) {
-                                        log_error_errno(errno, "personality() failed: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        } else if (secondary) {
-                                if (personality(PER_LINUX32) < 0) {
-                                        log_error_errno(errno, "personality() failed: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        }
-
-#ifdef HAVE_SELINUX
-                        if (arg_selinux_context)
-                                if (setexeccon((security_context_t) arg_selinux_context) < 0) {
-                                        log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
-                                        _exit(EXIT_FAILURE);
-                                }
-#endif
-
-                        r = change_uid_gid(&home);
+                        pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+                        uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+
+                        (void) reset_all_signal_handlers();
+                        (void) reset_signal_mask();
+
+                        r = outer_child(&barrier,
+                                        arg_directory,
+                                        console,
+                                        root_device, root_device_rw,
+                                        home_device, home_device_rw,
+                                        srv_device, srv_device_rw,
+                                        interactive,
+                                        secondary,
+                                        pid_socket_pair[1],
+                                        kmsg_socket_pair[1],
+                                        rtnl_socket_pair[1],
+                                        uid_shift_socket_pair[1],
+                                        fds,
+                                        argc, argv);
                         if (r < 0)
                                 _exit(EXIT_FAILURE);
 
-                        if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
-                            (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
-                            (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
-                                log_oom();
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
-                                char as_uuid[37];
-
-                                if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
-                                        log_oom();
-                                        _exit(EXIT_FAILURE);
-                                }
-                        }
-
-                        if (fdset_size(fds) > 0) {
-                                r = fdset_cloexec(fds, false);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
-                                        _exit(EXIT_FAILURE);
-                                }
-
-                                if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
-                                    (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
-                                        log_oom();
-                                        _exit(EXIT_FAILURE);
-                                }
-                        }
-
-                        if (!strv_isempty(arg_setenv)) {
-                                char **n;
-
-                                n = strv_env_merge(2, envp, arg_setenv);
-                                if (!n) {
-                                        log_oom();
-                                        _exit(EXIT_FAILURE);
-                                }
-
-                                env_use = n;
-                        } else
-                                env_use = (char**) envp;
-
-                        /* Let the parent know that we are ready and
-                         * wait until the parent is ready with the
-                         * setup, too... */
-                        (void) barrier_place_and_sync(&barrier); /* #5 */
-
-                        if (arg_boot) {
-                                char **a;
-                                size_t l;
-
-                                /* Automatically search for the init system */
-
-                                l = 1 + argc - optind;
-                                a = newa(char*, l + 1);
-                                memcpy(a + 1, argv + optind, l * sizeof(char*));
-
-                                a[0] = (char*) "/usr/lib/systemd/systemd";
-                                execve(a[0], a, env_use);
-
-                                a[0] = (char*) "/lib/systemd/systemd";
-                                execve(a[0], a, env_use);
-
-                                a[0] = (char*) "/sbin/init";
-                                execve(a[0], a, env_use);
-                        } else if (argc > optind)
-                                execvpe(argv[optind], argv + optind, env_use);
-                        else {
-                                chdir(home ? home : "/root");
-                                execle("/bin/bash", "-bash", NULL, env_use);
-                                execle("/bin/sh", "-sh", NULL, env_use);
-                        }
-
-                        log_error_errno(errno, "execv() failed: %m");
-                        _exit(EXIT_FAILURE);
+                        _exit(EXIT_SUCCESS);
                 }
 
                 barrier_set_role(&barrier, BARRIER_PARENT);
+
                 fdset_free(fds);
                 fds = NULL;
 
                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
+                pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
 
-                (void) barrier_place(&barrier); /* #1 */
-
-                /* Wait for the most basic Child-setup to be done,
-                 * before we add hardware to it, and place it in a
-                 * cgroup. */
-                if (barrier_sync(&barrier)) { /* #1 */
-                        int ifi = 0;
+                /* Wait for the outer child. */
+                r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
+                if (r < 0)
+                        goto finish;
+                if (r != 0) {
+                        r = -EIO;
+                        goto finish;
+                }
+                pid = 0;
 
-                        r = move_network_interfaces(pid);
-                        if (r < 0)
-                                goto finish;
+                /* And now retrieve the PID of the inner child. */
+                l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
+                if (l < 0) {
+                        r = log_error_errno(errno, "Failed to read inner child PID: %m");
+                        goto finish;
+                }
+                if (l != sizeof(pid)) {
+                        log_error("Short read while reading inner child PID: %m");
+                        r = EIO;
+                        goto finish;
+                }
 
-                        r = setup_veth(pid, veth_name, &ifi);
-                        if (r < 0)
-                                goto finish;
+                log_debug("Init process invoked as PID " PID_FMT, pid);
 
-                        r = setup_bridge(veth_name, &ifi);
-                        if (r < 0)
+                if (arg_userns) {
+                        if (!barrier_place_and_sync(&barrier)) { /* #1 */
+                                log_error("Child died too early.");
+                                r = -ESRCH;
                                 goto finish;
+                        }
 
-                        r = setup_macvlan(pid);
-                        if (r < 0)
+                        l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
+                        if (l < 0) {
+                                r = log_error_errno(errno, "Failed to read UID shift: %m");
                                 goto finish;
-
-                        r = setup_ipvlan(pid);
-                        if (r < 0)
+                        }
+                        if (l != sizeof(arg_uid_shift)) {
+                                log_error("Short read while reading UID shift: %m");
+                                r = EIO;
                                 goto finish;
+                        }
 
-                        r = register_machine(pid, ifi);
+                        r = setup_uid_map(pid);
                         if (r < 0)
                                 goto finish;
 
-                        /* Notify the child that the parent is ready with all
-                         * its setup, and that the child can now hand over
-                         * control to the code to run inside the container. */
                         (void) barrier_place(&barrier); /* #2 */
+                }
 
-                        if (arg_userns) {
-                                char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+                r = move_network_interfaces(pid);
+                if (r < 0)
+                        goto finish;
 
-                                (void) barrier_place_and_sync(&barrier); /* #3 */
+                r = setup_veth(pid, veth_name, &ifi);
+                if (r < 0)
+                        goto finish;
 
-                                xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
-                                xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
-                                r = write_string_file(uid_map, line);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to write UID map: %m");
-                                        goto finish;
-                                }
+                r = setup_bridge(veth_name, &ifi);
+                if (r < 0)
+                        goto finish;
 
-                                /* We always assign the same UID and GID ranges */
-                                xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
-                                r = write_string_file(uid_map, line);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to write GID map: %m");
-                                        goto finish;
-                                }
+                r = setup_macvlan(pid);
+                if (r < 0)
+                        goto finish;
 
-                                (void) barrier_place(&barrier); /* #4 */
-                        }
+                r = setup_ipvlan(pid);
+                if (r < 0)
+                        goto finish;
 
-                        /* Block SIGCHLD here, before notifying child.
-                         * process_pty() will handle it with the other signals. */
-                        r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
-                        if (r < 0)
-                                goto finish;
+                r = register_machine(pid, ifi);
+                if (r < 0)
+                        goto finish;
 
-                        /* Reset signal to default */
-                        r = default_signals(SIGCHLD, -1);
-                        if (r < 0)
-                                goto finish;
+                r = sync_cgroup(pid);
+                if (r < 0)
+                        goto finish;
 
-                        /* Let the child know that we are ready and wait that the child is completely ready now. */
-                        if (barrier_place_and_sync(&barrier)) { /* #5 */
-                                _cleanup_event_unref_ sd_event *event = NULL;
-                                _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
-                                _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
-                                char last_char = 0;
+                r = create_subcgroup(pid);
+                if (r < 0)
+                        goto finish;
 
-                                sd_notifyf(false,
-                                           "READY=1\n"
-                                           "STATUS=Container running.\n"
-                                           "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
+                r = chown_cgroup(pid);
+                if (r < 0)
+                        goto finish;
 
-                                r = sd_event_new(&event);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to get default event source: %m");
-                                        goto finish;
-                                }
+                /* Notify the child that the parent is ready with all
+                 * its setup (including cgroup-ification), and that
+                 * the child can now hand over control to the code to
+                 * run inside the container. */
+                (void) barrier_place(&barrier); /* #3 */
 
-                                if (arg_kill_signal > 0) {
-                                        /* Try to kill the init system on SIGINT or SIGTERM */
-                                        sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
-                                        sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
-                                } else {
-                                        /* Immediately exit */
-                                        sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
-                                        sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
-                                }
+                /* Block SIGCHLD here, before notifying child.
+                 * process_pty() will handle it with the other signals. */
+                assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
 
-                                /* simply exit on sigchld */
-                                sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
+                /* Reset signal to default */
+                r = default_signals(SIGCHLD, -1);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to reset SIGCHLD: %m");
+                        goto finish;
+                }
 
-                                if (arg_expose_ports) {
-                                        r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
-                                        if (r < 0)
-                                                goto finish;
+                /* Let the child know that we are ready and wait that the child is completely ready now. */
+                if (!barrier_place_and_sync(&barrier)) { /* #5 */
+                        log_error("Client died too early.");
+                        r = -ESRCH;
+                        goto finish;
+                }
 
-                                        (void) expose_ports(rtnl, &exposed);
-                                }
+                sd_notifyf(false,
+                           "READY=1\n"
+                           "STATUS=Container running.\n"
+                           "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
 
-                                rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+                r = sd_event_new(&event);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to get default event source: %m");
+                        goto finish;
+                }
 
-                                r = pty_forward_new(event, master, true, !interactive, &forward);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to create PTY forwarder: %m");
-                                        goto finish;
-                                }
+                if (arg_kill_signal > 0) {
+                        /* Try to kill the init system on SIGINT or SIGTERM */
+                        sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
+                        sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
+                } else {
+                        /* Immediately exit */
+                        sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+                        sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+                }
 
-                                r = sd_event_loop(event);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to run event loop: %m");
-                                        goto finish;
-                                }
+                /* simply exit on sigchld */
+                sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
 
-                                pty_forward_get_last_char(forward, &last_char);
+                if (arg_expose_ports) {
+                        r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
+                        if (r < 0)
+                                goto finish;
 
-                                forward = pty_forward_free(forward);
+                        (void) expose_ports(rtnl, &exposed);
+                }
 
-                                if (!arg_quiet && last_char != '\n')
-                                        putc('\n', stdout);
+                rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
 
-                                /* Kill if it is not dead yet anyway */
-                                terminate_machine(pid);
-                        }
+                r = pty_forward_new(event, master, true, !interactive, &forward);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to create PTY forwarder: %m");
+                        goto finish;
+                }
+
+                r = sd_event_loop(event);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to run event loop: %m");
+                        goto finish;
                 }
 
+                pty_forward_get_last_char(forward, &last_char);
+
+                forward = pty_forward_free(forward);
+
+                if (!arg_quiet && last_char != '\n')
+                        putc('\n', stdout);
+
+                /* Kill if it is not dead yet anyway */
+                terminate_machine(pid);
+
                 /* Normally redundant, but better safe than sorry */
                 kill(pid, SIGKILL);
 
@@ -4451,11 +5311,15 @@ finish:
                   "STOPPING=1\n"
                   "STATUS=Terminating...");
 
-        loop_remove(loop_nr, &image_fd);
-
         if (pid > 0)
                 kill(pid, SIGKILL);
 
+        /* Try to flush whatever is still queued in the pty */
+        if (master >= 0)
+                (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
+
+        loop_remove(loop_nr, &image_fd);
+
         if (remove_subvol && arg_directory) {
                 int k;
 
@@ -4480,9 +5344,7 @@ finish:
         strv_free(arg_network_interfaces);
         strv_free(arg_network_macvlan);
         strv_free(arg_network_ipvlan);
-        strv_free(arg_bind);
-        strv_free(arg_bind_ro);
-        strv_free(arg_tmpfs);
+        custom_mount_free_all();
 
         flush_ports(&exposed);