]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
nspawn: always take exclusive locks of ephemeral OS tree copies
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index 11620c2a669b7437af842f98de5ed54e062453a0..2289a0a97fb3e78dc28b21e99ff6a3ea85a4892d 100644 (file)
@@ -188,6 +188,7 @@ static char **arg_network_veth_extra = NULL;
 static char *arg_network_bridge = NULL;
 static char *arg_network_zone = NULL;
 static char *arg_network_namespace_path = NULL;
+static PagerFlags arg_pager_flags = 0;
 static unsigned long arg_personality = PERSONALITY_INVALID;
 static char *arg_image = NULL;
 static char *arg_oci_bundle = NULL;
@@ -219,8 +220,7 @@ static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
 static bool arg_no_new_privileges = false;
 static int arg_oom_score_adjust = 0;
 static bool arg_oom_score_adjust_set = false;
-static cpu_set_t *arg_cpuset = NULL;
-static unsigned arg_cpuset_ncpus = 0;
+static CPUSet arg_cpu_set = {};
 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
 static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
@@ -258,14 +258,14 @@ STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
 #if HAVE_SECCOMP
 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
 #endif
-STATIC_DESTRUCTOR_REGISTER(arg_cpuset, CPU_FREEp);
+STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
 
 static int help(void) {
         _cleanup_free_ char *link = NULL;
         int r;
 
-        (void) pager_open(false);
+        (void) pager_open(arg_pager_flags);
 
         r = terminal_urlify_man("systemd-nspawn", "1", &link);
         if (r < 0)
@@ -276,6 +276,7 @@ static int help(void) {
                "  -h --help                 Show this help\n"
                "     --version              Print version string\n"
                "  -q --quiet                Do not show status information\n"
+               "     --no-pager             Do not pipe output into a pager\n"
                "     --settings=BOOLEAN     Load additional settings from .nspawn file\n\n"
                "%3$sImage:%4$s\n"
                "  -D --directory=PATH       Root directory for the container\n"
@@ -283,11 +284,12 @@ static int help(void) {
                "                            if missing\n"
                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
                "                            remove it after exit\n"
-               "  -i --image=PATH           File system device or disk image for the container\n"
+               "  -i --image=PATH           Root file system disk image (or device node) for\n"
+               "                            the container\n"
                "     --oci-bundle=PATH      OCI bundle directory\n"
                "     --read-only            Mount the root directory read-only\n"
                "     --volatile[=MODE]      Run the system in volatile mode\n"
-               "     --root-hash=HASH       Specify verity root hash\n"
+               "     --root-hash=HASH       Specify verity root hash for root disk image\n"
                "     --pivot-root=PATH[:PATH]\n"
                "                            Pivot root to given directory in the container\n\n"
                "%3$sExecution:%4$s\n"
@@ -590,6 +592,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_CONSOLE,
                 ARG_PIPE,
                 ARG_OCI_BUNDLE,
+                ARG_NO_PAGER,
         };
 
         static const struct option options[] = {
@@ -654,6 +657,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "console",                required_argument, NULL, ARG_CONSOLE                },
                 { "pipe",                   no_argument,       NULL, ARG_PIPE                   },
                 { "oci-bundle",             required_argument, NULL, ARG_OCI_BUNDLE             },
+                { "no-pager",               no_argument,       NULL, ARG_NO_PAGER               },
                 {}
         };
 
@@ -721,7 +725,7 @@ static int parse_argv(int argc, char *argv[]) {
                 case ARG_NETWORK_ZONE: {
                         char *j;
 
-                        j = strappend("vz-", optarg);
+                        j = strjoin("vz-", optarg);
                         if (!j)
                                 return log_oom();
 
@@ -944,10 +948,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_LINK_JOURNAL:
                         r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
-                        if (r < 0) {
-                                log_error_errno(r, "Failed to parse link journal mode %s", optarg);
-                                return -EINVAL;
-                        }
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
 
                         arg_settings_mask |= SETTING_LINK_JOURNAL;
                         break;
@@ -1238,9 +1240,8 @@ static int parse_argv(int argc, char *argv[]) {
                         if (r < 0)
                                 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
                         if (l < sizeof(sd_id128_t)) {
-                                log_error("Root hash must be at least 128bit long: %s", optarg);
                                 free(k);
-                                return -EINVAL;
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
                         }
 
                         free(arg_root_hash);
@@ -1281,7 +1282,7 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_RLIMIT: {
                         const char *eq;
-                        char *name;
+                        _cleanup_free_ char *name = NULL;
                         int rl;
 
                         if (streq(optarg, "help")) {
@@ -1327,17 +1328,14 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_CPU_AFFINITY: {
-                        _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
+                        CPUSet cpuset;
 
                         r = parse_cpu_set(optarg, &cpuset);
                         if (r < 0)
-                                return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
-
-                        if (arg_cpuset)
-                                CPU_FREE(arg_cpuset);
+                                return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
 
-                        arg_cpuset = TAKE_PTR(cpuset);
-                        arg_cpuset_ncpus = r;
+                        cpu_set_reset(&arg_cpu_set);
+                        arg_cpu_set = cpuset;
                         arg_settings_mask |= SETTING_CPU_AFFINITY;
                         break;
                 }
@@ -1384,10 +1382,8 @@ static int parse_argv(int argc, char *argv[]) {
                                      "read-only\n"
                                      "passive\n"
                                      "pipe");
-                        else {
-                                log_error("Unknown console mode: %s", optarg);
-                                return -EINVAL;
-                        }
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
 
                         arg_settings_mask |= SETTING_CONSOLE_MODE;
                         break;
@@ -1398,6 +1394,10 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_settings_mask |= SETTING_CONSOLE_MODE;
                         break;
 
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -1491,7 +1491,7 @@ static int verify_arguments(void) {
 
         /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
          * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
-         * copy-up (in case of overlay) making the entire excercise pointless. */
+         * copy-up (in case of overlay) making the entire exercise pointless. */
         if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
 
@@ -1692,7 +1692,7 @@ static int setup_timezone(const char *dest) {
 
         case TIMEZONE_COPY:
                 /* If mounting failed, try to copy */
-                r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
+                r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
                 if (r < 0) {
                         log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
                                        "Failed to copy /etc/localtime to %s, ignoring: %m", where);
@@ -1819,7 +1819,7 @@ static int setup_resolv_conf(const char *dest) {
         }
 
         /* If that didn't work, let's copy the file */
-        r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
+        r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
         if (r < 0) {
                 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
                  * resolved or something similar runs inside and the symlink points there.
@@ -1845,10 +1845,9 @@ static int setup_boot_id(void) {
         const char *to;
         int r;
 
-        /* Generate a new randomized boot ID, so that each boot-up of
-         * the container gets a new one */
+        /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
 
-        r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+        r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate random boot ID path: %m");
 
@@ -1896,11 +1895,11 @@ static int copy_devnodes(const char *dest) {
                 _cleanup_free_ char *from = NULL, *to = NULL;
                 struct stat st;
 
-                from = strappend("/dev/", d);
+                from = path_join("/dev/", d);
                 if (!from)
                         return log_oom();
 
-                to = prefix_root(dest, from);
+                to = path_join(dest, from);
                 if (!to)
                         return log_oom();
 
@@ -1935,7 +1934,7 @@ static int copy_devnodes(const char *dest) {
                         if (r < 0)
                                 return log_error_errno(r, "chown() of device node %s failed: %m", to);
 
-                        dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
+                        dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
                         if (!dn)
                                 return log_oom();
 
@@ -1946,11 +1945,11 @@ static int copy_devnodes(const char *dest) {
                         if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
                                 return log_oom();
 
-                        prefixed = prefix_root(dest, sl);
+                        prefixed = path_join(dest, sl);
                         if (!prefixed)
                                 return log_oom();
 
-                        t = strjoin("../", d);
+                        t = path_join("..", d);
                         if (!t)
                                 return log_oom();
 
@@ -1973,7 +1972,7 @@ static int make_extra_nodes(const char *dest) {
                 _cleanup_free_ char *path = NULL;
                 DeviceNode *n = arg_extra_nodes + i;
 
-                path = prefix_root(dest, n->path);
+                path = path_join(dest, n->path);
                 if (!path)
                         return log_oom();
 
@@ -2038,32 +2037,41 @@ static int setup_pts(const char *dest) {
         return 0;
 }
 
-static int setup_dev_console(const char *dest, const char *console) {
-        _cleanup_umask_ mode_t u;
-        const char *to;
+static int setup_stdio_as_dev_console(void) {
+        int terminal;
         int r;
 
-        assert(dest);
-
-        u = umask(0000);
+        terminal = open_terminal("/dev/console", O_RDWR);
+        if (terminal < 0)
+                return log_error_errno(terminal, "Failed to open console: %m");
 
-        if (!console)
-                return 0;
+        /* Make sure we can continue logging to the original stderr, even if
+         * stderr points elsewhere now */
+        r = log_dup_console();
+        if (r < 0)
+                return log_error_errno(r, "Failed to duplicate stderr: %m");
 
-        r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
+        /* invalidates 'terminal' on success and failure */
+        r = rearrange_stdio(terminal, terminal, terminal);
         if (r < 0)
-                return log_error_errno(r, "Failed to correct access mode for TTY: %m");
+                return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
 
-        /* We need to bind mount the right tty to /dev/console since
-         * ptys can only exist on pts file systems. To have something
-         * to bind mount things on we create a empty regular file. */
+        return 0;
+}
 
-        to = prefix_roota(dest, "/dev/console");
-        r = touch(to);
+static int setup_dev_console(const char *console) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        /* Create /dev/console symlink */
+        r = path_make_relative("/dev", console, &p);
         if (r < 0)
-                return log_error_errno(r, "touch() for /dev/console failed: %m");
+                return log_error_errno(r, "Failed to create relative path: %m");
+
+        if (symlink(p, "/dev/console") < 0)
+                return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
 
-        return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
+        return 0;
 }
 
 static int setup_keyring(void) {
@@ -2092,19 +2100,18 @@ static int setup_kmsg(int kmsg_socket) {
         _cleanup_free_ char *fifo = NULL;
         _cleanup_close_ int fd = -1;
         _cleanup_umask_ mode_t u;
-        const char *to;
         int r;
 
         assert(kmsg_socket >= 0);
 
         u = umask(0000);
 
-        /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+        /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
          * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
          * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
          * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
 
-        r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+        r = tempfn_random_child("/run", "proc-kmsg", &fifo);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate kmsg path: %m");
 
@@ -2112,9 +2119,8 @@ static int setup_kmsg(int kmsg_socket) {
                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
 
         from = TAKE_PTR(fifo);
-        to = "/proc/kmsg";
 
-        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
         if (r < 0)
                 return r;
 
@@ -2311,7 +2317,11 @@ static int drop_capabilities(uid_t uid) {
 
                 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
                         q.ambient = 0;
-        } else
+
+                if (capability_quintet_mangle(&q))
+                        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
+
+        } else {
                 q = (CapabilityQuintet) {
                         .bounding = arg_caps_retain,
                         .effective = uid == 0 ? arg_caps_retain : 0,
@@ -2320,6 +2330,13 @@ static int drop_capabilities(uid_t uid) {
                         .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
                 };
 
+                /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
+                 * in order to maintain the same behavior as systemd < 242. */
+                if (capability_quintet_mangle(&q))
+                        log_warning("Some capabilities will not be set because they are not in the current bounding set.");
+
+        }
+
         return capability_quintet_enforce(&q);
 }
 
@@ -2579,7 +2596,7 @@ static int determine_names(void) {
                  * search for a machine, but instead create a new one
                  * in /var/lib/machine. */
 
-                arg_directory = strjoin("/var/lib/machines/", arg_machine);
+                arg_directory = path_join("/var/lib/machines", arg_machine);
                 if (!arg_directory)
                         return log_oom();
         }
@@ -2609,10 +2626,8 @@ static int determine_names(void) {
                                 return log_error_errno(r, "Failed to determine current directory: %m");
                 }
 
-                if (!arg_directory && !arg_image) {
-                        log_error("Failed to determine path, please use -D or -i.");
-                        return -EINVAL;
-                }
+                if (!arg_directory && !arg_image)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
         }
 
         if (!arg_machine) {
@@ -2635,10 +2650,8 @@ static int determine_names(void) {
                         return log_oom();
 
                 hostname_cleanup(arg_machine);
-                if (!machine_name_is_valid(arg_machine)) {
-                        log_error("Failed to determine machine name automatically, please use -M.");
-                        return -EINVAL;
-                }
+                if (!machine_name_is_valid(arg_machine))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
 
                 if (arg_ephemeral) {
                         char *b;
@@ -2765,10 +2778,8 @@ static int patch_sysctl(void) {
                                 break;
                 }
 
-                if (!good) {
-                        log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
-                        return -EPERM;
-                }
+                if (!good)
+                        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
 
                 r = sysctl_write(*k, *v);
                 if (r < 0)
@@ -2784,6 +2795,7 @@ static int inner_child(
                 bool secondary,
                 int kmsg_socket,
                 int rtnl_socket,
+                int master_pty_socket,
                 FDSet *fds) {
 
         _cleanup_free_ char *home = NULL;
@@ -2917,6 +2929,29 @@ static int inner_child(
                 rtnl_socket = safe_close(rtnl_socket);
         }
 
+        if (arg_console_mode != CONSOLE_PIPE) {
+                _cleanup_close_ int master = -1;
+                _cleanup_free_ char *console = NULL;
+
+                /* Allocate a pty and make it available as /dev/console. */
+                master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
+                if (master < 0)
+                        return log_error_errno(master, "Failed to allocate a pty: %m");
+
+                r = setup_dev_console(console);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to setup /dev/console: %m");
+
+                r = send_one_fd(master_pty_socket, master, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to send master fd: %m");
+                master_pty_socket = safe_close(master_pty_socket);
+
+                r = setup_stdio_as_dev_console();
+                if (r < 0)
+                        return r;
+        }
+
         r = patch_sysctl();
         if (r < 0)
                 return r;
@@ -2927,8 +2962,8 @@ static int inner_child(
                         return log_error_errno(r, "Failed to adjust OOM score: %m");
         }
 
-        if (arg_cpuset)
-                if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
+        if (arg_cpu_set.set)
+                if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
                         return log_error_errno(errno, "Failed to set CPU affinity: %m");
 
         (void) setup_hostname();
@@ -2953,7 +2988,7 @@ static int inner_child(
                 if (is_seccomp_available()) {
 
                         r = seccomp_load(arg_seccomp);
-                        if (IN_SET(r, -EPERM, -EACCES))
+                        if (ERRNO_IS_SECCOMP_FATAL(r))
                                 return log_error_errno(r, "Failed to install seccomp filter: %m");
                         if (r < 0)
                                 log_debug_errno(r, "Failed to install seccomp filter: %m");
@@ -3138,7 +3173,6 @@ static int setup_sd_notify_child(void) {
 static int outer_child(
                 Barrier *barrier,
                 const char *directory,
-                const char *console,
                 DissectedImage *dissected_image,
                 bool secondary,
                 int pid_socket,
@@ -3147,6 +3181,7 @@ static int outer_child(
                 int kmsg_socket,
                 int rtnl_socket,
                 int uid_shift_socket,
+                int master_pty_socket,
                 int unified_cgroup_hierarchy_socket,
                 FDSet *fds,
                 int netns_fd) {
@@ -3166,6 +3201,7 @@ static int outer_child(
         assert(pid_socket >= 0);
         assert(uuid_socket >= 0);
         assert(notify_socket >= 0);
+        assert(master_pty_socket >= 0);
         assert(kmsg_socket >= 0);
 
         log_debug("Outer child is initializing.");
@@ -3173,25 +3209,6 @@ static int outer_child(
         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
 
-        if (arg_console_mode != CONSOLE_PIPE) {
-                int terminal;
-
-                assert(console);
-
-                terminal = open_terminal(console, O_RDWR);
-                if (terminal < 0)
-                        return log_error_errno(terminal, "Failed to open console: %m");
-
-                /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
-                r = log_dup_console();
-                if (r < 0)
-                        return log_error_errno(r, "Failed to duplicate stderr: %m");
-
-                r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
-                if (r < 0)
-                        return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
-        }
-
         r = reset_audit_loginuid();
         if (r < 0)
                 return r;
@@ -3267,7 +3284,7 @@ static int outer_child(
                         arg_userns_mode != USER_NAMESPACE_NO,
                         arg_uid_shift,
                         arg_uid_range,
-                        arg_selinux_context);
+                        arg_selinux_apifs_context);
         if (r < 0)
                 return r;
 
@@ -3298,7 +3315,7 @@ static int outer_child(
 
         /* Mark everything as shared so our mounts get propagated down. This is
          * required to make new bind mounts available in systemd services
-         * inside the containter that create a new mount namespace.
+         * inside the container that create a new mount namespace.
          * See https://github.com/systemd/systemd/issues/3860
          * Further submounts (such as /dev) done after this will inherit the
          * shared propagation mode. */
@@ -3315,7 +3332,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
-                r = bind_remount_recursive(directory, true, NULL);
+                r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
@@ -3346,10 +3363,6 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = setup_dev_console(directory, console);
-        if (r < 0)
-                return r;
-
         r = setup_keyring();
         if (r < 0)
                 return r;
@@ -3424,7 +3437,7 @@ static int outer_child(
                                 return log_error_errno(r, "Failed to join network namespace: %m");
                 }
 
-                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
+                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
                 if (r < 0)
                         _exit(EXIT_FAILURE);
 
@@ -3447,11 +3460,12 @@ static int outer_child(
 
         l = send_one_fd(notify_socket, fd, 0);
         if (l < 0)
-                return log_error_errno(errno, "Failed to send notify fd: %m");
+                return log_error_errno(l, "Failed to send notify fd: %m");
 
         pid_socket = safe_close(pid_socket);
         uuid_socket = safe_close(uuid_socket);
         notify_socket = safe_close(notify_socket);
+        master_pty_socket = safe_close(master_pty_socket);
         kmsg_socket = safe_close(kmsg_socket);
         rtnl_socket = safe_close(rtnl_socket);
         netns_fd = safe_close(netns_fd);
@@ -3720,7 +3734,7 @@ static int merge_settings(Settings *settings, const char *path) {
                 /* Copy the full capabilities over too */
                 if (capability_quintet_is_set(&settings->full_capabilities)) {
                         if (!arg_settings_trusted)
-                                log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
+                                log_warning("Ignoring capability settings, file %s is not trusted.", path);
                         else
                                 arg_full_capabilities = settings->full_capabilities;
                 }
@@ -3874,15 +3888,14 @@ static int merge_settings(Settings *settings, const char *path) {
         }
 
         if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
-            settings->cpuset) {
+            settings->cpu_set.set) {
 
                 if (!arg_settings_trusted)
                         log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
                 else {
-                        if (arg_cpuset)
-                                CPU_FREE(arg_cpuset);
-                        arg_cpuset = TAKE_PTR(settings->cpuset);
-                        arg_cpuset_ncpus = settings->cpuset_ncpus;
+                        cpu_set_reset(&arg_cpu_set);
+                        arg_cpu_set = settings->cpu_set;
+                        settings->cpu_set = (CPUSet) {};
                 }
         }
 
@@ -3950,7 +3963,7 @@ static int merge_settings(Settings *settings, const char *path) {
         arg_console_width = settings->console_width;
         arg_console_height = settings->console_height;
 
-        device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
+        device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
         arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
         arg_n_extra_nodes = settings->n_extra_nodes;
 
@@ -3978,7 +3991,7 @@ static int load_settings(void) {
         FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
                 _cleanup_free_ char *j = NULL;
 
-                j = strjoin(i, "/", fn);
+                j = path_join(i, fn);
                 if (!j)
                         return log_oom();
 
@@ -4005,7 +4018,7 @@ static int load_settings(void) {
                         p = file_in_same_dir(arg_image, fn);
                         if (!p)
                                 return log_oom();
-                } else if (arg_directory) {
+                } else if (arg_directory && !path_equal(arg_directory, "/")) {
                         p = file_in_same_dir(arg_directory, fn);
                         if (!p)
                                 return log_oom();
@@ -4052,14 +4065,13 @@ static int load_oci_bundle(void) {
         return merge_settings(settings, arg_oci_bundle);
 }
 
-static int run_container(int master,
-               const char* console,
+static int run_container(
                DissectedImage *dissected_image,
                bool secondary,
                FDSet *fds,
                char veth_name[IFNAMSIZ], bool *veth_created,
                union in_addr_union *exposed,
-               pid_t *pid, int *ret) {
+               int *master, pid_t *pid, int *ret) {
 
         static const struct sigaction sa = {
                 .sa_handler = nop_signal_handler,
@@ -4075,9 +4087,10 @@ static int run_container(int master,
                 uuid_socket_pair[2] = { -1, -1 },
                 notify_socket_pair[2] = { -1, -1 },
                 uid_shift_socket_pair[2] = { -1, -1 },
+                master_pty_socket_pair[2] = { -1, -1 },
                 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
 
-        _cleanup_close_ int notify_socket= -1;
+        _cleanup_close_ int notify_socket = -1;
         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
         _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
         _cleanup_(sd_event_unrefp) sd_event *event = NULL;
@@ -4125,6 +4138,9 @@ static int run_container(int master,
         if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
                 return log_error_errno(errno, "Failed to create notify socket pair: %m");
 
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create console socket pair: %m");
+
         if (arg_userns_mode != USER_NAMESPACE_NO)
                 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
                         return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
@@ -4153,10 +4169,9 @@ static int run_container(int master,
                         log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
                 else if (r < 0)
                         return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
-                else if (r == 0) {
-                        log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
-                        return -EINVAL;
-                }
+                else if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
         }
 
         *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
@@ -4169,13 +4184,12 @@ static int run_container(int master,
                 /* The outer child only has a file system namespace. */
                 barrier_set_role(&barrier, BARRIER_CHILD);
 
-                master = safe_close(master);
-
                 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
                 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
                 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
                 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
+                master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
                 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
                 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
 
@@ -4184,7 +4198,6 @@ static int run_container(int master,
 
                 r = outer_child(&barrier,
                                 arg_directory,
-                                console,
                                 dissected_image,
                                 secondary,
                                 pid_socket_pair[1],
@@ -4193,6 +4206,7 @@ static int run_container(int master,
                                 kmsg_socket_pair[1],
                                 rtnl_socket_pair[1],
                                 uid_shift_socket_pair[1],
+                                master_pty_socket_pair[1],
                                 unified_cgroup_hierarchy_socket_pair[1],
                                 fds,
                                 netns_fd);
@@ -4204,13 +4218,14 @@ static int run_container(int master,
 
         barrier_set_role(&barrier, BARRIER_PARENT);
 
-        fds = fdset_free(fds);
+        fdset_close(fds);
 
         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
         pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
         uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
         notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
+        master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
         uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
         unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
 
@@ -4219,10 +4234,8 @@ static int run_container(int master,
                 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
                 if (l < 0)
                         return log_error_errno(errno, "Failed to read UID shift: %m");
-                if (l != sizeof arg_uid_shift) {
-                        log_error("Short read while reading UID shift.");
-                        return -EIO;
-                }
+                if (l != sizeof arg_uid_shift)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
 
                 if (arg_userns_mode == USER_NAMESPACE_PICK) {
                         /* If we are supposed to pick the UID shift, let's try to use the shift read from the
@@ -4236,10 +4249,8 @@ static int run_container(int master,
                         l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
                         if (l < 0)
                                 return log_error_errno(errno, "Failed to send UID shift: %m");
-                        if (l != sizeof arg_uid_shift) {
-                                log_error("Short write while writing UID shift.");
-                                return -EIO;
-                        }
+                        if (l != sizeof arg_uid_shift)
+                                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
                 }
         }
 
@@ -4248,11 +4259,9 @@ static int run_container(int master,
                 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
                 if (l < 0)
                         return log_error_errno(errno, "Failed to read cgroup mode: %m");
-                if (l != sizeof(arg_unified_cgroup_hierarchy)) {
-                        log_error("Short read while reading cgroup mode (%zu bytes).%s",
-                                  l, l == 0 ? " The child is most likely dead." : "");
-                        return -EIO;
-                }
+                if (l != sizeof(arg_unified_cgroup_hierarchy))
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
+                                               l, l == 0 ? " The child is most likely dead." : "");
         }
 
         /* Wait for the outer child. */
@@ -4266,19 +4275,15 @@ static int run_container(int master,
         l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
         if (l < 0)
                 return log_error_errno(errno, "Failed to read inner child PID: %m");
-        if (l != sizeof *pid) {
-                log_error("Short read while reading inner child PID.");
-                return -EIO;
-        }
+        if (l != sizeof *pid)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
 
         /* We also retrieve container UUID in case it was generated by outer child */
         l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
         if (l < 0)
                 return log_error_errno(errno, "Failed to read container machine ID: %m");
-        if (l != sizeof(arg_uuid)) {
-                log_error("Short read while reading container machined ID.");
-                return -EIO;
-        }
+        if (l != sizeof(arg_uuid))
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
 
         /* We also retrieve the socket used for notifications generated by outer child */
         notify_socket = receive_one_fd(notify_socket_pair[0], 0);
@@ -4289,10 +4294,8 @@ static int run_container(int master,
         log_debug("Init process invoked as PID "PID_FMT, *pid);
 
         if (arg_userns_mode != USER_NAMESPACE_NO) {
-                if (!barrier_place_and_sync(&barrier)) { /* #1 */
-                        log_error("Child died too early.");
-                        return -ESRCH;
-                }
+                if (!barrier_place_and_sync(&barrier)) /* #1 */
+                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
                 r = setup_uid_map(*pid);
                 if (r < 0)
@@ -4304,10 +4307,8 @@ static int run_container(int master,
         if (arg_private_network) {
                 if (!arg_network_namespace_path) {
                         /* Wait until the child has unshared its network namespace. */
-                        if (!barrier_place_and_sync(&barrier)) { /* #3 */
-                                log_error("Child died too early");
-                                return -ESRCH;
-                        }
+                        if (!barrier_place_and_sync(&barrier)) /* #3 */
+                                return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
                 }
 
                 r = move_network_interfaces(*pid, arg_network_interfaces);
@@ -4463,10 +4464,8 @@ static int run_container(int master,
                 return r;
 
         /* Let the child know that we are ready and wait that the child is completely ready now. */
-        if (!barrier_place_and_sync(&barrier)) { /* #5 */
-                log_error("Child died too early.");
-                return -ESRCH;
-        }
+        if (!barrier_place_and_sync(&barrier)) /* #5 */
+                return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
         /* At this point we have made use of the UID we picked, and thus nss-mymachines
          * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
@@ -4501,17 +4500,40 @@ static int run_container(int master,
 
         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
 
-        if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
-                assert(master >= 0);
+        if (arg_console_mode != CONSOLE_PIPE) {
+                _cleanup_close_ int fd = -1;
+                PTYForwardFlags flags = 0;
 
-                r = pty_forward_new(event, master,
-                                    PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
-                                    &forward);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to create PTY forwarder: %m");
+                /* Retrieve the master pty allocated by inner child */
+                fd = receive_one_fd(master_pty_socket_pair[0], 0);
+                if (fd < 0)
+                        return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
 
-                if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
-                        (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
+                switch (arg_console_mode) {
+
+                case CONSOLE_READ_ONLY:
+                        flags |= PTY_FORWARD_READ_ONLY;
+
+                        _fallthrough_;
+
+                case CONSOLE_INTERACTIVE:
+                        flags |= PTY_FORWARD_IGNORE_VHANGUP;
+
+                        r = pty_forward_new(event, fd, flags, &forward);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+                        if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
+                                (void) pty_forward_set_width_height(forward,
+                                                                    arg_console_width,
+                                                                    arg_console_height);
+                        break;
+
+                default:
+                        assert(arg_console_mode == CONSOLE_PASSIVE);
+                }
+
+                *master = TAKE_FD(fd);
         }
 
         r = sd_event_loop(event);
@@ -4641,28 +4663,23 @@ static int initialize_rlimits(void) {
 }
 
 static int run(int argc, char *argv[]) {
-        _cleanup_free_ char *console = NULL;
+        bool secondary = false, remove_directory = false, remove_image = false,
+                veth_created = false, remove_tmprootdir = false;
         _cleanup_close_ int master = -1;
         _cleanup_fdset_free_ FDSet *fds = NULL;
         int r, n_fd_passed, ret = EXIT_SUCCESS;
         char veth_name[IFNAMSIZ] = "";
-        bool secondary = false, remove_directory = false, remove_image = false;
-        pid_t pid = 0;
         union in_addr_union exposed = {};
         _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
-        bool veth_created = false, remove_tmprootdir = false;
         char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
         _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+        pid_t pid = 0;
 
         log_parse_environment();
         log_open();
 
-        /* Make sure rename_process() in the stub init process can work */
-        saved_argv = argv;
-        saved_argc = argc;
-
         r = parse_argv(argc, argv);
         if (r <= 0)
                 goto finish;
@@ -4723,8 +4740,12 @@ static int run(int argc, char *argv[]) {
         if (arg_directory) {
                 assert(!arg_image);
 
-                if (path_equal(arg_directory, "/") && !arg_ephemeral) {
-                        log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
+                /* Safety precaution: let's not allow running images from the live host OS image, as long as
+                 * /var from the host will propagate into container dynamically (because bad things happen if
+                 * two systems write to the same /var). Let's allow it for the special cases where /var is
+                 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
+                if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
+                        log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
                         r = -EINVAL;
                         goto finish;
                 }
@@ -4736,12 +4757,9 @@ static int run(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
 
-                        /* If the specified path is a mount point we
-                         * generate the new snapshot immediately
-                         * inside it under a random name. However if
-                         * the specified is not a mount point we
-                         * create the new snapshot in the parent
-                         * directory, just next to it. */
+                        /* If the specified path is a mount point we generate the new snapshot immediately
+                         * inside it under a random name. However if the specified is not a mount point we
+                         * create the new snapshot in the parent directory, just next to it. */
                         r = path_is_mount_point(arg_directory, NULL, 0);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
@@ -4756,27 +4774,35 @@ static int run(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        /* We take an exclusive lock on this image, since it's our private, ephemeral copy
+                         * only owned by us and noone else. */
+                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to lock %s: %m", np);
                                 goto finish;
                         }
 
-                        r = btrfs_subvol_snapshot(arg_directory, np,
-                                                  (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
-                                                  BTRFS_SNAPSHOT_FALLBACK_COPY |
-                                                  BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
-                                                  BTRFS_SNAPSHOT_RECURSIVE |
-                                                  BTRFS_SNAPSHOT_QUOTA);
+                        {
+                                BLOCK_SIGNALS(SIGINT);
+                                r = btrfs_subvol_snapshot(arg_directory, np,
+                                                          (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
+                                                          BTRFS_SNAPSHOT_FALLBACK_COPY |
+                                                          BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+                                                          BTRFS_SNAPSHOT_RECURSIVE |
+                                                          BTRFS_SNAPSHOT_QUOTA |
+                                                          BTRFS_SNAPSHOT_SIGINT);
+                        }
+                        if (r == -EINTR) {
+                                log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
+                                goto finish;
+                        }
                         if (r < 0) {
                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
                                 goto finish;
                         }
 
                         free_and_replace(arg_directory, np);
-
                         remove_directory = true;
-
                 } else {
                         r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
                         if (r < 0)
@@ -4797,17 +4823,24 @@ static int run(int argc, char *argv[]) {
                                 if (r < 0)
                                         goto finish;
 
-                                r = btrfs_subvol_snapshot(arg_template, arg_directory,
-                                                          (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
-                                                          BTRFS_SNAPSHOT_FALLBACK_COPY |
-                                                          BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
-                                                          BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
-                                                          BTRFS_SNAPSHOT_RECURSIVE |
-                                                          BTRFS_SNAPSHOT_QUOTA);
+                                {
+                                        BLOCK_SIGNALS(SIGINT);
+                                        r = btrfs_subvol_snapshot(arg_template, arg_directory,
+                                                                  (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
+                                                                  BTRFS_SNAPSHOT_FALLBACK_COPY |
+                                                                  BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+                                                                  BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
+                                                                  BTRFS_SNAPSHOT_RECURSIVE |
+                                                                  BTRFS_SNAPSHOT_QUOTA |
+                                                                  BTRFS_SNAPSHOT_SIGINT);
+                                }
                                 if (r == -EEXIST)
                                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
                                                  "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
-                                else if (r < 0) {
+                                else if (r == -EINTR) {
+                                        log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
+                                        goto finish;
+                                } else if (r < 0) {
                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
                                         goto finish;
                                 } else
@@ -4863,20 +4896,27 @@ static int run(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        /* Always take an exclusive lock on our own ephemeral copy. */
+                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
                         if (r < 0) {
                                 r = log_error_errno(r, "Failed to create image lock: %m");
                                 goto finish;
                         }
 
-                        r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
+                        {
+                                BLOCK_SIGNALS(SIGINT);
+                                r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
+                        }
+                        if (r == -EINTR) {
+                                log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
+                                goto finish;
+                        }
                         if (r < 0) {
                                 r = log_error_errno(r, "Failed to copy image file: %m");
                                 goto finish;
                         }
 
                         free_and_replace(arg_image, np);
-
                         remove_image = true;
                 } else {
                         r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
@@ -4960,31 +5000,6 @@ static int run(int argc, char *argv[]) {
         if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
                 arg_quiet = true;
 
-        if (arg_console_mode != CONSOLE_PIPE) {
-                master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
-                if (master < 0) {
-                        r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
-                        goto finish;
-                }
-
-                r = ptsname_malloc(master, &console);
-                if (r < 0) {
-                        r = log_error_errno(r, "Failed to determine tty name: %m");
-                        goto finish;
-                }
-
-                if (arg_selinux_apifs_context) {
-                        r = mac_selinux_apply(console, arg_selinux_apifs_context);
-                        if (r < 0)
-                                goto finish;
-                }
-
-                if (unlockpt(master) < 0) {
-                        r = log_error_errno(errno, "Failed to unlock tty: %m");
-                        goto finish;
-                }
-        }
-
         if (!arg_quiet)
                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
                          arg_machine, arg_image ?: arg_directory);
@@ -4997,13 +5012,11 @@ static int run(int argc, char *argv[]) {
         }
 
         for (;;) {
-                r = run_container(master,
-                                  console,
-                                  dissected_image,
+                r = run_container(dissected_image,
                                   secondary,
                                   fds,
                                   veth_name, &veth_created,
-                                  &exposed,
+                                  &exposed, &master,
                                   &pid, &ret);
                 if (r <= 0)
                         break;
@@ -5062,7 +5075,7 @@ finish:
         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
         expose_port_free_all(arg_expose_ports);
         rlimit_free_all(arg_rlimit);
-        device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
+        device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
 
         if (r < 0)
                 return r;