]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
Rework cpu affinity parsing
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index 64a30a5b9a800bce23932b4488fcae1b949fc998..25184e11e9e30fc76d16075451475caaa5844265 100644 (file)
 #include "loopback-setup.h"
 #include "machine-image.h"
 #include "macro.h"
+#include "main-func.h"
 #include "missing.h"
 #include "mkdir.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
+#include "namespace-util.h"
 #include "netlink-util.h"
 #include "nspawn-cgroup.h"
 #include "nspawn-def.h"
 #include "nspawn-expose-ports.h"
 #include "nspawn-mount.h"
 #include "nspawn-network.h"
+#include "nspawn-oci.h"
 #include "nspawn-patch-uid.h"
 #include "nspawn-register.h"
 #include "nspawn-seccomp.h"
 #include "nspawn-settings.h"
 #include "nspawn-setuid.h"
 #include "nspawn-stub-pid1.h"
+#include "nulstr-util.h"
 #include "os-util.h"
 #include "pager.h"
 #include "parse-util.h"
@@ -84,6 +88,9 @@
 #include "raw-clone.h"
 #include "rlimit-util.h"
 #include "rm-rf.h"
+#if HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
 #include "selinux-util.h"
 #include "signal-util.h"
 #include "socket-util.h"
@@ -92,6 +99,7 @@
 #include "string-table.h"
 #include "string-util.h"
 #include "strv.h"
+#include "sysctl-util.h"
 #include "terminal-util.h"
 #include "tmpfile-util.h"
 #include "umask-util.h"
 
 typedef enum ContainerStatus {
         CONTAINER_TERMINATED,
-        CONTAINER_REBOOTED
+        CONTAINER_REBOOTED,
 } ContainerStatus;
 
 static char *arg_directory = NULL;
@@ -122,12 +130,16 @@ static char *arg_chdir = NULL;
 static char *arg_pivot_root_new = NULL;
 static char *arg_pivot_root_old = NULL;
 static char *arg_user = NULL;
+static uid_t arg_uid = UID_INVALID;
+static gid_t arg_gid = GID_INVALID;
+static gid_t* arg_supplementary_gids = NULL;
+static size_t arg_n_supplementary_gids = 0;
 static sd_id128_t arg_uuid = {};
 static char *arg_machine = NULL;     /* The name used by the host to refer to this */
 static char *arg_hostname = NULL;    /* The name the payload sees by default */
 static const char *arg_selinux_context = NULL;
 static const char *arg_selinux_apifs_context = NULL;
-static const char *arg_slice = NULL;
+static char *arg_slice = NULL;
 static bool arg_private_network = false;
 static bool arg_read_only = false;
 static StartMode arg_start_mode = START_PID1;
@@ -161,6 +173,7 @@ static uint64_t arg_caps_retain =
         (1ULL << CAP_SYS_PTRACE) |
         (1ULL << CAP_SYS_RESOURCE) |
         (1ULL << CAP_SYS_TTY_CONFIG);
+static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
 static CustomMount *arg_custom_mounts = NULL;
 static size_t arg_n_custom_mounts = 0;
 static char **arg_setenv = NULL;
@@ -175,11 +188,14 @@ static char **arg_network_veth_extra = NULL;
 static char *arg_network_bridge = NULL;
 static char *arg_network_zone = NULL;
 static char *arg_network_namespace_path = NULL;
+static PagerFlags arg_pager_flags = 0;
 static unsigned long arg_personality = PERSONALITY_INVALID;
 static char *arg_image = NULL;
+static char *arg_oci_bundle = NULL;
 static VolatileMode arg_volatile_mode = VOLATILE_NO;
 static ExposePort *arg_expose_ports = NULL;
 static char **arg_property = NULL;
+static sd_bus_message *arg_property_message = NULL;
 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns_chown = false;
@@ -197,52 +213,109 @@ static void *arg_root_hash = NULL;
 static size_t arg_root_hash_size = 0;
 static char **arg_syscall_whitelist = NULL;
 static char **arg_syscall_blacklist = NULL;
+#if HAVE_SECCOMP
+static scmp_filter_ctx arg_seccomp = NULL;
+#endif
 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
 static bool arg_no_new_privileges = false;
 static int arg_oom_score_adjust = 0;
 static bool arg_oom_score_adjust_set = false;
-static cpu_set_t *arg_cpuset = NULL;
-static unsigned arg_cpuset_ncpus = 0;
+static CPUSet arg_cpu_set = {};
 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
+static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
+static DeviceNode* arg_extra_nodes = NULL;
+static size_t arg_n_extra_nodes = 0;
+static char **arg_sysctl = NULL;
+static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
+
+STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
+STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
+#if HAVE_SECCOMP
+STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
+#endif
+STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
+STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
 
 static int help(void) {
         _cleanup_free_ char *link = NULL;
         int r;
 
-        (void) pager_open(false);
+        (void) pager_open(arg_pager_flags);
 
         r = terminal_urlify_man("systemd-nspawn", "1", &link);
         if (r < 0)
                 return log_oom();
 
-        printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
+        printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
                "Spawn a command or OS in a light-weight container.\n\n"
                "  -h --help                 Show this help\n"
                "     --version              Print version string\n"
                "  -q --quiet                Do not show status information\n"
+               "     --no-pager             Do not pipe output into a pager\n"
+               "     --settings=BOOLEAN     Load additional settings from .nspawn file\n\n"
+               "%3$sImage:%4$s\n"
                "  -D --directory=PATH       Root directory for the container\n"
                "     --template=PATH        Initialize root directory from template directory,\n"
                "                            if missing\n"
                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
                "                            remove it after exit\n"
-               "  -i --image=PATH           File system device or disk image for the container\n"
-               "     --root-hash=HASH       Specify verity root hash\n"
+               "  -i --image=PATH           Root file system disk image (or device node) for\n"
+               "                            the container\n"
+               "     --oci-bundle=PATH      OCI bundle directory\n"
+               "     --read-only            Mount the root directory read-only\n"
+               "     --volatile[=MODE]      Run the system in volatile mode\n"
+               "     --root-hash=HASH       Specify verity root hash for root disk image\n"
+               "     --pivot-root=PATH[:PATH]\n"
+               "                            Pivot root to given directory in the container\n\n"
+               "%3$sExecution:%4$s\n"
                "  -a --as-pid2              Maintain a stub init as PID1, invoke binary as PID2\n"
                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
                "     --chdir=PATH           Set working directory in the container\n"
-               "     --pivot-root=PATH[:PATH]\n"
-               "                            Pivot root to given directory in the container\n"
-               "  -u --user=USER            Run the command under specified user or uid\n"
+               "  -E --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
+               "  -u --user=USER            Run the command under specified user or UID\n"
+               "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
+               "     --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
+               "%3$sSystem Identity:%4$s\n"
                "  -M --machine=NAME         Set the machine name for the container\n"
                "     --hostname=NAME        Override the hostname for the container\n"
-               "     --uuid=UUID            Set a specific machine UUID for the container\n"
+               "     --uuid=UUID            Set a specific machine UUID for the container\n\n"
+               "%3$sProperties:%4$s\n"
                "  -S --slice=SLICE          Place the container in the specified slice\n"
                "     --property=NAME=VALUE  Set scope unit property\n"
+               "     --register=BOOLEAN     Register container as machine\n"
+               "     --keep-unit            Do not register a scope for the machine, reuse\n"
+               "                            the service unit nspawn is running in\n\n"
+               "%3$sUser Namespacing:%4$s\n"
                "  -U --private-users=pick   Run within user namespace, autoselect UID/GID range\n"
                "     --private-users[=UIDBASE[:NUIDS]]\n"
                "                            Similar, but with user configured UID/GID range\n"
-               "     --private-users-chown  Adjust OS tree ownership to private UID/GID range\n"
+               "     --private-users-chown  Adjust OS tree ownership to private UID/GID range\n\n"
+               "%3$sNetworking:%4$s\n"
                "     --private-network      Disable network in container\n"
                "     --network-interface=INTERFACE\n"
                "                            Assign an existing network interface to the\n"
@@ -267,51 +340,54 @@ static int help(void) {
                "                            Set network namespace to the one represented by\n"
                "                            the specified kernel namespace file node\n"
                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
-               "                            Expose a container IP port on the host\n"
-               "  -Z --selinux-context=SECLABEL\n"
-               "                            Set the SELinux security context to be used by\n"
-               "                            processes in the container\n"
-               "  -L --selinux-apifs-context=SECLABEL\n"
-               "                            Set the SELinux security context to be used by\n"
-               "                            API/tmpfs file systems in the container\n"
+               "                            Expose a container IP port on the host\n\n"
+               "%3$sSecurity:%4$s\n"
                "     --capability=CAP       In addition to the default, retain specified\n"
                "                            capability\n"
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
+               "     --no-new-privileges    Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
                "     --system-call-filter=LIST|~LIST\n"
                "                            Permit/prohibit specific system calls\n"
+               "  -Z --selinux-context=SECLABEL\n"
+               "                            Set the SELinux security context to be used by\n"
+               "                            processes in the container\n"
+               "  -L --selinux-apifs-context=SECLABEL\n"
+               "                            Set the SELinux security context to be used by\n"
+               "                            API/tmpfs file systems in the container\n\n"
+               "%3$sResources:%4$s\n"
                "     --rlimit=NAME=LIMIT    Set a resource limit for the payload\n"
                "     --oom-score-adjust=VALUE\n"
                "                            Adjust the OOM score value for the payload\n"
                "     --cpu-affinity=CPUS    Adjust the CPU affinity of the container\n"
-               "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
-               "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, \n"
-               "                            host, try-guest, try-host\n"
-               "  -j                        Equivalent to --link-journal=try-guest\n"
+               "     --personality=ARCH     Pick personality for this container\n\n"
+               "%3$sIntegration:%4$s\n"
                "     --resolv-conf=MODE     Select mode of /etc/resolv.conf initialization\n"
                "     --timezone=MODE        Select mode of /etc/localtime initialization\n"
-               "     --read-only            Mount the root directory read-only\n"
+               "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, \n"
+               "                            host, try-guest, try-host\n"
+               "  -j                        Equivalent to --link-journal=try-guest\n\n"
+               "%3$sMounts:%4$s\n"
                "     --bind=PATH[:PATH[:OPTIONS]]\n"
                "                            Bind mount a file or directory from the host into\n"
                "                            the container\n"
                "     --bind-ro=PATH[:PATH[:OPTIONS]\n"
                "                            Similar, but creates a read-only bind mount\n"
+               "     --inaccessible=PATH    Over-mount file node with inaccessible node to mask\n"
+               "                            it\n"
                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
                "     --overlay=PATH[:PATH...]:PATH\n"
                "                            Create an overlay mount from the host to \n"
                "                            the container\n"
                "     --overlay-ro=PATH[:PATH...]:PATH\n"
-               "                            Similar, but creates a read-only overlay mount\n"
-               "  -E --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
-               "     --register=BOOLEAN     Register container as machine\n"
-               "     --keep-unit            Do not register a scope for the machine, reuse\n"
-               "                            the service unit nspawn is running in\n"
-               "     --volatile[=MODE]      Run the system in volatile mode\n"
-               "     --settings=BOOLEAN     Load additional settings from .nspawn file\n"
-               "     --notify-ready=BOOLEAN Receive notifications from the child init process\n"
-               "\nSee the %s for details.\n"
+               "                            Similar, but creates a read-only overlay mount\n\n"
+               "%3$sInput/Output:%4$s\n"
+               "     --console=MODE         Select how stdin/stdout/stderr and /dev/console are\n"
+               "                            set up for the container.\n"
+               "  -P --pipe                 Equivalent to --console=pipe\n"
+               "\nSee the %2$s for details.\n"
                , program_invocation_short_name
                , link
-        );
+               , ansi_underline(), ansi_normal());
 
         return 0;
 }
@@ -399,7 +475,9 @@ static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
                 return;
         if (r < 0)
                 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+
         arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
+        arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
 }
 
 static void parse_mount_settings_env(void) {
@@ -444,9 +522,20 @@ static void parse_environment(void) {
 
         /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
          * even if it is supported. If not supported, it has no effect. */
-        r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
-        if (r == 0 || !cg_ns_supported())
+        if (!cg_ns_supported())
                 arg_use_cgns = false;
+        else {
+                r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
+                if (r < 0) {
+                        if (r != -ENXIO)
+                                log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
+
+                        arg_use_cgns = true;
+                } else {
+                        arg_use_cgns = r > 0;
+                        arg_settings_mask |= SETTING_USE_CGNS;
+                }
+        }
 
         e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
         if (e)
@@ -469,6 +558,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_TMPFS,
                 ARG_OVERLAY,
                 ARG_OVERLAY_RO,
+                ARG_INACCESSIBLE,
                 ARG_SHARE_SYSTEM,
                 ARG_REGISTER,
                 ARG_KEEP_UNIT,
@@ -499,6 +589,10 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_CPU_AFFINITY,
                 ARG_RESOLV_CONF,
                 ARG_TIMEZONE,
+                ARG_CONSOLE,
+                ARG_PIPE,
+                ARG_OCI_BUNDLE,
+                ARG_NO_PAGER,
         };
 
         static const struct option options[] = {
@@ -522,6 +616,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "tmpfs",                  required_argument, NULL, ARG_TMPFS                  },
                 { "overlay",                required_argument, NULL, ARG_OVERLAY                },
                 { "overlay-ro",             required_argument, NULL, ARG_OVERLAY_RO             },
+                { "inaccessible",           required_argument, NULL, ARG_INACCESSIBLE           },
                 { "machine",                required_argument, NULL, 'M'                        },
                 { "hostname",               required_argument, NULL, ARG_HOSTNAME               },
                 { "slice",                  required_argument, NULL, 'S'                        },
@@ -559,6 +654,10 @@ static int parse_argv(int argc, char *argv[]) {
                 { "cpu-affinity",           required_argument, NULL, ARG_CPU_AFFINITY           },
                 { "resolv-conf",            required_argument, NULL, ARG_RESOLV_CONF            },
                 { "timezone",               required_argument, NULL, ARG_TIMEZONE               },
+                { "console",                required_argument, NULL, ARG_CONSOLE                },
+                { "pipe",                   no_argument,       NULL, ARG_PIPE                   },
+                { "oci-bundle",             required_argument, NULL, ARG_OCI_BUNDLE             },
+                { "no-pager",               no_argument,       NULL, ARG_NO_PAGER               },
                 {}
         };
 
@@ -570,7 +669,7 @@ static int parse_argv(int argc, char *argv[]) {
         assert(argc >= 0);
         assert(argv);
 
-        while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
+        while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
                 switch (c) {
 
                 case 'h':
@@ -583,18 +682,31 @@ static int parse_argv(int argc, char *argv[]) {
                         r = parse_path_argument_and_warn(optarg, false, &arg_directory);
                         if (r < 0)
                                 return r;
+
+                        arg_settings_mask |= SETTING_DIRECTORY;
                         break;
 
                 case ARG_TEMPLATE:
                         r = parse_path_argument_and_warn(optarg, false, &arg_template);
                         if (r < 0)
                                 return r;
+
+                        arg_settings_mask |= SETTING_DIRECTORY;
                         break;
 
                 case 'i':
                         r = parse_path_argument_and_warn(optarg, false, &arg_image);
                         if (r < 0)
                                 return r;
+
+                        arg_settings_mask |= SETTING_DIRECTORY;
+                        break;
+
+                case ARG_OCI_BUNDLE:
+                        r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
+                        if (r < 0)
+                                return r;
+
                         break;
 
                 case 'x':
@@ -702,6 +814,7 @@ static int parse_argv(int argc, char *argv[]) {
                         if (r < 0)
                                 return r;
 
+                        arg_settings_mask |= SETTING_NETWORK;
                         break;
 
                 case 'b':
@@ -735,7 +848,11 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case 'S':
-                        arg_slice = optarg;
+                        r = free_and_strdup(&arg_slice, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        arg_settings_mask |= SETTING_SLICE;
                         break;
 
                 case 'M':
@@ -790,7 +907,6 @@ static int parse_argv(int argc, char *argv[]) {
                                 r = extract_first_word(&p, &t, ",", 0);
                                 if (r < 0)
                                         return log_error_errno(r, "Failed to parse capability %s.", t);
-
                                 if (r == 0)
                                         break;
 
@@ -832,10 +948,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_LINK_JOURNAL:
                         r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
-                        if (r < 0) {
-                                log_error_errno(r, "Failed to parse link journal mode %s", optarg);
-                                return -EINVAL;
-                        }
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
 
                         arg_settings_mask |= SETTING_LINK_JOURNAL;
                         break;
@@ -868,6 +982,14 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
                         break;
 
+                case ARG_INACCESSIBLE:
+                        r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
+
+                        arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
+                        break;
+
                 case 'E': {
                         char **n;
 
@@ -1118,9 +1240,8 @@ static int parse_argv(int argc, char *argv[]) {
                         if (r < 0)
                                 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
                         if (l < sizeof(sd_id128_t)) {
-                                log_error("Root hash must be at least 128bit long: %s", optarg);
                                 free(k);
-                                return -EINVAL;
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
                         }
 
                         free(arg_root_hash);
@@ -1207,17 +1328,14 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_CPU_AFFINITY: {
-                        _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
+                        CPUSet cpuset;
 
                         r = parse_cpu_set(optarg, &cpuset);
                         if (r < 0)
-                                return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
+                                return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
 
-                        if (arg_cpuset)
-                                CPU_FREE(arg_cpuset);
-
-                        arg_cpuset = TAKE_PTR(cpuset);
-                        arg_cpuset_ncpus = r;
+                        cpu_set_reset(&arg_cpu_set);
+                        arg_cpu_set = cpuset;
                         arg_settings_mask |= SETTING_CPU_AFFINITY;
                         break;
                 }
@@ -1250,6 +1368,36 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_settings_mask |= SETTING_TIMEZONE;
                         break;
 
+                case ARG_CONSOLE:
+                        if (streq(optarg, "interactive"))
+                                arg_console_mode = CONSOLE_INTERACTIVE;
+                        else if (streq(optarg, "read-only"))
+                                arg_console_mode = CONSOLE_READ_ONLY;
+                        else if (streq(optarg, "passive"))
+                                arg_console_mode = CONSOLE_PASSIVE;
+                        else if (streq(optarg, "pipe"))
+                                arg_console_mode = CONSOLE_PIPE;
+                        else if (streq(optarg, "help"))
+                                puts("interactive\n"
+                                     "read-only\n"
+                                     "passive\n"
+                                     "pipe");
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
+
+                        arg_settings_mask |= SETTING_CONSOLE_MODE;
+                        break;
+
+                case 'P':
+                case ARG_PIPE:
+                        arg_console_mode = CONSOLE_PIPE;
+                        arg_settings_mask |= SETTING_CONSOLE_MODE;
+                        break;
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -1273,7 +1421,10 @@ static int parse_argv(int argc, char *argv[]) {
                  * --directory=". */
                 arg_directory = TAKE_PTR(arg_template);
 
-        arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+        arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
+
+        /* Make sure to parse environment before we reset the settings mask below */
+        parse_environment();
 
         /* Load all settings from .nspawn files */
         if (mask_no_settings)
@@ -1335,25 +1486,27 @@ static int verify_arguments(void) {
                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
 
         if (arg_userns_chown && arg_read_only)
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined.");
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--read-only and --private-users-chown may not be combined.");
 
         /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
          * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
-         * copy-up (in case of overlay) making the entire excercise pointless. */
+         * copy-up (in case of overlay) making the entire exercise pointless. */
         if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
 
-        /* If --network-namespace-path is given with any other network-related option,
-         * we need to error out, to avoid conflicts between different network options. */
+        /* If --network-namespace-path is given with any other network-related option, we need to error out,
+         * to avoid conflicts between different network options. */
         if (arg_network_namespace_path &&
                 (arg_network_interfaces || arg_network_macvlan ||
                  arg_network_ipvlan || arg_network_veth_extra ||
                  arg_network_bridge || arg_network_zone ||
                  arg_network_veth || arg_private_network))
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path cannot be combined with other network options.");
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
 
         if (arg_network_bridge && arg_network_zone)
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-bridge= and --network-zone= may not be combined.");
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--network-bridge= and --network-zone= may not be combined.");
 
         if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
@@ -1539,7 +1692,7 @@ static int setup_timezone(const char *dest) {
 
         case TIMEZONE_COPY:
                 /* If mounting failed, try to copy */
-                r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
+                r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
                 if (r < 0) {
                         log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
                                        "Failed to copy /etc/localtime to %s, ignoring: %m", where);
@@ -1666,7 +1819,7 @@ static int setup_resolv_conf(const char *dest) {
         }
 
         /* If that didn't work, let's copy the file */
-        r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
+        r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
         if (r < 0) {
                 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
                  * resolved or something similar runs inside and the symlink points there.
@@ -1692,10 +1845,9 @@ static int setup_boot_id(void) {
         const char *to;
         int r;
 
-        /* Generate a new randomized boot ID, so that each boot-up of
-         * the container gets a new one */
+        /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
 
-        r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+        r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate random boot ID path: %m");
 
@@ -1727,9 +1879,9 @@ static int copy_devnodes(const char *dest) {
                 "tty\0"
                 "net/tun\0";
 
+        _cleanup_umask_ mode_t u;
         const char *d;
         int r = 0;
-        _cleanup_umask_ mode_t u;
 
         assert(dest);
 
@@ -1809,6 +1961,32 @@ static int copy_devnodes(const char *dest) {
         return r;
 }
 
+static int make_extra_nodes(const char *dest) {
+        _cleanup_umask_ mode_t u;
+        size_t i;
+        int r;
+
+        u = umask(0000);
+
+        for (i = 0; i < arg_n_extra_nodes; i++) {
+                _cleanup_free_ char *path = NULL;
+                DeviceNode *n = arg_extra_nodes + i;
+
+                path = prefix_root(dest, n->path);
+                if (!path)
+                        return log_oom();
+
+                if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
+                        return log_error_errno(errno, "Failed to create device node '%s': %m", path);
+
+                r = chmod_and_chown(path, n->mode, n->uid, n->gid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
+        }
+
+        return 0;
+}
+
 static int setup_pts(const char *dest) {
         _cleanup_free_ char *options = NULL;
         const char *p;
@@ -1865,10 +2043,12 @@ static int setup_dev_console(const char *dest, const char *console) {
         int r;
 
         assert(dest);
-        assert(console);
 
         u = umask(0000);
 
+        if (!console)
+                return 0;
+
         r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
         if (r < 0)
                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
@@ -1911,19 +2091,18 @@ static int setup_kmsg(int kmsg_socket) {
         _cleanup_free_ char *fifo = NULL;
         _cleanup_close_ int fd = -1;
         _cleanup_umask_ mode_t u;
-        const char *to;
         int r;
 
         assert(kmsg_socket >= 0);
 
         u = umask(0000);
 
-        /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+        /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
          * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
          * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
          * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
 
-        r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+        r = tempfn_random_child("/run", "proc-kmsg", &fifo);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate kmsg path: %m");
 
@@ -1931,9 +2110,8 @@ static int setup_kmsg(int kmsg_socket) {
                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
 
         from = TAKE_PTR(fifo);
-        to = "/proc/kmsg";
 
-        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
         if (r < 0)
                 return r;
 
@@ -2106,8 +2284,40 @@ static int setup_journal(const char *directory) {
         return 0;
 }
 
-static int drop_capabilities(void) {
-        return capability_bounding_set_drop(arg_caps_retain, false);
+static int drop_capabilities(uid_t uid) {
+        CapabilityQuintet q;
+
+        /* Let's initialize all five capability sets to something valid. If the quintet was configured via
+         * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
+         * arg_caps_retain. */
+
+        if (capability_quintet_is_set(&arg_full_capabilities)) {
+                q = arg_full_capabilities;
+
+                if (q.bounding == (uint64_t) -1)
+                        q.bounding = uid == 0 ? arg_caps_retain : 0;
+
+                if (q.effective == (uint64_t) -1)
+                        q.effective = uid == 0 ? q.bounding : 0;
+
+                if (q.inheritable == (uint64_t) -1)
+                        q.inheritable = uid == 0 ? q.bounding : 0;
+
+                if (q.permitted == (uint64_t) -1)
+                        q.permitted = uid == 0 ? q.bounding : 0;
+
+                if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
+                        q.ambient = 0;
+        } else
+                q = (CapabilityQuintet) {
+                        .bounding = arg_caps_retain,
+                        .effective = uid == 0 ? arg_caps_retain : 0,
+                        .inheritable = uid == 0 ? arg_caps_retain : 0,
+                        .permitted = uid == 0 ? arg_caps_retain : 0,
+                        .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
+                };
+
+        return capability_quintet_enforce(&q);
 }
 
 static int reset_audit_loginuid(void) {
@@ -2396,10 +2606,8 @@ static int determine_names(void) {
                                 return log_error_errno(r, "Failed to determine current directory: %m");
                 }
 
-                if (!arg_directory && !arg_image) {
-                        log_error("Failed to determine path, please use -D or -i.");
-                        return -EINVAL;
-                }
+                if (!arg_directory && !arg_image)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
         }
 
         if (!arg_machine) {
@@ -2422,10 +2630,8 @@ static int determine_names(void) {
                         return log_oom();
 
                 hostname_cleanup(arg_machine);
-                if (!machine_name_is_valid(arg_machine)) {
-                        log_error("Failed to determine machine name automatically, please use -M.");
-                        return -EINVAL;
-                }
+                if (!machine_name_is_valid(arg_machine))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
 
                 if (arg_ephemeral) {
                         char *b;
@@ -2494,6 +2700,75 @@ static int determine_uid_shift(const char *directory) {
         return 0;
 }
 
+static unsigned long effective_clone_ns_flags(void) {
+        unsigned long flags = arg_clone_ns_flags;
+
+        if (arg_private_network)
+                flags |= CLONE_NEWNET;
+        if (arg_use_cgns)
+                flags |= CLONE_NEWCGROUP;
+        if (arg_userns_mode != USER_NAMESPACE_NO)
+                flags |= CLONE_NEWUSER;
+
+        return flags;
+}
+
+static int patch_sysctl(void) {
+
+        /* This table is inspired by runc's sysctl() function */
+        static const struct {
+                const char *key;
+                bool prefix;
+                unsigned long clone_flags;
+        } safe_sysctl[] = {
+                { "kernel.hostname",   false, CLONE_NEWUTS },
+                { "kernel.domainname", false, CLONE_NEWUTS },
+                { "kernel.msgmax",     false, CLONE_NEWIPC },
+                { "kernel.msgmnb",     false, CLONE_NEWIPC },
+                { "kernel.msgmni",     false, CLONE_NEWIPC },
+                { "kernel.sem",        false, CLONE_NEWIPC },
+                { "kernel.shmall",     false, CLONE_NEWIPC },
+                { "kernel.shmmax",     false, CLONE_NEWIPC },
+                { "kernel.shmmni",     false, CLONE_NEWIPC },
+                { "fs.mqueue.",        true,  CLONE_NEWIPC },
+                { "net.",              true,  CLONE_NEWNET },
+        };
+
+        unsigned long flags;
+        char **k, **v;
+        int r;
+
+        flags = effective_clone_ns_flags();
+
+        STRV_FOREACH_PAIR(k, v, arg_sysctl) {
+                bool good = false;
+                size_t i;
+
+                for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
+
+                        if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
+                                continue;
+
+                        if (safe_sysctl[i].prefix)
+                                good = startswith(*k, safe_sysctl[i].key);
+                        else
+                                good = streq(*k, safe_sysctl[i].key);
+
+                        if (good)
+                                break;
+                }
+
+                if (!good)
+                        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
+
+                r = sysctl_write(*k, *v);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
+        }
+
+        return 0;
+}
+
 static int inner_child(
                 Barrier *barrier,
                 const char *directory,
@@ -2520,7 +2795,7 @@ static int inner_child(
         };
         const char *exec_target;
         _cleanup_strv_free_ char **env_use = NULL;
-        int r;
+        int r, which_failed;
 
         /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
          * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
@@ -2537,6 +2812,8 @@ static int inner_child(
         assert(directory);
         assert(kmsg_socket >= 0);
 
+        log_debug("Inner child is initializing.");
+
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 /* Tell the parent, that it now can write the UID map. */
                 (void) barrier_place(barrier); /* #1 */
@@ -2606,6 +2883,18 @@ static int inner_child(
                 return r;
         kmsg_socket = safe_close(kmsg_socket);
 
+        r = mount_custom(
+                        "/",
+                        arg_custom_mounts,
+                        arg_n_custom_mounts,
+                        false,
+                        0,
+                        0,
+                        arg_selinux_apifs_context,
+                        true);
+        if (r < 0)
+                return r;
+
         if (setsid() < 0)
                 return log_error_errno(errno, "setsid() failed: %m");
 
@@ -2619,20 +2908,20 @@ static int inner_child(
                 rtnl_socket = safe_close(rtnl_socket);
         }
 
+        r = patch_sysctl();
+        if (r < 0)
+                return r;
+
         if (arg_oom_score_adjust_set) {
                 r = set_oom_score_adjust(arg_oom_score_adjust);
                 if (r < 0)
                         return log_error_errno(r, "Failed to adjust OOM score: %m");
         }
 
-        if (arg_cpuset)
-                if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
+        if (arg_cpu_set.set)
+                if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
                         return log_error_errno(errno, "Failed to set CPU affinity: %m");
 
-        r = drop_capabilities();
-        if (r < 0)
-                return log_error_errno(r, "drop_capabilities() failed: %m");
-
         (void) setup_hostname();
 
         if (arg_personality != PERSONALITY_INVALID) {
@@ -2645,16 +2934,51 @@ static int inner_child(
                         return log_error_errno(r, "personality() failed: %m");
         }
 
+        r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
+        if (r < 0)
+                return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+
+#if HAVE_SECCOMP
+        if (arg_seccomp) {
+
+                if (is_seccomp_available()) {
+
+                        r = seccomp_load(arg_seccomp);
+                        if (ERRNO_IS_SECCOMP_FATAL(r))
+                                return log_error_errno(r, "Failed to install seccomp filter: %m");
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to install seccomp filter: %m");
+                }
+        } else
+#endif
+        {
+                r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
+                if (r < 0)
+                        return r;
+        }
+
 #if HAVE_SELINUX
         if (arg_selinux_context)
                 if (setexeccon(arg_selinux_context) < 0)
                         return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
 #endif
 
-        r = change_uid_gid(arg_user, &home);
+        /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
+         * if we need to later on. */
+        if (prctl(PR_SET_KEEPCAPS, 1) < 0)
+                return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
+
+        if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
+                r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
+        else
+                r = change_uid_gid(arg_user, &home);
         if (r < 0)
                 return r;
 
+        r = drop_capabilities(getuid());
+        if (r < 0)
+                return log_error_errno(r, "Dropping capabilities failed: %m");
+
         if (arg_no_new_privileges)
                 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
                         return log_error_errno(errno, "Failed to disable new privileges: %m");
@@ -2666,10 +2990,14 @@ static int inner_child(
         if (envp[n_env])
                 n_env++;
 
-        if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
-            (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
-            (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
-                return log_oom();
+        if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
+                if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
+                        return log_oom();
+
+        if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
+                if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
+                    asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
+                        return log_oom();
 
         assert(!sd_id128_is_null(arg_uuid));
 
@@ -2709,6 +3037,8 @@ static int inner_child(
                         return r;
         }
 
+        log_debug("Inner child completed, invoking payload.");
+
         /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
          * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
          * it again. Note that the other fds closed here are at least the locking and barrier fds. */
@@ -2801,7 +3131,6 @@ static int outer_child(
                 const char *directory,
                 const char *console,
                 DissectedImage *dissected_image,
-                bool interactive,
                 bool secondary,
                 int pid_socket,
                 int uuid_socket,
@@ -2814,9 +3143,9 @@ static int outer_child(
                 int netns_fd) {
 
         _cleanup_close_ int fd = -1;
-        int r, which_failed;
         pid_t pid;
         ssize_t l;
+        int r;
 
         /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
          * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
@@ -2825,18 +3154,21 @@ static int outer_child(
 
         assert(barrier);
         assert(directory);
-        assert(console);
         assert(pid_socket >= 0);
         assert(uuid_socket >= 0);
         assert(notify_socket >= 0);
         assert(kmsg_socket >= 0);
 
+        log_debug("Outer child is initializing.");
+
         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
 
-        if (interactive) {
+        if (arg_console_mode != CONSOLE_PIPE) {
                 int terminal;
 
+                assert(console);
+
                 terminal = open_terminal(console, O_RDWR);
                 if (terminal < 0)
                         return log_error_errno(terminal, "Failed to open console: %m");
@@ -2926,7 +3258,7 @@ static int outer_child(
                         arg_userns_mode != USER_NAMESPACE_NO,
                         arg_uid_shift,
                         arg_uid_range,
-                        arg_selinux_context);
+                        arg_selinux_apifs_context);
         if (r < 0)
                 return r;
 
@@ -2957,7 +3289,7 @@ static int outer_child(
 
         /* Mark everything as shared so our mounts get propagated down. This is
          * required to make new bind mounts available in systemd services
-         * inside the containter that create a new mount namespace.
+         * inside the container that create a new mount namespace.
          * See https://github.com/systemd/systemd/issues/3860
          * Further submounts (such as /dev) done after this will inherit the
          * shared propagation mode. */
@@ -2974,7 +3306,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
-                r = bind_remount_recursive(directory, true, NULL);
+                r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
@@ -2990,7 +3322,12 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        dev_setup(directory, arg_uid_shift, arg_uid_shift);
+        r = make_extra_nodes(directory);
+        if (r < 0)
+                return r;
+
+        (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
+        (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
 
         r = setup_pts(directory);
         if (r < 0)
@@ -3008,10 +3345,6 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
-        if (r < 0)
-                return r;
-
         r = setup_timezone(directory);
         if (r < 0)
                 return r;
@@ -3035,7 +3368,8 @@ static int outer_child(
                         arg_userns_mode != USER_NAMESPACE_NO,
                         arg_uid_shift,
                         arg_uid_range,
-                        arg_selinux_apifs_context);
+                        arg_selinux_apifs_context,
+                        false);
         if (r < 0)
                 return r;
 
@@ -3060,10 +3394,6 @@ static int outer_child(
         if (fd < 0)
                 return fd;
 
-        r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
-        if (r < 0)
-                return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
-
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
                         arg_clone_ns_flags |
                         (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
@@ -3277,11 +3607,11 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
                 return log_oom();
 
         if (strv_find(tags, "READY=1"))
-                sd_notifyf(false, "READY=1\n");
+                (void) sd_notifyf(false, "READY=1\n");
 
         p = strv_find_startswith(tags, "STATUS=");
         if (p)
-                sd_notifyf(false, "STATUS=Container running: %s", p);
+                (void) sd_notifyf(false, "STATUS=Container running: %s", p);
 
         return 0;
 }
@@ -3316,6 +3646,15 @@ static int merge_settings(Settings *settings, const char *path) {
         if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
                 arg_ephemeral = settings->ephemeral;
 
+        if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
+            settings->root) {
+
+                if (!arg_settings_trusted)
+                        log_warning("Ignoring root directory setting, file %s is not trusted.", path);
+                else
+                        free_and_replace(arg_directory, settings->root);
+        }
+
         if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
             settings->pivot_root_new) {
                 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
@@ -3330,16 +3669,36 @@ static int merge_settings(Settings *settings, const char *path) {
             settings->environment)
                 strv_free_and_replace(arg_setenv, settings->environment);
 
-        if ((arg_settings_mask & SETTING_USER) == 0 &&
-            settings->user)
-                free_and_replace(arg_user, settings->user);
+        if ((arg_settings_mask & SETTING_USER) == 0) {
+
+                if (settings->user)
+                        free_and_replace(arg_user, settings->user);
+
+                if (uid_is_valid(settings->uid))
+                        arg_uid = settings->uid;
+                if (gid_is_valid(settings->gid))
+                        arg_gid = settings->gid;
+                if (settings->n_supplementary_gids > 0) {
+                        free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
+                        arg_n_supplementary_gids = settings->n_supplementary_gids;
+                }
+        }
 
         if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
-                uint64_t plus;
+                uint64_t plus, minus;
+
+                /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
+                 * Settings structure */
 
                 plus = settings->capability;
-                if (settings_private_network(settings))
-                        plus |= (1ULL << CAP_NET_ADMIN);
+                minus = settings->drop_capability;
+
+                if ((arg_settings_mask & SETTING_NETWORK) == 0) {
+                        if (settings_private_network(settings))
+                                plus |= UINT64_C(1) << CAP_NET_ADMIN;
+                        else
+                                minus |= UINT64_C(1) << CAP_NET_ADMIN;
+                }
 
                 if (!arg_settings_trusted && plus != 0) {
                         if (settings->capability != 0)
@@ -3347,7 +3706,15 @@ static int merge_settings(Settings *settings, const char *path) {
                 } else
                         arg_caps_retain |= plus;
 
-                arg_caps_retain &= ~settings->drop_capability;
+                arg_caps_retain &= ~minus;
+
+                /* Copy the full capabilities over too */
+                if (capability_quintet_is_set(&settings->full_capabilities)) {
+                        if (!arg_settings_trusted)
+                                log_warning("Ignoring capability settings, file %s is not trusted.", path);
+                        else
+                                arg_full_capabilities = settings->full_capabilities;
+                }
         }
 
         if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
@@ -3396,7 +3763,8 @@ static int merge_settings(Settings *settings, const char *path) {
              settings->network_interfaces ||
              settings->network_macvlan ||
              settings->network_ipvlan ||
-             settings->network_veth_extra)) {
+             settings->network_veth_extra ||
+             settings->network_namespace_path)) {
 
                 if (!arg_settings_trusted)
                         log_warning("Ignoring network settings, file %s is not trusted.", path);
@@ -3411,6 +3779,8 @@ static int merge_settings(Settings *settings, const char *path) {
 
                         free_and_replace(arg_network_bridge, settings->network_bridge);
                         free_and_replace(arg_network_zone, settings->network_zone);
+
+                        free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
                 }
         }
 
@@ -3443,12 +3813,21 @@ static int merge_settings(Settings *settings, const char *path) {
 
         if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
 
-                if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
+                if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
                         log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
                 else {
                         strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
                         strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
                 }
+
+#if HAVE_SECCOMP
+                if (!arg_settings_trusted && settings->seccomp)
+                        log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
+                else {
+                        seccomp_release(arg_seccomp);
+                        arg_seccomp = TAKE_PTR(settings->seccomp);
+                }
+#endif
         }
 
         for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
@@ -3486,15 +3865,14 @@ static int merge_settings(Settings *settings, const char *path) {
         }
 
         if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
-            settings->cpuset) {
+            settings->cpu_set.set) {
 
                 if (!arg_settings_trusted)
                         log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
                 else {
-                        if (arg_cpuset)
-                                CPU_FREE(arg_cpuset);
-                        arg_cpuset = TAKE_PTR(settings->cpuset);
-                        arg_cpuset_ncpus = settings->cpuset_ncpus;
+                        cpu_set_reset(&arg_cpu_set);
+                        arg_cpu_set = settings->cpu_set;
+                        settings->cpu_set = (CPUSet) {};
                 }
         }
 
@@ -3517,6 +3895,55 @@ static int merge_settings(Settings *settings, const char *path) {
             settings->timezone != _TIMEZONE_MODE_INVALID)
                 arg_timezone = settings->timezone;
 
+        if ((arg_settings_mask & SETTING_SLICE) == 0 &&
+            settings->slice) {
+
+                if (!arg_settings_trusted)
+                        log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
+                else
+                        free_and_replace(arg_slice, settings->slice);
+        }
+
+        if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
+            settings->use_cgns >= 0) {
+
+                if (!arg_settings_trusted)
+                        log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
+                else
+                        arg_use_cgns = settings->use_cgns;
+        }
+
+        if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
+            settings->clone_ns_flags != (unsigned long) -1) {
+
+                if (!arg_settings_trusted)
+                        log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
+                else
+                        arg_clone_ns_flags = settings->clone_ns_flags;
+        }
+
+        if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
+            settings->console_mode >= 0) {
+
+                if (!arg_settings_trusted)
+                        log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
+                else
+                        arg_console_mode = settings->console_mode;
+        }
+
+        /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
+         * don't consult arg_settings_mask for them. */
+
+        sd_bus_message_unref(arg_property_message);
+        arg_property_message = TAKE_PTR(settings->properties);
+
+        arg_console_width = settings->console_width;
+        arg_console_height = settings->console_height;
+
+        device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
+        arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
+        arg_n_extra_nodes = settings->n_extra_nodes;
+
         return 0;
 }
 
@@ -3527,6 +3954,9 @@ static int load_settings(void) {
         const char *fn, *i;
         int r;
 
+        if (arg_oci_bundle)
+                return 0;
+
         /* If all settings are masked, there's no point in looking for
          * the settings file */
         if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
@@ -3594,10 +4024,27 @@ static int load_settings(void) {
         return merge_settings(settings, p);
 }
 
-static int run(int master,
+static int load_oci_bundle(void) {
+        _cleanup_(settings_freep) Settings *settings = NULL;
+        int r;
+
+        if (!arg_oci_bundle)
+                return 0;
+
+        /* By default let's trust OCI bundles */
+        if (arg_settings_trusted < 0)
+                arg_settings_trusted = true;
+
+        r = oci_load(NULL, arg_oci_bundle, &settings);
+        if (r < 0)
+                return r;
+
+        return merge_settings(settings, arg_oci_bundle);
+}
+
+static int run_container(int master,
                const char* console,
                DissectedImage *dissected_image,
-               bool interactive,
                bool secondary,
                FDSet *fds,
                char veth_name[IFNAMSIZ], bool *veth_created,
@@ -3628,7 +4075,6 @@ static int run(int master,
         _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
         _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
         ContainerStatus container_status = 0;
-        char last_char = 0;
         int ifi = 0, r;
         ssize_t l;
         sigset_t mask_chld;
@@ -3697,10 +4143,9 @@ static int run(int master,
                         log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
                 else if (r < 0)
                         return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
-                else if (r == 0) {
-                        log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
-                        return -EINVAL;
-                }
+                else if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
         }
 
         *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
@@ -3730,7 +4175,6 @@ static int run(int master,
                                 arg_directory,
                                 console,
                                 dissected_image,
-                                interactive,
                                 secondary,
                                 pid_socket_pair[1],
                                 uuid_socket_pair[1],
@@ -3749,7 +4193,7 @@ static int run(int master,
 
         barrier_set_role(&barrier, BARRIER_PARENT);
 
-        fds = fdset_free(fds);
+        fdset_close(fds);
 
         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
@@ -3764,10 +4208,8 @@ static int run(int master,
                 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
                 if (l < 0)
                         return log_error_errno(errno, "Failed to read UID shift: %m");
-                if (l != sizeof arg_uid_shift) {
-                        log_error("Short read while reading UID shift.");
-                        return -EIO;
-                }
+                if (l != sizeof arg_uid_shift)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
 
                 if (arg_userns_mode == USER_NAMESPACE_PICK) {
                         /* If we are supposed to pick the UID shift, let's try to use the shift read from the
@@ -3781,10 +4223,8 @@ static int run(int master,
                         l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
                         if (l < 0)
                                 return log_error_errno(errno, "Failed to send UID shift: %m");
-                        if (l != sizeof arg_uid_shift) {
-                                log_error("Short write while writing UID shift.");
-                                return -EIO;
-                        }
+                        if (l != sizeof arg_uid_shift)
+                                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
                 }
         }
 
@@ -3793,11 +4233,9 @@ static int run(int master,
                 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
                 if (l < 0)
                         return log_error_errno(errno, "Failed to read cgroup mode: %m");
-                if (l != sizeof(arg_unified_cgroup_hierarchy)) {
-                        log_error("Short read while reading cgroup mode (%zu bytes).%s",
-                                  l, l == 0 ? " The child is most likely dead." : "");
-                        return -EIO;
-                }
+                if (l != sizeof(arg_unified_cgroup_hierarchy))
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
+                                               l, l == 0 ? " The child is most likely dead." : "");
         }
 
         /* Wait for the outer child. */
@@ -3811,19 +4249,15 @@ static int run(int master,
         l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
         if (l < 0)
                 return log_error_errno(errno, "Failed to read inner child PID: %m");
-        if (l != sizeof *pid) {
-                log_error("Short read while reading inner child PID.");
-                return -EIO;
-        }
+        if (l != sizeof *pid)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
 
         /* We also retrieve container UUID in case it was generated by outer child */
         l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
         if (l < 0)
                 return log_error_errno(errno, "Failed to read container machine ID: %m");
-        if (l != sizeof(arg_uuid)) {
-                log_error("Short read while reading container machined ID.");
-                return -EIO;
-        }
+        if (l != sizeof(arg_uuid))
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
 
         /* We also retrieve the socket used for notifications generated by outer child */
         notify_socket = receive_one_fd(notify_socket_pair[0], 0);
@@ -3834,10 +4268,8 @@ static int run(int master,
         log_debug("Init process invoked as PID "PID_FMT, *pid);
 
         if (arg_userns_mode != USER_NAMESPACE_NO) {
-                if (!barrier_place_and_sync(&barrier)) { /* #1 */
-                        log_error("Child died too early.");
-                        return -ESRCH;
-                }
+                if (!barrier_place_and_sync(&barrier)) /* #1 */
+                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
                 r = setup_uid_map(*pid);
                 if (r < 0)
@@ -3849,10 +4281,8 @@ static int run(int master,
         if (arg_private_network) {
                 if (!arg_network_namespace_path) {
                         /* Wait until the child has unshared its network namespace. */
-                        if (!barrier_place_and_sync(&barrier)) { /* #3 */
-                                log_error("Child died too early");
-                                return -ESRCH;
-                        }
+                        if (!barrier_place_and_sync(&barrier)) /* #3 */
+                                return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
                 }
 
                 r = move_network_interfaces(*pid, arg_network_interfaces);
@@ -3942,6 +4372,7 @@ static int run(int master,
                                 arg_custom_mounts, arg_n_custom_mounts,
                                 arg_kill_signal,
                                 arg_property,
+                                arg_property_message,
                                 arg_keep_unit,
                                 arg_container_service_name);
                 if (r < 0)
@@ -3955,7 +4386,8 @@ static int run(int master,
                                 arg_slice,
                                 arg_custom_mounts, arg_n_custom_mounts,
                                 arg_kill_signal,
-                                arg_property);
+                                arg_property,
+                                arg_property_message);
                 if (r < 0)
                         return r;
 
@@ -4006,18 +4438,16 @@ static int run(int master,
                 return r;
 
         /* Let the child know that we are ready and wait that the child is completely ready now. */
-        if (!barrier_place_and_sync(&barrier)) { /* #5 */
-                log_error("Child died too early.");
-                return -ESRCH;
-        }
+        if (!barrier_place_and_sync(&barrier)) /* #5 */
+                return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
         /* At this point we have made use of the UID we picked, and thus nss-mymachines
          * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
         etc_passwd_lock = safe_close(etc_passwd_lock);
 
-        sd_notifyf(false,
-                   "STATUS=Container running.\n"
-                   "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
+        (void) sd_notifyf(false,
+                          "STATUS=Container running.\n"
+                          "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
         if (!arg_notify_ready)
                 (void) sd_notify(false, "READY=1\n");
 
@@ -4044,22 +4474,32 @@ static int run(int master,
 
         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
 
-        r = pty_forward_new(event, master,
-                            PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
-                            &forward);
-        if (r < 0)
-                return log_error_errno(r, "Failed to create PTY forwarder: %m");
+        if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
+                assert(master >= 0);
+
+                r = pty_forward_new(event, master,
+                                    PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
+                                    &forward);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+                if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
+                        (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
+        }
 
         r = sd_event_loop(event);
         if (r < 0)
                 return log_error_errno(r, "Failed to run event loop: %m");
 
-        pty_forward_get_last_char(forward, &last_char);
+        if (forward) {
+                char last_char = 0;
 
-        forward = pty_forward_free(forward);
+                (void) pty_forward_get_last_char(forward, &last_char);
+                forward = pty_forward_free(forward);
 
-        if (!arg_quiet && last_char != '\n')
-                putc('\n', stdout);
+                if (!arg_quiet && last_char != '\n')
+                        putc('\n', stdout);
+        }
 
         /* Kill if it is not dead yet anyway */
         if (bus) {
@@ -4173,7 +4613,7 @@ static int initialize_rlimits(void) {
         return 0;
 }
 
-int main(int argc, char *argv[]) {
+static int run(int argc, char *argv[]) {
         _cleanup_free_ char *console = NULL;
         _cleanup_close_ int master = -1;
         _cleanup_fdset_free_ FDSet *fds = NULL;
@@ -4183,7 +4623,7 @@ int main(int argc, char *argv[]) {
         pid_t pid = 0;
         union in_addr_union exposed = {};
         _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
-        bool interactive, veth_created = false, remove_tmprootdir = false;
+        bool veth_created = false, remove_tmprootdir = false;
         char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
         _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
@@ -4192,10 +4632,6 @@ int main(int argc, char *argv[]) {
         log_parse_environment();
         log_open();
 
-        /* Make sure rename_process() in the stub init process can work */
-        saved_argv = argv;
-        saved_argc = argc;
-
         r = parse_argv(argc, argv);
         if (r <= 0)
                 goto finish;
@@ -4208,6 +4644,10 @@ int main(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
+        r = load_oci_bundle();
+        if (r < 0)
+                goto finish;
+
         r = determine_names();
         if (r < 0)
                 goto finish;
@@ -4216,8 +4656,6 @@ int main(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
-        parse_environment();
-
         r = cg_unified_flush();
         if (r < 0) {
                 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
@@ -4400,7 +4838,7 @@ int main(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
+                        r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
                         if (r < 0) {
                                 r = log_error_errno(r, "Failed to copy image file: %m");
                                 goto finish;
@@ -4483,31 +4921,37 @@ int main(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
-        interactive =
-                isatty(STDIN_FILENO) > 0 &&
-                isatty(STDOUT_FILENO) > 0;
+        if (arg_console_mode < 0)
+                arg_console_mode =
+                        isatty(STDIN_FILENO) > 0 &&
+                        isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
 
-        master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
-        if (master < 0) {
-                r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
-                goto finish;
-        }
+        if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
+                arg_quiet = true;
 
-        r = ptsname_malloc(master, &console);
-        if (r < 0) {
-                r = log_error_errno(r, "Failed to determine tty name: %m");
-                goto finish;
-        }
+        if (arg_console_mode != CONSOLE_PIPE) {
+                master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
+                if (master < 0) {
+                        r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
+                        goto finish;
+                }
 
-        if (arg_selinux_apifs_context) {
-                r = mac_selinux_apply(console, arg_selinux_apifs_context);
-                if (r < 0)
+                r = ptsname_malloc(master, &console);
+                if (r < 0) {
+                        r = log_error_errno(r, "Failed to determine tty name: %m");
                         goto finish;
-        }
+                }
 
-        if (unlockpt(master) < 0) {
-                r = log_error_errno(errno, "Failed to unlock tty: %m");
-                goto finish;
+                if (arg_selinux_apifs_context) {
+                        r = mac_selinux_apply(console, arg_selinux_apifs_context);
+                        if (r < 0)
+                                goto finish;
+                }
+
+                if (unlockpt(master) < 0) {
+                        r = log_error_errno(errno, "Failed to unlock tty: %m");
+                        goto finish;
+                }
         }
 
         if (!arg_quiet)
@@ -4522,22 +4966,22 @@ int main(int argc, char *argv[]) {
         }
 
         for (;;) {
-                r = run(master,
-                        console,
-                        dissected_image,
-                        interactive, secondary,
-                        fds,
-                        veth_name, &veth_created,
-                        &exposed,
-                        &pid, &ret);
+                r = run_container(master,
+                                  console,
+                                  dissected_image,
+                                  secondary,
+                                  fds,
+                                  veth_name, &veth_created,
+                                  &exposed,
+                                  &pid, &ret);
                 if (r <= 0)
                         break;
         }
 
 finish:
-        sd_notify(false,
-                  r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
-                                                        "STOPPING=1\nSTATUS=Terminating...");
+        (void) sd_notify(false,
+                         r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
+                                                               "STOPPING=1\nSTATUS=Terminating...");
 
         if (pid > 0)
                 (void) kill(pid, SIGKILL);
@@ -4584,32 +5028,15 @@ finish:
                 (void) remove_veth_links(veth_name, arg_network_veth_extra);
         (void) remove_bridge(arg_network_zone);
 
-        free(arg_directory);
-        free(arg_template);
-        free(arg_image);
-        free(arg_machine);
-        free(arg_hostname);
-        free(arg_user);
-        free(arg_pivot_root_new);
-        free(arg_pivot_root_old);
-        free(arg_chdir);
-        strv_free(arg_setenv);
-        free(arg_network_bridge);
-        strv_free(arg_network_interfaces);
-        strv_free(arg_network_macvlan);
-        strv_free(arg_network_ipvlan);
-        strv_free(arg_network_veth_extra);
-        strv_free(arg_parameters);
-        free(arg_network_zone);
-        free(arg_network_namespace_path);
-        strv_free(arg_property);
         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
         expose_port_free_all(arg_expose_ports);
-        free(arg_root_hash);
         rlimit_free_all(arg_rlimit);
-        strv_free(arg_syscall_whitelist);
-        strv_free(arg_syscall_blacklist);
-        arg_cpuset = cpu_set_mfree(arg_cpuset);
+        device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
 
-        return r < 0 ? EXIT_FAILURE : ret;
+        if (r < 0)
+                return r;
+
+        return ret;
 }
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);