]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
nspawn: make use of log_set_open_when_needed() in nspawn too
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index 7169c115765eb01be7e09abcafa18cde54332e97..b1add0f1590b567362bc550d9602e4ab52dd7881 100644 (file)
@@ -43,6 +43,7 @@
 #include "capability-util.h"
 #include "cgroup-util.h"
 #include "copy.h"
+#include "cpu-set-util.h"
 #include "dev-setup.h"
 #include "dissect-image.h"
 #include "env-util.h"
 #include "nspawn-settings.h"
 #include "nspawn-setuid.h"
 #include "nspawn-stub-pid1.h"
+#include "pager.h"
 #include "parse-util.h"
 #include "path-util.h"
 #include "process-util.h"
 #include "ptyfwd.h"
 #include "random-util.h"
 #include "raw-clone.h"
+#include "rlimit-util.h"
 #include "rm-rf.h"
 #include "selinux-util.h"
 #include "signal-util.h"
 #include "socket-util.h"
 #include "stat-util.h"
 #include "stdio-util.h"
+#include "string-table.h"
 #include "string-util.h"
 #include "strv.h"
 #include "terminal-util.h"
@@ -127,7 +131,8 @@ static char *arg_pivot_root_new = NULL;
 static char *arg_pivot_root_old = NULL;
 static char *arg_user = NULL;
 static sd_id128_t arg_uuid = {};
-static char *arg_machine = NULL;
+static char *arg_machine = NULL;     /* The name used by the host to refer to this */
+static char *arg_hostname = NULL;    /* The name the payload sees by default */
 static const char *arg_selinux_context = NULL;
 static const char *arg_selinux_apifs_context = NULL;
 static const char *arg_slice = NULL;
@@ -165,7 +170,7 @@ static uint64_t arg_caps_retain =
         (1ULL << CAP_SYS_RESOURCE) |
         (1ULL << CAP_SYS_TTY_CONFIG);
 static CustomMount *arg_custom_mounts = NULL;
-static unsigned arg_n_custom_mounts = 0;
+static size_t arg_n_custom_mounts = 0;
 static char **arg_setenv = NULL;
 static bool arg_quiet = false;
 static bool arg_register = true;
@@ -200,8 +205,17 @@ static void *arg_root_hash = NULL;
 static size_t arg_root_hash_size = 0;
 static char **arg_syscall_whitelist = NULL;
 static char **arg_syscall_blacklist = NULL;
+static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
+static bool arg_no_new_privileges = false;
+static int arg_oom_score_adjust = 0;
+static bool arg_oom_score_adjust_set = false;
+static cpu_set_t *arg_cpuset = NULL;
+static unsigned arg_cpuset_ncpus = 0;
 
 static void help(void) {
+
+        (void) pager_open(false, false);
+
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
                "  -h --help                 Show this help\n"
@@ -221,6 +235,7 @@ static void help(void) {
                "                            Pivot root to given directory in the container\n"
                "  -u --user=USER            Run the command under specified user or uid\n"
                "  -M --machine=NAME         Set the machine name for the container\n"
+               "     --hostname=NAME        Override the hostname for the container\n"
                "     --uuid=UUID            Set a specific machine UUID for the container\n"
                "  -S --slice=SLICE          Place the container in the specified slice\n"
                "     --property=NAME=VALUE  Set scope unit property\n"
@@ -264,6 +279,10 @@ static void help(void) {
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
                "     --system-call-filter=LIST|~LIST\n"
                "                            Permit/prohibit specific system calls\n"
+               "     --rlimit=NAME=LIMIT    Set a resource limit for the payload\n"
+               "     --oom-score-adjust=VALUE\n"
+               "                            Adjust the OOM score value for the payload\n"
+               "     --cpu-affinity=CPUS    Adjust the CPU affinity of the container\n"
                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, \n"
                "                            host, try-guest, try-host\n"
@@ -291,7 +310,7 @@ static void help(void) {
 }
 
 static int custom_mount_check_all(void) {
-        unsigned i;
+        size_t i;
 
         for (i = 0; i < arg_n_custom_mounts; i++) {
                 CustomMount *m = &arg_custom_mounts[i];
@@ -439,6 +458,11 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_NOTIFY_READY,
                 ARG_ROOT_HASH,
                 ARG_SYSTEM_CALL_FILTER,
+                ARG_RLIMIT,
+                ARG_HOSTNAME,
+                ARG_NO_NEW_PRIVILEGES,
+                ARG_OOM_SCORE_ADJUST,
+                ARG_CPU_AFFINITY,
         };
 
         static const struct option options[] = {
@@ -455,6 +479,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "read-only",              no_argument,       NULL, ARG_READ_ONLY              },
                 { "capability",             required_argument, NULL, ARG_CAPABILITY             },
                 { "drop-capability",        required_argument, NULL, ARG_DROP_CAPABILITY        },
+                { "no-new-privileges",      required_argument, NULL, ARG_NO_NEW_PRIVILEGES      },
                 { "link-journal",           required_argument, NULL, ARG_LINK_JOURNAL           },
                 { "bind",                   required_argument, NULL, ARG_BIND                   },
                 { "bind-ro",                required_argument, NULL, ARG_BIND_RO                },
@@ -462,6 +487,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "overlay",                required_argument, NULL, ARG_OVERLAY                },
                 { "overlay-ro",             required_argument, NULL, ARG_OVERLAY_RO             },
                 { "machine",                required_argument, NULL, 'M'                        },
+                { "hostname",               required_argument, NULL, ARG_HOSTNAME               },
                 { "slice",                  required_argument, NULL, 'S'                        },
                 { "setenv",                 required_argument, NULL, 'E'                        },
                 { "selinux-context",        required_argument, NULL, 'Z'                        },
@@ -492,6 +518,9 @@ static int parse_argv(int argc, char *argv[]) {
                 { "notify-ready",           required_argument, NULL, ARG_NOTIFY_READY           },
                 { "root-hash",              required_argument, NULL, ARG_ROOT_HASH              },
                 { "system-call-filter",     required_argument, NULL, ARG_SYSTEM_CALL_FILTER     },
+                { "rlimit",                 required_argument, NULL, ARG_RLIMIT                 },
+                { "oom-score-adjust",       required_argument, NULL, ARG_OOM_SCORE_ADJUST       },
+                { "cpu-affinity",           required_argument, NULL, ARG_CPU_AFFINITY           },
                 {}
         };
 
@@ -696,6 +725,23 @@ static int parse_argv(int argc, char *argv[]) {
                         }
                         break;
 
+                case ARG_HOSTNAME:
+                        if (isempty(optarg))
+                                arg_hostname = mfree(arg_hostname);
+                        else {
+                                if (!hostname_is_valid(optarg, false)) {
+                                        log_error("Invalid hostname: %s", optarg);
+                                        return -EINVAL;
+                                }
+
+                                r = free_and_strdup(&arg_hostname, optarg);
+                                if (r < 0)
+                                        return log_oom();
+                        }
+
+                        arg_settings_mask |= SETTING_HOSTNAME;
+                        break;
+
                 case 'Z':
                         arg_selinux_context = optarg;
                         break;
@@ -747,6 +793,15 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
                 }
 
+                case ARG_NO_NEW_PRIVILEGES:
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
+
+                        arg_no_new_privileges = r;
+                        arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
+                        break;
+
                 case 'j':
                         arg_link_journal = LINK_GUEST;
                         arg_link_journal_try = true;
@@ -818,9 +873,7 @@ static int parse_argv(int argc, char *argv[]) {
                         if (!n)
                                 return log_oom();
 
-                        strv_free(arg_setenv);
-                        arg_setenv = n;
-
+                        strv_free_and_replace(arg_setenv, n);
                         arg_settings_mask |= SETTING_ENVIRONMENT;
                         break;
                 }
@@ -864,7 +917,10 @@ static int parse_argv(int argc, char *argv[]) {
 
                         if (!optarg)
                                 arg_volatile_mode = VOLATILE_YES;
-                        else {
+                        else if (streq(optarg, "help")) {
+                                DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
+                                return 0;
+                        } else {
                                 VolatileMode m;
 
                                 m = volatile_mode_from_string(optarg);
@@ -972,7 +1028,12 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_KILL_SIGNAL:
-                        arg_kill_signal = signal_from_string_try_harder(optarg);
+                        if (streq(optarg, "help")) {
+                                DUMP_STRING_TABLE(signal, int, _NSIG);
+                                return 0;
+                        }
+
+                        arg_kill_signal = signal_from_string(optarg);
                         if (arg_kill_signal < 0) {
                                 log_error("Cannot parse signal: %s", optarg);
                                 return -EINVAL;
@@ -1096,6 +1157,71 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
                 }
 
+                case ARG_RLIMIT: {
+                        const char *eq;
+                        char *name;
+                        int rl;
+
+                        if (streq(optarg, "help")) {
+                                DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
+                                return 0;
+                        }
+
+                        eq = strchr(optarg, '=');
+                        if (!eq) {
+                                log_error("--rlimit= expects an '=' assignment.");
+                                return -EINVAL;
+                        }
+
+                        name = strndup(optarg, eq - optarg);
+                        if (!name)
+                                return log_oom();
+
+                        rl = rlimit_from_string_harder(name);
+                        if (rl < 0) {
+                                log_error("Unknown resource limit: %s", name);
+                                return -EINVAL;
+                        }
+
+                        if (!arg_rlimit[rl]) {
+                                arg_rlimit[rl] = new0(struct rlimit, 1);
+                                if (!arg_rlimit[rl])
+                                        return log_oom();
+                        }
+
+                        r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
+
+                        arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
+                        break;
+                }
+
+                case ARG_OOM_SCORE_ADJUST:
+                        r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
+
+                        arg_oom_score_adjust_set = true;
+                        arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
+                        break;
+
+                case ARG_CPU_AFFINITY: {
+                        _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
+
+                        r = parse_cpu_set(optarg, &cpuset);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
+
+                        if (arg_cpuset)
+                                CPU_FREE(arg_cpuset);
+
+                        arg_cpuset = TAKE_PTR(cpuset);
+                        arg_cpuset_ncpus = r;
+                        arg_settings_mask |= SETTING_CPU_AFFINITY;
+                        break;
+                }
+
                 case '?':
                         return -EINVAL;
 
@@ -1469,32 +1595,36 @@ static int setup_resolv_conf(const char *dest) {
         return 0;
 }
 
-static int setup_boot_id(const char *dest) {
+static int setup_boot_id(void) {
+        _cleanup_(unlink_and_freep) char *from = NULL;
+        _cleanup_free_ char *path = NULL;
         sd_id128_t rnd = SD_ID128_NULL;
-        const char *from, *to;
+        const char *to;
         int r;
 
         /* Generate a new randomized boot ID, so that each boot-up of
          * the container gets a new one */
 
-        from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
-        to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
+        r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate random boot ID path: %m");
 
         r = sd_id128_randomize(&rnd);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate random boot id: %m");
 
-        r = id128_write(from, ID128_UUID, rnd, false);
+        r = id128_write(path, ID128_UUID, rnd, false);
         if (r < 0)
                 return log_error_errno(r, "Failed to write boot id: %m");
 
+        from = TAKE_PTR(path);
+        to = "/proc/sys/kernel/random/boot_id";
+
         r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
-        if (r >= 0)
-                r = mount_verbose(LOG_ERR, NULL, to, NULL,
-                                  MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
+        if (r < 0)
+                return r;
 
-        (void) unlink(from);
-        return r;
+        return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
 }
 
 static int copy_devnodes(const char *dest) {
@@ -1661,27 +1791,33 @@ static int setup_keyring(void) {
         return 0;
 }
 
-static int setup_kmsg(const char *dest, int kmsg_socket) {
-        const char *from, *to;
+static int setup_kmsg(int kmsg_socket) {
+        _cleanup_(unlink_and_freep) char *from = NULL;
+        _cleanup_free_ char *fifo = NULL;
+        _cleanup_close_ int fd = -1;
         _cleanup_umask_ mode_t u;
-        int fd, r;
+        const char *to;
+        int r;
 
         assert(kmsg_socket >= 0);
 
         u = umask(0000);
 
-        /* We create the kmsg FIFO as /run/kmsg, but immediately
-         * delete it after bind mounting it to /proc/kmsg. While FIFOs
-         * on the reading side behave very similar to /proc/kmsg,
-         * their writing side behaves differently from /dev/kmsg in
-         * that writing blocks when nothing is reading. In order to
-         * avoid any problems with containers deadlocking due to this
-         * we simply make /dev/kmsg unavailable to the container. */
-        from = prefix_roota(dest, "/run/kmsg");
-        to = prefix_roota(dest, "/proc/kmsg");
-
-        if (mkfifo(from, 0600) < 0)
+        /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+         * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
+         * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
+         * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
+
+        r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate kmsg path: %m");
+
+        if (mkfifo(fifo, 0600) < 0)
                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
+
+        from = TAKE_PTR(fifo);
+        to = "/proc/kmsg";
+
         r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
         if (r < 0)
                 return r;
@@ -1690,17 +1826,11 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
         if (fd < 0)
                 return log_error_errno(errno, "Failed to open fifo: %m");
 
-        /* Store away the fd in the socket, so that it stays open as
-         * long as we run the child */
+        /* Store away the fd in the socket, so that it stays open as long as we run the child */
         r = send_one_fd(kmsg_socket, fd, 0);
-        safe_close(fd);
-
         if (r < 0)
                 return log_error_errno(r, "Failed to send FIFO fd: %m");
 
-        /* And now make the FIFO unavailable as /run/kmsg... */
-        (void) unlink(from);
-
         return 0;
 }
 
@@ -1716,12 +1846,14 @@ static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *user
 }
 
 static int setup_hostname(void) {
+        int r;
 
         if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
                 return 0;
 
-        if (sethostname_idempotent(arg_machine) < 0)
-                return -errno;
+        r = sethostname_idempotent(arg_hostname ?: arg_machine);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set hostname: %m");
 
         return 0;
 }
@@ -1897,7 +2029,6 @@ static int reset_audit_loginuid(void) {
         return 0;
 }
 
-
 static int setup_propagate(const char *root) {
         const char *p, *q;
         int r;
@@ -2266,9 +2397,9 @@ static int inner_child(
 
         _cleanup_free_ char *home = NULL;
         char as_uuid[37];
-        unsigned n_env = 1;
+        size_t n_env = 1;
         const char *envp[] = {
-                "PATH=" DEFAULT_PATH_SPLIT_USR,
+                "PATH=" DEFAULT_PATH_COMPAT,
                 NULL, /* container */
                 NULL, /* TERM */
                 NULL, /* HOME */
@@ -2281,7 +2412,6 @@ static int inner_child(
                 NULL
         };
         const char *exec_target;
-
         _cleanup_strv_free_ char **env_use = NULL;
         int r;
 
@@ -2352,11 +2482,11 @@ static int inner_child(
                         return r;
         }
 
-        r = setup_boot_id(NULL);
+        r = setup_boot_id();
         if (r < 0)
                 return r;
 
-        r = setup_kmsg(NULL, kmsg_socket);
+        r = setup_kmsg(kmsg_socket);
         if (r < 0)
                 return r;
         kmsg_socket = safe_close(kmsg_socket);
@@ -2376,11 +2506,21 @@ static int inner_child(
                 rtnl_socket = safe_close(rtnl_socket);
         }
 
+        if (arg_oom_score_adjust_set) {
+                r = set_oom_score_adjust(arg_oom_score_adjust);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to adjust OOM score: %m");
+        }
+
+        if (arg_cpuset)
+                if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
+                        return log_error_errno(errno, "Failed to set CPU affinity: %m");
+
         r = drop_capabilities();
         if (r < 0)
                 return log_error_errno(r, "drop_capabilities() failed: %m");
 
-        setup_hostname();
+        (void) setup_hostname();
 
         if (arg_personality != PERSONALITY_INVALID) {
                 r = safe_personality(arg_personality);
@@ -2402,6 +2542,10 @@ static int inner_child(
         if (r < 0)
                 return r;
 
+        if (arg_no_new_privileges)
+                if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+                        return log_error_errno(errno, "Failed to disable new privileges: %m");
+
         /* LXC sets container=lxc, so follow the scheme here */
         envp[n_env++] = strjoina("container=", arg_container_service_name);
 
@@ -2453,15 +2597,12 @@ static int inner_child(
                         return r;
         }
 
-        /* Now, explicitly close the log, so that we
-         * then can close all remaining fds. Closing
-         * the log explicitly first has the benefit
-         * that the logging subsystem knows about it,
-         * and is thus ready to be reopened should we
-         * need it again. Note that the other fds
-         * closed here are at least the locking and
-         * barrier fds. */
+        /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
+         * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
+         * it again. Note that the other fds closed here are at least the locking and barrier fds. */
         log_close();
+        log_set_open_when_needed(true);
+
         (void) fdset_close_others(fds);
 
         if (arg_start_mode == START_BOOT) {
@@ -2499,9 +2640,7 @@ static int inner_child(
                 exec_target = "/bin/bash, /bin/sh";
         }
 
-        r = -errno;
-        (void) log_open();
-        return log_error_errno(r, "execv(%s) failed: %m", exec_target);
+        return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
 }
 
 static int setup_sd_notify_child(void) {
@@ -2558,10 +2697,10 @@ static int outer_child(
                 FDSet *fds,
                 int netns_fd) {
 
+        _cleanup_close_ int fd = -1;
+        int r, which_failed;
         pid_t pid;
         ssize_t l;
-        int r;
-        _cleanup_close_ int fd = -1;
 
         assert(barrier);
         assert(directory);
@@ -2804,6 +2943,10 @@ static int outer_child(
         if (fd < 0)
                 return fd;
 
+        r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
+        if (r < 0)
+                return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
                         arg_clone_ns_flags |
                         (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
@@ -2879,7 +3022,7 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
 
         for (;;) {
                 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
-                _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
+                _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
 
                 if (--n_tries <= 0)
                         return -EBUSY;
@@ -3040,86 +3183,19 @@ static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pi
         return 0;
 }
 
-static int load_settings(void) {
-        _cleanup_(settings_freep) Settings *settings = NULL;
-        _cleanup_fclose_ FILE *f = NULL;
-        _cleanup_free_ char *p = NULL;
-        const char *fn, *i;
-        int r;
-
-        /* If all settings are masked, there's no point in looking for
-         * the settings file */
-        if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
-                return 0;
-
-        fn = strjoina(arg_machine, ".nspawn");
-
-        /* We first look in the admin's directories in /etc and /run */
-        FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
-                _cleanup_free_ char *j = NULL;
-
-                j = strjoin(i, "/", fn);
-                if (!j)
-                        return log_oom();
-
-                f = fopen(j, "re");
-                if (f) {
-                        p = TAKE_PTR(j);
-
-                        /* By default, we trust configuration from /etc and /run */
-                        if (arg_settings_trusted < 0)
-                                arg_settings_trusted = true;
-
-                        break;
-                }
-
-                if (errno != ENOENT)
-                        return log_error_errno(errno, "Failed to open %s: %m", j);
-        }
-
-        if (!f) {
-                /* After that, let's look for a file next to the
-                 * actual image we shall boot. */
+static int merge_settings(Settings *settings, const char *path) {
+        int rl;
 
-                if (arg_image) {
-                        p = file_in_same_dir(arg_image, fn);
-                        if (!p)
-                                return log_oom();
-                } else if (arg_directory) {
-                        p = file_in_same_dir(arg_directory, fn);
-                        if (!p)
-                                return log_oom();
-                }
+        assert(settings);
+        assert(path);
 
-                if (p) {
-                        f = fopen(p, "re");
-                        if (!f && errno != ENOENT)
-                                return log_error_errno(errno, "Failed to open %s: %m", p);
-
-                        /* By default, we do not trust configuration from /var/lib/machines */
-                        if (arg_settings_trusted < 0)
-                                arg_settings_trusted = false;
-                }
-        }
-
-        if (!f)
-                return 0;
-
-        log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
-
-        r = settings_load(f, p, &settings);
-        if (r < 0)
-                return r;
-
-        /* Copy over bits from the settings, unless they have been
-         * explicitly masked by command line switches. */
+        /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
+         * that this steals the fields of the Settings* structure, and hence modifies it. */
 
         if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
             settings->start_mode >= 0) {
                 arg_start_mode = settings->start_mode;
-
-                strv_free(arg_parameters);
-                arg_parameters = TAKE_PTR(settings->parameters);
+                strv_free_and_replace(arg_parameters, settings->parameters);
         }
 
         if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
@@ -3133,10 +3209,8 @@ static int load_settings(void) {
                 free_and_replace(arg_chdir, settings->working_directory);
 
         if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
-            settings->environment) {
-                strv_free(arg_setenv);
-                arg_setenv = TAKE_PTR(settings->environment);
-        }
+            settings->environment)
+                strv_free_and_replace(arg_setenv, settings->environment);
 
         if ((arg_settings_mask & SETTING_USER) == 0 &&
             settings->user)
@@ -3151,7 +3225,7 @@ static int load_settings(void) {
 
                 if (!arg_settings_trusted && plus != 0) {
                         if (settings->capability != 0)
-                                log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
+                                log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
                 } else
                         arg_caps_retain |= plus;
 
@@ -3170,7 +3244,7 @@ static int load_settings(void) {
             !sd_id128_is_null(settings->machine_id)) {
 
                 if (!arg_settings_trusted)
-                        log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
+                        log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
                 else
                         arg_uuid = settings->machine_id;
         }
@@ -3187,7 +3261,7 @@ static int load_settings(void) {
             settings->n_custom_mounts > 0) {
 
                 if (!arg_settings_trusted)
-                        log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
+                        log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
                 else {
                         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
                         arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
@@ -3207,22 +3281,15 @@ static int load_settings(void) {
              settings->network_veth_extra)) {
 
                 if (!arg_settings_trusted)
-                        log_warning("Ignoring network settings, file %s is not trusted.", p);
+                        log_warning("Ignoring network settings, file %s is not trusted.", path);
                 else {
                         arg_network_veth = settings_network_veth(settings);
                         arg_private_network = settings_private_network(settings);
 
-                        strv_free(arg_network_interfaces);
-                        arg_network_interfaces = TAKE_PTR(settings->network_interfaces);
-
-                        strv_free(arg_network_macvlan);
-                        arg_network_macvlan = TAKE_PTR(settings->network_macvlan);
-
-                        strv_free(arg_network_ipvlan);
-                        arg_network_ipvlan = TAKE_PTR(settings->network_ipvlan);
-
-                        strv_free(arg_network_veth_extra);
-                        arg_network_veth_extra = TAKE_PTR(settings->network_veth_extra);
+                        strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
+                        strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
+                        strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
+                        strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
 
                         free_and_replace(arg_network_bridge, settings->network_bridge);
                         free_and_replace(arg_network_zone, settings->network_zone);
@@ -3233,7 +3300,7 @@ static int load_settings(void) {
             settings->expose_ports) {
 
                 if (!arg_settings_trusted)
-                        log_warning("Ignoring Port= setting, file %s is not trusted.", p);
+                        log_warning("Ignoring Port= setting, file %s is not trusted.", path);
                 else {
                         expose_port_free_all(arg_expose_ports);
                         arg_expose_ports = TAKE_PTR(settings->expose_ports);
@@ -3244,7 +3311,7 @@ static int load_settings(void) {
             settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
 
                 if (!arg_settings_trusted)
-                        log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
+                        log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
                 else {
                         arg_userns_mode = settings->userns_mode;
                         arg_uid_shift = settings->uid_shift;
@@ -3259,19 +3326,137 @@ static int load_settings(void) {
         if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
 
                 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
-                        log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
+                        log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
                 else {
-                        strv_free(arg_syscall_whitelist);
-                        strv_free(arg_syscall_blacklist);
+                        strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
+                        strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
+                }
+        }
+
+        for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
+                if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
+                        continue;
+
+                if (!settings->rlimit[rl])
+                        continue;
+
+                if (!arg_settings_trusted) {
+                        log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
+                        continue;
+                }
+
+                free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
+        }
+
+        if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
+            settings->hostname)
+                free_and_replace(arg_hostname, settings->hostname);
 
-                        arg_syscall_whitelist = TAKE_PTR(settings->syscall_whitelist);
-                        arg_syscall_blacklist = TAKE_PTR(settings->syscall_blacklist);
+        if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
+            settings->no_new_privileges >= 0)
+                arg_no_new_privileges = settings->no_new_privileges;
+
+        if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
+            settings->oom_score_adjust_set) {
+
+                if (!arg_settings_trusted)
+                        log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
+                else {
+                        arg_oom_score_adjust = settings->oom_score_adjust;
+                        arg_oom_score_adjust_set = true;
+                }
+        }
+
+        if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
+            settings->cpuset) {
+
+                if (!arg_settings_trusted)
+                        log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
+                else {
+                        if (arg_cpuset)
+                                CPU_FREE(arg_cpuset);
+                        arg_cpuset = TAKE_PTR(settings->cpuset);
+                        arg_cpuset_ncpus = settings->cpuset_ncpus;
                 }
         }
 
         return 0;
 }
 
+static int load_settings(void) {
+        _cleanup_(settings_freep) Settings *settings = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *p = NULL;
+        const char *fn, *i;
+        int r;
+
+        /* If all settings are masked, there's no point in looking for
+         * the settings file */
+        if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
+                return 0;
+
+        fn = strjoina(arg_machine, ".nspawn");
+
+        /* We first look in the admin's directories in /etc and /run */
+        FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
+                _cleanup_free_ char *j = NULL;
+
+                j = strjoin(i, "/", fn);
+                if (!j)
+                        return log_oom();
+
+                f = fopen(j, "re");
+                if (f) {
+                        p = TAKE_PTR(j);
+
+                        /* By default, we trust configuration from /etc and /run */
+                        if (arg_settings_trusted < 0)
+                                arg_settings_trusted = true;
+
+                        break;
+                }
+
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to open %s: %m", j);
+        }
+
+        if (!f) {
+                /* After that, let's look for a file next to the
+                 * actual image we shall boot. */
+
+                if (arg_image) {
+                        p = file_in_same_dir(arg_image, fn);
+                        if (!p)
+                                return log_oom();
+                } else if (arg_directory) {
+                        p = file_in_same_dir(arg_directory, fn);
+                        if (!p)
+                                return log_oom();
+                }
+
+                if (p) {
+                        f = fopen(p, "re");
+                        if (!f && errno != ENOENT)
+                                return log_error_errno(errno, "Failed to open %s: %m", p);
+
+                        /* By default, we do not trust configuration from /var/lib/machines */
+                        if (arg_settings_trusted < 0)
+                                arg_settings_trusted = false;
+                }
+        }
+
+        if (!f)
+                return 0;
+
+        log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
+
+        r = settings_load(f, p, &settings);
+        if (r < 0)
+                return r;
+
+        return merge_settings(settings, p);
+}
+
 static int run(int master,
                const char* console,
                DissectedImage *dissected_image,
@@ -3287,7 +3472,7 @@ static int run(int master,
                 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
         };
 
-        _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
+        _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
         _cleanup_close_ int etc_passwd_lock = -1;
         _cleanup_close_pair_ int
                 kmsg_socket_pair[2] = { -1, -1 },
@@ -3640,11 +3825,9 @@ static int run(int master,
         if (r < 0)
                 return r;
 
-        if (arg_keep_unit) {
-                r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
-                if (r < 0)
-                        return r;
-        }
+        r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+        if (r < 0)
+                return r;
 
         r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
         if (r < 0)
@@ -3695,20 +3878,20 @@ static int run(int master,
                    "STATUS=Container running.\n"
                    "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
         if (!arg_notify_ready)
-                sd_notify(false, "READY=1\n");
+                (void) sd_notify(false, "READY=1\n");
 
         if (arg_kill_signal > 0) {
                 /* Try to kill the init system on SIGINT or SIGTERM */
-                sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
-                sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
+                (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
+                (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
         } else {
                 /* Immediately exit */
-                sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
-                sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+                (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+                (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
         }
 
         /* Exit when the child exits */
-        sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
+        (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
 
         if (arg_expose_ports) {
                 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
@@ -3782,6 +3965,71 @@ static int run(int master,
         return 1; /* loop again */
 }
 
+static int initialize_rlimits(void) {
+
+        /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
+         * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
+         * container execution environments. */
+
+        static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
+                [RLIMIT_AS]       = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_CORE]     = { 0,             RLIM_INFINITY },
+                [RLIMIT_CPU]      = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_DATA]     = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_FSIZE]    = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_LOCKS]    = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_MEMLOCK]  = { 65536,         65536         },
+                [RLIMIT_MSGQUEUE] = { 819200,        819200        },
+                [RLIMIT_NICE]     = { 0,             0             },
+                [RLIMIT_NOFILE]   = { 1024,          4096          },
+                [RLIMIT_RSS]      = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_RTPRIO]   = { 0,             0             },
+                [RLIMIT_RTTIME]   = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_STACK]    = { 8388608,       RLIM_INFINITY },
+
+                /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
+                 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
+                 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
+                 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
+                 * that PID 1 changes a number of other resource limits during early initialization which is why we
+                 * don't read the other limits from PID 1 but prefer the static table above. */
+        };
+
+        int rl;
+
+        for (rl = 0; rl < _RLIMIT_MAX; rl++) {
+
+                /* Let's only fill in what the user hasn't explicitly configured anyway */
+                if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
+                        const struct rlimit *v;
+                        struct rlimit buffer;
+
+                        if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
+                                /* For these two let's read the limits off PID 1. See above for an explanation. */
+
+                                if (prlimit(1, rl, NULL, &buffer) < 0)
+                                        return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
+
+                                v = &buffer;
+                        } else
+                                v = kernel_defaults + rl;
+
+                        arg_rlimit[rl] = newdup(struct rlimit, v, 1);
+                        if (!arg_rlimit[rl])
+                                return log_oom();
+                }
+
+                if (DEBUG_LOGGING) {
+                        _cleanup_free_ char *k = NULL;
+
+                        (void) rlimit_format(arg_rlimit[rl], &k);
+                        log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
+                }
+        }
+
+        return 0;
+}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *console = NULL;
@@ -3792,7 +4040,7 @@ int main(int argc, char *argv[]) {
         bool secondary = false, remove_directory = false, remove_image = false;
         pid_t pid = 0;
         union in_addr_union exposed = {};
-        _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
+        _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
         bool interactive, veth_created = false, remove_tmprootdir = false;
         char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
         _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
@@ -3814,6 +4062,10 @@ int main(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
+        r = initialize_rlimits();
+        if (r < 0)
+                goto finish;
+
         r = determine_names();
         if (r < 0)
                 goto finish;
@@ -3937,17 +4189,30 @@ int main(int argc, char *argv[]) {
                 }
 
                 if (arg_start_mode == START_BOOT) {
-                        if (path_is_os_tree(arg_directory) <= 0) {
-                                log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
+                       const char *p;
+
+                       if (arg_pivot_root_new)
+                               p = prefix_roota(arg_directory, arg_pivot_root_new);
+                       else
+                               p = arg_directory;
+
+                        if (path_is_os_tree(p) <= 0) {
+                                log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
                                 r = -EINVAL;
                                 goto finish;
                         }
                 } else {
-                        const char *p;
+                        const char *p, *q;
+
+                       if (arg_pivot_root_new)
+                               p = prefix_roota(arg_directory, arg_pivot_root_new);
+                       else
+                               p = arg_directory;
 
-                        p = strjoina(arg_directory, "/usr/");
-                        if (laccess(p, F_OK) < 0) {
-                                log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
+                        q = strjoina(p, "/usr/");
+
+                        if (laccess(q, F_OK) < 0) {
+                                log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
                                 r = -EINVAL;
                                 goto finish;
                         }
@@ -4092,7 +4357,7 @@ int main(int argc, char *argv[]) {
 
         assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
 
-        if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
+        if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
                 r = log_error_errno(errno, "Failed to become subreaper: %m");
                 goto finish;
         }
@@ -4127,6 +4392,8 @@ finish:
         if (pid > 0)
                 (void) wait_for_terminate(pid, NULL);
 
+        pager_close();
+
         if (remove_directory && arg_directory) {
                 int k;
 
@@ -4162,6 +4429,7 @@ finish:
         free(arg_template);
         free(arg_image);
         free(arg_machine);
+        free(arg_hostname);
         free(arg_user);
         free(arg_pivot_root_new);
         free(arg_pivot_root_old);
@@ -4176,6 +4444,8 @@ finish:
         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
         expose_port_free_all(arg_expose_ports);
         free(arg_root_hash);
+        rlimit_free_all(arg_rlimit);
+        arg_cpuset = cpu_set_mfree(arg_cpuset);
 
         return r < 0 ? EXIT_FAILURE : ret;
 }