#include "capability-util.h"
#include "cgroup-util.h"
#include "copy.h"
+#include "cpu-set-util.h"
#include "dev-setup.h"
#include "dissect-image.h"
#include "env-util.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
+#include "pager.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "ptyfwd.h"
#include "random-util.h"
#include "raw-clone.h"
+#include "rlimit-util.h"
#include "rm-rf.h"
#include "selinux-util.h"
#include "signal-util.h"
#include "socket-util.h"
#include "stat-util.h"
#include "stdio-util.h"
+#include "string-table.h"
#include "string-util.h"
#include "strv.h"
#include "terminal-util.h"
static char *arg_pivot_root_old = NULL;
static char *arg_user = NULL;
static sd_id128_t arg_uuid = {};
-static char *arg_machine = NULL;
+static char *arg_machine = NULL; /* The name used by the host to refer to this */
+static char *arg_hostname = NULL; /* The name the payload sees by default */
static const char *arg_selinux_context = NULL;
static const char *arg_selinux_apifs_context = NULL;
static const char *arg_slice = NULL;
(1ULL << CAP_SYS_RESOURCE) |
(1ULL << CAP_SYS_TTY_CONFIG);
static CustomMount *arg_custom_mounts = NULL;
-static unsigned arg_n_custom_mounts = 0;
+static size_t arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static bool arg_quiet = false;
static bool arg_register = true;
static size_t arg_root_hash_size = 0;
static char **arg_syscall_whitelist = NULL;
static char **arg_syscall_blacklist = NULL;
+static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
+static bool arg_no_new_privileges = false;
+static int arg_oom_score_adjust = 0;
+static bool arg_oom_score_adjust_set = false;
+static cpu_set_t *arg_cpuset = NULL;
+static unsigned arg_cpuset_ncpus = 0;
static void help(void) {
+
+ (void) pager_open(false, false);
+
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
"Spawn a minimal namespace container for debugging, testing and building.\n\n"
" -h --help Show this help\n"
" Pivot root to given directory in the container\n"
" -u --user=USER Run the command under specified user or uid\n"
" -M --machine=NAME Set the machine name for the container\n"
+ " --hostname=NAME Override the hostname for the container\n"
" --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
" --system-call-filter=LIST|~LIST\n"
" Permit/prohibit specific system calls\n"
+ " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
+ " --oom-score-adjust=VALUE\n"
+ " Adjust the OOM score value for the payload\n"
+ " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
" host, try-guest, try-host\n"
}
static int custom_mount_check_all(void) {
- unsigned i;
+ size_t i;
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
ARG_NOTIFY_READY,
ARG_ROOT_HASH,
ARG_SYSTEM_CALL_FILTER,
+ ARG_RLIMIT,
+ ARG_HOSTNAME,
+ ARG_NO_NEW_PRIVILEGES,
+ ARG_OOM_SCORE_ADJUST,
+ ARG_CPU_AFFINITY,
};
static const struct option options[] = {
{ "read-only", no_argument, NULL, ARG_READ_ONLY },
{ "capability", required_argument, NULL, ARG_CAPABILITY },
{ "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
+ { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
{ "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
{ "bind", required_argument, NULL, ARG_BIND },
{ "bind-ro", required_argument, NULL, ARG_BIND_RO },
{ "overlay", required_argument, NULL, ARG_OVERLAY },
{ "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
{ "machine", required_argument, NULL, 'M' },
+ { "hostname", required_argument, NULL, ARG_HOSTNAME },
{ "slice", required_argument, NULL, 'S' },
{ "setenv", required_argument, NULL, 'E' },
{ "selinux-context", required_argument, NULL, 'Z' },
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{ "root-hash", required_argument, NULL, ARG_ROOT_HASH },
{ "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
+ { "rlimit", required_argument, NULL, ARG_RLIMIT },
+ { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
+ { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
{}
};
}
break;
+ case ARG_HOSTNAME:
+ if (isempty(optarg))
+ arg_hostname = mfree(arg_hostname);
+ else {
+ if (!hostname_is_valid(optarg, false)) {
+ log_error("Invalid hostname: %s", optarg);
+ return -EINVAL;
+ }
+
+ r = free_and_strdup(&arg_hostname, optarg);
+ if (r < 0)
+ return log_oom();
+ }
+
+ arg_settings_mask |= SETTING_HOSTNAME;
+ break;
+
case 'Z':
arg_selinux_context = optarg;
break;
break;
}
+ case ARG_NO_NEW_PRIVILEGES:
+ r = parse_boolean(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
+
+ arg_no_new_privileges = r;
+ arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
+ break;
+
case 'j':
arg_link_journal = LINK_GUEST;
arg_link_journal_try = true;
if (!n)
return log_oom();
- strv_free(arg_setenv);
- arg_setenv = n;
-
+ strv_free_and_replace(arg_setenv, n);
arg_settings_mask |= SETTING_ENVIRONMENT;
break;
}
if (!optarg)
arg_volatile_mode = VOLATILE_YES;
- else {
+ else if (streq(optarg, "help")) {
+ DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
+ return 0;
+ } else {
VolatileMode m;
m = volatile_mode_from_string(optarg);
break;
case ARG_KILL_SIGNAL:
- arg_kill_signal = signal_from_string_try_harder(optarg);
+ if (streq(optarg, "help")) {
+ DUMP_STRING_TABLE(signal, int, _NSIG);
+ return 0;
+ }
+
+ arg_kill_signal = signal_from_string(optarg);
if (arg_kill_signal < 0) {
log_error("Cannot parse signal: %s", optarg);
return -EINVAL;
break;
}
+ case ARG_RLIMIT: {
+ const char *eq;
+ char *name;
+ int rl;
+
+ if (streq(optarg, "help")) {
+ DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
+ return 0;
+ }
+
+ eq = strchr(optarg, '=');
+ if (!eq) {
+ log_error("--rlimit= expects an '=' assignment.");
+ return -EINVAL;
+ }
+
+ name = strndup(optarg, eq - optarg);
+ if (!name)
+ return log_oom();
+
+ rl = rlimit_from_string_harder(name);
+ if (rl < 0) {
+ log_error("Unknown resource limit: %s", name);
+ return -EINVAL;
+ }
+
+ if (!arg_rlimit[rl]) {
+ arg_rlimit[rl] = new0(struct rlimit, 1);
+ if (!arg_rlimit[rl])
+ return log_oom();
+ }
+
+ r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
+
+ arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
+ break;
+ }
+
+ case ARG_OOM_SCORE_ADJUST:
+ r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
+
+ arg_oom_score_adjust_set = true;
+ arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
+ break;
+
+ case ARG_CPU_AFFINITY: {
+ _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
+
+ r = parse_cpu_set(optarg, &cpuset);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
+
+ if (arg_cpuset)
+ CPU_FREE(arg_cpuset);
+
+ arg_cpuset = TAKE_PTR(cpuset);
+ arg_cpuset_ncpus = r;
+ arg_settings_mask |= SETTING_CPU_AFFINITY;
+ break;
+ }
+
case '?':
return -EINVAL;
return 0;
}
-static int setup_boot_id(const char *dest) {
+static int setup_boot_id(void) {
+ _cleanup_(unlink_and_freep) char *from = NULL;
+ _cleanup_free_ char *path = NULL;
sd_id128_t rnd = SD_ID128_NULL;
- const char *from, *to;
+ const char *to;
int r;
/* Generate a new randomized boot ID, so that each boot-up of
* the container gets a new one */
- from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
- to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
+ r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate random boot ID path: %m");
r = sd_id128_randomize(&rnd);
if (r < 0)
return log_error_errno(r, "Failed to generate random boot id: %m");
- r = id128_write(from, ID128_UUID, rnd, false);
+ r = id128_write(path, ID128_UUID, rnd, false);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
+ from = TAKE_PTR(path);
+ to = "/proc/sys/kernel/random/boot_id";
+
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
- if (r >= 0)
- r = mount_verbose(LOG_ERR, NULL, to, NULL,
- MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
+ if (r < 0)
+ return r;
- (void) unlink(from);
- return r;
+ return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
}
static int copy_devnodes(const char *dest) {
return 0;
}
-static int setup_kmsg(const char *dest, int kmsg_socket) {
- const char *from, *to;
+static int setup_kmsg(int kmsg_socket) {
+ _cleanup_(unlink_and_freep) char *from = NULL;
+ _cleanup_free_ char *fifo = NULL;
+ _cleanup_close_ int fd = -1;
_cleanup_umask_ mode_t u;
- int fd, r;
+ const char *to;
+ int r;
assert(kmsg_socket >= 0);
u = umask(0000);
- /* We create the kmsg FIFO as /run/kmsg, but immediately
- * delete it after bind mounting it to /proc/kmsg. While FIFOs
- * on the reading side behave very similar to /proc/kmsg,
- * their writing side behaves differently from /dev/kmsg in
- * that writing blocks when nothing is reading. In order to
- * avoid any problems with containers deadlocking due to this
- * we simply make /dev/kmsg unavailable to the container. */
- from = prefix_roota(dest, "/run/kmsg");
- to = prefix_roota(dest, "/proc/kmsg");
-
- if (mkfifo(from, 0600) < 0)
+ /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+ * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
+ * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
+ * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
+
+ r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate kmsg path: %m");
+
+ if (mkfifo(fifo, 0600) < 0)
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
+
+ from = TAKE_PTR(fifo);
+ to = "/proc/kmsg";
+
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
if (r < 0)
return r;
if (fd < 0)
return log_error_errno(errno, "Failed to open fifo: %m");
- /* Store away the fd in the socket, so that it stays open as
- * long as we run the child */
+ /* Store away the fd in the socket, so that it stays open as long as we run the child */
r = send_one_fd(kmsg_socket, fd, 0);
- safe_close(fd);
-
if (r < 0)
return log_error_errno(r, "Failed to send FIFO fd: %m");
- /* And now make the FIFO unavailable as /run/kmsg... */
- (void) unlink(from);
-
return 0;
}
}
static int setup_hostname(void) {
+ int r;
if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
return 0;
- if (sethostname_idempotent(arg_machine) < 0)
- return -errno;
+ r = sethostname_idempotent(arg_hostname ?: arg_machine);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set hostname: %m");
return 0;
}
return 0;
}
-
static int setup_propagate(const char *root) {
const char *p, *q;
int r;
_cleanup_free_ char *home = NULL;
char as_uuid[37];
- unsigned n_env = 1;
+ size_t n_env = 1;
const char *envp[] = {
- "PATH=" DEFAULT_PATH_SPLIT_USR,
+ "PATH=" DEFAULT_PATH_COMPAT,
NULL, /* container */
NULL, /* TERM */
NULL, /* HOME */
NULL
};
const char *exec_target;
-
_cleanup_strv_free_ char **env_use = NULL;
int r;
return r;
}
- r = setup_boot_id(NULL);
+ r = setup_boot_id();
if (r < 0)
return r;
- r = setup_kmsg(NULL, kmsg_socket);
+ r = setup_kmsg(kmsg_socket);
if (r < 0)
return r;
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
}
+ if (arg_oom_score_adjust_set) {
+ r = set_oom_score_adjust(arg_oom_score_adjust);
+ if (r < 0)
+ return log_error_errno(r, "Failed to adjust OOM score: %m");
+ }
+
+ if (arg_cpuset)
+ if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
+ return log_error_errno(errno, "Failed to set CPU affinity: %m");
+
r = drop_capabilities();
if (r < 0)
return log_error_errno(r, "drop_capabilities() failed: %m");
- setup_hostname();
+ (void) setup_hostname();
if (arg_personality != PERSONALITY_INVALID) {
r = safe_personality(arg_personality);
if (r < 0)
return r;
+ if (arg_no_new_privileges)
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to disable new privileges: %m");
+
/* LXC sets container=lxc, so follow the scheme here */
envp[n_env++] = strjoina("container=", arg_container_service_name);
return r;
}
- /* Now, explicitly close the log, so that we
- * then can close all remaining fds. Closing
- * the log explicitly first has the benefit
- * that the logging subsystem knows about it,
- * and is thus ready to be reopened should we
- * need it again. Note that the other fds
- * closed here are at least the locking and
- * barrier fds. */
+ /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
+ * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
+ * it again. Note that the other fds closed here are at least the locking and barrier fds. */
log_close();
+ log_set_open_when_needed(true);
+
(void) fdset_close_others(fds);
if (arg_start_mode == START_BOOT) {
exec_target = "/bin/bash, /bin/sh";
}
- r = -errno;
- (void) log_open();
- return log_error_errno(r, "execv(%s) failed: %m", exec_target);
+ return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
}
static int setup_sd_notify_child(void) {
FDSet *fds,
int netns_fd) {
+ _cleanup_close_ int fd = -1;
+ int r, which_failed;
pid_t pid;
ssize_t l;
- int r;
- _cleanup_close_ int fd = -1;
assert(barrier);
assert(directory);
if (fd < 0)
return fd;
+ r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
+ if (r < 0)
+ return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
arg_clone_ns_flags |
(arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
for (;;) {
char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
- _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
+ _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
if (--n_tries <= 0)
return -EBUSY;
return 0;
}
-static int load_settings(void) {
- _cleanup_(settings_freep) Settings *settings = NULL;
- _cleanup_fclose_ FILE *f = NULL;
- _cleanup_free_ char *p = NULL;
- const char *fn, *i;
- int r;
-
- /* If all settings are masked, there's no point in looking for
- * the settings file */
- if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
- return 0;
-
- fn = strjoina(arg_machine, ".nspawn");
-
- /* We first look in the admin's directories in /etc and /run */
- FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
- _cleanup_free_ char *j = NULL;
-
- j = strjoin(i, "/", fn);
- if (!j)
- return log_oom();
-
- f = fopen(j, "re");
- if (f) {
- p = TAKE_PTR(j);
-
- /* By default, we trust configuration from /etc and /run */
- if (arg_settings_trusted < 0)
- arg_settings_trusted = true;
-
- break;
- }
-
- if (errno != ENOENT)
- return log_error_errno(errno, "Failed to open %s: %m", j);
- }
-
- if (!f) {
- /* After that, let's look for a file next to the
- * actual image we shall boot. */
+static int merge_settings(Settings *settings, const char *path) {
+ int rl;
- if (arg_image) {
- p = file_in_same_dir(arg_image, fn);
- if (!p)
- return log_oom();
- } else if (arg_directory) {
- p = file_in_same_dir(arg_directory, fn);
- if (!p)
- return log_oom();
- }
+ assert(settings);
+ assert(path);
- if (p) {
- f = fopen(p, "re");
- if (!f && errno != ENOENT)
- return log_error_errno(errno, "Failed to open %s: %m", p);
-
- /* By default, we do not trust configuration from /var/lib/machines */
- if (arg_settings_trusted < 0)
- arg_settings_trusted = false;
- }
- }
-
- if (!f)
- return 0;
-
- log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
-
- r = settings_load(f, p, &settings);
- if (r < 0)
- return r;
-
- /* Copy over bits from the settings, unless they have been
- * explicitly masked by command line switches. */
+ /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
+ * that this steals the fields of the Settings* structure, and hence modifies it. */
if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
settings->start_mode >= 0) {
arg_start_mode = settings->start_mode;
-
- strv_free(arg_parameters);
- arg_parameters = TAKE_PTR(settings->parameters);
+ strv_free_and_replace(arg_parameters, settings->parameters);
}
if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
free_and_replace(arg_chdir, settings->working_directory);
if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
- settings->environment) {
- strv_free(arg_setenv);
- arg_setenv = TAKE_PTR(settings->environment);
- }
+ settings->environment)
+ strv_free_and_replace(arg_setenv, settings->environment);
if ((arg_settings_mask & SETTING_USER) == 0 &&
settings->user)
if (!arg_settings_trusted && plus != 0) {
if (settings->capability != 0)
- log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
+ log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
} else
arg_caps_retain |= plus;
!sd_id128_is_null(settings->machine_id)) {
if (!arg_settings_trusted)
- log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
+ log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
else
arg_uuid = settings->machine_id;
}
settings->n_custom_mounts > 0) {
if (!arg_settings_trusted)
- log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
+ log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
else {
custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
settings->network_veth_extra)) {
if (!arg_settings_trusted)
- log_warning("Ignoring network settings, file %s is not trusted.", p);
+ log_warning("Ignoring network settings, file %s is not trusted.", path);
else {
arg_network_veth = settings_network_veth(settings);
arg_private_network = settings_private_network(settings);
- strv_free(arg_network_interfaces);
- arg_network_interfaces = TAKE_PTR(settings->network_interfaces);
-
- strv_free(arg_network_macvlan);
- arg_network_macvlan = TAKE_PTR(settings->network_macvlan);
-
- strv_free(arg_network_ipvlan);
- arg_network_ipvlan = TAKE_PTR(settings->network_ipvlan);
-
- strv_free(arg_network_veth_extra);
- arg_network_veth_extra = TAKE_PTR(settings->network_veth_extra);
+ strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
+ strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
+ strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
+ strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
free_and_replace(arg_network_bridge, settings->network_bridge);
free_and_replace(arg_network_zone, settings->network_zone);
settings->expose_ports) {
if (!arg_settings_trusted)
- log_warning("Ignoring Port= setting, file %s is not trusted.", p);
+ log_warning("Ignoring Port= setting, file %s is not trusted.", path);
else {
expose_port_free_all(arg_expose_ports);
arg_expose_ports = TAKE_PTR(settings->expose_ports);
settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
if (!arg_settings_trusted)
- log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
+ log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
else {
arg_userns_mode = settings->userns_mode;
arg_uid_shift = settings->uid_shift;
if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
- log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
+ log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
else {
- strv_free(arg_syscall_whitelist);
- strv_free(arg_syscall_blacklist);
+ strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
+ strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
+ }
+ }
+
+ for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
+ if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
+ continue;
+
+ if (!settings->rlimit[rl])
+ continue;
+
+ if (!arg_settings_trusted) {
+ log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
+ continue;
+ }
+
+ free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
+ }
+
+ if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
+ settings->hostname)
+ free_and_replace(arg_hostname, settings->hostname);
- arg_syscall_whitelist = TAKE_PTR(settings->syscall_whitelist);
- arg_syscall_blacklist = TAKE_PTR(settings->syscall_blacklist);
+ if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
+ settings->no_new_privileges >= 0)
+ arg_no_new_privileges = settings->no_new_privileges;
+
+ if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
+ settings->oom_score_adjust_set) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
+ else {
+ arg_oom_score_adjust = settings->oom_score_adjust;
+ arg_oom_score_adjust_set = true;
+ }
+ }
+
+ if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
+ settings->cpuset) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
+ else {
+ if (arg_cpuset)
+ CPU_FREE(arg_cpuset);
+ arg_cpuset = TAKE_PTR(settings->cpuset);
+ arg_cpuset_ncpus = settings->cpuset_ncpus;
}
}
return 0;
}
+static int load_settings(void) {
+ _cleanup_(settings_freep) Settings *settings = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ char *p = NULL;
+ const char *fn, *i;
+ int r;
+
+ /* If all settings are masked, there's no point in looking for
+ * the settings file */
+ if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
+ return 0;
+
+ fn = strjoina(arg_machine, ".nspawn");
+
+ /* We first look in the admin's directories in /etc and /run */
+ FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
+ _cleanup_free_ char *j = NULL;
+
+ j = strjoin(i, "/", fn);
+ if (!j)
+ return log_oom();
+
+ f = fopen(j, "re");
+ if (f) {
+ p = TAKE_PTR(j);
+
+ /* By default, we trust configuration from /etc and /run */
+ if (arg_settings_trusted < 0)
+ arg_settings_trusted = true;
+
+ break;
+ }
+
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to open %s: %m", j);
+ }
+
+ if (!f) {
+ /* After that, let's look for a file next to the
+ * actual image we shall boot. */
+
+ if (arg_image) {
+ p = file_in_same_dir(arg_image, fn);
+ if (!p)
+ return log_oom();
+ } else if (arg_directory) {
+ p = file_in_same_dir(arg_directory, fn);
+ if (!p)
+ return log_oom();
+ }
+
+ if (p) {
+ f = fopen(p, "re");
+ if (!f && errno != ENOENT)
+ return log_error_errno(errno, "Failed to open %s: %m", p);
+
+ /* By default, we do not trust configuration from /var/lib/machines */
+ if (arg_settings_trusted < 0)
+ arg_settings_trusted = false;
+ }
+ }
+
+ if (!f)
+ return 0;
+
+ log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
+
+ r = settings_load(f, p, &settings);
+ if (r < 0)
+ return r;
+
+ return merge_settings(settings, p);
+}
+
static int run(int master,
const char* console,
DissectedImage *dissected_image,
.sa_flags = SA_NOCLDSTOP|SA_RESTART,
};
- _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
+ _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
_cleanup_close_ int etc_passwd_lock = -1;
_cleanup_close_pair_ int
kmsg_socket_pair[2] = { -1, -1 },
if (r < 0)
return r;
- if (arg_keep_unit) {
- r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
- if (r < 0)
- return r;
- }
+ r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+ if (r < 0)
+ return r;
r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
if (r < 0)
"STATUS=Container running.\n"
"X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
if (!arg_notify_ready)
- sd_notify(false, "READY=1\n");
+ (void) sd_notify(false, "READY=1\n");
if (arg_kill_signal > 0) {
/* Try to kill the init system on SIGINT or SIGTERM */
- sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
- sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
+ (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
+ (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
} else {
/* Immediately exit */
- sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
- sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+ (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+ (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
}
/* Exit when the child exits */
- sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
+ (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
if (arg_expose_ports) {
r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
return 1; /* loop again */
}
+static int initialize_rlimits(void) {
+
+ /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
+ * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
+ * container execution environments. */
+
+ static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
+ [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_CORE] = { 0, RLIM_INFINITY },
+ [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_MEMLOCK] = { 65536, 65536 },
+ [RLIMIT_MSGQUEUE] = { 819200, 819200 },
+ [RLIMIT_NICE] = { 0, 0 },
+ [RLIMIT_NOFILE] = { 1024, 4096 },
+ [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_RTPRIO] = { 0, 0 },
+ [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
+
+ /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
+ * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
+ * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
+ * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
+ * that PID 1 changes a number of other resource limits during early initialization which is why we
+ * don't read the other limits from PID 1 but prefer the static table above. */
+ };
+
+ int rl;
+
+ for (rl = 0; rl < _RLIMIT_MAX; rl++) {
+
+ /* Let's only fill in what the user hasn't explicitly configured anyway */
+ if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
+ const struct rlimit *v;
+ struct rlimit buffer;
+
+ if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
+ /* For these two let's read the limits off PID 1. See above for an explanation. */
+
+ if (prlimit(1, rl, NULL, &buffer) < 0)
+ return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
+
+ v = &buffer;
+ } else
+ v = kernel_defaults + rl;
+
+ arg_rlimit[rl] = newdup(struct rlimit, v, 1);
+ if (!arg_rlimit[rl])
+ return log_oom();
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *k = NULL;
+
+ (void) rlimit_format(arg_rlimit[rl], &k);
+ log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
+ }
+ }
+
+ return 0;
+}
+
int main(int argc, char *argv[]) {
_cleanup_free_ char *console = NULL;
bool secondary = false, remove_directory = false, remove_image = false;
pid_t pid = 0;
union in_addr_union exposed = {};
- _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
+ _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
bool interactive, veth_created = false, remove_tmprootdir = false;
char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
if (r < 0)
goto finish;
+ r = initialize_rlimits();
+ if (r < 0)
+ goto finish;
+
r = determine_names();
if (r < 0)
goto finish;
}
if (arg_start_mode == START_BOOT) {
- if (path_is_os_tree(arg_directory) <= 0) {
- log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
+ const char *p;
+
+ if (arg_pivot_root_new)
+ p = prefix_roota(arg_directory, arg_pivot_root_new);
+ else
+ p = arg_directory;
+
+ if (path_is_os_tree(p) <= 0) {
+ log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
r = -EINVAL;
goto finish;
}
} else {
- const char *p;
+ const char *p, *q;
+
+ if (arg_pivot_root_new)
+ p = prefix_roota(arg_directory, arg_pivot_root_new);
+ else
+ p = arg_directory;
- p = strjoina(arg_directory, "/usr/");
- if (laccess(p, F_OK) < 0) {
- log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
+ q = strjoina(p, "/usr/");
+
+ if (laccess(q, F_OK) < 0) {
+ log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
r = -EINVAL;
goto finish;
}
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
- if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
+ if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
r = log_error_errno(errno, "Failed to become subreaper: %m");
goto finish;
}
if (pid > 0)
(void) wait_for_terminate(pid, NULL);
+ pager_close();
+
if (remove_directory && arg_directory) {
int k;
free(arg_template);
free(arg_image);
free(arg_machine);
+ free(arg_hostname);
free(arg_user);
free(arg_pivot_root_new);
free(arg_pivot_root_old);
custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
expose_port_free_all(arg_expose_ports);
free(arg_root_hash);
+ rlimit_free_all(arg_rlimit);
+ arg_cpuset = cpu_set_mfree(arg_cpuset);
return r < 0 ? EXIT_FAILURE : ret;
}