#include "seccomp-util.h"
#endif
-typedef struct ExposePort {
- int protocol;
- uint16_t host_port;
- uint16_t container_port;
- LIST_FIELDS(struct ExposePort, ports);
-} ExposePort;
+#include "nspawn.h"
+#include "nspawn-settings.h"
typedef enum ContainerStatus {
CONTAINER_TERMINATED,
LINK_GUEST
} LinkJournal;
-typedef enum Volatile {
- VOLATILE_NO,
- VOLATILE_YES,
- VOLATILE_STATE,
-} Volatile;
-
-typedef enum CustomMountType {
- CUSTOM_MOUNT_BIND,
- CUSTOM_MOUNT_TMPFS,
- CUSTOM_MOUNT_OVERLAY,
-} CustomMountType;
-
-typedef struct CustomMount {
- CustomMountType type;
- bool read_only;
- char *source; /* for overlayfs this is the upper directory */
- char *destination;
- char *options;
- char *work_dir;
- char **lower;
-} CustomMount;
-
static char *arg_directory = NULL;
static char *arg_template = NULL;
static char *arg_user = NULL;
static char **arg_network_macvlan = NULL;
static char **arg_network_ipvlan = NULL;
static bool arg_network_veth = false;
-static const char *arg_network_bridge = NULL;
+static char *arg_network_bridge = NULL;
static unsigned long arg_personality = PERSONALITY_INVALID;
static char *arg_image = NULL;
-static Volatile arg_volatile = VOLATILE_NO;
+static VolatileMode arg_volatile_mode = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL;
static char **arg_property = NULL;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns = false;
static int arg_kill_signal = 0;
+static bool arg_unified_cgroup_hierarchy = false;
+static SettingsMask arg_settings_mask = 0;
+static int arg_settings_trusted = -1;
+static char **arg_parameters = NULL;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" try-guest, try-host\n"
" -j Equivalent to --link-journal=try-guest\n"
" --read-only Mount the root directory read-only\n"
- " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
+ " --bind=PATH[:PATH[:OPTIONS]]\n"
+ " Bind mount a file or directory from the host into\n"
" the container\n"
- " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
+ " --bind-ro=PATH[:PATH[:OPTIONS]\n"
+ " Similar, but creates a read-only bind mount\n"
" --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
" --overlay=PATH[:PATH...]:PATH\n"
" Create an overlay mount from the host to \n"
" --keep-unit Do not register a scope for the machine, reuse\n"
" the service unit nspawn is running in\n"
" --volatile[=MODE] Run the system in volatile mode\n"
+ " --settings=BOOLEAN Load additional settings from .nspawn file\n"
, program_invocation_short_name);
}
-static CustomMount* custom_mount_add(CustomMountType t) {
+static CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
CustomMount *c, *ret;
- c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
+ assert(l);
+ assert(n);
+ assert(t >= 0);
+ assert(t < _CUSTOM_MOUNT_TYPE_MAX);
+
+ c = realloc(*l, (*n + 1) * sizeof(CustomMount));
if (!c)
return NULL;
- arg_custom_mounts = c;
- ret = arg_custom_mounts + arg_n_custom_mounts;
- arg_n_custom_mounts++;
+ *l = c;
+ ret = *l + *n;
+ (*n)++;
*ret = (CustomMount) { .type = t };
return ret;
}
-static void custom_mount_free_all(void) {
+void custom_mount_free_all(CustomMount *l, unsigned n) {
unsigned i;
- for (i = 0; i < arg_n_custom_mounts; i++) {
- CustomMount *m = &arg_custom_mounts[i];
+ for (i = 0; i < n; i++) {
+ CustomMount *m = l + i;
free(m->source);
free(m->destination);
strv_free(m->lower);
}
- arg_custom_mounts = mfree(arg_custom_mounts);
- arg_n_custom_mounts = 0;
+ free(l);
}
static int custom_mount_compare(const void *a, const void *b) {
return 0;
}
+static int detect_unified_cgroup_hierarchy(void) {
+ const char *e;
+ int r;
+
+ /* Allow the user to control whether the unified hierarchy is used */
+ e = getenv("UNIFIED_CGROUP_HIERARCHY");
+ if (e) {
+ r = parse_boolean(e);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+
+ arg_unified_cgroup_hierarchy = r;
+ return 0;
+ }
+
+ /* Otherwise inherit the default from the host system */
+ r = cg_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
+ arg_unified_cgroup_hierarchy = r;
+ return 0;
+}
+
+VolatileMode volatile_mode_from_string(const char *s) {
+ int b;
+
+ if (isempty(s))
+ return _VOLATILE_MODE_INVALID;
+
+ b = parse_boolean(s);
+ if (b > 0)
+ return VOLATILE_YES;
+ if (b == 0)
+ return VOLATILE_NO;
+
+ if (streq(s, "state"))
+ return VOLATILE_STATE;
+
+ return _VOLATILE_MODE_INVALID;
+}
+
+int expose_port_parse(ExposePort **l, const char *s) {
+
+ const char *split, *e;
+ uint16_t container_port, host_port;
+ int protocol;
+ ExposePort *p;
+ int r;
+
+ if ((e = startswith(s, "tcp:")))
+ protocol = IPPROTO_TCP;
+ else if ((e = startswith(s, "udp:")))
+ protocol = IPPROTO_UDP;
+ else {
+ e = s;
+ protocol = IPPROTO_TCP;
+ }
+
+ split = strchr(e, ':');
+ if (split) {
+ char v[split - e + 1];
+
+ memcpy(v, e, split - e);
+ v[split - e] = 0;
+
+ r = safe_atou16(v, &host_port);
+ if (r < 0 || host_port <= 0)
+ return -EINVAL;
+
+ r = safe_atou16(split + 1, &container_port);
+ } else {
+ r = safe_atou16(e, &container_port);
+ host_port = container_port;
+ }
+
+ if (r < 0 || container_port <= 0)
+ return -EINVAL;
+
+ LIST_FOREACH(ports, p, arg_expose_ports)
+ if (p->protocol == protocol && p->host_port == host_port)
+ return -EEXIST;
+
+ p = new(ExposePort, 1);
+ if (!p)
+ return -ENOMEM;
+
+ p->protocol = protocol;
+ p->host_port = host_port;
+ p->container_port = container_port;
+
+ LIST_PREPEND(ports, *l, p);
+
+ return 0;
+}
+
+int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+ _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
+ const char *p = s;
+ CustomMount *m;
+ int r;
+
+ assert(l);
+ assert(n);
+
+ r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ if (r == 1) {
+ destination = strdup(source);
+ if (!destination)
+ return -ENOMEM;
+ }
+
+ if (r == 2 && !isempty(p)) {
+ opts = strdup(p);
+ if (!opts)
+ return -ENOMEM;
+ }
+
+ if (!path_is_absolute(source))
+ return -EINVAL;
+
+ if (!path_is_absolute(destination))
+ return -EINVAL;
+
+ m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
+ if (!m)
+ return log_oom();
+
+ m->source = source;
+ m->destination = destination;
+ m->read_only = read_only;
+ m->options = opts;
+
+ source = destination = opts = NULL;
+ return 0;
+}
+
+int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
+ _cleanup_free_ char *path = NULL, *opts = NULL;
+ const char *p = s;
+ CustomMount *m;
+ int r;
+
+ assert(l);
+ assert(n);
+ assert(s);
+
+ r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ if (isempty(p))
+ opts = strdup("mode=0755");
+ else
+ opts = strdup(p);
+ if (!opts)
+ return -ENOMEM;
+
+ if (!path_is_absolute(path))
+ return -EINVAL;
+
+ m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
+ if (!m)
+ return -ENOMEM;
+
+ m->destination = path;
+ m->options = opts;
+
+ path = opts = NULL;
+ return 0;
+}
+
static int parse_argv(int argc, char *argv[]) {
enum {
ARG_PROPERTY,
ARG_PRIVATE_USERS,
ARG_KILL_SIGNAL,
+ ARG_SETTINGS,
};
static const struct option options[] = {
{ "property", required_argument, NULL, ARG_PROPERTY },
{ "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
{ "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
+ { "settings", required_argument, NULL, ARG_SETTINGS },
{}
};
int c, r;
uint64_t plus = 0, minus = 0;
+ bool mask_all_settings = false, mask_no_settings = false;
assert(argc >= 0);
assert(argv);
if (r < 0)
return log_oom();
+ arg_settings_mask |= SETTING_USER;
break;
case ARG_NETWORK_BRIDGE:
- arg_network_bridge = optarg;
+ r = free_and_strdup(&arg_network_bridge, optarg);
+ if (r < 0)
+ return log_oom();
/* fall through */
case 'n':
arg_network_veth = true;
arg_private_network = true;
+ arg_settings_mask |= SETTING_NETWORK;
break;
case ARG_NETWORK_INTERFACE:
return log_oom();
arg_private_network = true;
+ arg_settings_mask |= SETTING_NETWORK;
break;
case ARG_NETWORK_MACVLAN:
return log_oom();
arg_private_network = true;
+ arg_settings_mask |= SETTING_NETWORK;
break;
case ARG_NETWORK_IPVLAN:
case ARG_PRIVATE_NETWORK:
arg_private_network = true;
+ arg_settings_mask |= SETTING_NETWORK;
break;
case 'b':
arg_boot = true;
+ arg_settings_mask |= SETTING_BOOT;
break;
case ARG_UUID:
log_error("Invalid UUID: %s", optarg);
return r;
}
+
+ arg_settings_mask |= SETTING_MACHINE_ID;
break;
case 'S':
case ARG_READ_ONLY:
arg_read_only = true;
+ arg_settings_mask |= SETTING_READ_ONLY;
break;
case ARG_CAPABILITY:
}
}
+ arg_settings_mask |= SETTING_CAPABILITY;
break;
}
break;
case ARG_BIND:
- case ARG_BIND_RO: {
- const char *current = optarg;
- _cleanup_free_ char *source = NULL, *destination = NULL;
- CustomMount *m;
-
- r = extract_many_words(¤t, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
- switch (r) {
- case 1:
- destination = strdup(source);
- case 2:
- break;
- case -ENOMEM:
- return log_oom();
- default:
- log_error("Invalid bind mount specification: %s", optarg);
- return -EINVAL;
- }
-
- if (!source || !destination)
- return log_oom();
-
- if (!path_is_absolute(source) || !path_is_absolute(destination)) {
- log_error("Invalid bind mount specification: %s", optarg);
- return -EINVAL;
- }
-
- m = custom_mount_add(CUSTOM_MOUNT_BIND);
- if (!m)
- return log_oom();
-
- m->source = source;
- m->destination = destination;
- m->read_only = c == ARG_BIND_RO;
-
- source = destination = NULL;
+ case ARG_BIND_RO:
+ r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
+ arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
break;
- }
-
- case ARG_TMPFS: {
- const char *current = optarg;
- _cleanup_free_ char *path = NULL, *opts = NULL;
- CustomMount *m;
- r = extract_first_word(¤t, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
- if (r == -ENOMEM)
- return log_oom();
- else if (r < 0) {
- log_error("Invalid tmpfs specification: %s", optarg);
- return r;
- }
- if (r)
- opts = strdup(current);
- else
- opts = strdup("mode=0755");
-
- if (!path || !opts)
- return log_oom();
-
- if (!path_is_absolute(path)) {
- log_error("Invalid tmpfs specification: %s", optarg);
- return -EINVAL;
- }
-
- m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
- if (!m)
- return log_oom();
-
- m->destination = path;
- m->options = opts;
-
- path = opts = NULL;
+ case ARG_TMPFS:
+ r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
+ arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
break;
- }
case ARG_OVERLAY:
case ARG_OVERLAY_RO: {
lower[n - 2] = NULL;
}
- m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
+ m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
if (!m)
return log_oom();
upper = destination = NULL;
lower = NULL;
+ arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
break;
}
strv_free(arg_setenv);
arg_setenv = n;
+
+ arg_settings_mask |= SETTING_ENVIRONMENT;
break;
}
return -EINVAL;
}
+ arg_settings_mask |= SETTING_PERSONALITY;
break;
case ARG_VOLATILE:
if (!optarg)
- arg_volatile = VOLATILE_YES;
+ arg_volatile_mode = VOLATILE_YES;
else {
- r = parse_boolean(optarg);
- if (r < 0) {
- if (streq(optarg, "state"))
- arg_volatile = VOLATILE_STATE;
- else {
- log_error("Failed to parse --volatile= argument: %s", optarg);
- return r;
- }
- } else
- arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
- }
-
- break;
-
- case 'p': {
- const char *split, *e;
- uint16_t container_port, host_port;
- int protocol;
- ExposePort *p;
-
- if ((e = startswith(optarg, "tcp:")))
- protocol = IPPROTO_TCP;
- else if ((e = startswith(optarg, "udp:")))
- protocol = IPPROTO_UDP;
- else {
- e = optarg;
- protocol = IPPROTO_TCP;
- }
-
- split = strchr(e, ':');
- if (split) {
- char v[split - e + 1];
-
- memcpy(v, e, split - e);
- v[split - e] = 0;
-
- r = safe_atou16(v, &host_port);
- if (r < 0 || host_port <= 0) {
- log_error("Failed to parse host port: %s", optarg);
- return -EINVAL;
- }
-
- r = safe_atou16(split + 1, &container_port);
- } else {
- r = safe_atou16(e, &container_port);
- host_port = container_port;
- }
-
- if (r < 0 || container_port <= 0) {
- log_error("Failed to parse host port: %s", optarg);
- return -EINVAL;
- }
+ VolatileMode m;
- LIST_FOREACH(ports, p, arg_expose_ports) {
- if (p->protocol == protocol && p->host_port == host_port) {
- log_error("Duplicate port specification: %s", optarg);
+ m = volatile_mode_from_string(optarg);
+ if (m < 0) {
+ log_error("Failed to parse --volatile= argument: %s", optarg);
return -EINVAL;
- }
+ } else
+ arg_volatile_mode = m;
}
- p = new(ExposePort, 1);
- if (!p)
- return log_oom();
-
- p->protocol = protocol;
- p->host_port = host_port;
- p->container_port = container_port;
+ arg_settings_mask |= SETTING_VOLATILE_MODE;
+ break;
- LIST_PREPEND(ports, arg_expose_ports, p);
+ case 'p':
+ r = expose_port_parse(&arg_expose_ports, optarg);
+ if (r == -EEXIST)
+ return log_error_errno(r, "Duplicate port specification: %s", optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
+ arg_settings_mask |= SETTING_EXPOSE_PORTS;
break;
- }
case ARG_PROPERTY:
if (strv_extend(&arg_property, optarg) < 0)
return -EINVAL;
}
+ arg_settings_mask |= SETTING_KILL_SIGNAL;
+ break;
+
+ case ARG_SETTINGS:
+
+ /* no → do not read files
+ * yes → read files, do not override cmdline, trust only subset
+ * override → read files, override cmdline, trust only subset
+ * trusted → read files, do not override cmdline, trust all
+ */
+
+ r = parse_boolean(optarg);
+ if (r < 0) {
+ if (streq(optarg, "trusted")) {
+ mask_all_settings = false;
+ mask_no_settings = false;
+ arg_settings_trusted = true;
+
+ } else if (streq(optarg, "override")) {
+ mask_all_settings = false;
+ mask_no_settings = true;
+ arg_settings_trusted = -1;
+ } else
+ return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
+ } else if (r > 0) {
+ /* yes */
+ mask_all_settings = false;
+ mask_no_settings = false;
+ arg_settings_trusted = -1;
+ } else {
+ /* no */
+ mask_all_settings = true;
+ mask_no_settings = false;
+ arg_settings_trusted = false;
+ }
+
break;
case '?':
return -EINVAL;
}
- if (arg_volatile != VOLATILE_NO && arg_read_only) {
+ if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
+ return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
+
+ if (argc > optind) {
+ arg_parameters = strv_copy(argv + optind);
+ if (!arg_parameters)
+ return log_oom();
+
+ arg_settings_mask |= SETTING_BOOT;
+ }
+
+ /* Load all settings from .nspawn files */
+ if (mask_no_settings)
+ arg_settings_mask = 0;
+
+ /* Don't load any settings from .nspawn files */
+ if (mask_all_settings)
+ arg_settings_mask = _SETTINGS_MASK_ALL;
+
+ arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+
+ r = detect_unified_cgroup_hierarchy();
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int verify_arguments(void) {
+
+ if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
return -EINVAL;
}
return -EINVAL;
}
- if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
- return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
-
- arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
-
if (arg_boot && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
- return 1;
+ return 0;
}
static int tmpfs_patch_options(const char *options, char **ret) {
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
- { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
return 0;
}
+static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
+ const char *p = options;
+ unsigned long flags = *mount_flags;
+ char *opts = NULL;
+
+ assert(options);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ int r = extract_first_word(&p, &word, ",", 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract mount option: %m");
+ if (r == 0)
+ break;
+
+ if (streq(word, "rbind"))
+ flags |= MS_REC;
+ else if (streq(word, "norbind"))
+ flags &= ~MS_REC;
+ else {
+ log_error("Invalid bind mount option: %s", word);
+ return -EINVAL;
+ }
+ }
+
+ *mount_flags = flags;
+ /* in the future mount_opts will hold string options for mount(2) */
+ *mount_opts = opts;
+
+ return 0;
+}
+
static int mount_bind(const char *dest, CustomMount *m) {
struct stat source_st, dest_st;
const char *where;
+ unsigned long mount_flags = MS_BIND | MS_REC;
+ _cleanup_free_ char *mount_opts = NULL;
int r;
assert(m);
+ if (m->options) {
+ r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
+ if (r < 0)
+ return r;
+ }
+
if (stat(m->source, &source_st) < 0)
return log_error_errno(errno, "Failed to stat %s: %m", m->source);
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
- if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
+ if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
return log_error_errno(errno, "mount(%s) failed: %m", where);
if (m->read_only) {
return 0;
}
-static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
char *to;
int r;
return 1;
}
-static int mount_cgroup(const char *dest) {
+static int mount_legacy_cgroups(const char *dest) {
_cleanup_set_free_free_ Set *controllers = NULL;
const char *cgroup_root;
int r;
+ cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+ /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+ r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+ if (r == 0) {
+ _cleanup_free_ char *options = NULL;
+
+ r = tmpfs_patch_options("mode=755", &options);
+ if (r < 0)
+ return log_oom();
+
+ if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+ return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+ }
+
+ if (cg_unified() > 0)
+ goto skip_controllers;
+
controllers = set_new(&string_hash_ops);
if (!controllers)
return log_oom();
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
- r = mount_cgroup_hierarchy(dest, controller, controller, true);
+ r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
if (r < 0)
return r;
continue;
}
- r = mount_cgroup_hierarchy(dest, combined, combined, true);
+ r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
if (r < 0)
return r;
}
}
- r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
+skip_controllers:
+ r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
if (r < 0)
return r;
- cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
return 0;
}
+static int mount_unified_cgroups(const char *dest) {
+ const char *p;
+ int r;
+
+ assert(dest);
+
+ p = strjoina(dest, "/sys/fs/cgroup");
+
+ r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
+ if (r > 0) {
+ p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
+ if (access(p, F_OK) >= 0)
+ return 0;
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
+
+ log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
+ return -EINVAL;
+ }
+
+ if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+ return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
+
+ return 0;
+}
+
+static int mount_cgroups(const char *dest) {
+ if (arg_unified_cgroup_hierarchy)
+ return mount_unified_cgroups(dest);
+ else
+ return mount_legacy_cgroups(dest);
+}
+
static int mount_systemd_cgroup_writable(const char *dest) {
_cleanup_free_ char *own_cgroup_path = NULL;
const char *systemd_root, *systemd_own;
if (r < 0)
return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+ /* If we are living in the top-level, then there's nothing to do... */
+ if (path_equal(own_cgroup_path, "/"))
+ return 0;
+
+ if (arg_unified_cgroup_hierarchy) {
+ systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
+ systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
+ } else {
+ systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+ systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+ }
+
/* Make our own cgroup a (writable) bind mount */
- systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
/* And then remount the systemd cgroup root read-only */
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
assert(directory);
- if (arg_volatile != VOLATILE_STATE)
+ if (arg_volatile_mode != VOLATILE_STATE)
return 0;
/* --volatile=state means we simply overmount /var
assert(directory);
- if (arg_volatile != VOLATILE_YES)
+ if (arg_volatile_mode != VOLATILE_YES)
return 0;
/* --volatile=yes means we mount a tmpfs to the root dir, and
return 0;
}
+void expose_port_free_all(ExposePort *p) {
+
+ while (p) {
+ ExposePort *q = p;
+ LIST_REMOVE(ports, p, q);
+ free(q);
+ }
+}
+
static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
union in_addr_union *exposed = userdata;
bool secondary,
int kmsg_socket,
int rtnl_socket,
- FDSet *fds,
- int argc,
- char *argv[]) {
+ FDSet *fds) {
_cleanup_free_ char *home = NULL;
unsigned n_env = 2;
assert(directory);
assert(kmsg_socket >= 0);
+ cg_unified_flush();
+
if (arg_userns) {
/* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */
/* Automatically search for the init system */
- m = 1 + argc - optind;
+ m = 1 + strv_length(arg_parameters);
a = newa(char*, m + 1);
- memcpy(a + 1, argv + optind, m * sizeof(char*));
+ if (strv_isempty(arg_parameters))
+ a[1] = NULL;
+ else
+ memcpy(a + 1, arg_parameters, m * sizeof(char*));
a[0] = (char*) "/usr/lib/systemd/systemd";
execve(a[0], a, env_use);
a[0] = (char*) "/sbin/init";
execve(a[0], a, env_use);
- } else if (argc > optind)
- execvpe(argv[optind], argv + optind, env_use);
+ } else if (!strv_isempty(arg_parameters))
+ execvpe(arg_parameters[0], arg_parameters, env_use);
else {
- chdir(home ? home : "/root");
+ chdir(home ?: "/root");
execle("/bin/bash", "-bash", NULL, env_use);
execle("/bin/sh", "-sh", NULL, env_use);
}
int kmsg_socket,
int rtnl_socket,
int uid_shift_socket,
- FDSet *fds,
- int argc,
- char *argv[]) {
+ FDSet *fds) {
pid_t pid;
ssize_t l;
assert(pid_socket >= 0);
assert(kmsg_socket >= 0);
+ cg_unified_flush();
+
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
if (r < 0)
return r;
- r = mount_cgroup(directory);
+ r = mount_cgroups(directory);
if (r < 0)
return r;
NULL);
if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m");
-
if (pid == 0) {
pid_socket = safe_close(pid_socket);
uid_shift_socket = safe_close(uid_shift_socket);
* requested, so that we all are owned by the user if
* user namespaces are turned on. */
- r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
+ r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
if (r < 0)
_exit(EXIT_FAILURE);
if (fd < 0)
return log_error_errno(errno, "Failed to open %s: %m", fs);
- FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
+ FOREACH_STRING(fn,
+ ".",
+ "tasks",
+ "notify_on_release",
+ "cgroup.procs",
+ "cgroup.clone_children",
+ "cgroup.controllers",
+ "cgroup.subtree_control",
+ "cgroup.populated")
if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
- log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
+ log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+ return 0;
+}
+
+static int sync_cgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup = NULL;
+ char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+ bool undo_mount = false;
+ const char *fn;
+ int unified, r;
+
+ unified = cg_unified();
+ if (unified < 0)
+ return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+
+ if ((unified > 0) == arg_unified_cgroup_hierarchy)
+ return 0;
+
+ /* When the host uses the legacy cgroup setup, but the
+ * container shall use the unified hierarchy, let's make sure
+ * we copy the path from the name=systemd hierarchy into the
+ * unified hierarchy. Similar for the reverse situation. */
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
+
+ /* In order to access the unified hierarchy we need to mount it */
+ if (!mkdtemp(tree))
+ return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+
+ if (unified)
+ r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+ else
+ r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+ if (r < 0) {
+ r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+ goto finish;
+ }
+
+ undo_mount = true;
+
+ fn = strjoina(tree, cgroup, "/cgroup.procs");
+ (void) mkdir_parents(fn, 0755);
+
+ sprintf(pid_string, PID_FMT, pid);
+ r = write_string_file(fn, pid_string, 0);
+ if (r < 0)
+ log_error_errno(r, "Failed to move process: %m");
+
+finish:
+ if (undo_mount)
+ (void) umount(tree);
+
+ (void) rmdir(tree);
+ return r;
+}
+
+static int create_subcgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup = NULL;
+ const char *child;
+ int unified, r;
+ CGroupMask supported;
+
+ /* In the unified hierarchy inner nodes may only only contain
+ * subgroups, but not processes. Hence, if we running in the
+ * unified hierarchy and the container does the same, and we
+ * did not create a scope unit for the container move us and
+ * the container into two separate subcgroups. */
+
+ if (!arg_keep_unit)
+ return 0;
+
+ if (!arg_unified_cgroup_hierarchy)
+ return 0;
+
+ unified = cg_unified();
+ if (unified < 0)
+ return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+ if (unified == 0)
+ return 0;
+
+ r = cg_mask_supported(&supported);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine supported controllers: %m");
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get our control group: %m");
+
+ child = strjoina(cgroup, "/payload");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+ child = strjoina(cgroup, "/supervisor");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+ /* Try to enable as many controllers as possible for the new payload. */
+ (void) cg_enable_everywhere(supported, supported, cgroup);
+ return 0;
+}
+
+static int load_settings(void) {
+ _cleanup_(settings_freep) Settings *settings = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ char *p = NULL;
+ const char *fn, *i;
+ int r;
+
+ /* If all settings are masked, there's no point in looking for
+ * the settings file */
+ if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
+ return 0;
+
+ fn = strjoina(arg_machine, ".nspawn");
+
+ /* We first look in the admin's directories in /etc and /run */
+ FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
+ _cleanup_free_ char *j = NULL;
+
+ j = strjoin(i, "/", fn, NULL);
+ if (!j)
+ return log_oom();
+
+ f = fopen(j, "re");
+ if (f) {
+ p = j;
+ j = NULL;
+
+ /* By default we trust configuration from /etc and /run */
+ if (arg_settings_trusted < 0)
+ arg_settings_trusted = true;
+
+ break;
+ }
+
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to open %s: %m", j);
+ }
+
+ if (!f) {
+ /* After that, let's look for a file next to the
+ * actual image we shall boot. */
+
+ if (arg_image) {
+ p = file_in_same_dir(arg_image, fn);
+ if (!p)
+ return log_oom();
+ } else if (arg_directory) {
+ p = file_in_same_dir(arg_directory, fn);
+ if (!p)
+ return log_oom();
+ }
+
+ if (p) {
+ f = fopen(p, "re");
+ if (!f && errno != ENOENT)
+ return log_error_errno(errno, "Failed to open %s: %m", p);
+
+ /* By default we do not trust configuration from /var/lib/machines */
+ if (arg_settings_trusted < 0)
+ arg_settings_trusted = false;
+ }
+ }
+
+ if (!f)
+ return 0;
+
+ log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
+
+ r = settings_load(f, p, &settings);
+ if (r < 0)
+ return r;
+
+ /* Copy over bits from the settings, unless they have been
+ * explicitly masked by command line switches. */
+
+ if ((arg_settings_mask & SETTING_BOOT) == 0 &&
+ settings->boot >= 0) {
+ arg_boot = settings->boot;
+
+ strv_free(arg_parameters);
+ arg_parameters = settings->parameters;
+ settings->parameters = NULL;
+ }
+
+ if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
+ settings->environment) {
+ strv_free(arg_setenv);
+ arg_setenv = settings->environment;
+ settings->environment = NULL;
+ }
+
+ if ((arg_settings_mask & SETTING_USER) == 0 &&
+ settings->user) {
+ free(arg_user);
+ arg_user = settings->user;
+ settings->user = NULL;
+ }
+
+ if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
+
+ if (!arg_settings_trusted && settings->capability != 0)
+ log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
+ else
+ arg_retain |= settings->capability;
+
+ arg_retain &= ~settings->drop_capability;
+ }
+
+ if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
+ settings->kill_signal > 0)
+ arg_kill_signal = settings->kill_signal;
+
+ if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
+ settings->personality != PERSONALITY_INVALID)
+ arg_personality = settings->personality;
+
+ if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
+ !sd_id128_is_null(settings->machine_id)) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
+ else
+ arg_uuid = settings->machine_id;
+ }
+
+ if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
+ settings->read_only >= 0)
+ arg_read_only = settings->read_only;
+
+ if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
+ settings->volatile_mode != _VOLATILE_MODE_INVALID)
+ arg_volatile_mode = settings->volatile_mode;
+
+ if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
+ settings->n_custom_mounts > 0) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
+ else {
+ custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
+ arg_custom_mounts = settings->custom_mounts;
+ arg_n_custom_mounts = settings->n_custom_mounts;
+
+ settings->custom_mounts = NULL;
+ settings->n_custom_mounts = 0;
+ }
+ }
+
+ if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
+ (settings->private_network >= 0 ||
+ settings->network_veth >= 0 ||
+ settings->network_bridge ||
+ settings->network_interfaces ||
+ settings->network_macvlan ||
+ settings->network_ipvlan)) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring network settings, file %s is not trusted.", p);
+ else {
+ strv_free(arg_network_interfaces);
+ arg_network_interfaces = settings->network_interfaces;
+ settings->network_interfaces = NULL;
+
+ strv_free(arg_network_macvlan);
+ arg_network_macvlan = settings->network_macvlan;
+ settings->network_macvlan = NULL;
+
+ strv_free(arg_network_ipvlan);
+ arg_network_ipvlan = settings->network_ipvlan;
+ settings->network_ipvlan = NULL;
+
+ free(arg_network_bridge);
+ arg_network_bridge = settings->network_bridge;
+ settings->network_bridge = NULL;
+
+ arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
+
+ arg_private_network = true; /* all these settings imply private networking */
+ }
+ }
+
+ if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
+ settings->expose_ports) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring Port= setting, file %s is not trusted.", p);
+ else {
+ expose_port_free_all(arg_expose_ports);
+ arg_expose_ports = settings->expose_ports;
+ settings->expose_ports = NULL;
+ }
+ }
return 0;
}
if (r <= 0)
goto finish;
- r = determine_names();
- if (r < 0)
- goto finish;
-
if (geteuid() != 0) {
log_error("Need to be root.");
r = -EPERM;
goto finish;
}
+ r = determine_names();
+ if (r < 0)
+ goto finish;
+
+ r = load_settings();
+ if (r < 0)
+ goto finish;
+
+ r = verify_arguments();
+ if (r < 0)
+ goto finish;
n_fd_passed = sd_listen_fds(false);
if (n_fd_passed > 0) {
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
- fds,
- argc, argv);
+ fds);
if (r < 0)
_exit(EXIT_FAILURE);
if (r < 0)
goto finish;
+ r = sync_cgroup(pid);
+ if (r < 0)
+ goto finish;
+
+ r = create_subcgroup(pid);
+ if (r < 0)
+ goto finish;
+
r = chown_cgroup(pid);
if (r < 0)
goto finish;
(void) rm_rf(p, REMOVE_ROOT);
}
+ flush_ports(&exposed);
+
free(arg_directory);
free(arg_template);
free(arg_image);
free(arg_machine);
free(arg_user);
strv_free(arg_setenv);
+ free(arg_network_bridge);
strv_free(arg_network_interfaces);
strv_free(arg_network_macvlan);
strv_free(arg_network_ipvlan);
- custom_mount_free_all();
-
- flush_ports(&exposed);
-
- while (arg_expose_ports) {
- ExposePort *p = arg_expose_ports;
- LIST_REMOVE(ports, arg_expose_ports, p);
- free(p);
- }
+ strv_free(arg_parameters);
+ custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
+ expose_port_free_all(arg_expose_ports);
return r < 0 ? EXIT_FAILURE : ret;
}