#include "sd-daemon.h"
#include "sd-bus.h"
#include "sd-id128.h"
-#include "sd-rtnl.h"
+#include "sd-netlink.h"
+#include "random-util.h"
#include "log.h"
#include "util.h"
#include "mkdir.h"
#include "bus-error.h"
#include "ptyfwd.h"
#include "env-util.h"
-#include "rtnl-util.h"
+#include "netlink-util.h"
#include "udev-util.h"
#include "blkid-util.h"
#include "gpt.h"
#include "machine-image.h"
#include "list.h"
#include "in-addr-util.h"
-#include "fw-util.h"
+#include "firewall-util.h"
#include "local-addresses.h"
+#include "formats-util.h"
+#include "process-util.h"
+#include "terminal-util.h"
+#include "hostname-util.h"
+#include "signal-util.h"
#ifdef HAVE_SECCOMP
#include "seccomp-util.h"
VOLATILE_STATE,
} Volatile;
+typedef enum CustomMountType {
+ CUSTOM_MOUNT_BIND,
+ CUSTOM_MOUNT_TMPFS,
+ CUSTOM_MOUNT_OVERLAY,
+} CustomMountType;
+
+typedef struct CustomMount {
+ CustomMountType type;
+ bool read_only;
+ char *source; /* for overlayfs this is the upper directory */
+ char *destination;
+ char *options;
+ char *work_dir;
+ char **lower;
+} CustomMount;
+
static char *arg_directory = NULL;
static char *arg_template = NULL;
static char *arg_user = NULL;
(1ULL << CAP_AUDIT_WRITE) |
(1ULL << CAP_AUDIT_CONTROL) |
(1ULL << CAP_MKNOD);
-static char **arg_bind = NULL;
-static char **arg_bind_ro = NULL;
-static char **arg_tmpfs = NULL;
+static CustomMount *arg_custom_mounts = NULL;
+static unsigned arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static bool arg_quiet = false;
static bool arg_share_system = false;
static char **arg_network_ipvlan = NULL;
static bool arg_network_veth = false;
static const char *arg_network_bridge = NULL;
-static unsigned long arg_personality = 0xffffffffLU;
+static unsigned long arg_personality = PERSONALITY_INVALID;
static char *arg_image = NULL;
static Volatile arg_volatile = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns = false;
static int arg_kill_signal = 0;
+static bool arg_unified_cgroup_hierarchy = false;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
+ " --private-users[=UIDBASE[:NUIDS]]\n"
+ " Run within user namespace\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
" Add a virtual ethernet connection between host\n"
" and container and add it to an existing bridge on\n"
" the host\n"
- " --private-users[=UIDBASE[:NUIDS]]\n"
- " Run within user namespace\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
" Expose a container IP port on the host\n"
" -Z --selinux-context=SECLABEL\n"
" try-guest, try-host\n"
" -j Equivalent to --link-journal=try-guest\n"
" --read-only Mount the root directory read-only\n"
- " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
+ " --bind=PATH[:PATH[:OPTIONS]]\n"
+ " Bind mount a file or directory from the host into\n"
" the container\n"
- " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
+ " --bind-ro=PATH[:PATH[:OPTIONS]\n"
+ " Similar, but creates a read-only bind mount\n"
" --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
+ " --overlay=PATH[:PATH...]:PATH\n"
+ " Create an overlay mount from the host to \n"
+ " the container\n"
+ " --overlay-ro=PATH[:PATH...]:PATH\n"
+ " Similar, but creates a read-only overlay mount\n"
" --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
" --share-system Share system namespaces with host\n"
" --register=BOOLEAN Register container as machine\n"
, program_invocation_short_name);
}
+static CustomMount* custom_mount_add(CustomMountType t) {
+ CustomMount *c, *ret;
+
+ c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
+ if (!c)
+ return NULL;
+
+ arg_custom_mounts = c;
+ ret = arg_custom_mounts + arg_n_custom_mounts;
+ arg_n_custom_mounts++;
+
+ *ret = (CustomMount) { .type = t };
+
+ return ret;
+}
+
+static void custom_mount_free_all(void) {
+ unsigned i;
+
+ for (i = 0; i < arg_n_custom_mounts; i++) {
+ CustomMount *m = &arg_custom_mounts[i];
+
+ free(m->source);
+ free(m->destination);
+ free(m->options);
+
+ if (m->work_dir) {
+ (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
+ free(m->work_dir);
+ }
+
+ strv_free(m->lower);
+ }
+
+ arg_custom_mounts = mfree(arg_custom_mounts);
+ arg_n_custom_mounts = 0;
+}
+
+static int custom_mount_compare(const void *a, const void *b) {
+ const CustomMount *x = a, *y = b;
+ int r;
+
+ r = path_compare(x->destination, y->destination);
+ if (r != 0)
+ return r;
+
+ if (x->type < y->type)
+ return -1;
+ if (x->type > y->type)
+ return 1;
+
+ return 0;
+}
+
+static int custom_mounts_prepare(void) {
+ unsigned i;
+ int r;
+
+ /* Ensure the mounts are applied prefix first. */
+ qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
+
+ /* Allocate working directories for the overlay file systems that need it */
+ for (i = 0; i < arg_n_custom_mounts; i++) {
+ CustomMount *m = &arg_custom_mounts[i];
+
+ if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
+ log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
+ return -EINVAL;
+ }
+
+ if (m->type != CUSTOM_MOUNT_OVERLAY)
+ continue;
+
+ if (m->work_dir)
+ continue;
+
+ if (m->read_only)
+ continue;
+
+ r = tempfn_random(m->source, NULL, &m->work_dir);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
+ }
+
+ return 0;
+}
+
static int set_sanitized_path(char **b, const char *path) {
char *p;
return 0;
}
+static int detect_unified_cgroup_hierarchy(void) {
+ const char *e;
+ int r;
+
+ /* Allow the user to control whether the unified hierarchy is used */
+ e = getenv("UNIFIED_CGROUP_HIERARCHY");
+ if (e) {
+ r = parse_boolean(e);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+
+ arg_unified_cgroup_hierarchy = r;
+ return 0;
+ }
+
+ /* Otherwise inherit the default from the host system */
+ r = cg_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
+ arg_unified_cgroup_hierarchy = r;
+ return 0;
+}
+
static int parse_argv(int argc, char *argv[]) {
enum {
ARG_BIND,
ARG_BIND_RO,
ARG_TMPFS,
+ ARG_OVERLAY,
+ ARG_OVERLAY_RO,
ARG_SETENV,
ARG_SHARE_SYSTEM,
ARG_REGISTER,
{ "bind", required_argument, NULL, ARG_BIND },
{ "bind-ro", required_argument, NULL, ARG_BIND_RO },
{ "tmpfs", required_argument, NULL, ARG_TMPFS },
+ { "overlay", required_argument, NULL, ARG_OVERLAY },
+ { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
{ "machine", required_argument, NULL, 'M' },
{ "slice", required_argument, NULL, 'S' },
{ "setenv", required_argument, NULL, ARG_SETENV },
break;
case 'u':
- free(arg_user);
- arg_user = strdup(optarg);
- if (!arg_user)
+ r = free_and_strdup(&arg_user, optarg);
+ if (r < 0)
return log_oom();
break;
break;
case 'M':
- if (isempty(optarg)) {
- free(arg_machine);
- arg_machine = NULL;
- } else {
+ if (isempty(optarg))
+ arg_machine = mfree(arg_machine);
+ else {
if (!machine_name_is_valid(optarg)) {
log_error("Invalid machine name: %s", optarg);
return -EINVAL;
case ARG_BIND:
case ARG_BIND_RO: {
- _cleanup_free_ char *a = NULL, *b = NULL;
- char *e;
- char ***x;
-
- x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
-
- e = strchr(optarg, ':');
- if (e) {
- a = strndup(optarg, e - optarg);
- b = strdup(e + 1);
- } else {
- a = strdup(optarg);
- b = strdup(optarg);
+ const char *current = optarg;
+ _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
+ CustomMount *m;
+
+ r = extract_many_words(¤t, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL);
+ switch (r) {
+ case 1:
+ destination = strdup(source);
+ case 2:
+ case 3:
+ break;
+ case -ENOMEM:
+ return log_oom();
+ default:
+ log_error("Invalid bind mount specification: %s", optarg);
+ return -EINVAL;
}
- if (!a || !b)
+ if (!source || !destination)
return log_oom();
- if (!path_is_absolute(a) || !path_is_absolute(b)) {
+ if (!path_is_absolute(source) || !path_is_absolute(destination)) {
log_error("Invalid bind mount specification: %s", optarg);
return -EINVAL;
}
- r = strv_extend(x, a);
- if (r < 0)
+ m = custom_mount_add(CUSTOM_MOUNT_BIND);
+ if (!m)
return log_oom();
- r = strv_extend(x, b);
- if (r < 0)
- return log_oom();
+ m->source = source;
+ m->destination = destination;
+ m->read_only = c == ARG_BIND_RO;
+ m->options = opts;
+
+ source = destination = opts = NULL;
break;
}
case ARG_TMPFS: {
- _cleanup_free_ char *a = NULL, *b = NULL;
- char *e;
+ const char *current = optarg;
+ _cleanup_free_ char *path = NULL, *opts = NULL;
+ CustomMount *m;
- e = strchr(optarg, ':');
- if (e) {
- a = strndup(optarg, e - optarg);
- b = strdup(e + 1);
- } else {
- a = strdup(optarg);
- b = strdup("mode=0755");
+ r = extract_first_word(¤t, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ else if (r < 0) {
+ log_error("Invalid tmpfs specification: %s", optarg);
+ return r;
}
+ if (r)
+ opts = strdup(current);
+ else
+ opts = strdup("mode=0755");
- if (!a || !b)
+ if (!path || !opts)
return log_oom();
- if (!path_is_absolute(a)) {
+ if (!path_is_absolute(path)) {
log_error("Invalid tmpfs specification: %s", optarg);
return -EINVAL;
}
- r = strv_push(&arg_tmpfs, a);
- if (r < 0)
+ m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
+ if (!m)
return log_oom();
- a = NULL;
+ m->destination = path;
+ m->options = opts;
- r = strv_push(&arg_tmpfs, b);
- if (r < 0)
+ path = opts = NULL;
+
+ break;
+ }
+
+ case ARG_OVERLAY:
+ case ARG_OVERLAY_RO: {
+ _cleanup_free_ char *upper = NULL, *destination = NULL;
+ _cleanup_strv_free_ char **lower = NULL;
+ CustomMount *m;
+ unsigned n = 0;
+ char **i;
+
+ r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ else if (r < 0) {
+ log_error("Invalid overlay specification: %s", optarg);
+ return r;
+ }
+
+ STRV_FOREACH(i, lower) {
+ if (!path_is_absolute(*i)) {
+ log_error("Overlay path %s is not absolute.", *i);
+ return -EINVAL;
+ }
+
+ n++;
+ }
+
+ if (n < 2) {
+ log_error("--overlay= needs at least two colon-separated directories specified.");
+ return -EINVAL;
+ }
+
+ if (n == 2) {
+ /* If two parameters are specified,
+ * the first one is the lower, the
+ * second one the upper directory. And
+ * we'll also define the destination
+ * mount point the same as the upper. */
+ upper = lower[1];
+ lower[1] = NULL;
+
+ destination = strdup(upper);
+ if (!destination)
+ return log_oom();
+
+ } else {
+ upper = lower[n - 2];
+ destination = lower[n - 1];
+ lower[n - 2] = NULL;
+ }
+
+ m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
+ if (!m)
return log_oom();
- b = NULL;
+ m->destination = destination;
+ m->source = upper;
+ m->lower = lower;
+ m->read_only = c == ARG_OVERLAY_RO;
+
+ upper = destination = NULL;
+ lower = NULL;
break;
}
case ARG_PERSONALITY:
arg_personality = personality_from_string(optarg);
- if (arg_personality == 0xffffffffLU) {
+ if (arg_personality == PERSONALITY_INVALID) {
log_error("Unknown or unsupported personality '%s'.", optarg);
return -EINVAL;
}
return -EINVAL;
}
+ if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
+ return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
+
arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
if (arg_boot && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
+ r = detect_unified_cgroup_hierarchy();
+ if (r < 0)
+ return r;
+
return 1;
}
-static int mount_all(const char *dest) {
+static int tmpfs_patch_options(const char *options, char **ret) {
+ char *buf = NULL;
+
+ if (arg_userns && arg_uid_shift != 0) {
+ assert(arg_uid_shift != UID_INVALID);
+
+ if (options)
+ (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
+ else
+ (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
+ if (!buf)
+ return -ENOMEM;
+
+ options = buf;
+ }
+
+#ifdef HAVE_SELINUX
+ if (arg_selinux_apifs_context) {
+ char *t;
+
+ if (options)
+ t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
+ else
+ t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
+ if (!t) {
+ free(buf);
+ return -ENOMEM;
+ }
+
+ free(buf);
+ buf = t;
+ }
+#endif
+
+ *ret = buf;
+ return !!buf;
+}
+
+static int mount_all(const char *dest, bool userns) {
typedef struct MountPoint {
const char *what;
const char *options;
unsigned long flags;
bool fatal;
+ bool userns;
} MountPoint;
static const MountPoint mount_table[] = {
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
- { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
#ifdef HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
#endif
};
unsigned k;
- int r = 0;
+ int r;
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
_cleanup_free_ char *where = NULL, *options = NULL;
const char *o;
- int t;
- where = strjoin(dest, "/", mount_table[k].where, NULL);
+ if (userns != mount_table[k].userns)
+ continue;
+
+ where = prefix_root(dest, mount_table[k].where);
if (!where)
return log_oom();
- t = path_is_mount_point(where, true);
- if (t < 0) {
- log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
-
- if (r == 0)
- r = t;
-
- continue;
- }
+ r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
+ if (r < 0 && r != -ENOENT)
+ return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
/* Skip this entry if it is not a remount. */
- if (mount_table[k].what && t > 0)
+ if (mount_table[k].what && r > 0)
continue;
- t = mkdir_p(where, 0755);
- if (t < 0) {
- if (mount_table[k].fatal) {
- log_error_errno(t, "Failed to create directory %s: %m", where);
-
- if (r == 0)
- r = t;
- } else
- log_warning_errno(t, "Failed to create directory %s: %m", where);
+ r = mkdir_p(where, 0755);
+ if (r < 0) {
+ if (mount_table[k].fatal)
+ return log_error_errno(r, "Failed to create directory %s: %m", where);
+ log_warning_errno(r, "Failed to create directory %s: %m", where);
continue;
}
-#ifdef HAVE_SELINUX
- if (arg_selinux_apifs_context &&
- (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
- options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
- if (!options)
- return log_oom();
-
- o = options;
- } else
-#endif
- o = mount_table[k].options;
-
- if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
- char *uid_options = NULL;
-
- if (o)
- asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
- else
- asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
- if (!uid_options)
+ o = mount_table[k].options;
+ if (streq_ptr(mount_table[k].type, "tmpfs")) {
+ r = tmpfs_patch_options(o, &options);
+ if (r < 0)
return log_oom();
-
- free(options);
- o = options = uid_options;
+ if (r > 0)
+ o = options;
}
if (mount(mount_table[k].what,
mount_table[k].flags,
o) < 0) {
- if (mount_table[k].fatal) {
- log_error_errno(errno, "mount(%s) failed: %m", where);
+ if (mount_table[k].fatal)
+ return log_error_errno(errno, "mount(%s) failed: %m", where);
- if (r == 0)
- r = -errno;
- } else
- log_warning_errno(errno, "mount(%s) failed: %m", where);
+ log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
}
}
- return r;
+ return 0;
}
-static int mount_binds(const char *dest, char **l, bool ro) {
- char **x, **y;
+static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
+ const char *p = options;
+ unsigned long flags = *mount_flags;
+ char *opts = NULL;
- STRV_FOREACH_PAIR(x, y, l) {
- _cleanup_free_ char *where = NULL;
- struct stat source_st, dest_st;
- int r;
+ assert(options);
- if (stat(*x, &source_st) < 0)
- return log_error_errno(errno, "Failed to stat %s: %m", *x);
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ int r = extract_first_word(&p, &word, ",", 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract mount option: %m");
+ if (r == 0)
+ break;
- where = strappend(dest, *y);
- if (!where)
- return log_oom();
+ if (streq(word, "rbind"))
+ flags |= MS_REC;
+ else if (streq(word, "norbind"))
+ flags &= ~MS_REC;
+ else {
+ log_error("Invalid bind mount option: %s", word);
+ return -EINVAL;
+ }
+ }
- r = stat(where, &dest_st);
- if (r == 0) {
- if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
- log_error("Cannot bind mount directory %s on file %s.", *x, where);
- return -EINVAL;
- }
- if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
- log_error("Cannot bind mount file %s on directory %s.", *x, where);
- return -EINVAL;
- }
- } else if (errno == ENOENT) {
- r = mkdir_parents_label(where, 0755);
- if (r < 0)
- return log_error_errno(r, "Failed to bind mount %s: %m", *x);
- } else {
- log_error_errno(errno, "Failed to bind mount %s: %m", *x);
- return -errno;
+ *mount_flags = flags;
+ /* in the future mount_opts will hold string options for mount(2) */
+ *mount_opts = opts;
+
+ return 0;
+}
+
+static int mount_bind(const char *dest, CustomMount *m) {
+ struct stat source_st, dest_st;
+ const char *where;
+ unsigned long mount_flags = MS_BIND | MS_REC;
+ _cleanup_free_ char *mount_opts = NULL;
+ int r;
+
+ assert(m);
+
+ if (m->options) {
+ r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
+ if (r < 0)
+ return r;
+ }
+
+ if (stat(m->source, &source_st) < 0)
+ return log_error_errno(errno, "Failed to stat %s: %m", m->source);
+
+ where = prefix_roota(dest, m->destination);
+
+ if (stat(where, &dest_st) >= 0) {
+ if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
+ log_error("Cannot bind mount directory %s on file %s.", m->source, where);
+ return -EINVAL;
}
- /* Create the mount point. Any non-directory file can be
- * mounted on any non-directory file (regular, fifo, socket,
- * char, block).
- */
- if (S_ISDIR(source_st.st_mode)) {
- r = mkdir_label(where, 0755);
- if (r < 0 && errno != EEXIST)
- return log_error_errno(r, "Failed to create mount point %s: %m", where);
- } else {
- r = touch(where);
- if (r < 0)
- return log_error_errno(r, "Failed to create mount point %s: %m", where);
+ if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
+ log_error("Cannot bind mount file %s on directory %s.", m->source, where);
+ return -EINVAL;
}
- if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
- return log_error_errno(errno, "mount(%s) failed: %m", where);
+ } else if (errno == ENOENT) {
+ r = mkdir_parents_label(where, 0755);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make parents of %s: %m", where);
+ } else {
+ log_error_errno(errno, "Failed to stat %s: %m", where);
+ return -errno;
+ }
- if (ro) {
- r = bind_remount_recursive(where, true);
- if (r < 0)
- return log_error_errno(r, "Read-Only bind mount failed: %m");
- }
+ /* Create the mount point. Any non-directory file can be
+ * mounted on any non-directory file (regular, fifo, socket,
+ * char, block).
+ */
+ if (S_ISDIR(source_st.st_mode))
+ r = mkdir_label(where, 0755);
+ else
+ r = touch(where);
+ if (r < 0 && r != -EEXIST)
+ return log_error_errno(r, "Failed to create mount point %s: %m", where);
+
+ if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
+ return log_error_errno(errno, "mount(%s) failed: %m", where);
+
+ if (m->read_only) {
+ r = bind_remount_recursive(where, true);
+ if (r < 0)
+ return log_error_errno(r, "Read-only bind mount failed: %m");
}
return 0;
}
-static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
- char *to;
+static int mount_tmpfs(const char *dest, CustomMount *m) {
+ const char *where, *options;
+ _cleanup_free_ char *buf = NULL;
int r;
- to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
+ assert(dest);
+ assert(m);
+
+ where = prefix_roota(dest, m->destination);
- r = path_is_mount_point(to, false);
+ r = mkdir_p_label(where, 0755);
+ if (r < 0 && r != -EEXIST)
+ return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
+
+ r = tmpfs_patch_options(m->options, &buf);
if (r < 0)
- return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
- if (r > 0)
- return 0;
+ return log_oom();
+ options = r > 0 ? buf : m->options;
- mkdir_p(to, 0755);
+ if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
+ return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
- /* The superblock mount options of the mount point need to be
- * identical to the hosts', and hence writable... */
- if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
- return log_error_errno(errno, "Failed to mount to %s: %m", to);
+ return 0;
+}
- /* ... hence let's only make the bind mount read-only, not the
- * superblock. */
- if (read_only) {
- if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
- return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
- }
- return 1;
+static char *joined_and_escaped_lower_dirs(char * const *lower) {
+ _cleanup_strv_free_ char **sv = NULL;
+
+ sv = strv_copy(lower);
+ if (!sv)
+ return NULL;
+
+ strv_reverse(sv);
+
+ if (!strv_shell_escape(sv, ",:"))
+ return NULL;
+
+ return strv_join(sv, ":");
}
-static int mount_cgroup(const char *dest) {
- _cleanup_set_free_free_ Set *controllers = NULL;
- _cleanup_free_ char *own_cgroup_path = NULL;
- const char *cgroup_root, *systemd_root, *systemd_own;
+static int mount_overlay(const char *dest, CustomMount *m) {
+ _cleanup_free_ char *lower = NULL;
+ const char *where, *options;
int r;
- controllers = set_new(&string_hash_ops);
- if (!controllers)
- return log_oom();
+ assert(dest);
+ assert(m);
- r = cg_kernel_controllers(controllers);
- if (r < 0)
- return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+ where = prefix_roota(dest, m->destination);
- r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
- if (r < 0)
- return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+ r = mkdir_label(where, 0755);
+ if (r < 0 && r != -EEXIST)
+ return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
- cgroup_root = strjoina(dest, "/sys/fs/cgroup");
- if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
- return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
+ (void) mkdir_p_label(m->source, 0755);
- for (;;) {
- _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
+ lower = joined_and_escaped_lower_dirs(m->lower);
+ if (!lower)
+ return log_oom();
- controller = set_steal_first(controllers);
- if (!controller)
- break;
+ if (m->read_only) {
+ _cleanup_free_ char *escaped_source = NULL;
- origin = strappend("/sys/fs/cgroup/", controller);
- if (!origin)
+ escaped_source = shell_escape(m->source, ",:");
+ if (!escaped_source)
return log_oom();
- r = readlink_malloc(origin, &combined);
- if (r == -EINVAL) {
- /* Not a symbolic link, but directly a single cgroup hierarchy */
+ options = strjoina("lowerdir=", escaped_source, ":", lower);
+ } else {
+ _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
- r = mount_cgroup_hierarchy(dest, controller, controller, true);
- if (r < 0)
- return r;
+ assert(m->work_dir);
+ (void) mkdir_label(m->work_dir, 0700);
- } else if (r < 0)
+ escaped_source = shell_escape(m->source, ",:");
+ if (!escaped_source)
+ return log_oom();
+ escaped_work_dir = shell_escape(m->work_dir, ",:");
+ if (!escaped_work_dir)
+ return log_oom();
+
+ options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
+ }
+
+ if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
+ return log_error_errno(errno, "overlay mount to %s failed: %m", where);
+
+ return 0;
+}
+
+static int mount_custom(const char *dest) {
+ unsigned i;
+ int r;
+
+ assert(dest);
+
+ for (i = 0; i < arg_n_custom_mounts; i++) {
+ CustomMount *m = &arg_custom_mounts[i];
+
+ switch (m->type) {
+
+ case CUSTOM_MOUNT_BIND:
+ r = mount_bind(dest, m);
+ break;
+
+ case CUSTOM_MOUNT_TMPFS:
+ r = mount_tmpfs(dest, m);
+ break;
+
+ case CUSTOM_MOUNT_OVERLAY:
+ r = mount_overlay(dest, m);
+ break;
+
+ default:
+ assert_not_reached("Unknown custom mount type");
+ }
+
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+ char *to;
+ int r;
+
+ to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
+
+ r = path_is_mount_point(to, 0);
+ if (r < 0 && r != -ENOENT)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
+ if (r > 0)
+ return 0;
+
+ mkdir_p(to, 0755);
+
+ /* The superblock mount options of the mount point need to be
+ * identical to the hosts', and hence writable... */
+ if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
+ return log_error_errno(errno, "Failed to mount to %s: %m", to);
+
+ /* ... hence let's only make the bind mount read-only, not the
+ * superblock. */
+ if (read_only) {
+ if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
+ return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
+ }
+ return 1;
+}
+
+static int mount_legacy_cgroups(const char *dest) {
+ _cleanup_set_free_free_ Set *controllers = NULL;
+ const char *cgroup_root;
+ int r;
+
+ cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+ /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+ r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+ if (r == 0) {
+ _cleanup_free_ char *options = NULL;
+
+ r = tmpfs_patch_options("mode=755", &options);
+ if (r < 0)
+ return log_oom();
+
+ if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+ return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+ }
+
+ if (cg_unified() > 0)
+ goto skip_controllers;
+
+ controllers = set_new(&string_hash_ops);
+ if (!controllers)
+ return log_oom();
+
+ r = cg_kernel_controllers(controllers);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+
+ for (;;) {
+ _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
+
+ controller = set_steal_first(controllers);
+ if (!controller)
+ break;
+
+ origin = prefix_root("/sys/fs/cgroup/", controller);
+ if (!origin)
+ return log_oom();
+
+ r = readlink_malloc(origin, &combined);
+ if (r == -EINVAL) {
+ /* Not a symbolic link, but directly a single cgroup hierarchy */
+
+ r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
+ if (r < 0)
+ return r;
+
+ } else if (r < 0)
return log_error_errno(r, "Failed to read link %s: %m", origin);
else {
_cleanup_free_ char *target = NULL;
- target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
+ target = prefix_root(dest, origin);
if (!target)
return log_oom();
continue;
}
- r = mount_cgroup_hierarchy(dest, combined, combined, true);
+ r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
if (r < 0)
return r;
- if (symlink(combined, target) < 0)
- return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
+ r = symlink_idempotent(combined, target);
+ if (r == -EINVAL) {
+ log_error("Invalid existing symlink for combined hierarchy");
+ return r;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
}
}
- r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
+skip_controllers:
+ r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
if (r < 0)
return r;
+ if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
+ return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
+
+ return 0;
+}
+
+static int mount_unified_cgroups(const char *dest) {
+ const char *p;
+ int r;
+
+ assert(dest);
+
+ p = strjoina(dest, "/sys/fs/cgroup");
+
+ r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
+ if (r > 0) {
+ p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
+ if (access(p, F_OK) >= 0)
+ return 0;
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
+
+ log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
+ return -EINVAL;
+ }
+
+ if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+ return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
+
+ return 0;
+}
+
+static int mount_cgroups(const char *dest) {
+ if (arg_unified_cgroup_hierarchy)
+ return mount_unified_cgroups(dest);
+ else
+ return mount_legacy_cgroups(dest);
+}
+
+static int mount_systemd_cgroup_writable(const char *dest) {
+ _cleanup_free_ char *own_cgroup_path = NULL;
+ const char *systemd_root, *systemd_own;
+ int r;
+
+ assert(dest);
+
+ r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+
+ /* If we are living in the top-level, then there's nothing to do... */
+ if (path_equal(own_cgroup_path, "/"))
+ return 0;
+
+ if (arg_unified_cgroup_hierarchy) {
+ systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
+ systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
+ } else {
+ systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+ systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+ }
+
/* Make our own cgroup a (writable) bind mount */
- systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
/* And then remount the systemd cgroup root read-only */
- systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
- if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
- return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
-
return 0;
}
-static int mount_tmpfs(const char *dest) {
- char **i, **o;
+static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
+ assert(p);
- STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
- _cleanup_free_ char *where = NULL;
- int r;
+ if (!arg_userns)
+ return 0;
- where = strappend(dest, *i);
- if (!where)
- return log_oom();
+ if (uid == UID_INVALID && gid == GID_INVALID)
+ return 0;
- r = mkdir_label(where, 0755);
- if (r < 0 && r != -EEXIST)
- return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
+ if (uid != UID_INVALID) {
+ uid += arg_uid_shift;
+
+ if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
+ return -EOVERFLOW;
+ }
- if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
- return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
+ if (gid != GID_INVALID) {
+ gid += (gid_t) arg_uid_shift;
+
+ if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
+ return -EOVERFLOW;
}
+ if (lchown(p, uid, gid) < 0)
+ return -errno;
+
return 0;
}
+static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
+ const char *q;
+
+ q = prefix_roota(root, path);
+ if (mkdir(q, mode) < 0) {
+ if (errno == EEXIST)
+ return 0;
+ return -errno;
+ }
+
+ return userns_lchown(q, uid, gid);
+}
+
static int setup_timezone(const char *dest) {
- _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
+ _cleanup_free_ char *p = NULL, *q = NULL;
+ const char *where, *check, *what;
char *z, *y;
int r;
return 0;
}
- where = strappend(dest, "/etc/localtime");
- if (!where)
- return log_oom();
-
+ where = prefix_roota(dest, "/etc/localtime");
r = readlink_malloc(where, &q);
if (r >= 0) {
y = path_startswith(q, "../usr/share/zoneinfo/");
return 0;
}
- check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
- if (!check)
- return log_oom();
-
- if (access(check, F_OK) < 0) {
+ check = strjoina("/usr/share/zoneinfo/", z);
+ check = prefix_root(dest, check);
+ if (laccess(check, F_OK) < 0) {
log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
return 0;
}
- what = strappend("../usr/share/zoneinfo/", z);
- if (!what)
- return log_oom();
-
- r = mkdir_parents(where, 0755);
- if (r < 0) {
- log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
-
- return 0;
- }
-
r = unlink(where);
if (r < 0 && errno != ENOENT) {
log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
-
return 0;
}
+ what = strjoina("../usr/share/zoneinfo/", z);
if (symlink(what, where) < 0) {
log_error_errno(errno, "Failed to correct timezone of container: %m");
return 0;
}
+ r = userns_lchown(where, 0, 0);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
+
return 0;
}
static int setup_resolv_conf(const char *dest) {
- _cleanup_free_ char *where = NULL;
+ const char *where = NULL;
int r;
assert(dest);
return 0;
/* Fix resolv.conf, if possible */
- where = strappend(dest, "/etc/resolv.conf");
- if (!where)
- return log_oom();
-
- /* We don't really care for the results of this really. If it
- * fails, it fails, but meh... */
- r = mkdir_parents(where, 0755);
- if (r < 0) {
- log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
-
- return 0;
- }
+ where = prefix_roota(dest, "/etc/resolv.conf");
r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
if (r < 0) {
- log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
-
+ /* If the file already exists as symlink, let's
+ * suppress the warning, under the assumption that
+ * resolved or something similar runs inside and the
+ * symlink points there.
+ *
+ * If the disk image is read-only, there's also no
+ * point in complaining.
+ */
+ log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to copy /etc/resolv.conf to %s: %m", where);
return 0;
}
+ r = userns_lchown(where, 0, 0);
+ if (r < 0)
+ log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
+
return 0;
}
static int setup_volatile_state(const char *directory) {
- const char *p;
+ _cleanup_free_ char *buf = NULL;
+ const char *p, *options;
int r;
assert(directory);
if (r < 0)
return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
- p = strjoina(directory, "/var");
+ p = prefix_roota(directory, "/var");
r = mkdir(p, 0755);
if (r < 0 && errno != EEXIST)
return log_error_errno(errno, "Failed to create %s: %m", directory);
- if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
+ options = "mode=755";
+ r = tmpfs_patch_options(options, &buf);
+ if (r < 0)
+ return log_oom();
+ if (r > 0)
+ options = buf;
+
+ if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
return 0;
static int setup_volatile(const char *directory) {
bool tmpfs_mounted = false, bind_mounted = false;
char template[] = "/tmp/nspawn-volatile-XXXXXX";
- const char *f, *t;
+ _cleanup_free_ char *buf = NULL;
+ const char *f, *t, *options;
int r;
assert(directory);
if (!mkdtemp(template))
return log_error_errno(errno, "Failed to create temporary directory: %m");
- if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
- log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
- r = -errno;
+ options = "mode=755";
+ r = tmpfs_patch_options(options, &buf);
+ if (r < 0)
+ return log_oom();
+ if (r > 0)
+ options = buf;
+
+ if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
+ r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
goto fail;
}
tmpfs_mounted = true;
- f = strjoina(directory, "/usr");
- t = strjoina(template, "/usr");
+ f = prefix_roota(directory, "/usr");
+ t = prefix_roota(template, "/usr");
r = mkdir(t, 0755);
if (r < 0 && errno != EEXIST) {
- log_error_errno(errno, "Failed to create %s: %m", t);
- r = -errno;
+ r = log_error_errno(errno, "Failed to create %s: %m", t);
goto fail;
}
if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
- log_error_errno(errno, "Failed to create /usr bind mount: %m");
- r = -errno;
+ r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
goto fail;
}
}
if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
- log_error_errno(errno, "Failed to move root mount: %m");
- r = -errno;
+ r = log_error_errno(errno, "Failed to move root mount: %m");
goto fail;
}
- rmdir(template);
+ (void) rmdir(template);
return 0;
fail:
if (bind_mounted)
- umount(t);
+ (void) umount(t);
+
if (tmpfs_mounted)
- umount(template);
- rmdir(template);
+ (void) umount(template);
+ (void) rmdir(template);
return r;
}
static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
+ assert(s);
snprintf(s, 37,
"%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
}
static int setup_boot_id(const char *dest) {
- _cleanup_free_ char *from = NULL, *to = NULL;
+ const char *from, *to;
sd_id128_t rnd = {};
char as_uuid[37];
int r;
- assert(dest);
-
if (arg_share_system)
return 0;
/* Generate a new randomized boot ID, so that each boot-up of
* the container gets a new one */
- from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
- to = strappend(dest, "/proc/sys/kernel/random/boot_id");
- if (!from || !to)
- return log_oom();
+ from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
+ to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
r = sd_id128_randomize(&rnd);
if (r < 0)
id128_format_as_uuid(rnd, as_uuid);
- r = write_string_file(from, as_uuid);
+ r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
- if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
- log_error_errno(errno, "Failed to bind mount boot id: %m");
- r = -errno;
- } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
+ if (mount(from, to, NULL, MS_BIND, NULL) < 0)
+ r = log_error_errno(errno, "Failed to bind mount boot id: %m");
+ else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
log_warning_errno(errno, "Failed to make boot id read-only: %m");
unlink(from);
u = umask(0000);
+ /* Create /dev/net, so that we can create /dev/net/tun in it */
+ if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
+ return log_error_errno(r, "Failed to create /dev/net directory: %m");
+
NULSTR_FOREACH(d, devnodes) {
_cleanup_free_ char *from = NULL, *to = NULL;
struct stat st;
from = strappend("/dev/", d);
- to = strjoin(dest, "/dev/", d, NULL);
- if (!from || !to)
- return log_oom();
+ to = prefix_root(dest, from);
if (stat(from, &st) < 0) {
} else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
- log_error("%s is not a char or block device, cannot copy", from);
+ log_error("%s is not a char or block device, cannot copy.", from);
return -EIO;
} else {
- r = mkdir_parents(to, 0775);
- if (r < 0) {
- log_error_errno(r, "Failed to create parent directory of %s: %m", to);
- return -r;
- }
-
if (mknod(to, st.st_mode, st.st_rdev) < 0) {
if (errno != EPERM)
return log_error_errno(errno, "mknod(%s) failed: %m", to);
return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
}
- if (arg_userns && arg_uid_shift != UID_INVALID)
- if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
- return log_error_errno(errno, "chown() of device node %s failed: %m", to);
+ r = userns_lchown(to, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "chown() of device node %s failed: %m", to);
}
}
return r;
}
-static int setup_ptmx(const char *dest) {
- _cleanup_free_ char *p = NULL;
+static int setup_pts(const char *dest) {
+ _cleanup_free_ char *options = NULL;
+ const char *p;
+
+#ifdef HAVE_SELINUX
+ if (arg_selinux_apifs_context)
+ (void) asprintf(&options,
+ "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
+ arg_uid_shift + TTY_GID,
+ arg_selinux_apifs_context);
+ else
+#endif
+ (void) asprintf(&options,
+ "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
+ arg_uid_shift + TTY_GID);
- p = strappend(dest, "/dev/ptmx");
- if (!p)
+ if (!options)
return log_oom();
+ /* Mount /dev/pts itself */
+ p = prefix_roota(dest, "/dev/pts");
+ if (mkdir(p, 0755) < 0)
+ return log_error_errno(errno, "Failed to create /dev/pts: %m");
+ if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
+ return log_error_errno(errno, "Failed to mount /dev/pts: %m");
+ if (userns_lchown(p, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to chown /dev/pts: %m");
+
+ /* Create /dev/ptmx symlink */
+ p = prefix_roota(dest, "/dev/ptmx");
if (symlink("pts/ptmx", p) < 0)
return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
+ if (userns_lchown(p, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
- if (arg_userns && arg_uid_shift != UID_INVALID)
- if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
- return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
+ /* And fix /dev/pts/ptmx ownership */
+ p = prefix_roota(dest, "/dev/pts/ptmx");
+ if (userns_lchown(p, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
return 0;
}
u = umask(0000);
- r = chmod_and_chown(console, 0600, 0, 0);
+ r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
if (r < 0)
return log_error_errno(r, "Failed to correct access mode for TTY: %m");
* ptys can only exist on pts file systems. To have something
* to bind mount things on we create a empty regular file. */
- to = strjoina(dest, "/dev/console");
+ to = prefix_roota(dest, "/dev/console");
r = touch(to);
if (r < 0)
return log_error_errno(r, "touch() for /dev/console failed: %m");
}
static int setup_kmsg(const char *dest, int kmsg_socket) {
- _cleanup_free_ char *from = NULL, *to = NULL;
+ const char *from, *to;
_cleanup_umask_ mode_t u;
- int r, fd, k;
+ int fd, k;
union {
struct cmsghdr cmsghdr;
uint8_t buf[CMSG_SPACE(sizeof(int))];
};
struct cmsghdr *cmsg;
- assert(dest);
assert(kmsg_socket >= 0);
u = umask(0000);
- /* We create the kmsg FIFO as /dev/kmsg, but immediately
+ /* We create the kmsg FIFO as /run/kmsg, but immediately
* delete it after bind mounting it to /proc/kmsg. While FIFOs
* on the reading side behave very similar to /proc/kmsg,
* their writing side behaves differently from /dev/kmsg in
* that writing blocks when nothing is reading. In order to
* avoid any problems with containers deadlocking due to this
* we simply make /dev/kmsg unavailable to the container. */
- if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
- asprintf(&to, "%s/proc/kmsg", dest) < 0)
- return log_oom();
+ from = prefix_roota(dest, "/run/kmsg");
+ to = prefix_roota(dest, "/proc/kmsg");
if (mkfifo(from, 0600) < 0)
- return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
-
- r = chmod_and_chown(from, 0600, 0, 0);
- if (r < 0)
- return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
-
+ return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
if (mount(from, to, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
if (k < 0)
return log_error_errno(errno, "Failed to send FIFO fd: %m");
- /* And now make the FIFO unavailable as /dev/kmsg... */
- unlink(from);
+ /* And now make the FIFO unavailable as /run/kmsg... */
+ (void) unlink(from);
+
return 0;
}
fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
if (fd < 0)
- return log_error_errno(errno, "failed to allocate container netlink: %m");
+ return log_error_errno(errno, "Failed to allocate container netlink: %m");
cmsg = CMSG_FIRSTHDR(&mh);
cmsg->cmsg_level = SOL_SOCKET;
return 0;
}
-static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
+static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
_cleanup_free_ struct local_address *addresses = NULL;
_cleanup_free_ char *pretty = NULL;
union in_addr_union new_exposed;
return 0;
}
-static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
+static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
union in_addr_union *exposed = userdata;
assert(rtnl);
return 0;
}
-static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
+static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
union {
struct cmsghdr cmsghdr;
uint8_t buf[CMSG_SPACE(sizeof(int))];
.msg_controllen = sizeof(control),
};
struct cmsghdr *cmsg;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
int fd, r;
ssize_t k;
assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
- r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
+ r = sd_netlink_open_fd(&rtnl, fd);
if (r < 0) {
safe_close(fd);
return log_error_errno(r, "Failed to create rtnl object: %m");
}
- r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
+ r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
if (r < 0)
return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
- r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
+ r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
if (r < 0)
return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
- r = sd_rtnl_attach_event(rtnl, event, 0);
+ r = sd_netlink_attach_event(rtnl, event, 0);
if (r < 0)
return log_error_errno(r, "Failed to add to even loop: %m");
static int setup_journal(const char *directory) {
sd_id128_t machine_id, this_id;
- _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
+ _cleanup_free_ char *b = NULL, *d = NULL;
+ const char *etc_machine_id, *p, *q;
char *id;
int r;
if (arg_ephemeral)
return 0;
- p = strappend(directory, "/etc/machine-id");
- if (!p)
- return log_oom();
+ etc_machine_id = prefix_roota(directory, "/etc/machine-id");
- r = read_one_line_file(p, &b);
+ r = read_one_line_file(etc_machine_id, &b);
if (r == -ENOENT && arg_link_journal == LINK_AUTO)
return 0;
else if (r < 0)
- return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
+ return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
id = strstrip(b);
if (isempty(id) && arg_link_journal == LINK_AUTO)
/* Verify validity */
r = sd_id128_from_string(id, &machine_id);
if (r < 0)
- return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
+ return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
r = sd_id128_get_machine(&this_id);
if (r < 0)
if (arg_link_journal == LINK_NO)
return 0;
- free(p);
- p = strappend("/var/log/journal/", id);
- q = strjoin(directory, "/var/log/journal/", id, NULL);
- if (!p || !q)
- return log_oom();
+ r = userns_mkdir(directory, "/var", 0755, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /var: %m");
+
+ r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /var/log: %m");
- if (path_is_mount_point(p, false) > 0) {
+ r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /var/log/journal: %m");
+
+ p = strjoina("/var/log/journal/", id);
+ q = prefix_roota(directory, p);
+
+ if (path_is_mount_point(p, 0) > 0) {
if (arg_link_journal != LINK_AUTO) {
log_error("%s: already a mount point, refusing to use for journal", p);
return -EEXIST;
return 0;
}
- if (path_is_mount_point(q, false) > 0) {
+ if (path_is_mount_point(q, 0) > 0) {
if (arg_link_journal != LINK_AUTO) {
log_error("%s: already a mount point, refusing to use for journal", q);
return -EEXIST;
arg_link_journal == LINK_AUTO) &&
path_equal(d, q)) {
- r = mkdir_p(q, 0755);
+ r = userns_mkdir(directory, p, 0755, 0, 0);
if (r < 0)
log_warning_errno(errno, "Failed to create directory %s: %m", q);
return 0;
}
}
- r = mkdir_p(q, 0755);
+ r = userns_mkdir(directory, p, 0755, 0, 0);
if (r < 0)
log_warning_errno(errno, "Failed to create directory %s: %m", q);
return 0;
if (dir_is_empty(q) == 0)
log_warning("%s is not empty, proceeding anyway.", q);
- r = mkdir_p(q, 0755);
+ r = userns_mkdir(directory, p, 0755, 0, 0);
if (r < 0) {
log_error_errno(errno, "Failed to create %s: %m", q);
return r;
static int register_machine(pid_t pid, int local_ifindex) {
_cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
- _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+ _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
int r;
if (!arg_register)
} else {
_cleanup_bus_message_unref_ sd_bus_message *m = NULL;
char **i;
+ unsigned j;
r = sd_bus_message_new_method_call(
bus,
if (r < 0)
return bus_log_create_error(r);
+ /* If you make changes here, also make sure to update
+ * systemd-nspawn@.service, to keep the device
+ * policies in sync regardless if we are run with or
+ * without the --keep-unit switch. */
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
/* Allow the container to
* access and create the API
"/dev/pts/ptmx", "rw",
"char-pts", "rw");
if (r < 0)
- return log_error_errno(r, "Failed to add device whitelist: %m");
+ return bus_log_create_error(r);
+
+ for (j = 0; j < arg_n_custom_mounts; j++) {
+ CustomMount *cm = &arg_custom_mounts[j];
+
+ if (cm->type != CUSTOM_MOUNT_BIND)
+ continue;
+
+ r = is_device_node(cm->source);
+ if (r < 0)
+ return log_error_errno(r, "Failed to stat %s: %m", cm->source);
+
+ if (r) {
+ r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
+ cm->source, cm->read_only ? "r" : "rw");
+ if (r < 0)
+ return log_error_errno(r, "Failed to append message arguments: %m");
+ }
+ }
+
+ if (arg_kill_signal != 0) {
+ r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
STRV_FOREACH(i, arg_property) {
r = sd_bus_message_open_container(m, 'r', "sv");
static int terminate_machine(pid_t pid) {
_cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
_cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
- _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+ _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
const char *path;
int r;
if (!arg_register)
return 0;
+ /* If we are reusing the unit, then just exit, systemd will do
+ * the right thing when we exit. */
+ if (arg_keep_unit)
+ return 0;
+
r = sd_bus_default_system(&bus);
if (r < 0)
return log_error_errno(r, "Failed to open system bus: %m");
if (streq(p, "4294967295"))
return 0;
- r = write_string_file("/proc/self/loginuid", "4294967295");
+ r = write_string_file("/proc/self/loginuid", "4294967295", 0);
if (r < 0) {
- log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
- "old and you have audit enabled. Note that the auditing subsystem is known to\n"
- "be incompatible with containers on old kernels. Please make sure to upgrade\n"
- "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
- "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
+ log_error_errno(r,
+ "Failed to reset audit login UID. This probably means that your kernel is too\n"
+ "old and you have audit enabled. Note that the auditing subsystem is known to\n"
+ "be incompatible with containers on old kernels. Please make sure to upgrade\n"
+ "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
+ "using systemd-nspawn. Sleeping for 5s... (%m)");
sleep(5);
}
}
static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
- _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
struct ether_addr mac_host, mac_container;
int r, i;
if (r < 0)
return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
- r = sd_rtnl_open(&rtnl, 0);
+ r = sd_netlink_open(&rtnl);
if (r < 0)
return log_error_errno(r, "Failed to connect to netlink: %m");
if (r < 0)
return log_error_errno(r, "Failed to allocate netlink message: %m");
- r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
if (r < 0)
return log_error_errno(r, "Failed to add netlink interface name: %m");
- r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
+ r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
if (r < 0)
return log_error_errno(r, "Failed to add netlink MAC address: %m");
- r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+ r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
if (r < 0)
return log_error_errno(r, "Failed to open netlink container: %m");
- r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
+ r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
if (r < 0)
return log_error_errno(r, "Failed to open netlink container: %m");
- r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
+ r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
if (r < 0)
return log_error_errno(r, "Failed to open netlink container: %m");
- r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
if (r < 0)
return log_error_errno(r, "Failed to add netlink interface name: %m");
- r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
+ r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
if (r < 0)
return log_error_errno(r, "Failed to add netlink MAC address: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+ r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
if (r < 0)
return log_error_errno(r, "Failed to add netlink namespace field: %m");
- r = sd_rtnl_message_close_container(m);
+ r = sd_netlink_message_close_container(m);
if (r < 0)
return log_error_errno(r, "Failed to close netlink container: %m");
- r = sd_rtnl_message_close_container(m);
+ r = sd_netlink_message_close_container(m);
if (r < 0)
return log_error_errno(r, "Failed to close netlink container: %m");
- r = sd_rtnl_message_close_container(m);
+ r = sd_netlink_message_close_container(m);
if (r < 0)
return log_error_errno(r, "Failed to close netlink container: %m");
- r = sd_rtnl_call(rtnl, m, 0, NULL);
+ r = sd_netlink_call(rtnl, m, 0, NULL);
if (r < 0)
- return log_error_errno(r, "Failed to add new veth interfaces: %m");
+ return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
i = (int) if_nametoindex(iface_name);
if (i <= 0)
}
static int setup_bridge(const char veth_name[], int *ifi) {
- _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
int r, bridge;
if (!arg_private_network)
*ifi = bridge;
- r = sd_rtnl_open(&rtnl, 0);
+ r = sd_netlink_open(&rtnl);
if (r < 0)
return log_error_errno(r, "Failed to connect to netlink: %m");
if (r < 0)
return log_error_errno(r, "Failed to set IFF_UP flag: %m");
- r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
if (r < 0)
return log_error_errno(r, "Failed to add netlink interface name field: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
+ r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
if (r < 0)
return log_error_errno(r, "Failed to add netlink master field: %m");
- r = sd_rtnl_call(rtnl, m, 0, NULL);
+ r = sd_netlink_call(rtnl, m, 0, NULL);
if (r < 0)
return log_error_errno(r, "Failed to add veth interface to bridge: %m");
static int move_network_interfaces(pid_t pid) {
_cleanup_udev_unref_ struct udev *udev = NULL;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
char **i;
int r;
if (strv_isempty(arg_network_interfaces))
return 0;
- r = sd_rtnl_open(&rtnl, 0);
+ r = sd_netlink_open(&rtnl);
if (r < 0)
return log_error_errno(r, "Failed to connect to netlink: %m");
}
STRV_FOREACH(i, arg_network_interfaces) {
- _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+ _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
int ifi;
ifi = parse_interface(udev, *i);
if (r < 0)
return log_error_errno(r, "Failed to allocate netlink message: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+ r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
if (r < 0)
return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
- r = sd_rtnl_call(rtnl, m, 0, NULL);
+ r = sd_netlink_call(rtnl, m, 0, NULL);
if (r < 0)
return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
}
static int setup_macvlan(pid_t pid) {
_cleanup_udev_unref_ struct udev *udev = NULL;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
unsigned idx = 0;
char **i;
int r;
if (strv_isempty(arg_network_macvlan))
return 0;
- r = sd_rtnl_open(&rtnl, 0);
+ r = sd_netlink_open(&rtnl);
if (r < 0)
return log_error_errno(r, "Failed to connect to netlink: %m");
}
STRV_FOREACH(i, arg_network_macvlan) {
- _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+ _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
_cleanup_free_ char *n = NULL;
struct ether_addr mac;
int ifi;
if (r < 0)
return log_error_errno(r, "Failed to allocate netlink message: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
+ r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
if (r < 0)
return log_error_errno(r, "Failed to add netlink interface index: %m");
strshorten(n, IFNAMSIZ-1);
- r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
if (r < 0)
return log_error_errno(r, "Failed to add netlink interface name: %m");
- r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
+ r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
if (r < 0)
return log_error_errno(r, "Failed to add netlink MAC address: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+ r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
if (r < 0)
return log_error_errno(r, "Failed to add netlink namespace field: %m");
- r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+ r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
if (r < 0)
return log_error_errno(r, "Failed to open netlink container: %m");
- r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
+ r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
if (r < 0)
return log_error_errno(r, "Failed to open netlink container: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
+ r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
if (r < 0)
return log_error_errno(r, "Failed to append macvlan mode: %m");
- r = sd_rtnl_message_close_container(m);
+ r = sd_netlink_message_close_container(m);
if (r < 0)
return log_error_errno(r, "Failed to close netlink container: %m");
- r = sd_rtnl_message_close_container(m);
+ r = sd_netlink_message_close_container(m);
if (r < 0)
return log_error_errno(r, "Failed to close netlink container: %m");
- r = sd_rtnl_call(rtnl, m, 0, NULL);
+ r = sd_netlink_call(rtnl, m, 0, NULL);
if (r < 0)
return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
}
static int setup_ipvlan(pid_t pid) {
_cleanup_udev_unref_ struct udev *udev = NULL;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
char **i;
int r;
if (strv_isempty(arg_network_ipvlan))
return 0;
- r = sd_rtnl_open(&rtnl, 0);
+ r = sd_netlink_open(&rtnl);
if (r < 0)
return log_error_errno(r, "Failed to connect to netlink: %m");
}
STRV_FOREACH(i, arg_network_ipvlan) {
- _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+ _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
_cleanup_free_ char *n = NULL;
int ifi;
if (r < 0)
return log_error_errno(r, "Failed to allocate netlink message: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
+ r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
if (r < 0)
return log_error_errno(r, "Failed to add netlink interface index: %m");
strshorten(n, IFNAMSIZ-1);
- r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
if (r < 0)
return log_error_errno(r, "Failed to add netlink interface name: %m");
- r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+ r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
if (r < 0)
return log_error_errno(r, "Failed to add netlink namespace field: %m");
- r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+ r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
if (r < 0)
return log_error_errno(r, "Failed to open netlink container: %m");
- r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
+ r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
if (r < 0)
return log_error_errno(r, "Failed to open netlink container: %m");
- r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
+ r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
if (r < 0)
return log_error_errno(r, "Failed to add ipvlan mode: %m");
- r = sd_rtnl_message_close_container(m);
+ r = sd_netlink_message_close_container(m);
if (r < 0)
return log_error_errno(r, "Failed to close netlink container: %m");
- r = sd_rtnl_message_close_container(m);
+ r = sd_netlink_message_close_container(m);
if (r < 0)
return log_error_errno(r, "Failed to close netlink container: %m");
- r = sd_rtnl_call(rtnl, m, 0, NULL);
+ r = sd_netlink_call(rtnl, m, 0, NULL);
if (r < 0)
return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
}
uint64_t capability;
int syscall_num;
} blacklist[] = {
- { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
- { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
- { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
- { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
- { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
- { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
- { CAP_SYS_MODULE, SCMP_SYS(init_module)},
- { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
- { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
+ { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
+ { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
+ { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
+ { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
+ { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
+ { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
+ { CAP_SYS_MODULE, SCMP_SYS(init_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
+ { CAP_SYSLOG, SCMP_SYS(syslog) },
};
scmp_filter_ctx seccomp;
}
r = seccomp_load(seccomp);
- if (r < 0)
+ if (r == -EINVAL) {
+ log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
+ r = 0;
+ goto finish;
+ }
+ if (r < 0) {
log_error_errno(r, "Failed to install seccomp audit filter: %m");
+ goto finish;
+ }
finish:
seccomp_release(seccomp);
p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
(void) mkdir_p(p, 0600);
- q = strjoina(root, "/run/systemd/nspawn/incoming");
- mkdir_parents(q, 0755);
- mkdir_p(q, 0600);
+ if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to create /run/systemd: %m");
+ if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
+
+ if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
+
+ q = prefix_roota(root, "/run/systemd/nspawn/incoming");
if (mount(p, q, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to install propagation bind mount.");
if (nullfd > 2)
safe_close(nullfd);
- reset_all_signal_handlers();
+ (void) reset_all_signal_handlers();
+ (void) reset_signal_mask();
close_all_fds(NULL, 0);
execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
/* Reset everything fully to 0, just in case */
- if (setgroups(0, NULL) < 0)
- return log_error_errno(errno, "setgroups() failed: %m");
-
- if (setresgid(0, 0, 0) < 0)
- return log_error_errno(errno, "setregid() failed: %m");
-
- if (setresuid(0, 0, 0) < 0)
- return log_error_errno(errno, "setreuid() failed: %m");
+ r = reset_uid_gid();
+ if (r < 0)
+ return log_error_errno(r, "Failed to become root: %m");
*_home = NULL;
return 0;
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Failed to make home directory: %m");
- fchown(STDIN_FILENO, uid, gid);
- fchown(STDOUT_FILENO, uid, gid);
- fchown(STDERR_FILENO, uid, gid);
+ (void) fchown(STDIN_FILENO, uid, gid);
+ (void) fchown(STDOUT_FILENO, uid, gid);
+ (void) fchown(STDERR_FILENO, uid, gid);
if (setgroups(n_uids, uids) < 0)
return log_error_errno(errno, "Failed to set auxiliary groups: %m");
static int determine_names(void) {
int r;
- if (!arg_image && !arg_directory) {
- if (arg_machine) {
- _cleanup_(image_unrefp) Image *i = NULL;
+ if (arg_template && !arg_directory && arg_machine) {
+
+ /* If --template= was specified then we should not
+ * search for a machine, but instead create a new one
+ * in /var/lib/machine. */
+
+ arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
+ if (!arg_directory)
+ return log_oom();
+ }
+
+ if (!arg_image && !arg_directory) {
+ if (arg_machine) {
+ _cleanup_(image_unrefp) Image *i = NULL;
r = image_find(arg_machine, &i);
if (r < 0)
if (r < 0)
return log_error_errno(r, "Invalid image directory: %m");
- arg_read_only = arg_read_only || i->read_only;
+ if (!arg_ephemeral)
+ arg_read_only = arg_read_only || i->read_only;
} else
arg_directory = get_current_dir_name();
if (!arg_machine)
return log_oom();
- hostname_cleanup(arg_machine, false);
+ hostname_cleanup(arg_machine);
if (!machine_name_is_valid(arg_machine)) {
log_error("Failed to determine machine name automatically, please use -M.");
return -EINVAL;
return 0;
}
-static int determine_uid_shift(void) {
+static int determine_uid_shift(const char *directory) {
int r;
- if (!arg_userns)
+ if (!arg_userns) {
+ arg_uid_shift = 0;
return 0;
+ }
if (arg_uid_shift == UID_INVALID) {
struct stat st;
- r = stat(arg_directory, &st);
+ r = stat(directory, &st);
if (r < 0)
- return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
+ return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
- log_error("UID and GID base of %s don't match.", arg_directory);
+ log_error("UID and GID base of %s don't match.", directory);
return -EINVAL;
}
return 0;
}
+static int inner_child(
+ Barrier *barrier,
+ const char *directory,
+ bool secondary,
+ int kmsg_socket,
+ int rtnl_socket,
+ FDSet *fds,
+ int argc,
+ char *argv[]) {
+
+ _cleanup_free_ char *home = NULL;
+ unsigned n_env = 2;
+ const char *envp[] = {
+ "PATH=" DEFAULT_PATH_SPLIT_USR,
+ "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
+ NULL, /* TERM */
+ NULL, /* HOME */
+ NULL, /* USER */
+ NULL, /* LOGNAME */
+ NULL, /* container_uuid */
+ NULL, /* LISTEN_FDS */
+ NULL, /* LISTEN_PID */
+ NULL
+ };
+
+ _cleanup_strv_free_ char **env_use = NULL;
+ int r;
+
+ assert(barrier);
+ assert(directory);
+ assert(kmsg_socket >= 0);
+
+ cg_unified_flush();
+
+ if (arg_userns) {
+ /* Tell the parent, that it now can write the UID map. */
+ (void) barrier_place(barrier); /* #1 */
+
+ /* Wait until the parent wrote the UID map */
+ if (!barrier_place_and_sync(barrier)) { /* #2 */
+ log_error("Parent died too early");
+ return -ESRCH;
+ }
+ }
+
+ r = mount_all(NULL, true);
+ if (r < 0)
+ return r;
+
+ /* Wait until we are cgroup-ified, so that we
+ * can mount the right cgroup path writable */
+ if (!barrier_place_and_sync(barrier)) { /* #3 */
+ log_error("Parent died too early");
+ return -ESRCH;
+ }
+
+ r = mount_systemd_cgroup_writable("");
+ if (r < 0)
+ return r;
+
+ r = reset_uid_gid();
+ if (r < 0)
+ return log_error_errno(r, "Couldn't become new root: %m");
+
+ r = setup_boot_id(NULL);
+ if (r < 0)
+ return r;
+
+ r = setup_kmsg(NULL, kmsg_socket);
+ if (r < 0)
+ return r;
+ kmsg_socket = safe_close(kmsg_socket);
+
+ umask(0022);
+
+ if (setsid() < 0)
+ return log_error_errno(errno, "setsid() failed: %m");
+
+ if (arg_private_network)
+ loopback_setup();
+
+ r = send_rtnl(rtnl_socket);
+ if (r < 0)
+ return r;
+ rtnl_socket = safe_close(rtnl_socket);
+
+ if (drop_capabilities() < 0)
+ return log_error_errno(errno, "drop_capabilities() failed: %m");
+
+ setup_hostname();
+
+ if (arg_personality != PERSONALITY_INVALID) {
+ if (personality(arg_personality) < 0)
+ return log_error_errno(errno, "personality() failed: %m");
+ } else if (secondary) {
+ if (personality(PER_LINUX32) < 0)
+ return log_error_errno(errno, "personality() failed: %m");
+ }
+
+#ifdef HAVE_SELINUX
+ if (arg_selinux_context)
+ if (setexeccon((security_context_t) arg_selinux_context) < 0)
+ return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
+#endif
+
+ r = change_uid_gid(&home);
+ if (r < 0)
+ return r;
+
+ envp[n_env] = strv_find_prefix(environ, "TERM=");
+ if (envp[n_env])
+ n_env ++;
+
+ if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
+ (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
+ (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
+ return log_oom();
+
+ if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
+ char as_uuid[37];
+
+ if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
+ return log_oom();
+ }
+
+ if (fdset_size(fds) > 0) {
+ r = fdset_cloexec(fds, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
+
+ if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
+ (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
+ return log_oom();
+ }
+
+ env_use = strv_env_merge(2, envp, arg_setenv);
+ if (!env_use)
+ return log_oom();
+
+ /* Let the parent know that we are ready and
+ * wait until the parent is ready with the
+ * setup, too... */
+ if (!barrier_place_and_sync(barrier)) { /* #4 */
+ log_error("Parent died too early");
+ return -ESRCH;
+ }
+
+ /* Now, explicitly close the log, so that we
+ * then can close all remaining fds. Closing
+ * the log explicitly first has the benefit
+ * that the logging subsystem knows about it,
+ * and is thus ready to be reopened should we
+ * need it again. Note that the other fds
+ * closed here are at least the locking and
+ * barrier fds. */
+ log_close();
+ (void) fdset_close_others(fds);
+
+ if (arg_boot) {
+ char **a;
+ size_t m;
+
+ /* Automatically search for the init system */
+
+ m = 1 + argc - optind;
+ a = newa(char*, m + 1);
+ memcpy(a + 1, argv + optind, m * sizeof(char*));
+
+ a[0] = (char*) "/usr/lib/systemd/systemd";
+ execve(a[0], a, env_use);
+
+ a[0] = (char*) "/lib/systemd/systemd";
+ execve(a[0], a, env_use);
+
+ a[0] = (char*) "/sbin/init";
+ execve(a[0], a, env_use);
+ } else if (argc > optind)
+ execvpe(argv[optind], argv + optind, env_use);
+ else {
+ chdir(home ? home : "/root");
+ execle("/bin/bash", "-bash", NULL, env_use);
+ execle("/bin/sh", "-sh", NULL, env_use);
+ }
+
+ (void) log_open();
+ return log_error_errno(errno, "execv() failed: %m");
+}
+
+static int outer_child(
+ Barrier *barrier,
+ const char *directory,
+ const char *console,
+ const char *root_device, bool root_device_rw,
+ const char *home_device, bool home_device_rw,
+ const char *srv_device, bool srv_device_rw,
+ bool interactive,
+ bool secondary,
+ int pid_socket,
+ int kmsg_socket,
+ int rtnl_socket,
+ int uid_shift_socket,
+ FDSet *fds,
+ int argc,
+ char *argv[]) {
+
+ pid_t pid;
+ ssize_t l;
+ int r;
+
+ assert(barrier);
+ assert(directory);
+ assert(console);
+ assert(pid_socket >= 0);
+ assert(kmsg_socket >= 0);
+
+ cg_unified_flush();
+
+ if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
+ return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
+
+ if (interactive) {
+ close_nointr(STDIN_FILENO);
+ close_nointr(STDOUT_FILENO);
+ close_nointr(STDERR_FILENO);
+
+ r = open_terminal(console, O_RDWR);
+ if (r != STDIN_FILENO) {
+ if (r >= 0) {
+ safe_close(r);
+ r = -EINVAL;
+ }
+
+ return log_error_errno(r, "Failed to open console: %m");
+ }
+
+ if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
+ dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
+ return log_error_errno(errno, "Failed to duplicate console: %m");
+ }
+
+ r = reset_audit_loginuid();
+ if (r < 0)
+ return r;
+
+ /* Mark everything as slave, so that we still
+ * receive mounts from the real root, but don't
+ * propagate mounts to the real root. */
+ if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
+ return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
+
+ r = mount_devices(directory,
+ root_device, root_device_rw,
+ home_device, home_device_rw,
+ srv_device, srv_device_rw);
+ if (r < 0)
+ return r;
+
+ r = determine_uid_shift(directory);
+ if (r < 0)
+ return r;
+
+ if (arg_userns) {
+ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send UID shift: %m");
+ if (l != sizeof(arg_uid_shift)) {
+ log_error("Short write while sending UID shift.");
+ return -EIO;
+ }
+ }
+
+ /* Turn directory into bind mount */
+ if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
+ return log_error_errno(errno, "Failed to make bind mount: %m");
+
+ r = setup_volatile(directory);
+ if (r < 0)
+ return r;
+
+ r = setup_volatile_state(directory);
+ if (r < 0)
+ return r;
+
+ r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
+ if (r < 0)
+ return r;
+
+ if (arg_read_only) {
+ r = bind_remount_recursive(directory, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make tree read-only: %m");
+ }
+
+ r = mount_all(directory, false);
+ if (r < 0)
+ return r;
+
+ if (copy_devnodes(directory) < 0)
+ return r;
+
+ dev_setup(directory, arg_uid_shift, arg_uid_shift);
+
+ if (setup_pts(directory) < 0)
+ return r;
+
+ r = setup_propagate(directory);
+ if (r < 0)
+ return r;
+
+ r = setup_dev_console(directory, console);
+ if (r < 0)
+ return r;
+
+ r = setup_seccomp();
+ if (r < 0)
+ return r;
+
+ r = setup_timezone(directory);
+ if (r < 0)
+ return r;
+
+ r = setup_resolv_conf(directory);
+ if (r < 0)
+ return r;
+
+ r = setup_journal(directory);
+ if (r < 0)
+ return r;
+
+ r = mount_custom(directory);
+ if (r < 0)
+ return r;
+
+ r = mount_cgroups(directory);
+ if (r < 0)
+ return r;
+
+ r = mount_move_root(directory);
+ if (r < 0)
+ return log_error_errno(r, "Failed to move root directory: %m");
+
+ pid = raw_clone(SIGCHLD|CLONE_NEWNS|
+ (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
+ (arg_private_network ? CLONE_NEWNET : 0) |
+ (arg_userns ? CLONE_NEWUSER : 0),
+ NULL);
+ if (pid < 0)
+ return log_error_errno(errno, "Failed to fork inner child: %m");
+ if (pid == 0) {
+ pid_socket = safe_close(pid_socket);
+ uid_shift_socket = safe_close(uid_shift_socket);
+
+ /* The inner child has all namespaces that are
+ * requested, so that we all are owned by the user if
+ * user namespaces are turned on. */
+
+ r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
+ if (r < 0)
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send PID: %m");
+ if (l != sizeof(pid)) {
+ log_error("Short write while sending PID.");
+ return -EIO;
+ }
+
+ pid_socket = safe_close(pid_socket);
+
+ return 0;
+}
+
+static int setup_uid_map(pid_t pid) {
+ char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+ int r;
+
+ assert(pid > 1);
+
+ xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
+ xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
+ r = write_string_file(uid_map, line, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write UID map: %m");
+
+ /* We always assign the same UID and GID ranges */
+ xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
+ r = write_string_file(uid_map, line, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write GID map: %m");
+
+ return 0;
+}
+
+static int chown_cgroup(pid_t pid) {
+ _cleanup_free_ char *path = NULL, *fs = NULL;
+ _cleanup_close_ int fd = -1;
+ const char *fn;
+ int r;
+
+ r = cg_pid_get_path(NULL, pid, &path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+ fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to open %s: %m", fs);
+
+ FOREACH_STRING(fn,
+ ".",
+ "tasks",
+ "notify_on_release",
+ "cgroup.procs",
+ "cgroup.clone_children",
+ "cgroup.controllers",
+ "cgroup.subtree_control",
+ "cgroup.populated")
+ if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
+ log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+ return 0;
+}
+
+static int sync_cgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup = NULL;
+ char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+ bool undo_mount = false;
+ const char *fn;
+ int unified, r;
+
+ unified = cg_unified();
+ if (unified < 0)
+ return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+
+ if ((unified > 0) == arg_unified_cgroup_hierarchy)
+ return 0;
+
+ /* When the host uses the legacy cgroup setup, but the
+ * container shall use the unified hierarchy, let's make sure
+ * we copy the path from the name=systemd hierarchy into the
+ * unified hierarchy. Similar for the reverse situation. */
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
+
+ /* In order to access the unified hierarchy we need to mount it */
+ if (!mkdtemp(tree))
+ return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+
+ if (unified)
+ r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+ else
+ r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+ if (r < 0) {
+ r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+ goto finish;
+ }
+
+ undo_mount = true;
+
+ fn = strjoina(tree, cgroup, "/cgroup.procs");
+ (void) mkdir_parents(fn, 0755);
+
+ sprintf(pid_string, PID_FMT, pid);
+ r = write_string_file(fn, pid_string, 0);
+ if (r < 0)
+ log_error_errno(r, "Failed to move process: %m");
+
+finish:
+ if (undo_mount)
+ (void) umount(tree);
+
+ (void) rmdir(tree);
+ return r;
+}
+
+static int create_subcgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup = NULL;
+ const char *child;
+ int unified, r;
+
+ /* In the unified hierarchy inner nodes may only only contain
+ * subgroups, but not processes. Hence, if we running in the
+ * unified hierarchy and the container does the same, and we
+ * did not create a scope unit for the container move us and
+ * the container into two separate subcgroups. */
+
+ if (!arg_keep_unit)
+ return 0;
+
+ if (!arg_unified_cgroup_hierarchy)
+ return 0;
+
+ unified = cg_unified();
+ if (unified < 0)
+ return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+ if (unified == 0)
+ return 0;
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get our control group: %m");
+
+ child = strjoina(cgroup, "/payload");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+ child = strjoina(cgroup, "/supervisor");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+ return 0;
+}
+
int main(int argc, char *argv[]) {
_cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
int r, n_fd_passed, loop_nr = -1;
char veth_name[IFNAMSIZ];
bool secondary = false, remove_subvol = false;
- sigset_t mask, mask_chld;
+ sigset_t mask_chld;
pid_t pid = 0;
int ret = EXIT_SUCCESS;
union in_addr_union exposed = {};
goto finish;
}
- log_close();
n_fd_passed = sd_listen_fds(false);
if (n_fd_passed > 0) {
r = fdset_new_listen_fds(&fds, false);
goto finish;
}
}
- fdset_close_others(fds);
- log_open();
if (arg_directory) {
assert(!arg_image);
* the specified is not a mount point we
* create the new snapshot in the parent
* directory, just next to it. */
- r = path_is_mount_point(arg_directory, false);
+ r = path_is_mount_point(arg_directory, 0);
if (r < 0) {
log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
goto finish;
}
if (r > 0)
- r = tempfn_random_child(arg_directory, &np);
+ r = tempfn_random_child(arg_directory, "machine.", &np);
else
- r = tempfn_random(arg_directory, &np);
+ r = tempfn_random(arg_directory, "machine.", &np);
if (r < 0) {
log_error_errno(r, "Failed to generate name for snapshot: %m");
goto finish;
goto finish;
}
- r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
+ r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
if (r < 0) {
log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
goto finish;
}
if (arg_template) {
- r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
+ r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
if (r == -EEXIST) {
if (!arg_quiet)
log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
goto finish;
}
- r = determine_uid_shift();
+ r = custom_mounts_prepare();
if (r < 0)
goto finish;
- interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
+ interactive =
+ isatty(STDIN_FILENO) > 0 &&
+ isatty(STDOUT_FILENO) > 0;
master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
if (master < 0) {
log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
arg_machine, arg_image ?: arg_directory);
- assert_se(sigemptyset(&mask) == 0);
- sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
- assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
+ assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
assert_se(sigemptyset(&mask_chld) == 0);
assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
+ if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
+ r = log_error_errno(errno, "Failed to become subreaper: %m");
+ goto finish;
+ }
+
for (;;) {
- _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
+ _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
+ uid_shift_socket_pair[2] = { -1, -1 };
ContainerStatus container_status;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
- struct sigaction sa = {
+ static const struct sigaction sa = {
.sa_handler = nop_handler,
.sa_flags = SA_NOCLDSTOP,
};
+ int ifi = 0;
+ ssize_t l;
+ _cleanup_event_unref_ sd_event *event = NULL;
+ _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
+ char last_char = 0;
r = barrier_create(&barrier);
if (r < 0) {
goto finish;
}
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create pid socket pair: %m");
+ goto finish;
+ }
+
+ if (arg_userns)
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+ goto finish;
+ }
+
/* Child can be killed before execv(), so handle SIGCHLD
* in order to interrupt parent's blocking calls and
* give it a chance to call wait() and terminate. */
goto finish;
}
- pid = raw_clone(SIGCHLD|CLONE_NEWNS|
- (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
- (arg_private_network ? CLONE_NEWNET : 0), NULL);
+ pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
if (pid < 0) {
if (errno == EINVAL)
r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
}
if (pid == 0) {
- /* child */
- _cleanup_free_ char *home = NULL;
- unsigned n_env = 2;
- const char *envp[] = {
- "PATH=" DEFAULT_PATH_SPLIT_USR,
- "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
- NULL, /* TERM */
- NULL, /* HOME */
- NULL, /* USER */
- NULL, /* LOGNAME */
- NULL, /* container_uuid */
- NULL, /* LISTEN_FDS */
- NULL, /* LISTEN_PID */
- NULL
- };
- char **env_use;
-
+ /* The outer child only has a file system namespace. */
barrier_set_role(&barrier, BARRIER_CHILD);
- envp[n_env] = strv_find_prefix(environ, "TERM=");
- if (envp[n_env])
- n_env ++;
-
master = safe_close(master);
kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
-
- reset_all_signal_handlers();
- reset_signal_mask();
-
- if (interactive) {
- close_nointr(STDIN_FILENO);
- close_nointr(STDOUT_FILENO);
- close_nointr(STDERR_FILENO);
-
- r = open_terminal(console, O_RDWR);
- if (r != STDIN_FILENO) {
- if (r >= 0) {
- safe_close(r);
- r = -EINVAL;
- }
-
- log_error_errno(r, "Failed to open console: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
- dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
- log_error_errno(errno, "Failed to duplicate console: %m");
- _exit(EXIT_FAILURE);
- }
- }
-
- if (setsid() < 0) {
- log_error_errno(errno, "setsid() failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (reset_audit_loginuid() < 0)
- _exit(EXIT_FAILURE);
-
- if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
- log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (arg_private_network)
- loopback_setup();
-
- /* Mark everything as slave, so that we still
- * receive mounts from the real root, but don't
- * propagate mounts to the real root. */
- if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
- log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (mount_devices(arg_directory,
- root_device, root_device_rw,
- home_device, home_device_rw,
- srv_device, srv_device_rw) < 0)
- _exit(EXIT_FAILURE);
-
- /* Turn directory into bind mount */
- if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
- log_error_errno(errno, "Failed to make bind mount: %m");
- _exit(EXIT_FAILURE);
- }
-
- r = setup_volatile(arg_directory);
- if (r < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_volatile_state(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- r = base_filesystem_create(arg_directory);
- if (r < 0)
- _exit(EXIT_FAILURE);
-
- if (arg_read_only) {
- r = bind_remount_recursive(arg_directory, true);
- if (r < 0) {
- log_error_errno(r, "Failed to make tree read-only: %m");
- _exit(EXIT_FAILURE);
- }
- }
-
- if (mount_all(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (copy_devnodes(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_ptmx(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- dev_setup(arg_directory);
-
- if (setup_propagate(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_seccomp() < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_dev_console(arg_directory, console) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
- _exit(EXIT_FAILURE);
- kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
-
- if (send_rtnl(rtnl_socket_pair[1]) < 0)
- _exit(EXIT_FAILURE);
- rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
-
- /* Tell the parent that we are ready, and that
- * it can cgroupify us to that we lack access
- * to certain devices and resources. */
- (void) barrier_place(&barrier); /* #1 */
-
- if (setup_boot_id(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_timezone(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_resolv_conf(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_journal(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (mount_binds(arg_directory, arg_bind, false) < 0)
- _exit(EXIT_FAILURE);
-
- if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
- _exit(EXIT_FAILURE);
-
- if (mount_tmpfs(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- /* Wait until we are cgroup-ified, so that we
- * can mount the right cgroup path writable */
- (void) barrier_place_and_sync(&barrier); /* #2 */
-
- if (mount_cgroup(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (chdir(arg_directory) < 0) {
- log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
- _exit(EXIT_FAILURE);
- }
-
- if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
- log_error_errno(errno, "mount(MS_MOVE) failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (chroot(".") < 0) {
- log_error_errno(errno, "chroot() failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (chdir("/") < 0) {
- log_error_errno(errno, "chdir() failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (arg_userns) {
- if (unshare(CLONE_NEWUSER) < 0) {
- log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- /* Tell the parent, that it now can
- * write the UID map. */
- (void) barrier_place(&barrier); /* #3 */
-
- /* Wait until the parent wrote the UID
- * map */
- (void) barrier_place_and_sync(&barrier); /* #4 */
- }
-
- umask(0022);
-
- if (drop_capabilities() < 0) {
- log_error_errno(errno, "drop_capabilities() failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- setup_hostname();
-
- if (arg_personality != 0xffffffffLU) {
- if (personality(arg_personality) < 0) {
- log_error_errno(errno, "personality() failed: %m");
- _exit(EXIT_FAILURE);
- }
- } else if (secondary) {
- if (personality(PER_LINUX32) < 0) {
- log_error_errno(errno, "personality() failed: %m");
- _exit(EXIT_FAILURE);
- }
- }
-
-#ifdef HAVE_SELINUX
- if (arg_selinux_context)
- if (setexeccon((security_context_t) arg_selinux_context) < 0) {
- log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
- _exit(EXIT_FAILURE);
- }
-#endif
-
- r = change_uid_gid(&home);
+ pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+ uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+
+ (void) reset_all_signal_handlers();
+ (void) reset_signal_mask();
+
+ r = outer_child(&barrier,
+ arg_directory,
+ console,
+ root_device, root_device_rw,
+ home_device, home_device_rw,
+ srv_device, srv_device_rw,
+ interactive,
+ secondary,
+ pid_socket_pair[1],
+ kmsg_socket_pair[1],
+ rtnl_socket_pair[1],
+ uid_shift_socket_pair[1],
+ fds,
+ argc, argv);
if (r < 0)
_exit(EXIT_FAILURE);
- if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
- (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
- (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
-
- if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
- char as_uuid[37];
-
- if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
- }
-
- if (fdset_size(fds) > 0) {
- r = fdset_cloexec(fds, false);
- if (r < 0) {
- log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
- _exit(EXIT_FAILURE);
- }
-
- if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
- (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
- }
-
- if (!strv_isempty(arg_setenv)) {
- char **n;
-
- n = strv_env_merge(2, envp, arg_setenv);
- if (!n) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
-
- env_use = n;
- } else
- env_use = (char**) envp;
-
- /* Let the parent know that we are ready and
- * wait until the parent is ready with the
- * setup, too... */
- (void) barrier_place_and_sync(&barrier); /* #5 */
-
- if (arg_boot) {
- char **a;
- size_t l;
-
- /* Automatically search for the init system */
-
- l = 1 + argc - optind;
- a = newa(char*, l + 1);
- memcpy(a + 1, argv + optind, l * sizeof(char*));
-
- a[0] = (char*) "/usr/lib/systemd/systemd";
- execve(a[0], a, env_use);
-
- a[0] = (char*) "/lib/systemd/systemd";
- execve(a[0], a, env_use);
-
- a[0] = (char*) "/sbin/init";
- execve(a[0], a, env_use);
- } else if (argc > optind)
- execvpe(argv[optind], argv + optind, env_use);
- else {
- chdir(home ? home : "/root");
- execle("/bin/bash", "-bash", NULL, env_use);
- execle("/bin/sh", "-sh", NULL, env_use);
- }
-
- log_error_errno(errno, "execv() failed: %m");
- _exit(EXIT_FAILURE);
+ _exit(EXIT_SUCCESS);
}
barrier_set_role(&barrier, BARRIER_PARENT);
+
fdset_free(fds);
fds = NULL;
kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
+ pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
- (void) barrier_place(&barrier); /* #1 */
-
- /* Wait for the most basic Child-setup to be done,
- * before we add hardware to it, and place it in a
- * cgroup. */
- if (barrier_sync(&barrier)) { /* #1 */
- int ifi = 0;
+ /* Wait for the outer child. */
+ r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
+ if (r < 0)
+ goto finish;
+ if (r != 0) {
+ r = -EIO;
+ goto finish;
+ }
+ pid = 0;
- r = move_network_interfaces(pid);
- if (r < 0)
- goto finish;
+ /* And now retrieve the PID of the inner child. */
+ l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
+ if (l < 0) {
+ r = log_error_errno(errno, "Failed to read inner child PID: %m");
+ goto finish;
+ }
+ if (l != sizeof(pid)) {
+ log_error("Short read while reading inner child PID: %m");
+ r = EIO;
+ goto finish;
+ }
- r = setup_veth(pid, veth_name, &ifi);
- if (r < 0)
- goto finish;
+ log_debug("Init process invoked as PID " PID_FMT, pid);
- r = setup_bridge(veth_name, &ifi);
- if (r < 0)
+ if (arg_userns) {
+ if (!barrier_place_and_sync(&barrier)) { /* #1 */
+ log_error("Child died too early.");
+ r = -ESRCH;
goto finish;
+ }
- r = setup_macvlan(pid);
- if (r < 0)
+ l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
+ if (l < 0) {
+ r = log_error_errno(errno, "Failed to read UID shift: %m");
goto finish;
-
- r = setup_ipvlan(pid);
- if (r < 0)
+ }
+ if (l != sizeof(arg_uid_shift)) {
+ log_error("Short read while reading UID shift: %m");
+ r = EIO;
goto finish;
+ }
- r = register_machine(pid, ifi);
+ r = setup_uid_map(pid);
if (r < 0)
goto finish;
- /* Notify the child that the parent is ready with all
- * its setup, and that the child can now hand over
- * control to the code to run inside the container. */
(void) barrier_place(&barrier); /* #2 */
+ }
- if (arg_userns) {
- char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+ r = move_network_interfaces(pid);
+ if (r < 0)
+ goto finish;
- (void) barrier_place_and_sync(&barrier); /* #3 */
+ r = setup_veth(pid, veth_name, &ifi);
+ if (r < 0)
+ goto finish;
- xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
- xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
- r = write_string_file(uid_map, line);
- if (r < 0) {
- log_error_errno(r, "Failed to write UID map: %m");
- goto finish;
- }
+ r = setup_bridge(veth_name, &ifi);
+ if (r < 0)
+ goto finish;
- /* We always assign the same UID and GID ranges */
- xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
- r = write_string_file(uid_map, line);
- if (r < 0) {
- log_error_errno(r, "Failed to write GID map: %m");
- goto finish;
- }
+ r = setup_macvlan(pid);
+ if (r < 0)
+ goto finish;
- (void) barrier_place(&barrier); /* #4 */
- }
+ r = setup_ipvlan(pid);
+ if (r < 0)
+ goto finish;
- /* Block SIGCHLD here, before notifying child.
- * process_pty() will handle it with the other signals. */
- r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
- if (r < 0)
- goto finish;
+ r = register_machine(pid, ifi);
+ if (r < 0)
+ goto finish;
- /* Reset signal to default */
- r = default_signals(SIGCHLD, -1);
- if (r < 0)
- goto finish;
+ r = sync_cgroup(pid);
+ if (r < 0)
+ goto finish;
- /* Let the child know that we are ready and wait that the child is completely ready now. */
- if (barrier_place_and_sync(&barrier)) { /* #5 */
- _cleanup_event_unref_ sd_event *event = NULL;
- _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
- char last_char = 0;
+ r = create_subcgroup(pid);
+ if (r < 0)
+ goto finish;
- sd_notifyf(false,
- "READY=1\n"
- "STATUS=Container running.\n"
- "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
+ r = chown_cgroup(pid);
+ if (r < 0)
+ goto finish;
- r = sd_event_new(&event);
- if (r < 0) {
- log_error_errno(r, "Failed to get default event source: %m");
- goto finish;
- }
+ /* Notify the child that the parent is ready with all
+ * its setup (including cgroup-ification), and that
+ * the child can now hand over control to the code to
+ * run inside the container. */
+ (void) barrier_place(&barrier); /* #3 */
- if (arg_kill_signal > 0) {
- /* Try to kill the init system on SIGINT or SIGTERM */
- sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
- sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
- } else {
- /* Immediately exit */
- sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
- sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
- }
+ /* Block SIGCHLD here, before notifying child.
+ * process_pty() will handle it with the other signals. */
+ assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
- /* simply exit on sigchld */
- sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
+ /* Reset signal to default */
+ r = default_signals(SIGCHLD, -1);
+ if (r < 0) {
+ log_error_errno(r, "Failed to reset SIGCHLD: %m");
+ goto finish;
+ }
- if (arg_expose_ports) {
- r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
- if (r < 0)
- goto finish;
+ /* Let the child know that we are ready and wait that the child is completely ready now. */
+ if (!barrier_place_and_sync(&barrier)) { /* #5 */
+ log_error("Client died too early.");
+ r = -ESRCH;
+ goto finish;
+ }
- (void) expose_ports(rtnl, &exposed);
- }
+ sd_notifyf(false,
+ "READY=1\n"
+ "STATUS=Container running.\n"
+ "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
- rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+ r = sd_event_new(&event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to get default event source: %m");
+ goto finish;
+ }
- r = pty_forward_new(event, master, true, !interactive, &forward);
- if (r < 0) {
- log_error_errno(r, "Failed to create PTY forwarder: %m");
- goto finish;
- }
+ if (arg_kill_signal > 0) {
+ /* Try to kill the init system on SIGINT or SIGTERM */
+ sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
+ sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
+ } else {
+ /* Immediately exit */
+ sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+ sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+ }
- r = sd_event_loop(event);
- if (r < 0) {
- log_error_errno(r, "Failed to run event loop: %m");
- goto finish;
- }
+ /* simply exit on sigchld */
+ sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
- pty_forward_get_last_char(forward, &last_char);
+ if (arg_expose_ports) {
+ r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
+ if (r < 0)
+ goto finish;
- forward = pty_forward_free(forward);
+ (void) expose_ports(rtnl, &exposed);
+ }
- if (!arg_quiet && last_char != '\n')
- putc('\n', stdout);
+ rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
- /* Kill if it is not dead yet anyway */
- terminate_machine(pid);
- }
+ r = pty_forward_new(event, master, true, !interactive, &forward);
+ if (r < 0) {
+ log_error_errno(r, "Failed to create PTY forwarder: %m");
+ goto finish;
+ }
+
+ r = sd_event_loop(event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to run event loop: %m");
+ goto finish;
}
+ pty_forward_get_last_char(forward, &last_char);
+
+ forward = pty_forward_free(forward);
+
+ if (!arg_quiet && last_char != '\n')
+ putc('\n', stdout);
+
+ /* Kill if it is not dead yet anyway */
+ terminate_machine(pid);
+
/* Normally redundant, but better safe than sorry */
kill(pid, SIGKILL);
"STOPPING=1\n"
"STATUS=Terminating...");
- loop_remove(loop_nr, &image_fd);
-
if (pid > 0)
kill(pid, SIGKILL);
+ /* Try to flush whatever is still queued in the pty */
+ if (master >= 0)
+ (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
+
+ loop_remove(loop_nr, &image_fd);
+
if (remove_subvol && arg_directory) {
int k;
strv_free(arg_network_interfaces);
strv_free(arg_network_macvlan);
strv_free(arg_network_ipvlan);
- strv_free(arg_bind);
- strv_free(arg_bind_ro);
- strv_free(arg_tmpfs);
+ custom_mount_free_all();
flush_ports(&exposed);