STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
+static int handle_arg_console(const char *arg) {
+ if (streq(arg, "help")) {
+ puts("interactive\n"
+ "read-only\n"
+ "passive\n"
+ "pipe");
+ return 0;
+ }
+
+ if (streq(arg, "interactive"))
+ arg_console_mode = CONSOLE_INTERACTIVE;
+ else if (streq(arg, "read-only"))
+ arg_console_mode = CONSOLE_READ_ONLY;
+ else if (streq(arg, "passive"))
+ arg_console_mode = CONSOLE_PASSIVE;
+ else if (streq(arg, "pipe"))
+ arg_console_mode = CONSOLE_PIPE;
+ else
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
+
+ arg_settings_mask |= SETTING_CONSOLE_MODE;
+ return 1;
+}
+
static int help(void) {
_cleanup_free_ char *link = NULL;
int r;
}
static int detect_unified_cgroup_hierarchy_from_environment(void) {
- const char *e;
+ const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
int r;
/* Allow the user to control whether the unified hierarchy is used */
- e = getenv("UNIFIED_CGROUP_HIERARCHY");
- if (e) {
+
+ e = getenv(var);
+ if (!e) {
+ static bool warned = false;
+
+ var = "UNIFIED_CGROUP_HIERARCHY";
+ e = getenv(var);
+ if (e && !warned) {
+ log_info("$UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY.");
+ warned = true;
+ }
+ }
+
+ if (!isempty(e)) {
r = parse_boolean(e);
if (r < 0)
- return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+ return log_error_errno(r, "Failed to parse $%s: %m", var);
if (r > 0)
arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
else
static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
int r;
- /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
- * image actually supports. */
+ /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
+ * in the image actually supports. */
r = cg_all_unified();
if (r < 0)
return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
return 0;
}
-static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
+static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
int r;
r = getenv_bool(name);
if (r == -ENXIO)
- return;
+ return 0;
if (r < 0)
- log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+ return log_error_errno(r, "Failed to parse $%s: %m", name);
arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
+ return 0;
}
-static void parse_mount_settings_env(void) {
+static int parse_mount_settings_env(void) {
const char *e;
int r;
r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
+ if (r < 0 && r != -ENXIO)
+ return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
if (r >= 0)
SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
- else if (r != -ENXIO)
- log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
- if (!e)
- return;
-
- if (streq(e, "network")) {
+ if (streq_ptr(e, "network"))
arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
- return;
- }
- r = parse_boolean(e);
- if (r < 0) {
- log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
- return;
+ else if (e) {
+ r = parse_boolean(e);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
+
+ SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
+ SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
}
- SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
- SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
+ return 0;
}
-static void parse_environment(void) {
+static int parse_environment(void) {
const char *e;
int r;
- parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
- parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
- parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
- parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
+ r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
+ if (r < 0)
+ return r;
+ r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
+ if (r < 0)
+ return r;
+ r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
+ if (r < 0)
+ return r;
+ r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
+ if (r < 0)
+ return r;
- parse_mount_settings_env();
+ r = parse_mount_settings_env();
+ if (r < 0)
+ return r;
/* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
* even if it is supported. If not supported, it has no effect. */
r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
if (r < 0) {
if (r != -ENXIO)
- log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
+ return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
arg_use_cgns = true;
} else {
if (e)
arg_container_service_name = e;
- detect_unified_cgroup_hierarchy_from_environment();
+ return detect_unified_cgroup_hierarchy_from_environment();
}
static int parse_argv(int argc, char *argv[]) {
case ARG_NETWORK_ZONE: {
char *j;
- j = strappend("vz-", optarg);
+ j = strjoin("vz-", optarg);
if (!j)
return log_oom();
case ARG_RLIMIT: {
const char *eq;
- char *name;
+ _cleanup_free_ char *name = NULL;
int rl;
if (streq(optarg, "help")) {
break;
case ARG_CONSOLE:
- if (streq(optarg, "interactive"))
- arg_console_mode = CONSOLE_INTERACTIVE;
- else if (streq(optarg, "read-only"))
- arg_console_mode = CONSOLE_READ_ONLY;
- else if (streq(optarg, "passive"))
- arg_console_mode = CONSOLE_PASSIVE;
- else if (streq(optarg, "pipe"))
- arg_console_mode = CONSOLE_PIPE;
- else if (streq(optarg, "help"))
- puts("interactive\n"
- "read-only\n"
- "passive\n"
- "pipe");
- else
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
-
- arg_settings_mask |= SETTING_CONSOLE_MODE;
+ r = handle_arg_console(optarg);
+ if (r <= 0)
+ return r;
break;
case 'P':
case ARG_PIPE:
- arg_console_mode = CONSOLE_PIPE;
- arg_settings_mask |= SETTING_CONSOLE_MODE;
+ r = handle_arg_console("pipe");
+ if (r <= 0)
+ return r;
break;
case ARG_NO_PAGER:
arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
/* Make sure to parse environment before we reset the settings mask below */
- parse_environment();
+ r = parse_environment();
+ if (r < 0)
+ return r;
/* Load all settings from .nspawn files */
if (mask_no_settings)
static int verify_arguments(void) {
int r;
+ if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
+ /* If we are running the stub init in the container, we don't need to look at what the init
+ * in the container supports, because we are not using it. Let's immediately pick the right
+ * setting based on the host system configuration.
+ *
+ * We only do this, if the user didn't use an environment variable to override the detection.
+ */
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
+ if (r > 0)
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+ else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
+ else
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+ }
+
if (arg_userns_mode != USER_NAMESPACE_NO)
arg_mount_settings |= MOUNT_USE_USERNS;
if (m == TIMEZONE_OFF)
return 0;
- r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
+ r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
if (r < 0) {
log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
return 0;
return 0; /* Already pointing to the right place? Then do nothing .. */
check = strjoina(dest, "/usr/share/zoneinfo/", z);
- r = chase_symlinks(check, dest, 0, NULL);
+ r = chase_symlinks(check, dest, 0, NULL, NULL);
if (r < 0)
log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
else {
_cleanup_free_ char *resolved = NULL;
int found;
- found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
+ found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
if (found < 0) {
log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
return 0;
if (m == RESOLV_CONF_OFF)
return 0;
- r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
+ r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
if (r < 0) {
log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
return 0;
_cleanup_free_ char *resolved = NULL;
int found;
- found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
+ found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
if (found < 0) {
log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
return 0;
if (!*p)
return 0;
- r = chase_symlinks(*p, NULL, flags, &chased);
+ r = chase_symlinks(*p, NULL, flags, &chased, NULL);
if (r < 0)
return log_error_errno(r, "Failed to resolve path %s: %m", *p);
- free_and_replace(*p, chased);
- return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
+ return free_and_replace(*p, chased);
}
static int determine_uid_shift(const char *directory) {
"Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
}
- if (!dissected_image) {
- /* Turn directory into bind mount */
+ if (path_equal(directory, "/")) {
+ /* If the directory we shall boot is the host, let's operate on a bind mount at a different
+ * place, so that we can make changes to its mount structure (for example, to implement
+ * --volatile=) without this interfering with our ability to access files such as
+ * /etc/localtime to copy into the container. Note that we use a fixed place for this
+ * (instead of a temporary directory, since we are living in our own mount namspace here
+ * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
+ (void) mkdir_p("/run/systemd/nspawn-root", 0755);
+
+ r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ directory = "/run/systemd/nspawn-root";
+
+ } else if (!dissected_image) {
+ /* Turn directory into bind mount (we need that so that we can move the bind mount to root
+ * later on). */
r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
if (r < 0)
return r;
p = file_in_same_dir(arg_image, fn);
if (!p)
return log_oom();
- } else if (arg_directory) {
+ } else if (arg_directory && !path_equal(arg_directory, "/")) {
p = file_in_same_dir(arg_directory, fn);
if (!p)
return log_oom();
}
/* Kill if it is not dead yet anyway */
- if (bus) {
- if (arg_register)
- terminate_machine(bus, arg_machine);
- else if (!arg_keep_unit)
- terminate_scope(bus, arg_machine);
- }
+ if (!arg_register && !arg_keep_unit && bus)
+ terminate_scope(bus, arg_machine);
/* Normally redundant, but better safe than sorry */
(void) kill(*pid, SIGKILL);
r = wait_for_container(*pid, &container_status);
*pid = 0;
+ /* Tell machined that we are gone. */
+ if (bus)
+ (void) unregister_machine(bus, arg_machine);
+
if (r < 0)
/* We failed to wait for the container, or the container exited abnormally. */
return r;
}
static int run(int argc, char *argv[]) {
+ bool secondary = false, remove_directory = false, remove_image = false,
+ veth_created = false, remove_tmprootdir = false;
_cleanup_close_ int master = -1;
_cleanup_fdset_free_ FDSet *fds = NULL;
int r, n_fd_passed, ret = EXIT_SUCCESS;
char veth_name[IFNAMSIZ] = "";
- bool secondary = false, remove_directory = false, remove_image = false;
- pid_t pid = 0;
union in_addr_union exposed = {};
_cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
- bool veth_created = false, remove_tmprootdir = false;
char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
_cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+ pid_t pid = 0;
log_parse_environment();
log_open();
if (r < 0)
goto finish;
- r = cg_unified_flush();
+ r = cg_unified();
if (r < 0) {
log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
goto finish;
if (r < 0)
goto finish;
- r = detect_unified_cgroup_hierarchy_from_environment();
- if (r < 0)
- goto finish;
+ /* Reapply environment settings. */
+ (void) detect_unified_cgroup_hierarchy_from_environment();
/* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
* the result is closed. Note that the container payload child will reset signal mask+handler anyway,
if (arg_directory) {
assert(!arg_image);
- if (path_equal(arg_directory, "/") && !arg_ephemeral) {
- log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
+ /* Safety precaution: let's not allow running images from the live host OS image, as long as
+ * /var from the host will propagate into container dynamically (because bad things happen if
+ * two systems write to the same /var). Let's allow it for the special cases where /var is
+ * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
+ if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
+ log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
r = -EINVAL;
goto finish;
}
if (r < 0)
goto finish;
- /* If the specified path is a mount point we
- * generate the new snapshot immediately
- * inside it under a random name. However if
- * the specified is not a mount point we
- * create the new snapshot in the parent
- * directory, just next to it. */
+ /* If the specified path is a mount point we generate the new snapshot immediately
+ * inside it under a random name. However if the specified is not a mount point we
+ * create the new snapshot in the parent directory, just next to it. */
r = path_is_mount_point(arg_directory, NULL, 0);
if (r < 0) {
log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
goto finish;
}
- r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+ /* We take an exclusive lock on this image, since it's our private, ephemeral copy
+ * only owned by us and noone else. */
+ r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", np);
goto finish;
}
- r = btrfs_subvol_snapshot(arg_directory, np,
- (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
- BTRFS_SNAPSHOT_FALLBACK_COPY |
- BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
- BTRFS_SNAPSHOT_RECURSIVE |
- BTRFS_SNAPSHOT_QUOTA);
+ {
+ BLOCK_SIGNALS(SIGINT);
+ r = btrfs_subvol_snapshot(arg_directory, np,
+ (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_QUOTA |
+ BTRFS_SNAPSHOT_SIGINT);
+ }
+ if (r == -EINTR) {
+ log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
+ goto finish;
+ }
if (r < 0) {
log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
goto finish;
}
free_and_replace(arg_directory, np);
-
remove_directory = true;
-
} else {
r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
if (r < 0)
if (r < 0)
goto finish;
- r = btrfs_subvol_snapshot(arg_template, arg_directory,
- (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
- BTRFS_SNAPSHOT_FALLBACK_COPY |
- BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
- BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
- BTRFS_SNAPSHOT_RECURSIVE |
- BTRFS_SNAPSHOT_QUOTA);
+ {
+ BLOCK_SIGNALS(SIGINT);
+ r = btrfs_subvol_snapshot(arg_template, arg_directory,
+ (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_QUOTA |
+ BTRFS_SNAPSHOT_SIGINT);
+ }
if (r == -EEXIST)
log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
"Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
- else if (r < 0) {
+ else if (r == -EINTR) {
+ log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
+ goto finish;
+ } else if (r < 0) {
log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
goto finish;
} else
goto finish;
}
- r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+ /* Always take an exclusive lock on our own ephemeral copy. */
+ r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
if (r < 0) {
r = log_error_errno(r, "Failed to create image lock: %m");
goto finish;
}
- r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
+ {
+ BLOCK_SIGNALS(SIGINT);
+ r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
+ }
+ if (r == -EINTR) {
+ log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
+ goto finish;
+ }
if (r < 0) {
r = log_error_errno(r, "Failed to copy image file: %m");
goto finish;
}
free_and_replace(arg_image, np);
-
remove_image = true;
} else {
r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);