]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
tree-wide: drop stdio.h when stdio-util.h is included
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index fed5c651cedfa6c56fd72b23f0143cf98a085898..c87e8591949facd285c9bb4a06124c2d2c0dcad6 100644 (file)
@@ -1,22 +1,15 @@
 /* SPDX-License-Identifier: LGPL-2.1+ */
 
 #if HAVE_BLKID
-#include <blkid.h>
 #endif
 #include <errno.h>
 #include <getopt.h>
-#include <grp.h>
 #include <linux/fs.h>
 #include <linux/loop.h>
-#include <pwd.h>
-#include <sched.h>
 #if HAVE_SELINUX
 #include <selinux/selinux.h>
 #endif
-#include <signal.h>
-#include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 #include <sys/file.h>
 #include <sys/personality.h>
 #include <sys/prctl.h>
@@ -58,7 +51,7 @@
 #include "machine-image.h"
 #include "macro.h"
 #include "main-func.h"
-#include "missing.h"
+#include "missing_sched.h"
 #include "mkdir.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
 #include "terminal-util.h"
 #include "tmpfile-util.h"
 #include "umask-util.h"
+#include "unit-name.h"
 #include "user-util.h"
 #include "util.h"
 
@@ -261,6 +255,30 @@ STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
 
+static int handle_arg_console(const char *arg) {
+        if (streq(arg, "help")) {
+                puts("interactive\n"
+                     "read-only\n"
+                     "passive\n"
+                     "pipe");
+                return 0;
+        }
+
+        if (streq(arg, "interactive"))
+                arg_console_mode = CONSOLE_INTERACTIVE;
+        else if (streq(arg, "read-only"))
+                arg_console_mode = CONSOLE_READ_ONLY;
+        else if (streq(arg, "passive"))
+                arg_console_mode = CONSOLE_PASSIVE;
+        else if (streq(arg, "pipe"))
+                arg_console_mode = CONSOLE_PIPE;
+        else
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
+
+        arg_settings_mask |= SETTING_CONSOLE_MODE;
+        return 1;
+}
+
 static int help(void) {
         _cleanup_free_ char *link = NULL;
         int r;
@@ -412,15 +430,27 @@ static int custom_mount_check_all(void) {
 }
 
 static int detect_unified_cgroup_hierarchy_from_environment(void) {
-        const char *e;
+        const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
         int r;
 
         /* Allow the user to control whether the unified hierarchy is used */
-        e = getenv("UNIFIED_CGROUP_HIERARCHY");
-        if (e) {
+
+        e = getenv(var);
+        if (!e) {
+                static bool warned = false;
+
+                var = "UNIFIED_CGROUP_HIERARCHY";
+                e = getenv(var);
+                if (e && !warned) {
+                        log_info("$UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY.");
+                        warned = true;
+                }
+        }
+
+        if (!isempty(e)) {
                 r = parse_boolean(e);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+                        return log_error_errno(r, "Failed to parse $%s: %m", var);
                 if (r > 0)
                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
                 else
@@ -433,8 +463,8 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) {
 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
         int r;
 
-        /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
-         * image actually supports. */
+        /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
+         * in the image actually supports. */
         r = cg_all_unified();
         if (r < 0)
                 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
@@ -467,58 +497,66 @@ static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
         return 0;
 }
 
-static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
+static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
         int r;
 
         r = getenv_bool(name);
         if (r == -ENXIO)
-                return;
+                return 0;
         if (r < 0)
-                log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+                return log_error_errno(r, "Failed to parse $%s: %m", name);
 
         arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
         arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
+        return 0;
 }
 
-static void parse_mount_settings_env(void) {
+static int parse_mount_settings_env(void) {
         const char *e;
         int r;
 
         r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
+        if (r < 0 && r != -ENXIO)
+                return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
         if (r >= 0)
                 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
-        else if (r != -ENXIO)
-                log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
 
         e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
-        if (!e)
-                return;
-
-        if (streq(e, "network")) {
+        if (streq_ptr(e, "network"))
                 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
-                return;
-        }
 
-        r = parse_boolean(e);
-        if (r < 0) {
-                log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
-                return;
+        else if (e) {
+                r = parse_boolean(e);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
+
+                SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
+                SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
         }
 
-        SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
-        SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
+        return 0;
 }
 
-static void parse_environment(void) {
+static int parse_environment(void) {
         const char *e;
         int r;
 
-        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
-        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
-        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
-        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
+        r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
+        if (r < 0)
+                return r;
+        r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
+        if (r < 0)
+                return r;
+        r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
+        if (r < 0)
+                return r;
+        r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
+        if (r < 0)
+                return r;
 
-        parse_mount_settings_env();
+        r = parse_mount_settings_env();
+        if (r < 0)
+                return r;
 
         /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
          * even if it is supported. If not supported, it has no effect. */
@@ -528,7 +566,7 @@ static void parse_environment(void) {
                 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
                 if (r < 0) {
                         if (r != -ENXIO)
-                                log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
+                                return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
 
                         arg_use_cgns = true;
                 } else {
@@ -541,7 +579,7 @@ static void parse_environment(void) {
         if (e)
                 arg_container_service_name = e;
 
-        detect_unified_cgroup_hierarchy_from_environment();
+        return detect_unified_cgroup_hierarchy_from_environment();
 }
 
 static int parse_argv(int argc, char *argv[]) {
@@ -847,13 +885,17 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_settings_mask |= SETTING_MACHINE_ID;
                         break;
 
-                case 'S':
-                        r = free_and_strdup(&arg_slice, optarg);
+                case 'S': {
+                        _cleanup_free_ char *mangled = NULL;
+
+                        r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
                         if (r < 0)
                                 return log_oom();
 
+                        free_and_replace(arg_slice, mangled);
                         arg_settings_mask |= SETTING_SLICE;
                         break;
+                }
 
                 case 'M':
                         if (isempty(optarg))
@@ -1369,29 +1411,16 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_CONSOLE:
-                        if (streq(optarg, "interactive"))
-                                arg_console_mode = CONSOLE_INTERACTIVE;
-                        else if (streq(optarg, "read-only"))
-                                arg_console_mode = CONSOLE_READ_ONLY;
-                        else if (streq(optarg, "passive"))
-                                arg_console_mode = CONSOLE_PASSIVE;
-                        else if (streq(optarg, "pipe"))
-                                arg_console_mode = CONSOLE_PIPE;
-                        else if (streq(optarg, "help"))
-                                puts("interactive\n"
-                                     "read-only\n"
-                                     "passive\n"
-                                     "pipe");
-                        else
-                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
-
-                        arg_settings_mask |= SETTING_CONSOLE_MODE;
+                        r = handle_arg_console(optarg);
+                        if (r <= 0)
+                                return r;
                         break;
 
                 case 'P':
                 case ARG_PIPE:
-                        arg_console_mode = CONSOLE_PIPE;
-                        arg_settings_mask |= SETTING_CONSOLE_MODE;
+                        r = handle_arg_console("pipe");
+                        if (r <= 0)
+                                return r;
                         break;
 
                 case ARG_NO_PAGER:
@@ -1424,7 +1453,9 @@ static int parse_argv(int argc, char *argv[]) {
         arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
 
         /* Make sure to parse environment before we reset the settings mask below */
-        parse_environment();
+        r = parse_environment();
+        if (r < 0)
+                return r;
 
         /* Load all settings from .nspawn files */
         if (mask_no_settings)
@@ -1440,6 +1471,25 @@ static int parse_argv(int argc, char *argv[]) {
 static int verify_arguments(void) {
         int r;
 
+        if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
+                /* If we are running the stub init in the container, we don't need to look at what the init
+                 * in the container supports, because we are not using it. Let's immediately pick the right
+                 * setting based on the host system configuration.
+                 *
+                 * We only do this, if the user didn't use an environment variable to override the detection.
+                 */
+
+                r = cg_all_unified();
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
+                if (r > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+                else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
+                else
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+        }
+
         if (arg_userns_mode != USER_NAMESPACE_NO)
                 arg_mount_settings |= MOUNT_USE_USERNS;
 
@@ -1616,7 +1666,7 @@ static int setup_timezone(const char *dest) {
         if (m == TIMEZONE_OFF)
                 return 0;
 
-        r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
+        r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
         if (r < 0) {
                 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
                 return 0;
@@ -1647,7 +1697,7 @@ static int setup_timezone(const char *dest) {
                         return 0; /* Already pointing to the right place? Then do nothing .. */
 
                 check = strjoina(dest, "/usr/share/zoneinfo/", z);
-                r = chase_symlinks(check, dest, 0, NULL);
+                r = chase_symlinks(check, dest, 0, NULL, NULL);
                 if (r < 0)
                         log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
                 else {
@@ -1674,7 +1724,7 @@ static int setup_timezone(const char *dest) {
                 _cleanup_free_ char *resolved = NULL;
                 int found;
 
-                found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
+                found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
                 if (found < 0) {
                         log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
                         return 0;
@@ -1780,7 +1830,7 @@ static int setup_resolv_conf(const char *dest) {
         if (m == RESOLV_CONF_OFF)
                 return 0;
 
-        r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
+        r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
         if (r < 0) {
                 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
                 return 0;
@@ -1804,7 +1854,7 @@ static int setup_resolv_conf(const char *dest) {
                 _cleanup_free_ char *resolved = NULL;
                 int found;
 
-                found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
+                found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
                 if (found < 0) {
                         log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
                         return 0;
@@ -2333,7 +2383,8 @@ static int drop_capabilities(uid_t uid) {
                 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
                  * in order to maintain the same behavior as systemd < 242. */
                 if (capability_quintet_mangle(&q))
-                        log_warning("Some capabilities will not be set because they are not in the current bounding set.");
+                        log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
+                                 "Some capabilities will not be set because they are not in the current bounding set.");
 
         }
 
@@ -2681,12 +2732,11 @@ static int chase_symlinks_and_update(char **p, unsigned flags) {
         if (!*p)
                 return 0;
 
-        r = chase_symlinks(*p, NULL, flags, &chased);
+        r = chase_symlinks(*p, NULL, flags, &chased, NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
 
-        free_and_replace(*p, chased);
-        return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
+        return free_and_replace(*p, chased);
 }
 
 static int determine_uid_shift(const char *directory) {
@@ -3264,8 +3314,24 @@ static int outer_child(
                          "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
         }
 
-        if (!dissected_image) {
-                /* Turn directory into bind mount */
+        if (path_equal(directory, "/")) {
+                /* If the directory we shall boot is the host, let's operate on a bind mount at a different
+                 * place, so that we can make changes to its mount structure (for example, to implement
+                 * --volatile=) without this interfering with our ability to access files such as
+                 * /etc/localtime to copy into the container. Note that we use a fixed place for this
+                 * (instead of a temporary directory, since we are living in our own mount namspace here
+                 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
+                (void) mkdir_p("/run/systemd/nspawn-root", 0755);
+
+                r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
+                if (r < 0)
+                        return r;
+
+                directory = "/run/systemd/nspawn-root";
+
+        } else if (!dissected_image) {
+                /* Turn directory into bind mount (we need that so that we can move the bind mount to root
+                 * later on). */
                 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
                 if (r < 0)
                         return r;
@@ -4018,7 +4084,7 @@ static int load_settings(void) {
                         p = file_in_same_dir(arg_image, fn);
                         if (!p)
                                 return log_oom();
-                } else if (arg_directory) {
+                } else if (arg_directory && !path_equal(arg_directory, "/")) {
                         p = file_in_same_dir(arg_directory, fn);
                         if (!p)
                                 return log_oom();
@@ -4551,12 +4617,8 @@ static int run_container(
         }
 
         /* Kill if it is not dead yet anyway */
-        if (bus) {
-                if (arg_register)
-                        terminate_machine(bus, arg_machine);
-                else if (!arg_keep_unit)
-                        terminate_scope(bus, arg_machine);
-        }
+        if (!arg_register && !arg_keep_unit && bus)
+                terminate_scope(bus, arg_machine);
 
         /* Normally redundant, but better safe than sorry */
         (void) kill(*pid, SIGKILL);
@@ -4564,6 +4626,10 @@ static int run_container(
         r = wait_for_container(*pid, &container_status);
         *pid = 0;
 
+        /* Tell machined that we are gone. */
+        if (bus)
+                (void) unregister_machine(bus, arg_machine);
+
         if (r < 0)
                 /* We failed to wait for the container, or the container exited abnormally. */
                 return r;
@@ -4704,7 +4770,7 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
-        r = cg_unified_flush();
+        r = cg_unified();
         if (r < 0) {
                 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
                 goto finish;
@@ -4714,9 +4780,8 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
-        r = detect_unified_cgroup_hierarchy_from_environment();
-        if (r < 0)
-                goto finish;
+        /* Reapply environment settings. */
+        (void) detect_unified_cgroup_hierarchy_from_environment();
 
         /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
          * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
@@ -4740,8 +4805,12 @@ static int run(int argc, char *argv[]) {
         if (arg_directory) {
                 assert(!arg_image);
 
-                if (path_equal(arg_directory, "/") && !arg_ephemeral) {
-                        log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
+                /* Safety precaution: let's not allow running images from the live host OS image, as long as
+                 * /var from the host will propagate into container dynamically (because bad things happen if
+                 * two systems write to the same /var). Let's allow it for the special cases where /var is
+                 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
+                if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
+                        log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
                         r = -EINVAL;
                         goto finish;
                 }
@@ -4770,7 +4839,9 @@ static int run(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        /* We take an exclusive lock on this image, since it's our private, ephemeral copy
+                         * only owned by us and noone else. */
+                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to lock %s: %m", np);
                                 goto finish;
@@ -4890,7 +4961,8 @@ static int run(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        /* Always take an exclusive lock on our own ephemeral copy. */
+                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
                         if (r < 0) {
                                 r = log_error_errno(r, "Failed to create image lock: %m");
                                 goto finish;