]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
nspawn: turn on watchdog logic for nspawn too
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index 82fd0bd5f4a5aec940af7cb199e13cef09f16a88..f217def92d53b7b03aabc929bd0a1c469b3890d4 100644 (file)
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
 /***
   This file is part of systemd.
 
@@ -17,7 +18,7 @@
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
 ***/
 
-#ifdef HAVE_BLKID
+#if HAVE_BLKID
 #include <blkid.h>
 #endif
 #include <errno.h>
@@ -26,7 +27,7 @@
 #include <linux/loop.h>
 #include <pwd.h>
 #include <sched.h>
-#ifdef HAVE_SELINUX
+#if HAVE_SELINUX
 #include <selinux/selinux.h>
 #endif
 #include <signal.h>
@@ -77,6 +78,7 @@
 #include "mount-util.h"
 #include "netlink-util.h"
 #include "nspawn-cgroup.h"
+#include "nspawn-def.h"
 #include "nspawn-expose-ports.h"
 #include "nspawn-mount.h"
 #include "nspawn-network.h"
 #include "user-util.h"
 #include "util.h"
 
-/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
- * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
- * may have their own allocation ranges too. */
-#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
-#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
+#if HAVE_SPLIT_USR
+#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
+#else
+#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
+#endif
 
 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
  * nspawn_notify_socket_path is relative to the container
@@ -318,7 +320,7 @@ static int custom_mount_check_all(void) {
         return 0;
 }
 
-static int detect_unified_cgroup_hierarchy(const char *directory) {
+static int detect_unified_cgroup_hierarchy_from_environment(void) {
         const char *e;
         int r;
 
@@ -332,11 +334,16 @@ static int detect_unified_cgroup_hierarchy(const char *directory) {
                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
                 else
                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
-
-                return 0;
         }
 
-        /* Otherwise inherit the default from the host system */
+        return 0;
+}
+
+static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
+        int r;
+
+        /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
+         * image actually supports. */
         r = cg_all_unified();
         if (r < 0)
                 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
@@ -362,6 +369,10 @@ static int detect_unified_cgroup_hierarchy(const char *directory) {
         } else
                 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
 
+        log_debug("Using %s hierarchy for container.",
+                  arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
+                  arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
+
         return 0;
 }
 
@@ -573,8 +584,7 @@ static int parse_argv(int argc, char *argv[]) {
                         if (r < 0)
                                 return log_oom();
 
-                        /* fall through */
-
+                        _fallthrough_;
                 case 'n':
                         arg_network_veth = true;
                         arg_private_network = true;
@@ -628,8 +638,7 @@ static int parse_argv(int argc, char *argv[]) {
                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
                                 return log_oom();
 
-                        /* fall through */
-
+                        _fallthrough_;
                 case ARG_PRIVATE_NETWORK:
                         arg_private_network = true;
                         arg_settings_mask |= SETTING_NETWORK;
@@ -1120,6 +1129,8 @@ static int parse_argv(int argc, char *argv[]) {
                 arg_userns_chown = true;
 
         if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
+                /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
+                 * The latter is not technically a user session, but we don't need to labour the point. */
                 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
                 return -EINVAL;
         }
@@ -1234,7 +1245,7 @@ static int verify_arguments(void) {
                 return -EINVAL;
         }
 
-#ifndef HAVE_LIBIPTC
+#if ! HAVE_LIBIPTC
         if (arg_expose_ports) {
                 log_error("--port= is not supported, compiled without libiptc support.");
                 return -EOPNOTSUPP;
@@ -1410,7 +1421,7 @@ static int setup_resolv_conf(const char *dest) {
                 return 0;
         }
 
-        if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
+        if (access(STATIC_RESOLV_CONF, F_OK) >= 0 &&
             resolved_listening() > 0) {
 
                 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
@@ -1422,7 +1433,7 @@ static int setup_resolv_conf(const char *dest) {
                 if (found == 0) /* missing? */
                         (void) touch(resolved);
 
-                r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
+                r = mount_verbose(LOG_DEBUG, STATIC_RESOLV_CONF, resolved, NULL, MS_BIND, NULL);
                 if (r >= 0)
                         return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
         }
@@ -1547,7 +1558,7 @@ static int setup_pts(const char *dest) {
         const char *p;
         int r;
 
-#ifdef HAVE_SELINUX
+#if HAVE_SELINUX
         if (arg_selinux_apifs_context)
                 (void) asprintf(&options,
                                 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
@@ -2015,8 +2026,7 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) {
                         return 0;
                 }
 
-                /* fall through */
-
+                _fallthrough_;
         case CLD_DUMPED:
                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
                 return -EIO;
@@ -2044,18 +2054,27 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo
 }
 
 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
+        pid_t pid;
+
+        assert(s);
+        assert(ssi);
+
+        pid = PTR_TO_PID(userdata);
+
         for (;;) {
                 siginfo_t si = {};
+
                 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
                         return log_error_errno(errno, "Failed to waitid(): %m");
                 if (si.si_pid == 0) /* No pending children. */
                         break;
-                if (si.si_pid == PTR_TO_PID(userdata)) {
+                if (si.si_pid == pid) {
                         /* The main process we care for has exited. Return from
                          * signal handler but leave the zombie. */
                         sd_event_exit(sd_event_source_get_event(s), 0);
                         break;
                 }
+
                 /* Reap all other children. */
                 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
         }
@@ -2063,6 +2082,24 @@ static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, vo
         return 0;
 }
 
+static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        pid_t pid;
+
+        assert(m);
+
+        pid = PTR_TO_PID(userdata);
+
+        if (arg_kill_signal > 0) {
+                log_info("Container termination requested. Attempting to halt container.");
+                (void) kill(pid, arg_kill_signal);
+        } else {
+                log_info("Container termination requested. Exiting.");
+                sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
+        }
+
+        return 0;
+}
+
 static int determine_names(void) {
         int r;
 
@@ -2089,7 +2126,7 @@ static int determine_names(void) {
                                 return -ENOENT;
                         }
 
-                        if (i->type == IMAGE_RAW)
+                        if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
                                 r = free_and_strdup(&arg_image, i->path);
                         else
                                 r = free_and_strdup(&arg_directory, i->path);
@@ -2332,7 +2369,7 @@ static int inner_child(
                         return log_error_errno(r, "personality() failed: %m");
         }
 
-#ifdef HAVE_SELINUX
+#if HAVE_SELINUX
         if (arg_selinux_context)
                 if (setexeccon(arg_selinux_context) < 0)
                         return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
@@ -2494,6 +2531,7 @@ static int outer_child(
                 int kmsg_socket,
                 int rtnl_socket,
                 int uid_shift_socket,
+                int unified_cgroup_hierarchy_socket,
                 FDSet *fds) {
 
         pid_t pid;
@@ -2544,7 +2582,13 @@ static int outer_child(
                 return r;
 
         if (dissected_image) {
-                r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
+                /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
+                 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
+                 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
+                 * makes sure ESP partitions and userns are compatible. */
+
+                r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
+                                          DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
                 if (r < 0)
                         return r;
         }
@@ -2580,6 +2624,32 @@ static int outer_child(
                 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
         }
 
+        if (dissected_image) {
+                /* Now we know the uid shift, let's now mount everything else that might be in the image. */
+                r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
+                                          DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
+                if (r < 0)
+                        return r;
+        }
+
+        if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
+                /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
+
+                r = detect_unified_cgroup_hierarchy_from_image(directory);
+                if (r < 0)
+                        return r;
+
+                l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to send cgroup mode: %m");
+                if (l != sizeof(arg_unified_cgroup_hierarchy)) {
+                        log_error("Short write while sending cgroup mode: %m");
+                        return -EIO;
+                }
+
+                unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
+        }
+
         /* Turn directory into bind mount */
         r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
         if (r < 0)
@@ -2771,6 +2841,7 @@ static int outer_child(
 }
 
 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
+        bool tried_hashed = false;
         unsigned n_tries = 100;
         uid_t candidate;
         int r;
@@ -2791,7 +2862,7 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
                 if (--n_tries <= 0)
                         return -EBUSY;
 
-                if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
+                if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
                         goto next;
                 if ((candidate & UINT32_C(0xFFFF)) != 0)
                         goto next;
@@ -2819,8 +2890,21 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
                 return 0;
 
         next:
-                random_bytes(&candidate, sizeof(candidate));
-                candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
+                if (arg_machine && !tried_hashed) {
+                        /* Try to hash the base from the container name */
+
+                        static const uint8_t hash_key[] = {
+                                0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
+                                0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
+                        };
+
+                        candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
+
+                        tried_hashed = true;
+                } else
+                        random_bytes(&candidate, sizeof(candidate));
+
+                candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
                 candidate &= (uid_t) UINT32_C(0xFFFF0000);
         }
 }
@@ -3212,13 +3296,16 @@ static int run(int master,
                 pid_socket_pair[2] = { -1, -1 },
                 uuid_socket_pair[2] = { -1, -1 },
                 notify_socket_pair[2] = { -1, -1 },
-                uid_shift_socket_pair[2] = { -1, -1 };
+                uid_shift_socket_pair[2] = { -1, -1 },
+                unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
+
         _cleanup_close_ int notify_socket= -1;
         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
         _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
         _cleanup_(sd_event_unrefp) sd_event *event = NULL;
         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
         _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
         ContainerStatus container_status = 0;
         char last_char = 0;
         int ifi = 0, r;
@@ -3264,6 +3351,10 @@ static int run(int master,
                 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
                         return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
 
+        if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
+                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
+                        return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
+
         /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
          * parent's blocking calls and give it a chance to call wait() and terminate. */
         r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
@@ -3292,6 +3383,7 @@ static int run(int master,
                 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
                 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
                 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+                unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
 
                 (void) reset_all_signal_handlers();
                 (void) reset_signal_mask();
@@ -3308,6 +3400,7 @@ static int run(int master,
                                 kmsg_socket_pair[1],
                                 rtnl_socket_pair[1],
                                 uid_shift_socket_pair[1],
+                                unified_cgroup_hierarchy_socket_pair[1],
                                 fds);
                 if (r < 0)
                         _exit(EXIT_FAILURE);
@@ -3325,6 +3418,7 @@ static int run(int master,
         uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
         notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
         uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
+        unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
 
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 /* The child just let us know the UID shift it might have read from the image. */
@@ -3355,6 +3449,17 @@ static int run(int master,
                 }
         }
 
+        if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
+                /* The child let us know the support cgroup mode it might have read from the image. */
+                l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to read cgroup mode: %m");
+                if (l != sizeof(arg_unified_cgroup_hierarchy)) {
+                        log_error("Short read while reading cgroup mode.");
+                        return -EIO;
+                }
+        }
+
         /* Wait for the outer child. */
         r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
         if (r != 0)
@@ -3449,8 +3554,31 @@ static int run(int master,
                         return r;
         }
 
+        if (arg_register || !arg_keep_unit) {
+                r = sd_bus_default_system(&bus);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open system bus: %m");
+        }
+
+        if (!arg_keep_unit) {
+                /* When a new scope is created for this container, then we'll be registered as its controller, in which
+                 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
+                 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
+
+                r = sd_bus_add_match(bus, NULL,
+                                     "type='signal',"
+                                     "sender='org.freedesktop.systemd1',"
+                                     "interface='org.freedesktop.systemd1.Scope',"
+                                     "member='RequestStop'",
+                                     on_request_stop, PID_TO_PTR(*pid));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to install request stop match: %m");
+        }
+
         if (arg_register) {
+
                 r = register_machine(
+                                bus,
                                 arg_machine,
                                 *pid,
                                 arg_directory,
@@ -3464,8 +3592,11 @@ static int run(int master,
                                 arg_container_service_name);
                 if (r < 0)
                         return r;
+
         } else if (!arg_keep_unit) {
+
                 r = allocate_scope(
+                                bus,
                                 arg_machine,
                                 *pid,
                                 arg_slice,
@@ -3488,7 +3619,7 @@ static int run(int master,
                         return r;
         }
 
-        r = chown_cgroup(*pid, arg_uid_shift);
+        r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
         if (r < 0)
                 return r;
 
@@ -3511,6 +3642,14 @@ static int run(int master,
         if (r < 0)
                 return log_error_errno(r, "Failed to get default event source: %m");
 
+        (void) sd_event_set_watchdog(event, true);
+
+        if (bus) {
+                r = sd_bus_attach_event(bus, event, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to attach bus to event loop: %m");
+        }
+
         r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
         if (r < 0)
                 return r;
@@ -3572,8 +3711,8 @@ static int run(int master,
                 putc('\n', stdout);
 
         /* Kill if it is not dead yet anyway */
-        if (arg_register && !arg_keep_unit)
-                terminate_machine(*pid);
+        if (arg_register && !arg_keep_unit && bus)
+                terminate_machine(bus, *pid);
 
         /* Normally redundant, but better safe than sorry */
         (void) kill(*pid, SIGKILL);
@@ -3661,6 +3800,10 @@ int main(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
+        r = detect_unified_cgroup_hierarchy_from_environment();
+        if (r < 0)
+                goto finish;
+
         n_fd_passed = sd_listen_fds(false);
         if (n_fd_passed > 0) {
                 r = fdset_new_listen_fds(&fds, false);
@@ -3883,6 +4026,10 @@ int main(int argc, char *argv[]) {
                         log_error_errno(r, "--image= is not supported, compiled without blkid support.");
                         goto finish;
                 }
+                if (r == -EPROTONOSUPPORT) {
+                        log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on.");
+                        goto finish;
+                }
                 if (r < 0) {
                         log_error_errno(r, "Failed to dissect image: %m");
                         goto finish;
@@ -3904,10 +4051,6 @@ int main(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
-        r = detect_unified_cgroup_hierarchy(arg_directory);
-        if (r < 0)
-                goto finish;
-
         interactive =
                 isatty(STDIN_FILENO) > 0 &&
                 isatty(STDOUT_FILENO) > 0;