Merge pull request #25513 from brauner/pivot_root.nspawn

author Luca Boccassi <bluca@debian.org>

Tue, 6 Dec 2022 00:51:51 +0000 (01:51 +0100)

committer GitHub <noreply@github.com>

Tue, 6 Dec 2022 00:51:51 +0000 (01:51 +0100)
author Luca Boccassi <bluca@debian.org>
Tue, 6 Dec 2022 00:51:51 +0000 (01:51 +0100)
committer GitHub <noreply@github.com>
Tue, 6 Dec 2022 00:51:51 +0000 (01:51 +0100)
diff --git a/src/core/namespace.c b/src/core/namespace.c

index 4e9292f2de4993b3124ad4b8df892160a2c53017..4920716f3487536271996a955e961f205f0abe49 100644 (file)
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -2488,7 +2488,7 @@ int setup_namespace(
                  goto finish;
  
          /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
-        r = mount_pivot_root(root);
+        r = mount_switch_root(root, MOUNT_ATTR_PROPAGATION_INHERIT);
          if (r == -EINVAL && root_directory) {
                  /* If we are using root_directory and we don't have privileges (ie: user manager in a user
                   * namespace) and the root_directory is already a mount point in the parent namespace,
@@ -2498,7 +2498,7 @@ int setup_namespace(
                  r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
                  if (r < 0)
                          goto finish;
-                r = mount_pivot_root(root);
+                r = mount_switch_root(root, MOUNT_ATTR_PROPAGATION_INHERIT);
          }
          if (r < 0) {
                  log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c

index a54f1464bac04bb48586c714d400480759ea0085..0e8aaa1e3c4c69b0742594d29db2e8d0c55e4105 100644 (file)
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -13,6 +13,7 @@
  #include "mkdir-label.h"
  #include "mount-util.h"
  #include "mountpoint-util.h"
+#include "namespace-util.h"
  #include "nspawn-mount.h"
  #include "parse-util.h"
  #include "path-util.h"
@@ -510,6 +511,9 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
                                        MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
  }
  
+#define PROC_DEFAULT_MOUNT_FLAGS (MS_NOSUID|MS_NOEXEC|MS_NODEV)
+#define SYS_DEFAULT_MOUNT_FLAGS  (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV)
+
  int mount_all(const char *dest,
                MountSettingsMask mount_settings,
                uid_t uid_shift,
@@ -538,7 +542,7 @@ int mount_all(const char *dest,
  
          static const MountPoint mount_table[] = {
                  /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
-                { "proc",            "/proc",           "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                { "proc",            "/proc",           "proc",  NULL,        PROC_DEFAULT_MOUNT_FLAGS,
                    MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
  
                  { "/proc/sys",       "/proc/sys",       NULL,    NULL,        MS_BIND,
@@ -576,7 +580,7 @@ int mount_all(const char *dest,
                    MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
                  { "tmpfs",                  "/sys",                         "tmpfs", "mode=555" TMPFS_LIMITS_SYS,      MS_NOSUID|MS_NOEXEC|MS_NODEV,
                    MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
-                { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                { "sysfs",                  "/sys",                         "sysfs", NULL,                             SYS_DEFAULT_MOUNT_FLAGS,
                    MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR },    /* skipped if above was mounted */
                  { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
                    MOUNT_FATAL|MOUNT_MKDIR },                          /* skipped if above was mounted */
@@ -1336,3 +1340,60 @@ done:
  
          return r;
  }
+
+#define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc"
+#define NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS "/run/host/sys"
+
+int pin_fully_visible_fs(void) {
+        int r;
+
+        (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, 0755);
+        (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, 0755);
+
+        r = mount_follow_verbose(LOG_ERR, "proc", NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, "proc", PROC_DEFAULT_MOUNT_FLAGS, NULL);
+        if (r < 0)
+                return r;
+
+        r = mount_follow_verbose(LOG_ERR, "sysfs", NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, "sysfs", SYS_DEFAULT_MOUNT_FLAGS, NULL);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int do_wipe_fully_visible_fs(void) {
+        if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, MNT_DETACH) < 0)
+                return log_error_errno(errno, "Failed to unmount temporary proc: %m");
+
+        if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS) < 0)
+                return log_error_errno(errno, "Failed to remove temporary proc mountpoint: %m");
+
+        if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, MNT_DETACH) < 0)
+                return log_error_errno(errno, "Failed to unmount temporary sys: %m");
+
+        if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS) < 0)
+                return log_error_errno(errno, "Failed to remove temporary sys mountpoint: %m");
+
+        return 0;
+}
+
+int wipe_fully_visible_fs(int mntns_fd) {
+        _cleanup_close_ int orig_mntns_fd = -EBADF;
+        int r, rr;
+
+        r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to pin originating mount namespace: %m");
+
+        r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enter mount namespace: %m");
+
+        rr = do_wipe_fully_visible_fs();
+
+        r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enter original mount namespace: %m");
+
+        return rr;
+}
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h

index 6bedbf9b3faec6d2615cc635d681c31ef2f16a92..bf5e47dce405fbe4b7361338051b36256540f05c 100644 (file)
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -67,3 +67,5 @@ int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s
  int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old);
  
  int tmpfs_patch_options(const char *options,uid_t uid_shift, const char *selinux_apifs_context, char **ret);
+int pin_fully_visible_fs(void);
+int wipe_fully_visible_fs(int mntns_fd);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c

index 1e0c8a2448d520fc40c282ed7fdfeefd274282c7..1282c8b98b7afcd50d75e612cee74fd1e1e1e764 100644 (file)
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -114,6 +114,7 @@
  
  /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
  #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
+#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
  
  #define EXIT_FORCE_RESTART 133
  
@@ -2776,7 +2777,7 @@ static int reset_audit_loginuid(void) {
          return 0;
  }
  
-static int setup_propagate(const char *root) {
+static int mount_tunnel_dig(const char *root) {
          const char *p, *q;
          int r;
  
@@ -2789,11 +2790,11 @@ static int setup_propagate(const char *root) {
          if (r < 0)
                  return log_error_errno(r, "Failed to create /run/host: %m");
  
-        r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
+        r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
          if (r < 0)
-                return log_error_errno(r, "Failed to create /run/host/incoming: %m");
+                return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
  
-        q = prefix_roota(root, "/run/host/incoming");
+        q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
          r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
          if (r < 0)
                  return r;
@@ -2802,8 +2803,17 @@ static int setup_propagate(const char *root) {
          if (r < 0)
                  return r;
  
-        /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
-        return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
+        return 0;
+}
+
+static int mount_tunnel_open(void) {
+        int r;
+
+        r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
+        if (r < 0)
+                return r;
+
+        return 0;
  }
  
  static int setup_machine_id(const char *directory) {
@@ -3632,7 +3642,7 @@ static int outer_child(
  
          _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
          _cleanup_strv_free_ char **os_release_pairs = NULL;
-        _cleanup_close_ int fd = -1;
+        _cleanup_close_ int fd = -1, mntns_fd = -EBADF;
          bool idmap = false;
          const char *p;
          pid_t pid;
@@ -3697,6 +3707,15 @@ static int outer_child(
                  return r;
  
          if (arg_userns_mode != USER_NAMESPACE_NO) {
+                r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to pin outer mount namespace: %m");
+
+                l = send_one_fd(notify_socket, mntns_fd, 0);
+                if (l < 0)
+                        return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
+                mntns_fd = safe_close(mntns_fd);
+
                  /* Let the parent know which UID shift we read from the image */
                  l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
                  if (l < 0)
@@ -3858,19 +3877,6 @@ static int outer_child(
                  unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
          }
  
-        /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
-         * mounts available in systemd services inside the container that create a new mount namespace.  See
-         * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
-         * will inherit the shared propagation mode.
-         *
-         * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
-         * directory mount to root later on.
-         * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
-         */
-        r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
-        if (r < 0)
-                return r;
-
          r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
          if (r < 0)
                  return r;
@@ -3910,7 +3916,7 @@ static int outer_child(
          if (r < 0)
                  return r;
  
-        r = setup_propagate(directory);
+        r = mount_tunnel_dig(directory);
          if (r < 0)
                  return r;
  
@@ -3974,10 +3980,40 @@ static int outer_child(
                          return r;
          }
  
-        r = mount_move_root(directory);
+        /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
+         * mounts available in systemd services inside the container that create a new mount namespace.  See
+         * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
+         * will inherit the shared propagation mode.
+         *
+         * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
+         * directory mount to root later on.
+         * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
+         */
+        r = mount_switch_root(directory, MOUNT_ATTR_PROPAGATION_SHARED);
          if (r < 0)
                  return log_error_errno(r, "Failed to move root directory: %m");
  
+        /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
+         * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
+         * the container. */
+        r = mount_tunnel_open();
+        if (r < 0)
+                return r;
+
+        if (arg_userns_mode != USER_NAMESPACE_NO) {
+                /* In order to mount procfs and sysfs in an unprivileged container the kernel
+                 * requires that a fully visible instance is already present in the target mount
+                 * namespace. Mount one here so the inner child can mount its own instances. Later
+                 * we umount the temporary instances created here before we actually exec the
+                 * payload. Since the rootfs is shared the umount will propagate into the container.
+                 * Note, the inner child wouldn't be able to unmount the instances on its own since
+                 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
+                 * this. */
+                r = pin_fully_visible_fs();
+                if (r < 0)
+                        return r;
+        }
+
          fd = setup_notify_child();
          if (fd < 0)
                  return fd;
@@ -4735,12 +4771,12 @@ static int run_container(
                  rtnl_socket_pair[2] = { -1, -1 },
                  pid_socket_pair[2] = { -1, -1 },
                  uuid_socket_pair[2] = { -1, -1 },
-                notify_socket_pair[2] = { -1, -1 },
+                fd_socket_pair[2] = { -EBADF, -EBADF },
                  uid_shift_socket_pair[2] = { -1, -1 },
                  master_pty_socket_pair[2] = { -1, -1 },
                  unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
  
-        _cleanup_close_ int notify_socket = -1;
+        _cleanup_close_ int notify_socket = -1, mntns_fd = -EBADF;
          _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
          _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
          _cleanup_(sd_event_unrefp) sd_event *event = NULL;
@@ -4787,7 +4823,7 @@ static int run_container(
          if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
                  return log_error_errno(errno, "Failed to create id socket pair: %m");
  
-        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_socket_pair) < 0)
                  return log_error_errno(errno, "Failed to create notify socket pair: %m");
  
          if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
@@ -4840,7 +4876,7 @@ static int run_container(
                  rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
                  pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
                  uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
-                notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
+                fd_socket_pair[0] = safe_close(fd_socket_pair[0]);
                  master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
                  uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
                  unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
@@ -4854,7 +4890,7 @@ static int run_container(
                                  secondary,
                                  pid_socket_pair[1],
                                  uuid_socket_pair[1],
-                                notify_socket_pair[1],
+                                fd_socket_pair[1],
                                  kmsg_socket_pair[1],
                                  rtnl_socket_pair[1],
                                  uid_shift_socket_pair[1],
@@ -4876,12 +4912,16 @@ static int run_container(
          rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
          pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
          uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
-        notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
+        fd_socket_pair[1] = safe_close(fd_socket_pair[1]);
          master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
          uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
          unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
  
          if (arg_userns_mode != USER_NAMESPACE_NO) {
+                mntns_fd = receive_one_fd(fd_socket_pair[0], 0);
+                if (mntns_fd < 0)
+                        return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
+
                  /* The child just let us know the UID shift it might have read from the image. */
                  l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
                  if (l < 0)
@@ -4958,7 +4998,7 @@ static int run_container(
                  return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
  
          /* We also retrieve the socket used for notifications generated by outer child */
-        notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+        notify_socket = receive_one_fd(fd_socket_pair[0], 0);
          if (notify_socket < 0)
                  return log_error_errno(notify_socket,
                                         "Failed to receive notification socket from the outer child: %m");
@@ -5143,6 +5183,13 @@ static int run_container(
          if (r < 0)
                  return r;
  
+        if (arg_userns_mode != USER_NAMESPACE_NO) {
+                r = wipe_fully_visible_fs(mntns_fd);
+                if (r < 0)
+                        return r;
+                mntns_fd = safe_close(mntns_fd);
+        }
+
          /* Let the child know that we are ready and wait that the child is completely ready now. */
          if (!barrier_place_and_sync(&barrier)) /* #5 */
                  return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c

index 681d698800b1ab8477f9093b3c8a7eddd81c2787..adb6b6dd279aa15d3d13709e03d259399b16a8af 100644 (file)
--- a/src/shared/mount-util.c
+++ b/src/shared/mount-util.c
@@ -36,6 +36,7 @@
  #include "set.h"
  #include "stat-util.h"
  #include "stdio-util.h"
+#include "string-table.h"
  #include "string-util.h"
  #include "strv.h"
  #include "tmpfile-util.h"
@@ -475,47 +476,41 @@ int bind_remount_one_with_mountinfo(
          return 0;
  }
  
-int mount_move_root(const char *path) {
-        assert(path);
+static const char *const mount_attr_propagation_type_table[_MOUNT_ATTR_PROPAGATION_TYPE_MAX] = {
+        [MOUNT_ATTR_PROPAGATION_INHERIT]   = "inherited",
+        [MOUNT_ATTR_PROPAGATION_PRIVATE]   = "private",
+        [MOUNT_ATTR_PROPAGATION_DEPENDENT] = "dependent",
+        [MOUNT_ATTR_PROPAGATION_SHARED]    = "shared",
+};
  
-        if (chdir(path) < 0)
-                return -errno;
+DEFINE_STRING_TABLE_LOOKUP(mount_attr_propagation_type, MountAttrPropagationType);
  
-        if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
-                return -errno;
-
-        if (chroot(".") < 0)
-                return -errno;
-
-        return RET_NERRNO(chdir("/"));
+unsigned int mount_attr_propagation_type_to_flag(MountAttrPropagationType t) {
+        switch (t) {
+        case MOUNT_ATTR_PROPAGATION_INHERIT:
+                return 0;
+        case MOUNT_ATTR_PROPAGATION_PRIVATE:
+                return MS_PRIVATE;
+        case MOUNT_ATTR_PROPAGATION_DEPENDENT:
+                return MS_SLAVE;
+        case MOUNT_ATTR_PROPAGATION_SHARED:
+                return MS_SHARED;
+        default:
+                assert_not_reached();
+        }
  }
  
-int mount_pivot_root(const char *path) {
-        _cleanup_close_ int fd_oldroot = -EBADF, fd_newroot = -EBADF;
-
-        assert(path);
-
-        /* pivot_root() isn't currently supported in the initramfs. */
-        if (in_initrd())
-                return mount_move_root(path);
+static inline int mount_switch_root_pivot(const char *path, int fd_newroot) {
+        _cleanup_close_ int fd_oldroot = -EBADF;
  
          fd_oldroot = open("/", O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
          if (fd_oldroot < 0)
                  return log_debug_errno(errno, "Failed to open old rootfs");
  
-        fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
-        if (fd_newroot < 0)
-                return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
-
-        /* Change into the new rootfs. */
-        if (fchdir(fd_newroot) < 0)
-                return log_debug_errno(errno, "Failed to change into new rootfs '%s': %m", path);
-
          /* Let the kernel tuck the new root under the old one. */
          if (pivot_root(".", ".") < 0)
                  return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path);
  
-
          /* At this point the new root is tucked under the old root. If we want
           * to unmount it we cannot be fchdir()ed into it. So escape back to the
           * old root. */
@@ -535,6 +530,52 @@ int mount_pivot_root(const char *path) {
          return 0;
  }
  
+static inline int mount_switch_root_move(const char *path) {
+        if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
+                return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path);
+
+        if (chroot(".") < 0)
+                return log_debug_errno(errno, "Failed to chroot to new rootfs '%s': %m", path);
+
+        if (chdir("/"))
+                return log_debug_errno(errno, "Failed to chdir to new rootfs '%s': %m", path);
+
+        return 0;
+}
+
+int mount_switch_root(const char *path, MountAttrPropagationType type) {
+        int r;
+        _cleanup_close_ int fd_newroot = -EBADF;
+        unsigned int flags;
+
+        assert(path);
+
+        fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+        if (fd_newroot < 0)
+                return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
+
+        /* Change into the new rootfs. */
+        if (fchdir(fd_newroot) < 0)
+                return log_debug_errno(errno, "Failed to change into new rootfs '%s': %m", path);
+
+        r = mount_switch_root_pivot(path, fd_newroot);
+        if (r < 0) {
+                /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the
+                 * rootfs is an initramfs in which case pivot_root() isn't supported. */
+                log_debug_errno(r, "Failed to pivot into new rootfs '%s': %m", path);
+                r = mount_switch_root_move(path);
+        }
+        if (r < 0)
+                return log_debug_errno(r, "Failed to switch to new rootfs '%s': %m", path);
+
+        /* Finally, let's establish the requested propagation type. */
+        flags = mount_attr_propagation_type_to_flag(type);
+        if ((flags != 0) && mount(NULL, ".", NULL, flags|MS_REC, 0) < 0)
+                return log_debug_errno(errno, "Failed to turn new rootfs '%s' into %s mount: %m",
+                                       mount_attr_propagation_type_to_string(type), path);
+
+        return 0;
+}
  
  int repeat_unmount(const char *path, int flags) {
          bool done = false;
diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h

index 29b9ed02f7ca5efda2db43b58caa09469805a581..56b1c3669c8964fdc36a2d10bf62610daf6aa3db 100644 (file)
--- a/src/shared/mount-util.h
+++ b/src/shared/mount-util.h
@@ -11,6 +11,20 @@
  #include "errno-util.h"
  #include "macro.h"
  
+typedef enum MountAttrPropagationType {
+        MOUNT_ATTR_PROPAGATION_INHERIT,   /* no special MS_* propagation flags */
+        MOUNT_ATTR_PROPAGATION_PRIVATE,   /* MS_PRIVATE */
+        MOUNT_ATTR_PROPAGATION_DEPENDENT, /* MS_SLAVE */
+        MOUNT_ATTR_PROPAGATION_SHARED,    /* MS_SHARE */
+
+        _MOUNT_ATTR_PROPAGATION_TYPE_MAX,
+        _MOUNT_ATTR_PROPAGATION_TYPE_INVALID = -EINVAL,
+} MountAttrPropagationType;
+
+const char* mount_attr_propagation_type_to_string(MountAttrPropagationType t) _const_;
+MountAttrPropagationType mount_attr_propagation_type_from_string(const char *s) _pure_;
+unsigned int mount_attr_propagation_type_to_flag(MountAttrPropagationType t);
+
  /* The limit used for /dev itself. 4MB should be enough since device nodes and symlinks don't
   * consume any space and udev isn't supposed to create regular file either. There's no limit on the
   * max number of inodes since such limit is hard to guess especially on large storage array
@@ -54,8 +68,7 @@ static inline int bind_remount_recursive(const char *prefix, unsigned long new_f
  
  int bind_remount_one_with_mountinfo(const char *path, unsigned long new_flags, unsigned long flags_mask, FILE *proc_self_mountinfo);
  
-int mount_move_root(const char *path);
-int mount_pivot_root(const char *path);
+int mount_switch_root(const char *path, MountAttrPropagationType type);
  
  DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(FILE*, endmntent, NULL);
  #define _cleanup_endmntent_ _cleanup_(endmntentp)
author	Luca Boccassi <bluca@debian.org>
	Tue, 6 Dec 2022 00:51:51 +0000 (01:51 +0100)
committer	GitHub <noreply@github.com>
	Tue, 6 Dec 2022 00:51:51 +0000 (01:51 +0100)
src/core/namespace.c		patch \| blob \| blame \| history
src/nspawn/nspawn-mount.c		patch \| blob \| blame \| history
src/nspawn/nspawn-mount.h		patch \| blob \| blame \| history
src/nspawn/nspawn.c		patch \| blob \| blame \| history
src/shared/mount-util.c		patch \| blob \| blame \| history
src/shared/mount-util.h		patch \| blob \| blame \| history