goto finish;
/* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
- r = mount_pivot_root(root);
+ r = mount_switch_root(root, MOUNT_ATTR_PROPAGATION_INHERIT);
if (r == -EINVAL && root_directory) {
/* If we are using root_directory and we don't have privileges (ie: user manager in a user
* namespace) and the root_directory is already a mount point in the parent namespace,
r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
if (r < 0)
goto finish;
- r = mount_pivot_root(root);
+ r = mount_switch_root(root, MOUNT_ATTR_PROPAGATION_INHERIT);
}
if (r < 0) {
log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
#include "mkdir-label.h"
#include "mount-util.h"
#include "mountpoint-util.h"
+#include "namespace-util.h"
#include "nspawn-mount.h"
#include "parse-util.h"
#include "path-util.h"
MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
}
+#define PROC_DEFAULT_MOUNT_FLAGS (MS_NOSUID|MS_NOEXEC|MS_NODEV)
+#define SYS_DEFAULT_MOUNT_FLAGS (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV)
+
int mount_all(const char *dest,
MountSettingsMask mount_settings,
uid_t uid_shift,
static const MountPoint mount_table[] = {
/* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ { "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS,
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
{ "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ { "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */
return r;
}
+
+#define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc"
+#define NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS "/run/host/sys"
+
+int pin_fully_visible_fs(void) {
+ int r;
+
+ (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, 0755);
+ (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, 0755);
+
+ r = mount_follow_verbose(LOG_ERR, "proc", NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, "proc", PROC_DEFAULT_MOUNT_FLAGS, NULL);
+ if (r < 0)
+ return r;
+
+ r = mount_follow_verbose(LOG_ERR, "sysfs", NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, "sysfs", SYS_DEFAULT_MOUNT_FLAGS, NULL);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int do_wipe_fully_visible_fs(void) {
+ if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, MNT_DETACH) < 0)
+ return log_error_errno(errno, "Failed to unmount temporary proc: %m");
+
+ if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS) < 0)
+ return log_error_errno(errno, "Failed to remove temporary proc mountpoint: %m");
+
+ if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, MNT_DETACH) < 0)
+ return log_error_errno(errno, "Failed to unmount temporary sys: %m");
+
+ if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS) < 0)
+ return log_error_errno(errno, "Failed to remove temporary sys mountpoint: %m");
+
+ return 0;
+}
+
+int wipe_fully_visible_fs(int mntns_fd) {
+ _cleanup_close_ int orig_mntns_fd = -EBADF;
+ int r, rr;
+
+ r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pin originating mount namespace: %m");
+
+ r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enter mount namespace: %m");
+
+ rr = do_wipe_fully_visible_fs();
+
+ r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enter original mount namespace: %m");
+
+ return rr;
+}
int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old);
int tmpfs_patch_options(const char *options,uid_t uid_shift, const char *selinux_apifs_context, char **ret);
+int pin_fully_visible_fs(void);
+int wipe_fully_visible_fs(int mntns_fd);
/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
+#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
#define EXIT_FORCE_RESTART 133
return 0;
}
-static int setup_propagate(const char *root) {
+static int mount_tunnel_dig(const char *root) {
const char *p, *q;
int r;
if (r < 0)
return log_error_errno(r, "Failed to create /run/host: %m");
- r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
+ r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
if (r < 0)
- return log_error_errno(r, "Failed to create /run/host/incoming: %m");
+ return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
- q = prefix_roota(root, "/run/host/incoming");
+ q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
if (r < 0)
return r;
if (r < 0)
return r;
- /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
- return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
+ return 0;
+}
+
+static int mount_tunnel_open(void) {
+ int r;
+
+ r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
+ if (r < 0)
+ return r;
+
+ return 0;
}
static int setup_machine_id(const char *directory) {
_cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
_cleanup_strv_free_ char **os_release_pairs = NULL;
- _cleanup_close_ int fd = -1;
+ _cleanup_close_ int fd = -1, mntns_fd = -EBADF;
bool idmap = false;
const char *p;
pid_t pid;
return r;
if (arg_userns_mode != USER_NAMESPACE_NO) {
+ r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pin outer mount namespace: %m");
+
+ l = send_one_fd(notify_socket, mntns_fd, 0);
+ if (l < 0)
+ return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
+ mntns_fd = safe_close(mntns_fd);
+
/* Let the parent know which UID shift we read from the image */
l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
if (l < 0)
unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
}
- /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
- * mounts available in systemd services inside the container that create a new mount namespace. See
- * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
- * will inherit the shared propagation mode.
- *
- * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
- * directory mount to root later on.
- * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
- */
- r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
- if (r < 0)
- return r;
-
r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
if (r < 0)
return r;
if (r < 0)
return r;
- r = setup_propagate(directory);
+ r = mount_tunnel_dig(directory);
if (r < 0)
return r;
return r;
}
- r = mount_move_root(directory);
+ /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
+ * mounts available in systemd services inside the container that create a new mount namespace. See
+ * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
+ * will inherit the shared propagation mode.
+ *
+ * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
+ * directory mount to root later on.
+ * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
+ */
+ r = mount_switch_root(directory, MOUNT_ATTR_PROPAGATION_SHARED);
if (r < 0)
return log_error_errno(r, "Failed to move root directory: %m");
+ /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
+ * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
+ * the container. */
+ r = mount_tunnel_open();
+ if (r < 0)
+ return r;
+
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
+ /* In order to mount procfs and sysfs in an unprivileged container the kernel
+ * requires that a fully visible instance is already present in the target mount
+ * namespace. Mount one here so the inner child can mount its own instances. Later
+ * we umount the temporary instances created here before we actually exec the
+ * payload. Since the rootfs is shared the umount will propagate into the container.
+ * Note, the inner child wouldn't be able to unmount the instances on its own since
+ * it doesn't own the originating mount namespace. IOW, the outer child needs to do
+ * this. */
+ r = pin_fully_visible_fs();
+ if (r < 0)
+ return r;
+ }
+
fd = setup_notify_child();
if (fd < 0)
return fd;
rtnl_socket_pair[2] = { -1, -1 },
pid_socket_pair[2] = { -1, -1 },
uuid_socket_pair[2] = { -1, -1 },
- notify_socket_pair[2] = { -1, -1 },
+ fd_socket_pair[2] = { -EBADF, -EBADF },
uid_shift_socket_pair[2] = { -1, -1 },
master_pty_socket_pair[2] = { -1, -1 },
unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
- _cleanup_close_ int notify_socket = -1;
+ _cleanup_close_ int notify_socket = -1, mntns_fd = -EBADF;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
return log_error_errno(errno, "Failed to create id socket pair: %m");
- if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_socket_pair) < 0)
return log_error_errno(errno, "Failed to create notify socket pair: %m");
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
- notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
+ fd_socket_pair[0] = safe_close(fd_socket_pair[0]);
master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
secondary,
pid_socket_pair[1],
uuid_socket_pair[1],
- notify_socket_pair[1],
+ fd_socket_pair[1],
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
- notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
+ fd_socket_pair[1] = safe_close(fd_socket_pair[1]);
master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
+ mntns_fd = receive_one_fd(fd_socket_pair[0], 0);
+ if (mntns_fd < 0)
+ return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
+
/* The child just let us know the UID shift it might have read from the image. */
l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
if (l < 0)
return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
/* We also retrieve the socket used for notifications generated by outer child */
- notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+ notify_socket = receive_one_fd(fd_socket_pair[0], 0);
if (notify_socket < 0)
return log_error_errno(notify_socket,
"Failed to receive notification socket from the outer child: %m");
if (r < 0)
return r;
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
+ r = wipe_fully_visible_fs(mntns_fd);
+ if (r < 0)
+ return r;
+ mntns_fd = safe_close(mntns_fd);
+ }
+
/* Let the child know that we are ready and wait that the child is completely ready now. */
if (!barrier_place_and_sync(&barrier)) /* #5 */
return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
#include "set.h"
#include "stat-util.h"
#include "stdio-util.h"
+#include "string-table.h"
#include "string-util.h"
#include "strv.h"
#include "tmpfile-util.h"
return 0;
}
-int mount_move_root(const char *path) {
- assert(path);
+static const char *const mount_attr_propagation_type_table[_MOUNT_ATTR_PROPAGATION_TYPE_MAX] = {
+ [MOUNT_ATTR_PROPAGATION_INHERIT] = "inherited",
+ [MOUNT_ATTR_PROPAGATION_PRIVATE] = "private",
+ [MOUNT_ATTR_PROPAGATION_DEPENDENT] = "dependent",
+ [MOUNT_ATTR_PROPAGATION_SHARED] = "shared",
+};
- if (chdir(path) < 0)
- return -errno;
+DEFINE_STRING_TABLE_LOOKUP(mount_attr_propagation_type, MountAttrPropagationType);
- if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
- return -errno;
-
- if (chroot(".") < 0)
- return -errno;
-
- return RET_NERRNO(chdir("/"));
+unsigned int mount_attr_propagation_type_to_flag(MountAttrPropagationType t) {
+ switch (t) {
+ case MOUNT_ATTR_PROPAGATION_INHERIT:
+ return 0;
+ case MOUNT_ATTR_PROPAGATION_PRIVATE:
+ return MS_PRIVATE;
+ case MOUNT_ATTR_PROPAGATION_DEPENDENT:
+ return MS_SLAVE;
+ case MOUNT_ATTR_PROPAGATION_SHARED:
+ return MS_SHARED;
+ default:
+ assert_not_reached();
+ }
}
-int mount_pivot_root(const char *path) {
- _cleanup_close_ int fd_oldroot = -EBADF, fd_newroot = -EBADF;
-
- assert(path);
-
- /* pivot_root() isn't currently supported in the initramfs. */
- if (in_initrd())
- return mount_move_root(path);
+static inline int mount_switch_root_pivot(const char *path, int fd_newroot) {
+ _cleanup_close_ int fd_oldroot = -EBADF;
fd_oldroot = open("/", O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
if (fd_oldroot < 0)
return log_debug_errno(errno, "Failed to open old rootfs");
- fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
- if (fd_newroot < 0)
- return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
-
- /* Change into the new rootfs. */
- if (fchdir(fd_newroot) < 0)
- return log_debug_errno(errno, "Failed to change into new rootfs '%s': %m", path);
-
/* Let the kernel tuck the new root under the old one. */
if (pivot_root(".", ".") < 0)
return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path);
-
/* At this point the new root is tucked under the old root. If we want
* to unmount it we cannot be fchdir()ed into it. So escape back to the
* old root. */
return 0;
}
+static inline int mount_switch_root_move(const char *path) {
+ if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
+ return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path);
+
+ if (chroot(".") < 0)
+ return log_debug_errno(errno, "Failed to chroot to new rootfs '%s': %m", path);
+
+ if (chdir("/"))
+ return log_debug_errno(errno, "Failed to chdir to new rootfs '%s': %m", path);
+
+ return 0;
+}
+
+int mount_switch_root(const char *path, MountAttrPropagationType type) {
+ int r;
+ _cleanup_close_ int fd_newroot = -EBADF;
+ unsigned int flags;
+
+ assert(path);
+
+ fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+ if (fd_newroot < 0)
+ return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
+
+ /* Change into the new rootfs. */
+ if (fchdir(fd_newroot) < 0)
+ return log_debug_errno(errno, "Failed to change into new rootfs '%s': %m", path);
+
+ r = mount_switch_root_pivot(path, fd_newroot);
+ if (r < 0) {
+ /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the
+ * rootfs is an initramfs in which case pivot_root() isn't supported. */
+ log_debug_errno(r, "Failed to pivot into new rootfs '%s': %m", path);
+ r = mount_switch_root_move(path);
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to switch to new rootfs '%s': %m", path);
+
+ /* Finally, let's establish the requested propagation type. */
+ flags = mount_attr_propagation_type_to_flag(type);
+ if ((flags != 0) && mount(NULL, ".", NULL, flags|MS_REC, 0) < 0)
+ return log_debug_errno(errno, "Failed to turn new rootfs '%s' into %s mount: %m",
+ mount_attr_propagation_type_to_string(type), path);
+
+ return 0;
+}
int repeat_unmount(const char *path, int flags) {
bool done = false;
#include "errno-util.h"
#include "macro.h"
+typedef enum MountAttrPropagationType {
+ MOUNT_ATTR_PROPAGATION_INHERIT, /* no special MS_* propagation flags */
+ MOUNT_ATTR_PROPAGATION_PRIVATE, /* MS_PRIVATE */
+ MOUNT_ATTR_PROPAGATION_DEPENDENT, /* MS_SLAVE */
+ MOUNT_ATTR_PROPAGATION_SHARED, /* MS_SHARE */
+
+ _MOUNT_ATTR_PROPAGATION_TYPE_MAX,
+ _MOUNT_ATTR_PROPAGATION_TYPE_INVALID = -EINVAL,
+} MountAttrPropagationType;
+
+const char* mount_attr_propagation_type_to_string(MountAttrPropagationType t) _const_;
+MountAttrPropagationType mount_attr_propagation_type_from_string(const char *s) _pure_;
+unsigned int mount_attr_propagation_type_to_flag(MountAttrPropagationType t);
+
/* The limit used for /dev itself. 4MB should be enough since device nodes and symlinks don't
* consume any space and udev isn't supposed to create regular file either. There's no limit on the
* max number of inodes since such limit is hard to guess especially on large storage array
int bind_remount_one_with_mountinfo(const char *path, unsigned long new_flags, unsigned long flags_mask, FILE *proc_self_mountinfo);
-int mount_move_root(const char *path);
-int mount_pivot_root(const char *path);
+int mount_switch_root(const char *path, MountAttrPropagationType type);
DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(FILE*, endmntent, NULL);
#define _cleanup_endmntent_ _cleanup_(endmntentp)