1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
11 #include "base-filesystem.h"
13 #include "creds-util.h"
15 #include "initrd-util.h"
17 #include "missing_syscall.h"
18 #include "mkdir-label.h"
19 #include "mount-util.h"
20 #include "mountpoint-util.h"
21 #include "path-util.h"
23 #include "stdio-util.h"
24 #include "string-util.h"
26 #include "switch-root.h"
27 #include "user-util.h"
29 int switch_root(const char *new_root
,
30 const char *old_root_after
, /* path below the new root, where to place the old root after the transition; may be NULL to unmount it */
31 SwitchRootFlags flags
) {
33 /* Stuff mounted below /run/ we don't save on soft reboot, as it might have lost its relevance, i.e.
34 * credentials, removable media and such, we rather want that the new boot mounts this fresh. But on
35 * the switch from initrd we do use MS_REC, as it is expected that mounts set up in /run/ are
39 unsigned long mount_flags
; /* Flags to apply if SWITCH_ROOT_RECURSIVE_RUN is unset */
40 unsigned long mount_flags_recursive_run
; /* Flags to apply if SWITCH_ROOT_RECURSIVE_RUN is set (0 if shall be skipped) */
41 } transfer_table
[] = {
42 { "/dev", MS_BIND
|MS_REC
, MS_BIND
|MS_REC
}, /* Recursive, because we want to save the original /dev/shm/ + /dev/pts/ and similar */
43 { "/sys", MS_BIND
|MS_REC
, MS_BIND
|MS_REC
}, /* Similar, we want to retain various API VFS, or the cgroupv1 /sys/fs/cgroup/ tree */
44 { "/proc", MS_BIND
|MS_REC
, MS_BIND
|MS_REC
}, /* Similar */
45 { "/run", MS_BIND
, MS_BIND
|MS_REC
}, /* Recursive except on soft reboot, see above */
46 { SYSTEM_CREDENTIALS_DIRECTORY
, MS_BIND
, 0 /* skip! */ }, /* Credentials passed into the system should survive */
47 { ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY
, MS_BIND
, 0 /* skip! */ }, /* Similar */
48 { "/run/host", MS_BIND
|MS_REC
, 0 /* skip! */ }, /* Host supplied hierarchy should also survive */
51 _cleanup_close_
int old_root_fd
= -EBADF
, new_root_fd
= -EBADF
;
52 _cleanup_free_
char *resolved_old_root_after
= NULL
;
57 /* Check if we shall remove the contents of the old root */
58 old_root_fd
= open("/", O_DIRECTORY
|O_CLOEXEC
);
60 return log_error_errno(errno
, "Failed to open root directory: %m");
62 new_root_fd
= open(new_root
, O_DIRECTORY
|O_CLOEXEC
);
64 return log_error_errno(errno
, "Failed to open target directory '%s': %m", new_root
);
66 r
= fds_are_same_mount(old_root_fd
, new_root_fd
);
68 return log_error_errno(r
, "Failed to check if old and new root directory/mount are the same: %m");
70 log_debug("Skipping switch root, as old and new root directories/mounts are the same.");
74 /* Make the new root directory a mount point if it isn't */
75 r
= fd_make_mount_point(new_root_fd
);
77 return log_error_errno(r
, "Failed to make new root directory a mount point: %m");
81 /* When the path was not a mount point, then we need to reopen the path, otherwise, it still
82 * points to the underlying directory. */
84 fd
= open(new_root
, O_DIRECTORY
|O_CLOEXEC
);
86 return log_error_errno(errno
, "Failed to reopen target directory '%s': %m", new_root
);
88 close_and_replace(new_root_fd
, fd
);
91 if (FLAGS_SET(flags
, SWITCH_ROOT_DESTROY_OLD_ROOT
)) {
92 istmp
= fd_is_temporary_fs(old_root_fd
);
94 return log_error_errno(istmp
, "Failed to stat root directory: %m");
96 log_debug("Root directory is on tmpfs, will do cleanup later.");
98 istmp
= -1; /* don't know */
100 if (old_root_after
) {
101 /* Determine where we shall place the old root after the transition */
102 r
= chase(old_root_after
, new_root
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &resolved_old_root_after
, NULL
);
104 return log_error_errno(r
, "Failed to resolve %s/%s: %m", new_root
, old_root_after
);
105 if (r
== 0) /* Doesn't exist yet. Let's create it */
106 (void) mkdir_p_label(resolved_old_root_after
, 0755);
109 /* We are about to unmount various file systems with MNT_DETACH (either explicitly via umount() or
110 * indirectly via pivot_root()), and thus do not synchronously wait for them to be fully sync'ed —
111 * all while making them invisible/inaccessible in the file system tree for later code. That makes
112 * sync'ing them then difficult. Let's hence issue a manual sync() here, so that we at least can
113 * guarantee all file systems are an a good state before entering this state. */
114 if (!FLAGS_SET(flags
, SWITCH_ROOT_DONT_SYNC
))
117 /* Work-around for kernel design: the kernel refuses MS_MOVE if any file systems are mounted
118 * MS_SHARED. Hence remount them MS_PRIVATE here as a work-around.
120 * https://bugzilla.redhat.com/show_bug.cgi?id=847418 */
121 if (mount(NULL
, "/", NULL
, MS_REC
|MS_PRIVATE
, NULL
) < 0)
122 return log_error_errno(errno
, "Failed to set \"/\" mount propagation to private: %m");
124 /* Do not fail if base_filesystem_create() fails. Not all switch roots are like base_filesystem_create() wants
125 * them to look like. They might even boot, if they are RO and don't have the FS layout. Just ignore the error
126 * and switch_root() nevertheless. */
127 (void) base_filesystem_create_fd(new_root_fd
, new_root
, UID_INVALID
, GID_INVALID
);
129 FOREACH_ARRAY(transfer
, transfer_table
, ELEMENTSOF(transfer_table
)) {
130 _cleanup_free_
char *chased
= NULL
;
131 unsigned long mount_flags
;
133 mount_flags
= FLAGS_SET(flags
, SWITCH_ROOT_RECURSIVE_RUN
) ? transfer
->mount_flags_recursive_run
: transfer
->mount_flags
;
134 if (mount_flags
== 0) /* skip if zero */
137 if (access(transfer
->path
, F_OK
) < 0) {
138 log_debug_errno(errno
, "Path '%s' to move to target root directory, not found, ignoring: %m", transfer
->path
);
142 r
= chase(transfer
->path
, new_root
, CHASE_PREFIX_ROOT
, &chased
, NULL
);
144 return log_error_errno(r
, "Failed to resolve %s/%s: %m", new_root
, transfer
->path
);
146 /* Let's see if it is a mount point already. */
147 r
= path_is_mount_point(chased
);
149 return log_error_errno(r
, "Failed to determine whether %s is a mount point: %m", chased
);
150 if (r
> 0) /* If it is already mounted, then do nothing */
153 r
= mount_nofollow_verbose(LOG_ERR
, transfer
->path
, chased
, NULL
, mount_flags
, NULL
);
158 if (fchdir(new_root_fd
) < 0)
159 return log_error_errno(errno
, "Failed to change directory to %s: %m", new_root
);
161 /* We first try a pivot_root() so that we can umount the old root dir. In many cases (i.e. where rootfs is /),
162 * that's not possible however, and hence we simply overmount root */
163 if (resolved_old_root_after
)
164 r
= RET_NERRNO(pivot_root(".", resolved_old_root_after
));
166 r
= RET_NERRNO(pivot_root(".", "."));
168 /* Now unmount the upper of the two stacked file systems */
169 if (umount2(".", MNT_DETACH
) < 0)
170 return log_error_errno(errno
, "Failed to unmount the old root: %m");
174 log_debug_errno(r
, "Pivoting root file system failed, moving mounts instead: %m");
176 if (resolved_old_root_after
) {
177 r
= mount_nofollow_verbose(LOG_ERR
, "/", resolved_old_root_after
, NULL
, MS_BIND
|MS_REC
, NULL
);
182 /* If we have to use MS_MOVE let's first try to get rid of *all* mounts we can, with the
183 * exception of the path we want to switch to, plus everything leading to it and within
184 * it. This is necessary because unlike pivot_root() just moving the mount to the root via
185 * MS_MOVE won't magically unmount anything below it. Once the chroot() succeeds the mounts
186 * below would still be around but invisible to us, because not accessible via
187 * /proc/self/mountinfo. Hence, let's clean everything up first, as long as we still can. */
188 (void) umount_recursive_full(NULL
, MNT_DETACH
, STRV_MAKE(new_root
));
190 if (mount(".", "/", NULL
, MS_MOVE
, NULL
) < 0)
191 return log_error_errno(errno
, "Failed to move %s to /: %m", new_root
);
194 return log_error_errno(errno
, "Failed to change root: %m");
197 return log_error_errno(errno
, "Failed to change directory: %m");
203 if (fstat(old_root_fd
, &rb
) < 0)
204 return log_error_errno(errno
, "Failed to stat old root directory: %m");
206 /* Note: the below won't operate on non-memory file systems (i.e. only on tmpfs, ramfs), and
207 * it will stop at mount boundaries */
208 (void) rm_rf_children(TAKE_FD(old_root_fd
), 0, &rb
); /* takes possession of the dir fd, even on failure */