1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
7 #include <sys/statvfs.h>
10 #include "alloc-util.h"
11 #include "extract-word.h"
16 #include "libmount-util.h"
17 #include "mount-util.h"
18 #include "mountpoint-util.h"
19 #include "parse-util.h"
20 #include "path-util.h"
22 #include "stat-util.h"
23 #include "stdio-util.h"
24 #include "string-util.h"
27 int mount_fd(const char *source
,
29 const char *filesystemtype
,
30 unsigned long mountflags
,
33 char path
[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
35 xsprintf(path
, "/proc/self/fd/%i", target_fd
);
36 if (mount(source
, path
, filesystemtype
, mountflags
, data
) < 0) {
40 /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
41 * mounted. Check for the latter to generate better error messages. */
42 if (proc_mounted() == 0)
54 const char *filesystemtype
,
55 unsigned long mountflags
,
58 _cleanup_close_
int fd
= -1;
60 /* In almost all cases we want to manipulate the mount table without following symlinks, hence
61 * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
62 * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
63 * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
64 * fs to mount) we can only use traditional mount() directly.
66 * Note that this disables following only for the final component of the target, i.e symlinks within
67 * the path of the target are honoured, as are symlinks in the source path everywhere. */
69 fd
= open(target
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
73 return mount_fd(source
, fd
, filesystemtype
, mountflags
, data
);
76 int umount_recursive(const char *prefix
, int flags
) {
80 /* Try to umount everything recursively below a
81 * directory. Also, take care of stacked mounts, and keep
82 * unmounting them until they are gone. */
85 _cleanup_(mnt_free_tablep
) struct libmnt_table
*table
= NULL
;
86 _cleanup_(mnt_free_iterp
) struct libmnt_iter
*iter
= NULL
;
90 r
= libmount_parse("/proc/self/mountinfo", NULL
, &table
, &iter
);
92 return log_debug_errno(r
, "Failed to parse /proc/self/mountinfo: %m");
98 r
= mnt_table_next_fs(table
, iter
, &fs
);
102 return log_debug_errno(r
, "Failed to get next entry from /proc/self/mountinfo: %m");
104 path
= mnt_fs_get_target(fs
);
108 if (!path_startswith(path
, prefix
))
111 if (umount2(path
, flags
| UMOUNT_NOFOLLOW
) < 0) {
112 log_debug_errno(errno
, "Failed to umount %s, ignoring: %m", path
);
116 log_debug("Successfully unmounted %s", path
);
128 static int get_mount_flags(
129 struct libmnt_table
*table
,
131 unsigned long *ret
) {
133 _cleanup_close_
int fd
= -1;
134 struct libmnt_fs
*fs
;
139 /* Get the mount flags for the mountpoint at "path" from "table". We have a fallback using statvfs()
140 * in place (which provides us with mostly the same info), but it's just a fallback, since using it
141 * means triggering autofs or NFS mounts, which we'd rather avoid needlessly.
143 * This generally doesn't follow symlinks. */
145 fs
= mnt_table_find_target(table
, path
, MNT_ITER_FORWARD
);
147 log_debug("Could not find '%s' in mount table, ignoring.", path
);
151 opts
= mnt_fs_get_vfs_options(fs
);
157 r
= mnt_optstr_get_flags(opts
, ret
, mnt_get_builtin_optmap(MNT_LINUX_MAP
));
159 log_debug_errno(r
, "Could not get flags for '%s', ignoring: %m", path
);
163 /* MS_RELATIME is default and trying to set it in an unprivileged container causes EPERM */
164 *ret
&= ~MS_RELATIME
;
168 fd
= open(path
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
172 if (fstatvfs(fd
, &buf
) < 0)
175 /* The statvfs() flags and the mount flags mostly have the same values, but for some cases do
176 * not. Hence map the flags manually. (Strictly speaking, ST_RELATIME/MS_RELATIME is the most
177 * prominent one that doesn't match, but that's the one we mask away anyway, see above.) */
180 FLAGS_SET(buf
.f_flag
, ST_RDONLY
) * MS_RDONLY
|
181 FLAGS_SET(buf
.f_flag
, ST_NODEV
) * MS_NODEV
|
182 FLAGS_SET(buf
.f_flag
, ST_NOEXEC
) * MS_NOEXEC
|
183 FLAGS_SET(buf
.f_flag
, ST_NOSUID
) * MS_NOSUID
|
184 FLAGS_SET(buf
.f_flag
, ST_NOATIME
) * MS_NOATIME
|
185 FLAGS_SET(buf
.f_flag
, ST_NODIRATIME
) * MS_NODIRATIME
;
190 /* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
191 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
192 int bind_remount_recursive_with_mountinfo(
194 unsigned long new_flags
,
195 unsigned long flags_mask
,
197 FILE *proc_self_mountinfo
) {
199 _cleanup_set_free_free_ Set
*done
= NULL
;
200 _cleanup_free_
char *simplified
= NULL
;
204 assert(proc_self_mountinfo
);
206 /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
207 * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
208 * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
209 * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
210 * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
211 * do not have any effect on future submounts that might get propagated, they might be writable. This includes
212 * future submounts that have been triggered via autofs.
214 * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
215 * remount operation. Note that we'll ignore the deny list for the top-level path. */
217 simplified
= strdup(prefix
);
221 path_simplify(simplified
, false);
223 done
= set_new(&path_hash_ops
);
228 _cleanup_set_free_free_ Set
*todo
= NULL
;
229 _cleanup_(mnt_free_tablep
) struct libmnt_table
*table
= NULL
;
230 _cleanup_(mnt_free_iterp
) struct libmnt_iter
*iter
= NULL
;
231 bool top_autofs
= false;
233 unsigned long orig_flags
;
235 todo
= set_new(&path_hash_ops
);
239 rewind(proc_self_mountinfo
);
241 r
= libmount_parse("/proc/self/mountinfo", proc_self_mountinfo
, &table
, &iter
);
243 return log_debug_errno(r
, "Failed to parse /proc/self/mountinfo: %m");
246 struct libmnt_fs
*fs
;
247 const char *path
, *type
;
249 r
= mnt_table_next_fs(table
, iter
, &fs
);
253 return log_debug_errno(r
, "Failed to get next entry from /proc/self/mountinfo: %m");
255 path
= mnt_fs_get_target(fs
);
256 type
= mnt_fs_get_fstype(fs
);
260 if (!path_startswith(path
, simplified
))
263 /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
264 * we shall operate on. */
265 if (!path_equal(path
, simplified
)) {
266 bool deny_listed
= false;
269 STRV_FOREACH(i
, deny_list
) {
270 if (path_equal(*i
, simplified
))
273 if (!path_startswith(*i
, simplified
))
276 if (path_startswith(path
, *i
)) {
278 log_debug("Not remounting %s deny-listed by %s, called for %s",
279 path
, *i
, simplified
);
287 /* Let's ignore autofs mounts. If they aren't
288 * triggered yet, we want to avoid triggering
289 * them, as we don't make any guarantees for
290 * future submounts anyway. If they are
291 * already triggered, then we will find
292 * another entry for this. */
293 if (streq(type
, "autofs")) {
294 top_autofs
= top_autofs
|| path_equal(path
, simplified
);
298 if (!set_contains(done
, path
)) {
299 r
= set_put_strdup(&todo
, path
);
305 /* If we have no submounts to process anymore and if
306 * the root is either already done, or an autofs, we
308 if (set_isempty(todo
) &&
309 (top_autofs
|| set_contains(done
, simplified
)))
312 if (!set_contains(done
, simplified
) &&
313 !set_contains(todo
, simplified
)) {
314 /* The prefix directory itself is not yet a mount, make it one. */
315 r
= mount_nofollow(simplified
, simplified
, NULL
, MS_BIND
|MS_REC
, NULL
);
320 (void) get_mount_flags(table
, simplified
, &orig_flags
);
322 r
= mount_nofollow(NULL
, simplified
, NULL
, (orig_flags
& ~flags_mask
)|MS_BIND
|MS_REMOUNT
|new_flags
, NULL
);
326 log_debug("Made top-level directory %s a mount point.", prefix
);
328 r
= set_put_strdup(&done
, simplified
);
333 while ((x
= set_steal_first(todo
))) {
335 r
= set_consume(done
, x
);
336 if (IN_SET(r
, 0, -EEXIST
))
341 /* Deal with mount points that are obstructed by a later mount */
342 r
= path_is_mount_point(x
, NULL
, 0);
343 if (IN_SET(r
, 0, -ENOENT
))
346 if (!ERRNO_IS_PRIVILEGE(r
))
349 /* Even if root user invoke this, submounts under private FUSE or NFS mount points
350 * may not be acceessed. E.g.,
352 * $ bindfs --no-allow-other ~/mnt/mnt ~/mnt/mnt
353 * $ bindfs --no-allow-other ~/mnt ~/mnt
355 * Then, root user cannot access the mount point ~/mnt/mnt.
356 * In such cases, the submounts are ignored, as we have no way to manage them. */
357 log_debug_errno(r
, "Failed to determine '%s' is mount point or not, ignoring: %m", x
);
361 /* Try to reuse the original flag set */
363 (void) get_mount_flags(table
, x
, &orig_flags
);
365 r
= mount_nofollow(NULL
, x
, NULL
, (orig_flags
& ~flags_mask
)|MS_BIND
|MS_REMOUNT
|new_flags
, NULL
);
369 log_debug("Remounted %s read-only.", x
);
374 int bind_remount_recursive(
376 unsigned long new_flags
,
377 unsigned long flags_mask
,
380 _cleanup_fclose_
FILE *proc_self_mountinfo
= NULL
;
383 r
= fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo
);
387 return bind_remount_recursive_with_mountinfo(prefix
, new_flags
, flags_mask
, deny_list
, proc_self_mountinfo
);
390 int bind_remount_one_with_mountinfo(
392 unsigned long new_flags
,
393 unsigned long flags_mask
,
394 FILE *proc_self_mountinfo
) {
396 _cleanup_(mnt_free_tablep
) struct libmnt_table
*table
= NULL
;
397 unsigned long orig_flags
= 0;
401 assert(proc_self_mountinfo
);
403 rewind(proc_self_mountinfo
);
405 table
= mnt_new_table();
409 r
= mnt_table_parse_stream(table
, proc_self_mountinfo
, "/proc/self/mountinfo");
413 /* Try to reuse the original flag set */
414 (void) get_mount_flags(table
, path
, &orig_flags
);
416 r
= mount_nofollow(NULL
, path
, NULL
, (orig_flags
& ~flags_mask
)|MS_BIND
|MS_REMOUNT
|new_flags
, NULL
);
423 int mount_move_root(const char *path
) {
429 if (mount(path
, "/", NULL
, MS_MOVE
, NULL
) < 0)
441 int repeat_unmount(const char *path
, int flags
) {
446 /* If there are multiple mounts on a mount point, this
447 * removes them all */
450 if (umount2(path
, flags
) < 0) {
462 int mode_to_inaccessible_node(
463 const char *runtime_dir
,
467 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
468 * during early boot by PID 1. In some cases we lacked the privs to create the character and block
469 * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
470 * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
471 * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
472 * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
473 * file nodes, and that's the most important thing that matters.
475 * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
476 * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
478 _cleanup_free_
char *d
= NULL
;
479 const char *node
= NULL
;
484 runtime_dir
= "/run";
486 switch(mode
& S_IFMT
) {
488 node
= "/systemd/inaccessible/reg";
492 node
= "/systemd/inaccessible/dir";
496 node
= "/systemd/inaccessible/chr";
500 node
= "/systemd/inaccessible/blk";
504 node
= "/systemd/inaccessible/fifo";
508 node
= "/systemd/inaccessible/sock";
514 d
= path_join(runtime_dir
, node
);
518 /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
519 * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
520 * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
521 * inaccessible block device node let's see if the block device node actually exists, and if not,
522 * fall back to the character device node. From there fall back to the socket device node. This means
523 * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
524 * device node at all. */
527 access(d
, F_OK
) < 0 && errno
== ENOENT
) {
529 d
= path_join(runtime_dir
, "/systemd/inaccessible/chr");
534 if (IN_SET(mode
& S_IFMT
, S_IFBLK
, S_IFCHR
) &&
535 access(d
, F_OK
) < 0 && errno
== ENOENT
) {
537 d
= path_join(runtime_dir
, "/systemd/inaccessible/sock");
546 #define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
547 static char* mount_flags_to_string(long unsigned flags
) {
549 _cleanup_free_
char *y
= NULL
;
550 long unsigned overflow
;
552 overflow
= flags
& ~(MS_RDONLY
|
577 if (flags
== 0 || overflow
!= 0)
578 if (asprintf(&y
, "%lx", overflow
) < 0)
581 x
= strjoin(FLAG(MS_RDONLY
),
585 FLAG(MS_SYNCHRONOUS
),
603 FLAG(MS_STRICTATIME
),
609 x
[strlen(x
) - 1] = '\0'; /* truncate the last | */
613 int mount_verbose_full(
620 bool follow_symlink
) {
622 _cleanup_free_
char *fl
= NULL
, *o
= NULL
;
626 r
= mount_option_mangle(options
, flags
, &f
, &o
);
628 return log_full_errno(error_log_level
, r
,
629 "Failed to mangle mount options %s: %m",
632 fl
= mount_flags_to_string(f
);
634 if ((f
& MS_REMOUNT
) && !what
&& !type
)
635 log_debug("Remounting %s (%s \"%s\")...",
636 where
, strnull(fl
), strempty(o
));
637 else if (!what
&& !type
)
638 log_debug("Mounting %s (%s \"%s\")...",
639 where
, strnull(fl
), strempty(o
));
640 else if ((f
& MS_BIND
) && !type
)
641 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
642 what
, where
, strnull(fl
), strempty(o
));
643 else if (f
& MS_MOVE
)
644 log_debug("Moving mount %s → %s (%s \"%s\")...",
645 what
, where
, strnull(fl
), strempty(o
));
647 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
648 strna(what
), strna(type
), where
, strnull(fl
), strempty(o
));
651 r
= mount(what
, where
, type
, f
, o
) < 0 ? -errno
: 0;
653 r
= mount_nofollow(what
, where
, type
, f
, o
);
655 return log_full_errno(error_log_level
, r
,
656 "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
657 strna(what
), strna(type
), where
, strnull(fl
), strempty(o
));
668 log_debug("Umounting %s...", what
);
670 if (umount2(what
, flags
) < 0)
671 return log_full_errno(error_log_level
, errno
,
672 "Failed to unmount %s: %m", what
);
677 int mount_option_mangle(
679 unsigned long mount_flags
,
680 unsigned long *ret_mount_flags
,
681 char **ret_remaining_options
) {
683 const struct libmnt_optmap
*map
;
684 _cleanup_free_
char *ret
= NULL
;
688 /* This extracts mount flags from the mount options, and store
689 * non-mount-flag options to '*ret_remaining_options'.
691 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
692 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
693 * "size=1630748k,mode=700,uid=1000,gid=1000".
694 * See more examples in test-mount-utils.c.
696 * Note that if 'options' does not contain any non-mount-flag options,
697 * then '*ret_remaining_options' is set to NULL instead of empty string.
698 * Note that this does not check validity of options stored in
699 * '*ret_remaining_options'.
700 * Note that if 'options' is NULL, then this just copies 'mount_flags'
701 * to '*ret_mount_flags'. */
703 assert(ret_mount_flags
);
704 assert(ret_remaining_options
);
706 map
= mnt_get_builtin_optmap(MNT_LINUX_MAP
);
712 _cleanup_free_
char *word
= NULL
;
713 const struct libmnt_optmap
*ent
;
715 r
= extract_first_word(&p
, &word
, ",", EXTRACT_UNQUOTE
);
721 for (ent
= map
; ent
->name
; ent
++) {
722 /* All entries in MNT_LINUX_MAP do not take any argument.
723 * Thus, ent->name does not contain "=" or "[=]". */
724 if (!streq(word
, ent
->name
))
727 if (!(ent
->mask
& MNT_INVERT
))
728 mount_flags
|= ent
->id
;
729 else if (mount_flags
& ent
->id
)
730 mount_flags
^= ent
->id
;
735 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
736 if (!ent
->name
&& !strextend_with_separator(&ret
, ",", word
, NULL
))
740 *ret_mount_flags
= mount_flags
;
741 *ret_remaining_options
= TAKE_PTR(ret
);