1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
12 #include <sys/mount.h>
14 #include <sys/statvfs.h>
20 #include "alloc-util.h"
22 #include "extract-word.h"
27 #include "mount-util.h"
28 #include "parse-util.h"
29 #include "path-util.h"
31 #include "stdio-util.h"
32 #include "string-util.h"
35 /* This is the original MAX_HANDLE_SZ definition from the kernel, when the API was introduced. We use that in place of
36 * any more currently defined value to future-proof things: if the size is increased in the API headers, and our code
37 * is recompiled then it would cease working on old kernels, as those refuse any sizes larger than this value with
38 * EINVAL right-away. Hence, let's disconnect ourselves from any such API changes, and stick to the original definition
39 * from when it was introduced. We use it as a start value only anyway (see below), and hence should be able to deal
40 * with large file handles anyway. */
41 #define ORIGINAL_MAX_HANDLE_SZ 128
43 int name_to_handle_at_loop(
46 struct file_handle
**ret_handle
,
50 _cleanup_free_
struct file_handle
*h
= NULL
;
51 size_t n
= ORIGINAL_MAX_HANDLE_SZ
;
53 /* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified
54 * buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a
55 * start value, it is not an upper bound on the buffer size required.
57 * This improves on raw name_to_handle_at() also in one other regard: ret_handle and ret_mnt_id can be passed
58 * as NULL if there's no interest in either. */
63 h
= malloc0(offsetof(struct file_handle
, f_handle
) + n
);
69 if (name_to_handle_at(fd
, path
, h
, &mnt_id
, flags
) >= 0) {
72 *ret_handle
= TAKE_PTR(h
);
79 if (errno
!= EOVERFLOW
)
82 if (!ret_handle
&& ret_mnt_id
&& mnt_id
>= 0) {
84 /* As it appears, name_to_handle_at() fills in mnt_id even when it returns EOVERFLOW when the
85 * buffer is too small, but that's undocumented. Hence, let's make use of this if it appears to
86 * be filled in, and the caller was interested in only the mount ID an nothing else. */
92 /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something
93 * else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small
94 * buffer. In that case propagate EOVERFLOW */
95 if (h
->handle_bytes
<= n
)
98 /* The buffer was too small. Size the new buffer by what name_to_handle_at() returned. */
100 if (offsetof(struct file_handle
, f_handle
) + n
< n
) /* check for addition overflow */
107 static int fd_fdinfo_mnt_id(int fd
, const char *filename
, int flags
, int *mnt_id
) {
108 char path
[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
109 _cleanup_free_
char *fdinfo
= NULL
;
110 _cleanup_close_
int subfd
= -1;
114 if ((flags
& AT_EMPTY_PATH
) && isempty(filename
))
115 xsprintf(path
, "/proc/self/fdinfo/%i", fd
);
117 subfd
= openat(fd
, filename
, O_CLOEXEC
|O_PATH
);
121 xsprintf(path
, "/proc/self/fdinfo/%i", subfd
);
124 r
= read_full_file(path
, &fdinfo
, NULL
);
125 if (r
== -ENOENT
) /* The fdinfo directory is a relatively new addition */
130 p
= startswith(fdinfo
, "mnt_id:");
132 p
= strstr(fdinfo
, "\nmnt_id:");
133 if (!p
) /* The mnt_id field is a relatively new addition */
139 p
+= strspn(p
, WHITESPACE
);
140 p
[strcspn(p
, WHITESPACE
)] = 0;
142 return safe_atoi(p
, mnt_id
);
145 int fd_is_mount_point(int fd
, const char *filename
, int flags
) {
146 _cleanup_free_
struct file_handle
*h
= NULL
, *h_parent
= NULL
;
147 int mount_id
= -1, mount_id_parent
= -1;
148 bool nosupp
= false, check_st_dev
= true;
155 /* First we will try the name_to_handle_at() syscall, which
156 * tells us the mount id and an opaque file "handle". It is
157 * not supported everywhere though (kernel compile-time
158 * option, not all file systems are hooked up). If it works
159 * the mount id is usually good enough to tell us whether
160 * something is a mount point.
162 * If that didn't work we will try to read the mount id from
163 * /proc/self/fdinfo/<fd>. This is almost as good as
164 * name_to_handle_at(), however, does not return the
165 * opaque file handle. The opaque file handle is pretty useful
166 * to detect the root directory, which we should always
167 * consider a mount point. Hence we use this only as
168 * fallback. Exporting the mnt_id in fdinfo is a pretty recent
171 * As last fallback we do traditional fstat() based st_dev
172 * comparisons. This is how things were traditionally done,
173 * but unionfs breaks this since it exposes file
174 * systems with a variety of st_dev reported. Also, btrfs
175 * subvolumes have different st_dev, even though they aren't
176 * real mounts of their own. */
178 r
= name_to_handle_at_loop(fd
, filename
, &h
, &mount_id
, flags
);
179 if (IN_SET(r
, -ENOSYS
, -EACCES
, -EPERM
, -EOVERFLOW
, -EINVAL
))
180 /* This kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked
181 * (EACCES/EPERM; maybe through seccomp, because we are running inside of a container?), or the mount
182 * point is not triggered yet (EOVERFLOW, think nfs4), or some general name_to_handle_at() flakiness
183 * (EINVAL): fall back to simpler logic. */
184 goto fallback_fdinfo
;
185 else if (r
== -EOPNOTSUPP
)
186 /* This kernel or file system does not support name_to_handle_at(), hence let's see if the upper fs
187 * supports it (in which case it is a mount point), otherwise fallback to the traditional stat()
193 r
= name_to_handle_at_loop(fd
, "", &h_parent
, &mount_id_parent
, AT_EMPTY_PATH
);
194 if (r
== -EOPNOTSUPP
) {
196 /* Neither parent nor child do name_to_handle_at()? We have no choice but to fall back. */
197 goto fallback_fdinfo
;
199 /* The parent can't do name_to_handle_at() but the directory we are interested in can? If so,
200 * it must be a mount point. */
205 /* The parent can do name_to_handle_at() but the
206 * directory we are interested in can't? If so, it
207 * must be a mount point. */
211 /* If the file handle for the directory we are
212 * interested in and its parent are identical, we
213 * assume this is the root directory, which is a mount
216 if (h
->handle_bytes
== h_parent
->handle_bytes
&&
217 h
->handle_type
== h_parent
->handle_type
&&
218 memcmp(h
->f_handle
, h_parent
->f_handle
, h
->handle_bytes
) == 0)
221 return mount_id
!= mount_id_parent
;
224 r
= fd_fdinfo_mnt_id(fd
, filename
, flags
, &mount_id
);
225 if (IN_SET(r
, -EOPNOTSUPP
, -EACCES
, -EPERM
))
230 r
= fd_fdinfo_mnt_id(fd
, "", AT_EMPTY_PATH
, &mount_id_parent
);
234 if (mount_id
!= mount_id_parent
)
237 /* Hmm, so, the mount ids are the same. This leaves one
238 * special case though for the root file system. For that,
239 * let's see if the parent directory has the same inode as we
240 * are interested in. Hence, let's also do fstat() checks now,
241 * too, but avoid the st_dev comparisons, since they aren't
242 * that useful on unionfs mounts. */
243 check_st_dev
= false;
246 /* yay for fstatat() taking a different set of flags than the other
248 if (flags
& AT_SYMLINK_FOLLOW
)
249 flags
&= ~AT_SYMLINK_FOLLOW
;
251 flags
|= AT_SYMLINK_NOFOLLOW
;
252 if (fstatat(fd
, filename
, &a
, flags
) < 0)
255 if (fstatat(fd
, "", &b
, AT_EMPTY_PATH
) < 0)
258 /* A directory with same device and inode as its parent? Must
259 * be the root directory */
260 if (a
.st_dev
== b
.st_dev
&&
261 a
.st_ino
== b
.st_ino
)
264 return check_st_dev
&& (a
.st_dev
!= b
.st_dev
);
267 /* flags can be AT_SYMLINK_FOLLOW or 0 */
268 int path_is_mount_point(const char *t
, const char *root
, int flags
) {
269 _cleanup_free_
char *canonical
= NULL
, *parent
= NULL
;
270 _cleanup_close_
int fd
= -1;
274 assert((flags
& ~AT_SYMLINK_FOLLOW
) == 0);
276 if (path_equal(t
, "/"))
279 /* we need to resolve symlinks manually, we can't just rely on
280 * fd_is_mount_point() to do that for us; if we have a structure like
281 * /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
282 * look at needs to be /usr, not /. */
283 if (flags
& AT_SYMLINK_FOLLOW
) {
284 r
= chase_symlinks(t
, root
, CHASE_TRAIL_SLASH
, &canonical
);
291 parent
= dirname_malloc(t
);
295 fd
= openat(AT_FDCWD
, parent
, O_DIRECTORY
|O_CLOEXEC
|O_PATH
);
299 return fd_is_mount_point(fd
, last_path_component(t
), flags
);
302 int path_get_mnt_id(const char *path
, int *ret
) {
305 r
= name_to_handle_at_loop(AT_FDCWD
, path
, NULL
, ret
, 0);
306 if (IN_SET(r
, -EOPNOTSUPP
, -ENOSYS
, -EACCES
, -EPERM
, -EOVERFLOW
, -EINVAL
)) /* kernel/fs don't support this, or seccomp blocks access, or untriggered mount, or name_to_handle_at() is flaky */
307 return fd_fdinfo_mnt_id(AT_FDCWD
, path
, 0, ret
);
312 int umount_recursive(const char *prefix
, int flags
) {
316 /* Try to umount everything recursively below a
317 * directory. Also, take care of stacked mounts, and keep
318 * unmounting them until they are gone. */
321 _cleanup_fclose_
FILE *proc_self_mountinfo
= NULL
;
326 proc_self_mountinfo
= fopen("/proc/self/mountinfo", "re");
327 if (!proc_self_mountinfo
)
330 (void) __fsetlocking(proc_self_mountinfo
, FSETLOCKING_BYCALLER
);
333 _cleanup_free_
char *path
= NULL
, *p
= NULL
;
336 k
= fscanf(proc_self_mountinfo
,
337 "%*s " /* (1) mount id */
338 "%*s " /* (2) parent id */
339 "%*s " /* (3) major:minor */
340 "%*s " /* (4) root */
341 "%ms " /* (5) mount point */
342 "%*s" /* (6) mount options */
343 "%*[^-]" /* (7) optional fields */
344 "- " /* (8) separator */
345 "%*s " /* (9) file system type */
346 "%*s" /* (10) mount source */
347 "%*s" /* (11) mount options 2 */
348 "%*[^\n]", /* some rubbish at the end */
357 r
= cunescape(path
, UNESCAPE_RELAX
, &p
);
361 if (!path_startswith(p
, prefix
))
364 if (umount2(p
, flags
) < 0) {
365 r
= log_debug_errno(errno
, "Failed to umount %s: %m", p
);
369 log_debug("Successfully unmounted %s", p
);
382 static int get_mount_flags(const char *path
, unsigned long *flags
) {
385 if (statvfs(path
, &buf
) < 0)
391 /* Use this function only if do you have direct access to /proc/self/mountinfo
392 * and need the caller to open it for you. This is the case when /proc is
393 * masked or not mounted. Otherwise, use bind_remount_recursive. */
394 int bind_remount_recursive_with_mountinfo(const char *prefix
, bool ro
, char **blacklist
, FILE *proc_self_mountinfo
) {
395 _cleanup_set_free_free_ Set
*done
= NULL
;
396 _cleanup_free_
char *cleaned
= NULL
;
399 assert(proc_self_mountinfo
);
401 /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
402 * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
403 * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
404 * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
405 * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
406 * do not have any effect on future submounts that might get propagated, they migt be writable. This includes
407 * future submounts that have been triggered via autofs.
409 * If the "blacklist" parameter is specified it may contain a list of subtrees to exclude from the
410 * remount operation. Note that we'll ignore the blacklist for the top-level path. */
412 cleaned
= strdup(prefix
);
416 path_kill_slashes(cleaned
);
418 done
= set_new(&path_hash_ops
);
423 _cleanup_set_free_free_ Set
*todo
= NULL
;
424 bool top_autofs
= false;
426 unsigned long orig_flags
;
428 todo
= set_new(&path_hash_ops
);
432 rewind(proc_self_mountinfo
);
435 _cleanup_free_
char *path
= NULL
, *p
= NULL
, *type
= NULL
;
438 k
= fscanf(proc_self_mountinfo
,
439 "%*s " /* (1) mount id */
440 "%*s " /* (2) parent id */
441 "%*s " /* (3) major:minor */
442 "%*s " /* (4) root */
443 "%ms " /* (5) mount point */
444 "%*s" /* (6) mount options (superblock) */
445 "%*[^-]" /* (7) optional fields */
446 "- " /* (8) separator */
447 "%ms " /* (9) file system type */
448 "%*s" /* (10) mount source */
449 "%*s" /* (11) mount options (bind mount) */
450 "%*[^\n]", /* some rubbish at the end */
460 r
= cunescape(path
, UNESCAPE_RELAX
, &p
);
464 if (!path_startswith(p
, cleaned
))
467 /* Ignore this mount if it is blacklisted, but only if it isn't the top-level mount we shall
469 if (!path_equal(cleaned
, p
)) {
470 bool blacklisted
= false;
473 STRV_FOREACH(i
, blacklist
) {
475 if (path_equal(*i
, cleaned
))
478 if (!path_startswith(*i
, cleaned
))
481 if (path_startswith(p
, *i
)) {
483 log_debug("Not remounting %s, because blacklisted by %s, called for %s", p
, *i
, cleaned
);
491 /* Let's ignore autofs mounts. If they aren't
492 * triggered yet, we want to avoid triggering
493 * them, as we don't make any guarantees for
494 * future submounts anyway. If they are
495 * already triggered, then we will find
496 * another entry for this. */
497 if (streq(type
, "autofs")) {
498 top_autofs
= top_autofs
|| path_equal(cleaned
, p
);
502 if (!set_contains(done
, p
)) {
503 r
= set_consume(todo
, p
);
512 /* If we have no submounts to process anymore and if
513 * the root is either already done, or an autofs, we
515 if (set_isempty(todo
) &&
516 (top_autofs
|| set_contains(done
, cleaned
)))
519 if (!set_contains(done
, cleaned
) &&
520 !set_contains(todo
, cleaned
)) {
521 /* The prefix directory itself is not yet a mount, make it one. */
522 if (mount(cleaned
, cleaned
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
526 (void) get_mount_flags(cleaned
, &orig_flags
);
527 orig_flags
&= ~MS_RDONLY
;
529 if (mount(NULL
, prefix
, NULL
, orig_flags
|MS_BIND
|MS_REMOUNT
|(ro
? MS_RDONLY
: 0), NULL
) < 0)
532 log_debug("Made top-level directory %s a mount point.", prefix
);
538 r
= set_consume(done
, x
);
543 while ((x
= set_steal_first(todo
))) {
545 r
= set_consume(done
, x
);
546 if (IN_SET(r
, 0, -EEXIST
))
551 /* Deal with mount points that are obstructed by a later mount */
552 r
= path_is_mount_point(x
, NULL
, 0);
553 if (IN_SET(r
, 0, -ENOENT
))
558 /* Try to reuse the original flag set */
560 (void) get_mount_flags(x
, &orig_flags
);
561 orig_flags
&= ~MS_RDONLY
;
563 if (mount(NULL
, x
, NULL
, orig_flags
|MS_BIND
|MS_REMOUNT
|(ro
? MS_RDONLY
: 0), NULL
) < 0)
566 log_debug("Remounted %s read-only.", x
);
571 int bind_remount_recursive(const char *prefix
, bool ro
, char **blacklist
) {
572 _cleanup_fclose_
FILE *proc_self_mountinfo
= NULL
;
574 proc_self_mountinfo
= fopen("/proc/self/mountinfo", "re");
575 if (!proc_self_mountinfo
)
578 (void) __fsetlocking(proc_self_mountinfo
, FSETLOCKING_BYCALLER
);
580 return bind_remount_recursive_with_mountinfo(prefix
, ro
, blacklist
, proc_self_mountinfo
);
583 int mount_move_root(const char *path
) {
589 if (mount(path
, "/", NULL
, MS_MOVE
, NULL
) < 0)
601 bool fstype_is_network(const char *fstype
) {
604 x
= startswith(fstype
, "fuse.");
608 return STR_IN_SET(fstype
,
620 "pvfs2", /* OrangeFS */
625 bool fstype_is_api_vfs(const char *fstype
) {
626 return STR_IN_SET(fstype
,
649 bool fstype_is_ro(const char *fstype
) {
650 /* All Linux file systems that are necessarily read-only */
651 return STR_IN_SET(fstype
,
657 bool fstype_can_discard(const char *fstype
) {
658 return STR_IN_SET(fstype
,
665 bool fstype_can_uid_gid(const char *fstype
) {
667 /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and directories,
668 * current and future. */
670 return STR_IN_SET(fstype
,
681 int repeat_unmount(const char *path
, int flags
) {
686 /* If there are multiple mounts on a mount point, this
687 * removes them all */
690 if (umount2(path
, flags
) < 0) {
702 const char* mode_to_inaccessible_node(mode_t mode
) {
703 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created during
704 * early boot by PID 1. In some cases we lacked the privs to create the character and block devices (maybe
705 * because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a devices policy that excludes
706 * device nodes with major and minor of 0), but that's fine, in that case we use an AF_UNIX file node instead,
707 * which is not the same, but close enough for most uses. And most importantly, the kernel allows bind mounts
708 * from socket nodes to any non-directory file nodes, and that's the most important thing that matters. */
710 switch(mode
& S_IFMT
) {
712 return "/run/systemd/inaccessible/reg";
715 return "/run/systemd/inaccessible/dir";
718 if (access("/run/systemd/inaccessible/chr", F_OK
) == 0)
719 return "/run/systemd/inaccessible/chr";
720 return "/run/systemd/inaccessible/sock";
723 if (access("/run/systemd/inaccessible/blk", F_OK
) == 0)
724 return "/run/systemd/inaccessible/blk";
725 return "/run/systemd/inaccessible/sock";
728 return "/run/systemd/inaccessible/fifo";
731 return "/run/systemd/inaccessible/sock";
736 #define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
737 static char* mount_flags_to_string(long unsigned flags
) {
739 _cleanup_free_
char *y
= NULL
;
740 long unsigned overflow
;
742 overflow
= flags
& ~(MS_RDONLY
|
767 if (flags
== 0 || overflow
!= 0)
768 if (asprintf(&y
, "%lx", overflow
) < 0)
771 x
= strjoin(FLAG(MS_RDONLY
),
775 FLAG(MS_SYNCHRONOUS
),
793 FLAG(MS_STRICTATIME
),
799 x
[strlen(x
) - 1] = '\0'; /* truncate the last | */
809 const char *options
) {
811 _cleanup_free_
char *fl
= NULL
, *o
= NULL
;
815 r
= mount_option_mangle(options
, flags
, &f
, &o
);
817 return log_full_errno(error_log_level
, r
,
818 "Failed to mangle mount options %s: %m",
821 fl
= mount_flags_to_string(f
);
823 if ((f
& MS_REMOUNT
) && !what
&& !type
)
824 log_debug("Remounting %s (%s \"%s\")...",
825 where
, strnull(fl
), strempty(o
));
826 else if (!what
&& !type
)
827 log_debug("Mounting %s (%s \"%s\")...",
828 where
, strnull(fl
), strempty(o
));
829 else if ((f
& MS_BIND
) && !type
)
830 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
831 what
, where
, strnull(fl
), strempty(o
));
832 else if (f
& MS_MOVE
)
833 log_debug("Moving mount %s → %s (%s \"%s\")...",
834 what
, where
, strnull(fl
), strempty(o
));
836 log_debug("Mounting %s on %s (%s \"%s\")...",
837 strna(type
), where
, strnull(fl
), strempty(o
));
838 if (mount(what
, where
, type
, f
, o
) < 0)
839 return log_full_errno(error_log_level
, errno
,
840 "Failed to mount %s on %s (%s \"%s\"): %m",
841 strna(type
), where
, strnull(fl
), strempty(o
));
845 int umount_verbose(const char *what
) {
846 log_debug("Umounting %s...", what
);
847 if (umount(what
) < 0)
848 return log_error_errno(errno
, "Failed to unmount %s: %m", what
);
852 const char *mount_propagation_flags_to_string(unsigned long flags
) {
854 switch (flags
& (MS_SHARED
|MS_SLAVE
|MS_PRIVATE
)) {
868 int mount_propagation_flags_from_string(const char *name
, unsigned long *ret
) {
872 else if (streq(name
, "shared"))
874 else if (streq(name
, "slave"))
876 else if (streq(name
, "private"))
883 int mount_option_mangle(
885 unsigned long mount_flags
,
886 unsigned long *ret_mount_flags
,
887 char **ret_remaining_options
) {
889 const struct libmnt_optmap
*map
;
890 _cleanup_free_
char *ret
= NULL
;
894 /* This extracts mount flags from the mount options, and store
895 * non-mount-flag options to '*ret_remaining_options'.
897 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
898 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
899 * "size=1630748k,mode=700,uid=1000,gid=1000".
900 * See more examples in test-mount-utils.c.
902 * Note that if 'options' does not contain any non-mount-flag options,
903 * then '*ret_remaining_options' is set to NULL instread of empty string.
904 * Note that this does not check validity of options stored in
905 * '*ret_remaining_options'.
906 * Note that if 'options' is NULL, then this just copies 'mount_flags'
907 * to '*ret_mount_flags'. */
909 assert(ret_mount_flags
);
910 assert(ret_remaining_options
);
912 map
= mnt_get_builtin_optmap(MNT_LINUX_MAP
);
918 _cleanup_free_
char *word
= NULL
;
919 const struct libmnt_optmap
*ent
;
921 r
= extract_first_word(&p
, &word
, ",", EXTRACT_QUOTES
);
927 for (ent
= map
; ent
->name
; ent
++) {
928 /* All entries in MNT_LINUX_MAP do not take any argument.
929 * Thus, ent->name does not contain "=" or "[=]". */
930 if (!streq(word
, ent
->name
))
933 if (!(ent
->mask
& MNT_INVERT
))
934 mount_flags
|= ent
->id
;
935 else if (mount_flags
& ent
->id
)
936 mount_flags
^= ent
->id
;
941 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
942 if (!ent
->name
&& !strextend_with_separator(&ret
, ",", word
, NULL
))
946 *ret_mount_flags
= mount_flags
;
947 *ret_remaining_options
= TAKE_PTR(ret
);