1 /* SPDX-License-Identifier: LGPL-2.1+ */
9 #include <sys/statvfs.h>
15 #include "alloc-util.h"
17 #include "extract-word.h"
22 #include "mount-util.h"
23 #include "parse-util.h"
24 #include "path-util.h"
26 #include "stdio-util.h"
27 #include "string-util.h"
30 /* This is the original MAX_HANDLE_SZ definition from the kernel, when the API was introduced. We use that in place of
31 * any more currently defined value to future-proof things: if the size is increased in the API headers, and our code
32 * is recompiled then it would cease working on old kernels, as those refuse any sizes larger than this value with
33 * EINVAL right-away. Hence, let's disconnect ourselves from any such API changes, and stick to the original definition
34 * from when it was introduced. We use it as a start value only anyway (see below), and hence should be able to deal
35 * with large file handles anyway. */
36 #define ORIGINAL_MAX_HANDLE_SZ 128
38 int name_to_handle_at_loop(
41 struct file_handle
**ret_handle
,
45 _cleanup_free_
struct file_handle
*h
= NULL
;
46 size_t n
= ORIGINAL_MAX_HANDLE_SZ
;
48 /* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified
49 * buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a
50 * start value, it is not an upper bound on the buffer size required.
52 * This improves on raw name_to_handle_at() also in one other regard: ret_handle and ret_mnt_id can be passed
53 * as NULL if there's no interest in either. */
58 h
= malloc0(offsetof(struct file_handle
, f_handle
) + n
);
64 if (name_to_handle_at(fd
, path
, h
, &mnt_id
, flags
) >= 0) {
67 *ret_handle
= TAKE_PTR(h
);
74 if (errno
!= EOVERFLOW
)
77 if (!ret_handle
&& ret_mnt_id
&& mnt_id
>= 0) {
79 /* As it appears, name_to_handle_at() fills in mnt_id even when it returns EOVERFLOW when the
80 * buffer is too small, but that's undocumented. Hence, let's make use of this if it appears to
81 * be filled in, and the caller was interested in only the mount ID an nothing else. */
87 /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something
88 * else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small
89 * buffer. In that case propagate EOVERFLOW */
90 if (h
->handle_bytes
<= n
)
93 /* The buffer was too small. Size the new buffer by what name_to_handle_at() returned. */
95 if (offsetof(struct file_handle
, f_handle
) + n
< n
) /* check for addition overflow */
102 static int fd_fdinfo_mnt_id(int fd
, const char *filename
, int flags
, int *mnt_id
) {
103 char path
[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
104 _cleanup_free_
char *fdinfo
= NULL
;
105 _cleanup_close_
int subfd
= -1;
109 if ((flags
& AT_EMPTY_PATH
) && isempty(filename
))
110 xsprintf(path
, "/proc/self/fdinfo/%i", fd
);
112 subfd
= openat(fd
, filename
, O_CLOEXEC
|O_PATH
);
116 xsprintf(path
, "/proc/self/fdinfo/%i", subfd
);
119 r
= read_full_file(path
, &fdinfo
, NULL
);
120 if (r
== -ENOENT
) /* The fdinfo directory is a relatively new addition */
125 p
= startswith(fdinfo
, "mnt_id:");
127 p
= strstr(fdinfo
, "\nmnt_id:");
128 if (!p
) /* The mnt_id field is a relatively new addition */
134 p
+= strspn(p
, WHITESPACE
);
135 p
[strcspn(p
, WHITESPACE
)] = 0;
137 return safe_atoi(p
, mnt_id
);
140 int fd_is_mount_point(int fd
, const char *filename
, int flags
) {
141 _cleanup_free_
struct file_handle
*h
= NULL
, *h_parent
= NULL
;
142 int mount_id
= -1, mount_id_parent
= -1;
143 bool nosupp
= false, check_st_dev
= true;
150 /* First we will try the name_to_handle_at() syscall, which
151 * tells us the mount id and an opaque file "handle". It is
152 * not supported everywhere though (kernel compile-time
153 * option, not all file systems are hooked up). If it works
154 * the mount id is usually good enough to tell us whether
155 * something is a mount point.
157 * If that didn't work we will try to read the mount id from
158 * /proc/self/fdinfo/<fd>. This is almost as good as
159 * name_to_handle_at(), however, does not return the
160 * opaque file handle. The opaque file handle is pretty useful
161 * to detect the root directory, which we should always
162 * consider a mount point. Hence we use this only as
163 * fallback. Exporting the mnt_id in fdinfo is a pretty recent
166 * As last fallback we do traditional fstat() based st_dev
167 * comparisons. This is how things were traditionally done,
168 * but unionfs breaks this since it exposes file
169 * systems with a variety of st_dev reported. Also, btrfs
170 * subvolumes have different st_dev, even though they aren't
171 * real mounts of their own. */
173 r
= name_to_handle_at_loop(fd
, filename
, &h
, &mount_id
, flags
);
174 if (IN_SET(r
, -ENOSYS
, -EACCES
, -EPERM
, -EOVERFLOW
, -EINVAL
))
175 /* This kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked
176 * (EACCES/EPERM; maybe through seccomp, because we are running inside of a container?), or the mount
177 * point is not triggered yet (EOVERFLOW, think nfs4), or some general name_to_handle_at() flakiness
178 * (EINVAL): fall back to simpler logic. */
179 goto fallback_fdinfo
;
180 else if (r
== -EOPNOTSUPP
)
181 /* This kernel or file system does not support name_to_handle_at(), hence let's see if the upper fs
182 * supports it (in which case it is a mount point), otherwise fallback to the traditional stat()
188 r
= name_to_handle_at_loop(fd
, "", &h_parent
, &mount_id_parent
, AT_EMPTY_PATH
);
189 if (r
== -EOPNOTSUPP
) {
191 /* Neither parent nor child do name_to_handle_at()? We have no choice but to fall back. */
192 goto fallback_fdinfo
;
194 /* The parent can't do name_to_handle_at() but the directory we are interested in can? If so,
195 * it must be a mount point. */
200 /* The parent can do name_to_handle_at() but the
201 * directory we are interested in can't? If so, it
202 * must be a mount point. */
206 /* If the file handle for the directory we are
207 * interested in and its parent are identical, we
208 * assume this is the root directory, which is a mount
211 if (h
->handle_bytes
== h_parent
->handle_bytes
&&
212 h
->handle_type
== h_parent
->handle_type
&&
213 memcmp(h
->f_handle
, h_parent
->f_handle
, h
->handle_bytes
) == 0)
216 return mount_id
!= mount_id_parent
;
219 r
= fd_fdinfo_mnt_id(fd
, filename
, flags
, &mount_id
);
220 if (IN_SET(r
, -EOPNOTSUPP
, -EACCES
, -EPERM
))
225 r
= fd_fdinfo_mnt_id(fd
, "", AT_EMPTY_PATH
, &mount_id_parent
);
229 if (mount_id
!= mount_id_parent
)
232 /* Hmm, so, the mount ids are the same. This leaves one
233 * special case though for the root file system. For that,
234 * let's see if the parent directory has the same inode as we
235 * are interested in. Hence, let's also do fstat() checks now,
236 * too, but avoid the st_dev comparisons, since they aren't
237 * that useful on unionfs mounts. */
238 check_st_dev
= false;
241 /* yay for fstatat() taking a different set of flags than the other
243 if (flags
& AT_SYMLINK_FOLLOW
)
244 flags
&= ~AT_SYMLINK_FOLLOW
;
246 flags
|= AT_SYMLINK_NOFOLLOW
;
247 if (fstatat(fd
, filename
, &a
, flags
) < 0)
250 if (fstatat(fd
, "", &b
, AT_EMPTY_PATH
) < 0)
253 /* A directory with same device and inode as its parent? Must
254 * be the root directory */
255 if (a
.st_dev
== b
.st_dev
&&
256 a
.st_ino
== b
.st_ino
)
259 return check_st_dev
&& (a
.st_dev
!= b
.st_dev
);
262 /* flags can be AT_SYMLINK_FOLLOW or 0 */
263 int path_is_mount_point(const char *t
, const char *root
, int flags
) {
264 _cleanup_free_
char *canonical
= NULL
, *parent
= NULL
;
265 _cleanup_close_
int fd
= -1;
269 assert((flags
& ~AT_SYMLINK_FOLLOW
) == 0);
271 if (path_equal(t
, "/"))
274 /* we need to resolve symlinks manually, we can't just rely on
275 * fd_is_mount_point() to do that for us; if we have a structure like
276 * /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
277 * look at needs to be /usr, not /. */
278 if (flags
& AT_SYMLINK_FOLLOW
) {
279 r
= chase_symlinks(t
, root
, CHASE_TRAIL_SLASH
, &canonical
);
286 parent
= dirname_malloc(t
);
290 fd
= openat(AT_FDCWD
, parent
, O_DIRECTORY
|O_CLOEXEC
|O_PATH
);
294 return fd_is_mount_point(fd
, last_path_component(t
), flags
);
297 int path_get_mnt_id(const char *path
, int *ret
) {
300 r
= name_to_handle_at_loop(AT_FDCWD
, path
, NULL
, ret
, 0);
301 if (IN_SET(r
, -EOPNOTSUPP
, -ENOSYS
, -EACCES
, -EPERM
, -EOVERFLOW
, -EINVAL
)) /* kernel/fs don't support this, or seccomp blocks access, or untriggered mount, or name_to_handle_at() is flaky */
302 return fd_fdinfo_mnt_id(AT_FDCWD
, path
, 0, ret
);
307 int umount_recursive(const char *prefix
, int flags
) {
311 /* Try to umount everything recursively below a
312 * directory. Also, take care of stacked mounts, and keep
313 * unmounting them until they are gone. */
316 _cleanup_fclose_
FILE *proc_self_mountinfo
= NULL
;
321 proc_self_mountinfo
= fopen("/proc/self/mountinfo", "re");
322 if (!proc_self_mountinfo
)
325 (void) __fsetlocking(proc_self_mountinfo
, FSETLOCKING_BYCALLER
);
328 _cleanup_free_
char *path
= NULL
, *p
= NULL
;
331 k
= fscanf(proc_self_mountinfo
,
332 "%*s " /* (1) mount id */
333 "%*s " /* (2) parent id */
334 "%*s " /* (3) major:minor */
335 "%*s " /* (4) root */
336 "%ms " /* (5) mount point */
337 "%*s" /* (6) mount options */
338 "%*[^-]" /* (7) optional fields */
339 "- " /* (8) separator */
340 "%*s " /* (9) file system type */
341 "%*s" /* (10) mount source */
342 "%*s" /* (11) mount options 2 */
343 "%*[^\n]", /* some rubbish at the end */
352 r
= cunescape(path
, UNESCAPE_RELAX
, &p
);
356 if (!path_startswith(p
, prefix
))
359 if (umount2(p
, flags
) < 0) {
360 r
= log_debug_errno(errno
, "Failed to umount %s: %m", p
);
364 log_debug("Successfully unmounted %s", p
);
377 static int get_mount_flags(const char *path
, unsigned long *flags
) {
380 if (statvfs(path
, &buf
) < 0)
386 /* Use this function only if do you have direct access to /proc/self/mountinfo
387 * and need the caller to open it for you. This is the case when /proc is
388 * masked or not mounted. Otherwise, use bind_remount_recursive. */
389 int bind_remount_recursive_with_mountinfo(const char *prefix
, bool ro
, char **blacklist
, FILE *proc_self_mountinfo
) {
390 _cleanup_set_free_free_ Set
*done
= NULL
;
391 _cleanup_free_
char *cleaned
= NULL
;
394 assert(proc_self_mountinfo
);
396 /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
397 * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
398 * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
399 * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
400 * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
401 * do not have any effect on future submounts that might get propagated, they migt be writable. This includes
402 * future submounts that have been triggered via autofs.
404 * If the "blacklist" parameter is specified it may contain a list of subtrees to exclude from the
405 * remount operation. Note that we'll ignore the blacklist for the top-level path. */
407 cleaned
= strdup(prefix
);
411 path_simplify(cleaned
, false);
413 done
= set_new(&path_hash_ops
);
418 _cleanup_set_free_free_ Set
*todo
= NULL
;
419 bool top_autofs
= false;
421 unsigned long orig_flags
;
423 todo
= set_new(&path_hash_ops
);
427 rewind(proc_self_mountinfo
);
430 _cleanup_free_
char *path
= NULL
, *p
= NULL
, *type
= NULL
;
433 k
= fscanf(proc_self_mountinfo
,
434 "%*s " /* (1) mount id */
435 "%*s " /* (2) parent id */
436 "%*s " /* (3) major:minor */
437 "%*s " /* (4) root */
438 "%ms " /* (5) mount point */
439 "%*s" /* (6) mount options (superblock) */
440 "%*[^-]" /* (7) optional fields */
441 "- " /* (8) separator */
442 "%ms " /* (9) file system type */
443 "%*s" /* (10) mount source */
444 "%*s" /* (11) mount options (bind mount) */
445 "%*[^\n]", /* some rubbish at the end */
455 r
= cunescape(path
, UNESCAPE_RELAX
, &p
);
459 if (!path_startswith(p
, cleaned
))
462 /* Ignore this mount if it is blacklisted, but only if it isn't the top-level mount we shall
464 if (!path_equal(cleaned
, p
)) {
465 bool blacklisted
= false;
468 STRV_FOREACH(i
, blacklist
) {
470 if (path_equal(*i
, cleaned
))
473 if (!path_startswith(*i
, cleaned
))
476 if (path_startswith(p
, *i
)) {
478 log_debug("Not remounting %s, because blacklisted by %s, called for %s", p
, *i
, cleaned
);
486 /* Let's ignore autofs mounts. If they aren't
487 * triggered yet, we want to avoid triggering
488 * them, as we don't make any guarantees for
489 * future submounts anyway. If they are
490 * already triggered, then we will find
491 * another entry for this. */
492 if (streq(type
, "autofs")) {
493 top_autofs
= top_autofs
|| path_equal(cleaned
, p
);
497 if (!set_contains(done
, p
)) {
498 r
= set_consume(todo
, p
);
507 /* If we have no submounts to process anymore and if
508 * the root is either already done, or an autofs, we
510 if (set_isempty(todo
) &&
511 (top_autofs
|| set_contains(done
, cleaned
)))
514 if (!set_contains(done
, cleaned
) &&
515 !set_contains(todo
, cleaned
)) {
516 /* The prefix directory itself is not yet a mount, make it one. */
517 if (mount(cleaned
, cleaned
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
521 (void) get_mount_flags(cleaned
, &orig_flags
);
522 orig_flags
&= ~MS_RDONLY
;
524 if (mount(NULL
, prefix
, NULL
, orig_flags
|MS_BIND
|MS_REMOUNT
|(ro
? MS_RDONLY
: 0), NULL
) < 0)
527 log_debug("Made top-level directory %s a mount point.", prefix
);
533 r
= set_consume(done
, x
);
538 while ((x
= set_steal_first(todo
))) {
540 r
= set_consume(done
, x
);
541 if (IN_SET(r
, 0, -EEXIST
))
546 /* Deal with mount points that are obstructed by a later mount */
547 r
= path_is_mount_point(x
, NULL
, 0);
548 if (IN_SET(r
, 0, -ENOENT
))
553 /* Try to reuse the original flag set */
555 (void) get_mount_flags(x
, &orig_flags
);
556 orig_flags
&= ~MS_RDONLY
;
558 if (mount(NULL
, x
, NULL
, orig_flags
|MS_BIND
|MS_REMOUNT
|(ro
? MS_RDONLY
: 0), NULL
) < 0)
561 log_debug("Remounted %s read-only.", x
);
566 int bind_remount_recursive(const char *prefix
, bool ro
, char **blacklist
) {
567 _cleanup_fclose_
FILE *proc_self_mountinfo
= NULL
;
569 proc_self_mountinfo
= fopen("/proc/self/mountinfo", "re");
570 if (!proc_self_mountinfo
)
573 (void) __fsetlocking(proc_self_mountinfo
, FSETLOCKING_BYCALLER
);
575 return bind_remount_recursive_with_mountinfo(prefix
, ro
, blacklist
, proc_self_mountinfo
);
578 int mount_move_root(const char *path
) {
584 if (mount(path
, "/", NULL
, MS_MOVE
, NULL
) < 0)
596 bool fstype_is_network(const char *fstype
) {
599 x
= startswith(fstype
, "fuse.");
603 return STR_IN_SET(fstype
,
615 "pvfs2", /* OrangeFS */
620 bool fstype_is_api_vfs(const char *fstype
) {
621 return STR_IN_SET(fstype
,
644 bool fstype_is_ro(const char *fstype
) {
645 /* All Linux file systems that are necessarily read-only */
646 return STR_IN_SET(fstype
,
652 bool fstype_can_discard(const char *fstype
) {
653 return STR_IN_SET(fstype
,
660 bool fstype_can_uid_gid(const char *fstype
) {
662 /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and directories,
663 * current and future. */
665 return STR_IN_SET(fstype
,
676 int repeat_unmount(const char *path
, int flags
) {
681 /* If there are multiple mounts on a mount point, this
682 * removes them all */
685 if (umount2(path
, flags
) < 0) {
697 const char* mode_to_inaccessible_node(mode_t mode
) {
698 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created during
699 * early boot by PID 1. In some cases we lacked the privs to create the character and block devices (maybe
700 * because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a devices policy that excludes
701 * device nodes with major and minor of 0), but that's fine, in that case we use an AF_UNIX file node instead,
702 * which is not the same, but close enough for most uses. And most importantly, the kernel allows bind mounts
703 * from socket nodes to any non-directory file nodes, and that's the most important thing that matters. */
705 switch(mode
& S_IFMT
) {
707 return "/run/systemd/inaccessible/reg";
710 return "/run/systemd/inaccessible/dir";
713 if (access("/run/systemd/inaccessible/chr", F_OK
) == 0)
714 return "/run/systemd/inaccessible/chr";
715 return "/run/systemd/inaccessible/sock";
718 if (access("/run/systemd/inaccessible/blk", F_OK
) == 0)
719 return "/run/systemd/inaccessible/blk";
720 return "/run/systemd/inaccessible/sock";
723 return "/run/systemd/inaccessible/fifo";
726 return "/run/systemd/inaccessible/sock";
731 #define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
732 static char* mount_flags_to_string(long unsigned flags
) {
734 _cleanup_free_
char *y
= NULL
;
735 long unsigned overflow
;
737 overflow
= flags
& ~(MS_RDONLY
|
762 if (flags
== 0 || overflow
!= 0)
763 if (asprintf(&y
, "%lx", overflow
) < 0)
766 x
= strjoin(FLAG(MS_RDONLY
),
770 FLAG(MS_SYNCHRONOUS
),
788 FLAG(MS_STRICTATIME
),
794 x
[strlen(x
) - 1] = '\0'; /* truncate the last | */
804 const char *options
) {
806 _cleanup_free_
char *fl
= NULL
, *o
= NULL
;
810 r
= mount_option_mangle(options
, flags
, &f
, &o
);
812 return log_full_errno(error_log_level
, r
,
813 "Failed to mangle mount options %s: %m",
816 fl
= mount_flags_to_string(f
);
818 if ((f
& MS_REMOUNT
) && !what
&& !type
)
819 log_debug("Remounting %s (%s \"%s\")...",
820 where
, strnull(fl
), strempty(o
));
821 else if (!what
&& !type
)
822 log_debug("Mounting %s (%s \"%s\")...",
823 where
, strnull(fl
), strempty(o
));
824 else if ((f
& MS_BIND
) && !type
)
825 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
826 what
, where
, strnull(fl
), strempty(o
));
827 else if (f
& MS_MOVE
)
828 log_debug("Moving mount %s → %s (%s \"%s\")...",
829 what
, where
, strnull(fl
), strempty(o
));
831 log_debug("Mounting %s on %s (%s \"%s\")...",
832 strna(type
), where
, strnull(fl
), strempty(o
));
833 if (mount(what
, where
, type
, f
, o
) < 0)
834 return log_full_errno(error_log_level
, errno
,
835 "Failed to mount %s on %s (%s \"%s\"): %m",
836 strna(type
), where
, strnull(fl
), strempty(o
));
840 int umount_verbose(const char *what
) {
841 log_debug("Umounting %s...", what
);
842 if (umount(what
) < 0)
843 return log_error_errno(errno
, "Failed to unmount %s: %m", what
);
847 const char *mount_propagation_flags_to_string(unsigned long flags
) {
849 switch (flags
& (MS_SHARED
|MS_SLAVE
|MS_PRIVATE
)) {
863 int mount_propagation_flags_from_string(const char *name
, unsigned long *ret
) {
867 else if (streq(name
, "shared"))
869 else if (streq(name
, "slave"))
871 else if (streq(name
, "private"))
878 int mount_option_mangle(
880 unsigned long mount_flags
,
881 unsigned long *ret_mount_flags
,
882 char **ret_remaining_options
) {
884 const struct libmnt_optmap
*map
;
885 _cleanup_free_
char *ret
= NULL
;
889 /* This extracts mount flags from the mount options, and store
890 * non-mount-flag options to '*ret_remaining_options'.
892 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
893 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
894 * "size=1630748k,mode=700,uid=1000,gid=1000".
895 * See more examples in test-mount-utils.c.
897 * Note that if 'options' does not contain any non-mount-flag options,
898 * then '*ret_remaining_options' is set to NULL instread of empty string.
899 * Note that this does not check validity of options stored in
900 * '*ret_remaining_options'.
901 * Note that if 'options' is NULL, then this just copies 'mount_flags'
902 * to '*ret_mount_flags'. */
904 assert(ret_mount_flags
);
905 assert(ret_remaining_options
);
907 map
= mnt_get_builtin_optmap(MNT_LINUX_MAP
);
913 _cleanup_free_
char *word
= NULL
;
914 const struct libmnt_optmap
*ent
;
916 r
= extract_first_word(&p
, &word
, ",", EXTRACT_QUOTES
);
922 for (ent
= map
; ent
->name
; ent
++) {
923 /* All entries in MNT_LINUX_MAP do not take any argument.
924 * Thus, ent->name does not contain "=" or "[=]". */
925 if (!streq(word
, ent
->name
))
928 if (!(ent
->mask
& MNT_INVERT
))
929 mount_flags
|= ent
->id
;
930 else if (mount_flags
& ent
->id
)
931 mount_flags
^= ent
->id
;
936 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
937 if (!ent
->name
&& !strextend_with_separator(&ret
, ",", word
, NULL
))
941 *ret_mount_flags
= mount_flags
;
942 *ret_remaining_options
= TAKE_PTR(ret
);