1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
7 #include <sys/statvfs.h>
9 #include <linux/loop.h>
12 #include "alloc-util.h"
13 #include "chase-symlinks.h"
14 #include "dissect-image.h"
15 #include "exec-util.h"
16 #include "extract-word.h"
22 #include "libmount-util.h"
23 #include "missing_mount.h"
24 #include "missing_syscall.h"
25 #include "mkdir-label.h"
26 #include "mount-util.h"
27 #include "mountpoint-util.h"
28 #include "namespace-util.h"
29 #include "parse-util.h"
30 #include "path-util.h"
31 #include "process-util.h"
33 #include "stat-util.h"
34 #include "stdio-util.h"
35 #include "string-util.h"
37 #include "tmpfile-util.h"
38 #include "user-util.h"
40 int mount_fd(const char *source
,
42 const char *filesystemtype
,
43 unsigned long mountflags
,
46 if (mount(source
, FORMAT_PROC_FD_PATH(target_fd
), filesystemtype
, mountflags
, data
) < 0) {
50 /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
51 * mounted. Check for the latter to generate better error messages. */
52 if (proc_mounted() == 0)
64 const char *filesystemtype
,
65 unsigned long mountflags
,
68 _cleanup_close_
int fd
= -1;
70 /* In almost all cases we want to manipulate the mount table without following symlinks, hence
71 * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
72 * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
73 * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
74 * fs to mount) we can only use traditional mount() directly.
76 * Note that this disables following only for the final component of the target, i.e symlinks within
77 * the path of the target are honoured, as are symlinks in the source path everywhere. */
79 fd
= open(target
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
83 return mount_fd(source
, fd
, filesystemtype
, mountflags
, data
);
86 int umount_recursive(const char *prefix
, int flags
) {
90 /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
91 * keep unmounting them until they are gone. */
94 _cleanup_(mnt_free_tablep
) struct libmnt_table
*table
= NULL
;
95 _cleanup_(mnt_free_iterp
) struct libmnt_iter
*iter
= NULL
;
99 r
= libmount_parse("/proc/self/mountinfo", NULL
, &table
, &iter
);
101 return log_debug_errno(r
, "Failed to parse /proc/self/mountinfo: %m");
104 struct libmnt_fs
*fs
;
107 r
= mnt_table_next_fs(table
, iter
, &fs
);
111 return log_debug_errno(r
, "Failed to get next entry from /proc/self/mountinfo: %m");
113 path
= mnt_fs_get_target(fs
);
117 if (!path_startswith(path
, prefix
))
120 if (umount2(path
, flags
| UMOUNT_NOFOLLOW
) < 0) {
121 log_debug_errno(errno
, "Failed to umount %s, ignoring: %m", path
);
125 log_debug("Successfully unmounted %s", path
);
137 #define MS_CONVERTIBLE_FLAGS (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_NOSYMFOLLOW)
139 static uint64_t ms_flags_to_mount_attr(unsigned long a
) {
142 if (FLAGS_SET(a
, MS_RDONLY
))
143 f
|= MOUNT_ATTR_RDONLY
;
145 if (FLAGS_SET(a
, MS_NOSUID
))
146 f
|= MOUNT_ATTR_NOSUID
;
148 if (FLAGS_SET(a
, MS_NODEV
))
149 f
|= MOUNT_ATTR_NODEV
;
151 if (FLAGS_SET(a
, MS_NOEXEC
))
152 f
|= MOUNT_ATTR_NOEXEC
;
154 if (FLAGS_SET(a
, MS_NOSYMFOLLOW
))
155 f
|= MOUNT_ATTR_NOSYMFOLLOW
;
160 static bool skip_mount_set_attr
= false;
162 /* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
163 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
164 int bind_remount_recursive_with_mountinfo(
166 unsigned long new_flags
,
167 unsigned long flags_mask
,
169 FILE *proc_self_mountinfo
) {
171 _cleanup_fclose_
FILE *proc_self_mountinfo_opened
= NULL
;
172 _cleanup_set_free_ Set
*done
= NULL
;
173 unsigned n_tries
= 0;
178 if ((flags_mask
& ~MS_CONVERTIBLE_FLAGS
) == 0 && strv_isempty(deny_list
) && !skip_mount_set_attr
) {
179 /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
181 if (mount_setattr(AT_FDCWD
, prefix
, AT_SYMLINK_NOFOLLOW
|AT_RECURSIVE
,
182 &(struct mount_attr
) {
183 .attr_set
= ms_flags_to_mount_attr(new_flags
& flags_mask
),
184 .attr_clr
= ms_flags_to_mount_attr(~new_flags
& flags_mask
),
185 }, MOUNT_ATTR_SIZE_VER0
) < 0) {
187 log_debug_errno(errno
, "mount_setattr() failed, falling back to classic remounting: %m");
189 /* We fall through to classic behaviour if not supported (i.e. kernel < 5.12). We
190 * also do this for all other kinds of errors since they are so many different, and
191 * mount_setattr() has no graceful mode where it continues despite seeing errors one
192 * some mounts, but we want that. Moreover mount_setattr() only works on the mount
193 * point inode itself, not a non-mount point inode, and we want to support arbitrary
196 if (ERRNO_IS_NOT_SUPPORTED(errno
)) /* if not supported, then don't bother at all anymore */
197 skip_mount_set_attr
= true;
199 return 0; /* Nice, this worked! */
202 if (!proc_self_mountinfo
) {
203 r
= fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo_opened
);
207 proc_self_mountinfo
= proc_self_mountinfo_opened
;
210 /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
211 * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
212 * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
213 * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
214 * access, too. When mounts are stacked on the same mount point we only care for each individual
215 * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
216 * not have any effect on future submounts that might get propagated, they might be writable
217 * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
218 * operate atomically here. Mounts established while we process the tree might or might not get
219 * noticed and thus might or might not be covered.
221 * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
222 * remount operation. Note that we'll ignore the deny list for the top-level path. */
225 _cleanup_(mnt_free_tablep
) struct libmnt_table
*table
= NULL
;
226 _cleanup_(mnt_free_iterp
) struct libmnt_iter
*iter
= NULL
;
227 _cleanup_hashmap_free_ Hashmap
*todo
= NULL
;
228 bool top_autofs
= false;
230 if (n_tries
++ >= 32) /* Let's not retry this loop forever */
233 rewind(proc_self_mountinfo
);
235 r
= libmount_parse("/proc/self/mountinfo", proc_self_mountinfo
, &table
, &iter
);
237 return log_debug_errno(r
, "Failed to parse /proc/self/mountinfo: %m");
240 _cleanup_free_
char *d
= NULL
;
241 const char *path
, *type
, *opts
;
242 unsigned long flags
= 0;
243 struct libmnt_fs
*fs
;
245 r
= mnt_table_next_fs(table
, iter
, &fs
);
246 if (r
== 1) /* EOF */
249 return log_debug_errno(r
, "Failed to get next entry from /proc/self/mountinfo: %m");
251 path
= mnt_fs_get_target(fs
);
255 if (!path_startswith(path
, prefix
))
258 type
= mnt_fs_get_fstype(fs
);
262 /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
263 * triggering them, as we don't make any guarantees for future submounts anyway. If
264 * they are already triggered, then we will find another entry for this. */
265 if (streq(type
, "autofs")) {
266 top_autofs
= top_autofs
|| path_equal(path
, prefix
);
270 if (set_contains(done
, path
))
273 /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
274 * we shall operate on. */
275 if (!path_equal(path
, prefix
)) {
276 bool deny_listed
= false;
278 STRV_FOREACH(i
, deny_list
) {
279 if (path_equal(*i
, prefix
))
282 if (!path_startswith(*i
, prefix
))
285 if (path_startswith(path
, *i
)) {
287 log_debug("Not remounting %s deny-listed by %s, called for %s", path
, *i
, prefix
);
296 opts
= mnt_fs_get_vfs_options(fs
);
298 r
= mnt_optstr_get_flags(opts
, &flags
, mnt_get_builtin_optmap(MNT_LINUX_MAP
));
300 log_debug_errno(r
, "Could not get flags for '%s', ignoring: %m", path
);
307 r
= hashmap_ensure_put(&todo
, &path_hash_ops_free
, d
, ULONG_TO_PTR(flags
));
309 /* If the same path was recorded, but with different mount flags, update it:
310 * it means a mount point is overmounted, and libmount returns the "bottom" (or
311 * older one) first, but we want to reapply the flags from the "top" (or newer
312 * one). See: https://github.com/systemd/systemd/issues/20032
313 * Note that this shouldn't really fail, as we were just told that the key
314 * exists, and it's an update so we want 'd' to be freed immediately. */
315 r
= hashmap_update(todo
, d
, ULONG_TO_PTR(flags
));
322 /* Check if the top-level directory was among what we have seen so far. For that check both
323 * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
324 * not include it in either set but will set this bool. */
325 if (!set_contains(done
, prefix
) &&
326 !(top_autofs
|| hashmap_contains(todo
, prefix
))) {
328 /* The prefix directory itself is not yet a mount, make it one. */
329 r
= mount_nofollow(prefix
, prefix
, NULL
, MS_BIND
|MS_REC
, NULL
);
333 /* Immediately rescan, so that we pick up the new mount's flags */
337 /* If we have no submounts to process anymore, we are done */
338 if (hashmap_isempty(todo
))
345 /* Take the first mount from our list of mounts to still process */
346 flags
= PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo
, (void**) &x
));
350 r
= set_ensure_consume(&done
, &path_hash_ops_free
, x
);
351 if (IN_SET(r
, 0, -EEXIST
))
352 continue; /* Already done */
356 /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
357 * the default anyway, thus redundant, and in userns we'll get an error if we try to
358 * explicitly enable it) */
359 r
= mount_nofollow(NULL
, x
, NULL
, ((flags
& ~flags_mask
)|MS_BIND
|MS_REMOUNT
|new_flags
) & ~MS_RELATIME
, NULL
);
363 /* OK, so the remount of this entry failed. We'll ultimately ignore this in
364 * almost all cases (there are simply so many reasons why this can fail,
365 * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
368 q
= path_is_mount_point(x
, NULL
, 0);
369 if (IN_SET(q
, 0, -ENOENT
)) {
370 /* Hmm, whaaaa? The mount point is not actually a mount point? Then
371 * it is either obstructed by a later mount or somebody has been
372 * racing against us and removed it. Either way the mount point
373 * doesn't matter to us, let's ignore it hence. */
374 log_debug_errno(r
, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x
);
377 if (q
< 0) /* Any other error on this? Just log and continue */
378 log_debug_errno(q
, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x
);
380 if (((flags
^ new_flags
) & flags_mask
& ~MS_RELATIME
) == 0) { /* ignore MS_RELATIME while comparing */
381 log_debug_errno(r
, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x
);
385 /* Make this fatal if this is the top-level mount */
386 if (path_equal(x
, prefix
))
389 /* If this is not the top-level mount, then handle this gracefully: log but
390 * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
391 * this might fail without a chance for us to do anything about it, let's
392 * hence be strict on the top-level mount and lenient on the inner ones. */
393 log_debug_errno(r
, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x
);
397 log_debug("Remounted %s.", x
);
402 int bind_remount_one_with_mountinfo(
404 unsigned long new_flags
,
405 unsigned long flags_mask
,
406 FILE *proc_self_mountinfo
) {
408 _cleanup_(mnt_free_tablep
) struct libmnt_table
*table
= NULL
;
409 unsigned long flags
= 0;
410 struct libmnt_fs
*fs
;
415 assert(proc_self_mountinfo
);
417 if ((flags_mask
& ~MS_CONVERTIBLE_FLAGS
) == 0 && !skip_mount_set_attr
) {
418 /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
420 if (mount_setattr(AT_FDCWD
, path
, AT_SYMLINK_NOFOLLOW
,
421 &(struct mount_attr
) {
422 .attr_set
= ms_flags_to_mount_attr(new_flags
& flags_mask
),
423 .attr_clr
= ms_flags_to_mount_attr(~new_flags
& flags_mask
),
424 }, MOUNT_ATTR_SIZE_VER0
) < 0) {
426 log_debug_errno(errno
, "mount_setattr() didn't work, falling back to classic remounting: %m");
428 if (ERRNO_IS_NOT_SUPPORTED(errno
)) /* if not supported, then don't bother at all anymore */
429 skip_mount_set_attr
= true;
431 return 0; /* Nice, this worked! */
434 rewind(proc_self_mountinfo
);
436 table
= mnt_new_table();
440 r
= mnt_table_parse_stream(table
, proc_self_mountinfo
, "/proc/self/mountinfo");
444 fs
= mnt_table_find_target(table
, path
, MNT_ITER_FORWARD
);
446 if (laccess(path
, F_OK
) < 0) /* Hmm, it's not in the mount table, but does it exist at all? */
449 return -EINVAL
; /* Not a mount point we recognize */
452 opts
= mnt_fs_get_vfs_options(fs
);
454 r
= mnt_optstr_get_flags(opts
, &flags
, mnt_get_builtin_optmap(MNT_LINUX_MAP
));
456 log_debug_errno(r
, "Could not get flags for '%s', ignoring: %m", path
);
459 r
= mount_nofollow(NULL
, path
, NULL
, ((flags
& ~flags_mask
)|MS_BIND
|MS_REMOUNT
|new_flags
) & ~MS_RELATIME
, NULL
);
461 if (((flags
^ new_flags
) & flags_mask
& ~MS_RELATIME
) != 0) /* Ignore MS_RELATIME again,
462 * since kernel adds it in
463 * everywhere, because it's the
467 /* Let's handle redundant remounts gracefully */
468 log_debug_errno(r
, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path
);
474 int mount_move_root(const char *path
) {
480 if (mount(path
, "/", NULL
, MS_MOVE
, NULL
) < 0)
486 return RET_NERRNO(chdir("/"));
489 int repeat_unmount(const char *path
, int flags
) {
494 /* If there are multiple mounts on a mount point, this
495 * removes them all */
498 if (umount2(path
, flags
) < 0) {
510 int mode_to_inaccessible_node(
511 const char *runtime_dir
,
515 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
516 * during early boot by PID 1. In some cases we lacked the privs to create the character and block
517 * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
518 * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
519 * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
520 * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
521 * file nodes, and that's the most important thing that matters.
523 * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
524 * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
526 _cleanup_free_
char *d
= NULL
;
527 const char *node
= NULL
;
532 runtime_dir
= "/run";
534 switch(mode
& S_IFMT
) {
536 node
= "/systemd/inaccessible/reg";
540 node
= "/systemd/inaccessible/dir";
544 node
= "/systemd/inaccessible/chr";
548 node
= "/systemd/inaccessible/blk";
552 node
= "/systemd/inaccessible/fifo";
556 node
= "/systemd/inaccessible/sock";
562 d
= path_join(runtime_dir
, node
);
566 /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
567 * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
568 * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
569 * inaccessible block device node let's see if the block device node actually exists, and if not,
570 * fall back to the character device node. From there fall back to the socket device node. This means
571 * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
572 * device node at all. */
575 access(d
, F_OK
) < 0 && errno
== ENOENT
) {
577 d
= path_join(runtime_dir
, "/systemd/inaccessible/chr");
582 if (IN_SET(mode
& S_IFMT
, S_IFBLK
, S_IFCHR
) &&
583 access(d
, F_OK
) < 0 && errno
== ENOENT
) {
585 d
= path_join(runtime_dir
, "/systemd/inaccessible/sock");
594 int mount_flags_to_string(unsigned long flags
, char **ret
) {
595 static const struct {
599 { .flag
= MS_RDONLY
, .name
= "MS_RDONLY", },
600 { .flag
= MS_NOSUID
, .name
= "MS_NOSUID", },
601 { .flag
= MS_NODEV
, .name
= "MS_NODEV", },
602 { .flag
= MS_NOEXEC
, .name
= "MS_NOEXEC", },
603 { .flag
= MS_SYNCHRONOUS
, .name
= "MS_SYNCHRONOUS", },
604 { .flag
= MS_REMOUNT
, .name
= "MS_REMOUNT", },
605 { .flag
= MS_MANDLOCK
, .name
= "MS_MANDLOCK", },
606 { .flag
= MS_DIRSYNC
, .name
= "MS_DIRSYNC", },
607 { .flag
= MS_NOSYMFOLLOW
, .name
= "MS_NOSYMFOLLOW", },
608 { .flag
= MS_NOATIME
, .name
= "MS_NOATIME", },
609 { .flag
= MS_NODIRATIME
, .name
= "MS_NODIRATIME", },
610 { .flag
= MS_BIND
, .name
= "MS_BIND", },
611 { .flag
= MS_MOVE
, .name
= "MS_MOVE", },
612 { .flag
= MS_REC
, .name
= "MS_REC", },
613 { .flag
= MS_SILENT
, .name
= "MS_SILENT", },
614 { .flag
= MS_POSIXACL
, .name
= "MS_POSIXACL", },
615 { .flag
= MS_UNBINDABLE
, .name
= "MS_UNBINDABLE", },
616 { .flag
= MS_PRIVATE
, .name
= "MS_PRIVATE", },
617 { .flag
= MS_SLAVE
, .name
= "MS_SLAVE", },
618 { .flag
= MS_SHARED
, .name
= "MS_SHARED", },
619 { .flag
= MS_RELATIME
, .name
= "MS_RELATIME", },
620 { .flag
= MS_KERNMOUNT
, .name
= "MS_KERNMOUNT", },
621 { .flag
= MS_I_VERSION
, .name
= "MS_I_VERSION", },
622 { .flag
= MS_STRICTATIME
, .name
= "MS_STRICTATIME", },
623 { .flag
= MS_LAZYTIME
, .name
= "MS_LAZYTIME", },
625 _cleanup_free_
char *str
= NULL
;
629 for (size_t i
= 0; i
< ELEMENTSOF(map
); i
++)
630 if (flags
& map
[i
].flag
) {
631 if (!strextend_with_separator(&str
, "|", map
[i
].name
))
633 flags
&= ~map
[i
].flag
;
636 if (!str
|| flags
!= 0)
637 if (strextendf_with_separator(&str
, "|", "%lx", flags
) < 0)
640 *ret
= TAKE_PTR(str
);
644 int mount_verbose_full(
651 bool follow_symlink
) {
653 _cleanup_free_
char *fl
= NULL
, *o
= NULL
;
657 r
= mount_option_mangle(options
, flags
, &f
, &o
);
659 return log_full_errno(error_log_level
, r
,
660 "Failed to mangle mount options %s: %m",
663 (void) mount_flags_to_string(f
, &fl
);
665 if ((f
& MS_REMOUNT
) && !what
&& !type
)
666 log_debug("Remounting %s (%s \"%s\")...",
667 where
, strnull(fl
), strempty(o
));
668 else if (!what
&& !type
)
669 log_debug("Mounting %s (%s \"%s\")...",
670 where
, strnull(fl
), strempty(o
));
671 else if ((f
& MS_BIND
) && !type
)
672 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
673 what
, where
, strnull(fl
), strempty(o
));
674 else if (f
& MS_MOVE
)
675 log_debug("Moving mount %s → %s (%s \"%s\")...",
676 what
, where
, strnull(fl
), strempty(o
));
678 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
679 strna(what
), strna(type
), where
, strnull(fl
), strempty(o
));
682 r
= RET_NERRNO(mount(what
, where
, type
, f
, o
));
684 r
= mount_nofollow(what
, where
, type
, f
, o
);
686 return log_full_errno(error_log_level
, r
,
687 "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
688 strna(what
), strna(type
), where
, strnull(fl
), strempty(o
));
699 log_debug("Umounting %s...", what
);
701 if (umount2(what
, flags
) < 0)
702 return log_full_errno(error_log_level
, errno
,
703 "Failed to unmount %s: %m", what
);
708 int mount_option_mangle(
710 unsigned long mount_flags
,
711 unsigned long *ret_mount_flags
,
712 char **ret_remaining_options
) {
714 const struct libmnt_optmap
*map
;
715 _cleanup_free_
char *ret
= NULL
;
718 /* This extracts mount flags from the mount options, and store
719 * non-mount-flag options to '*ret_remaining_options'.
721 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
722 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
723 * "size=1630748k,mode=700,uid=1000,gid=1000".
724 * See more examples in test-mount-utils.c.
726 * Note that if 'options' does not contain any non-mount-flag options,
727 * then '*ret_remaining_options' is set to NULL instead of empty string.
728 * Note that this does not check validity of options stored in
729 * '*ret_remaining_options'.
730 * Note that if 'options' is NULL, then this just copies 'mount_flags'
731 * to '*ret_mount_flags'. */
733 assert(ret_mount_flags
);
734 assert(ret_remaining_options
);
736 map
= mnt_get_builtin_optmap(MNT_LINUX_MAP
);
740 for (const char *p
= options
;;) {
741 _cleanup_free_
char *word
= NULL
;
742 const struct libmnt_optmap
*ent
;
744 r
= extract_first_word(&p
, &word
, ",", EXTRACT_KEEP_QUOTE
);
750 for (ent
= map
; ent
->name
; ent
++) {
751 /* All entries in MNT_LINUX_MAP do not take any argument.
752 * Thus, ent->name does not contain "=" or "[=]". */
753 if (!streq(word
, ent
->name
))
756 if (!(ent
->mask
& MNT_INVERT
))
757 mount_flags
|= ent
->id
;
758 else if (mount_flags
& ent
->id
)
759 mount_flags
^= ent
->id
;
764 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
765 if (!ent
->name
&& !strextend_with_separator(&ret
, ",", word
))
769 *ret_mount_flags
= mount_flags
;
770 *ret_remaining_options
= TAKE_PTR(ret
);
775 static int mount_in_namespace(
777 const char *propagate_path
,
778 const char *incoming_path
,
782 bool make_file_or_directory
,
783 const MountOptions
*options
,
786 _cleanup_close_pair_
int errno_pipe_fd
[2] = { -1, -1 };
787 _cleanup_close_
int self_mntns_fd
= -1, mntns_fd
= -1, root_fd
= -1, pidns_fd
= -1, chased_src_fd
= -1;
788 char mount_slave
[] = "/tmp/propagate.XXXXXX", *mount_tmp
, *mount_outside
, *p
;
789 bool mount_slave_created
= false, mount_slave_mounted
= false,
790 mount_tmp_created
= false, mount_tmp_mounted
= false,
791 mount_outside_created
= false, mount_outside_mounted
= false;
792 struct stat st
, self_mntns_st
;
797 assert(propagate_path
);
798 assert(incoming_path
);
801 assert(!options
|| is_image
);
803 r
= namespace_open(target
, &pidns_fd
, &mntns_fd
, NULL
, NULL
, &root_fd
);
805 return log_debug_errno(r
, "Failed to retrieve FDs of the target process' namespace: %m");
807 if (fstat(mntns_fd
, &st
) < 0)
808 return log_debug_errno(errno
, "Failed to fstat mount namespace FD of target process: %m");
810 r
= namespace_open(0, NULL
, &self_mntns_fd
, NULL
, NULL
, NULL
);
812 return log_debug_errno(r
, "Failed to retrieve FDs of systemd's namespace: %m");
814 if (fstat(self_mntns_fd
, &self_mntns_st
) < 0)
815 return log_debug_errno(errno
, "Failed to fstat mount namespace FD of systemd: %m");
817 /* We can't add new mounts at runtime if the process wasn't started in a namespace */
818 if (stat_inode_same(&st
, &self_mntns_st
))
819 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "Failed to activate bind mount in target, not running in a mount namespace");
821 /* One day, when bind mounting /proc/self/fd/n works across namespace boundaries we should rework
822 * this logic to make use of it... */
824 p
= strjoina(propagate_path
, "/");
825 r
= laccess(p
, F_OK
);
827 return log_debug_errno(r
== -ENOENT
? SYNTHETIC_ERRNO(EOPNOTSUPP
) : r
, "Target does not allow propagation of mount points");
829 r
= chase_symlinks(src
, NULL
, CHASE_TRAIL_SLASH
, NULL
, &chased_src_fd
);
831 return log_debug_errno(r
, "Failed to resolve source path of %s: %m", src
);
833 if (fstat(chased_src_fd
, &st
) < 0)
834 return log_debug_errno(errno
, "Failed to stat() resolved source path %s: %m", src
);
835 if (S_ISLNK(st
.st_mode
)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
836 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "Source directory %s can't be a symbolic link", src
);
838 /* Our goal is to install a new bind mount into the container,
839 possibly read-only. This is irritatingly complex
840 unfortunately, currently.
842 First, we start by creating a private playground in /tmp,
843 that we can mount MS_SLAVE. (Which is necessary, since
844 MS_MOVE cannot be applied to mounts with MS_SHARED parent
847 if (!mkdtemp(mount_slave
))
848 return log_debug_errno(errno
, "Failed to create playground %s: %m", mount_slave
);
850 mount_slave_created
= true;
852 r
= mount_nofollow_verbose(LOG_DEBUG
, mount_slave
, mount_slave
, NULL
, MS_BIND
, NULL
);
856 mount_slave_mounted
= true;
858 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, mount_slave
, NULL
, MS_SLAVE
, NULL
);
862 /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
863 mount_tmp
= strjoina(mount_slave
, "/mount");
865 r
= mkdir_p(mount_tmp
, 0700);
867 r
= make_mount_point_inode_from_stat(&st
, mount_tmp
, 0700);
869 log_debug_errno(r
, "Failed to create temporary mount point %s: %m", mount_tmp
);
873 mount_tmp_created
= true;
876 r
= verity_dissect_and_mount(FORMAT_PROC_FD_PATH(chased_src_fd
), mount_tmp
, options
, NULL
, NULL
, NULL
, NULL
);
878 r
= mount_follow_verbose(LOG_DEBUG
, FORMAT_PROC_FD_PATH(chased_src_fd
), mount_tmp
, NULL
, MS_BIND
, NULL
);
882 mount_tmp_mounted
= true;
884 /* Third, we remount the new bind mount read-only if requested. */
886 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, mount_tmp
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
891 /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
894 mount_outside
= strjoina(propagate_path
, "/XXXXXX");
895 if (is_image
|| S_ISDIR(st
.st_mode
))
896 r
= mkdtemp(mount_outside
) ? 0 : -errno
;
898 r
= mkostemp_safe(mount_outside
);
902 log_debug_errno(r
, "Cannot create propagation file or directory %s: %m", mount_outside
);
906 mount_outside_created
= true;
908 r
= mount_nofollow_verbose(LOG_DEBUG
, mount_tmp
, mount_outside
, NULL
, MS_MOVE
, NULL
);
912 mount_outside_mounted
= true;
913 mount_tmp_mounted
= false;
915 if (is_image
|| S_ISDIR(st
.st_mode
))
916 (void) rmdir(mount_tmp
);
918 (void) unlink(mount_tmp
);
919 mount_tmp_created
= false;
921 (void) umount_verbose(LOG_DEBUG
, mount_slave
, UMOUNT_NOFOLLOW
);
922 mount_slave_mounted
= false;
924 (void) rmdir(mount_slave
);
925 mount_slave_created
= false;
927 if (pipe2(errno_pipe_fd
, O_CLOEXEC
|O_NONBLOCK
) < 0) {
928 log_debug_errno(errno
, "Failed to create pipe: %m");
932 r
= namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL
, 0, FORK_RESET_SIGNALS
|FORK_DEATHSIG
,
933 pidns_fd
, mntns_fd
, -1, -1, root_fd
, &child
);
937 const char *mount_inside
;
939 errno_pipe_fd
[0] = safe_close(errno_pipe_fd
[0]);
941 if (make_file_or_directory
) {
943 (void) mkdir_parents(dest
, 0755);
944 (void) make_mount_point_inode_from_stat(&st
, dest
, 0700);
946 (void) mkdir_p(dest
, 0755);
949 /* Fifth, move the mount to the right place inside */
950 mount_inside
= strjoina(incoming_path
, basename(mount_outside
));
951 r
= mount_nofollow_verbose(LOG_ERR
, mount_inside
, dest
, NULL
, MS_MOVE
, NULL
);
958 (void) write(errno_pipe_fd
[1], &r
, sizeof(r
));
959 errno_pipe_fd
[1] = safe_close(errno_pipe_fd
[1]);
964 errno_pipe_fd
[1] = safe_close(errno_pipe_fd
[1]);
966 r
= wait_for_terminate_and_check("(sd-bindmnt)", child
, 0);
968 log_debug_errno(r
, "Failed to wait for child: %m");
971 if (r
!= EXIT_SUCCESS
) {
972 if (read(errno_pipe_fd
[0], &r
, sizeof(r
)) == sizeof(r
))
973 log_debug_errno(r
, "Failed to mount: %m");
975 log_debug("Child failed.");
980 if (mount_outside_mounted
)
981 (void) umount_verbose(LOG_DEBUG
, mount_outside
, UMOUNT_NOFOLLOW
);
982 if (mount_outside_created
) {
983 if (is_image
|| S_ISDIR(st
.st_mode
))
984 (void) rmdir(mount_outside
);
986 (void) unlink(mount_outside
);
989 if (mount_tmp_mounted
)
990 (void) umount_verbose(LOG_DEBUG
, mount_tmp
, UMOUNT_NOFOLLOW
);
991 if (mount_tmp_created
) {
992 if (is_image
|| S_ISDIR(st
.st_mode
))
993 (void) rmdir(mount_tmp
);
995 (void) unlink(mount_tmp
);
998 if (mount_slave_mounted
)
999 (void) umount_verbose(LOG_DEBUG
, mount_slave
, UMOUNT_NOFOLLOW
);
1000 if (mount_slave_created
)
1001 (void) rmdir(mount_slave
);
1006 int bind_mount_in_namespace(
1008 const char *propagate_path
,
1009 const char *incoming_path
,
1013 bool make_file_or_directory
) {
1015 return mount_in_namespace(target
, propagate_path
, incoming_path
, src
, dest
, read_only
, make_file_or_directory
, NULL
, false);
1018 int mount_image_in_namespace(
1020 const char *propagate_path
,
1021 const char *incoming_path
,
1025 bool make_file_or_directory
,
1026 const MountOptions
*options
) {
1028 return mount_in_namespace(target
, propagate_path
, incoming_path
, src
, dest
, read_only
, make_file_or_directory
, options
, true);
1031 int make_mount_point(const char *path
) {
1036 /* If 'path' is already a mount point, does nothing and returns 0. If it is not it makes it one, and returns 1. */
1038 r
= path_is_mount_point(path
, NULL
, 0);
1040 return log_debug_errno(r
, "Failed to determine whether '%s' is a mount point: %m", path
);
1044 r
= mount_nofollow_verbose(LOG_DEBUG
, path
, path
, NULL
, MS_BIND
|MS_REC
, NULL
);
1051 static int make_userns(uid_t uid_shift
, uid_t uid_range
, RemountIdmapFlags flags
) {
1052 _cleanup_close_
int userns_fd
= -1;
1053 _cleanup_free_
char *line
= NULL
;
1055 /* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
1056 * process whose only purpose is to give us a new user namespace. It's killed when we got it. */
1058 if (asprintf(&line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, uid_shift
, uid_range
) < 0)
1059 return log_oom_debug();
1061 /* If requested we'll include an entry in the mapping so that the host root user can make changes to
1062 * the uidmapped mount like it normally would. Specifically, we'll map the user with UID_HOST_ROOT on
1063 * the backing fs to UID 0. This is useful, since nspawn code wants to create various missing inodes
1064 * in the OS tree before booting into it, and this becomes very easy and straightforward to do if it
1065 * can just do it under its own regular UID. Note that in that case the container's runtime uidmap
1066 * (i.e. the one the container payload processes run in) will leave this UID unmapped, i.e. if we
1067 * accidentally leave files owned by host root in the already uidmapped tree around they'll show up
1068 * as owned by 'nobody', which is safe. (Of course, we shouldn't leave such inodes around, but always
1069 * chown() them to the container's own UID range, but it's good to have a safety net, in case we
1071 if (flags
& REMOUNT_IDMAP_HOST_ROOT
)
1072 if (strextendf(&line
,
1073 UID_FMT
" " UID_FMT
" " UID_FMT
"\n",
1074 UID_MAPPED_ROOT
, 0, 1) < 0)
1075 return log_oom_debug();
1077 /* We always assign the same UID and GID ranges */
1078 userns_fd
= userns_acquire(line
, line
);
1080 return log_debug_errno(userns_fd
, "Failed to acquire new userns: %m");
1082 return TAKE_FD(userns_fd
);
1089 RemountIdmapFlags flags
) {
1091 _cleanup_close_
int mount_fd
= -1, userns_fd
= -1;
1096 if (!userns_shift_range_valid(uid_shift
, uid_range
))
1099 /* Clone the mount point */
1100 mount_fd
= open_tree(-1, p
, OPEN_TREE_CLONE
| OPEN_TREE_CLOEXEC
);
1102 return log_debug_errno(errno
, "Failed to open tree of mounted filesystem '%s': %m", p
);
1104 /* Create a user namespace mapping */
1105 userns_fd
= make_userns(uid_shift
, uid_range
, flags
);
1109 /* Set the user namespace mapping attribute on the cloned mount point */
1110 if (mount_setattr(mount_fd
, "", AT_EMPTY_PATH
| AT_RECURSIVE
,
1111 &(struct mount_attr
) {
1112 .attr_set
= MOUNT_ATTR_IDMAP
,
1113 .userns_fd
= userns_fd
,
1114 }, sizeof(struct mount_attr
)) < 0)
1115 return log_debug_errno(errno
, "Failed to change bind mount attributes for '%s': %m", p
);
1117 /* Remove the old mount point */
1118 r
= umount_verbose(LOG_DEBUG
, p
, UMOUNT_NOFOLLOW
);
1122 /* And place the cloned version in its place */
1123 if (move_mount(mount_fd
, "", -1, p
, MOVE_MOUNT_F_EMPTY_PATH
) < 0)
1124 return log_debug_errno(errno
, "Failed to attach UID mapped mount to '%s': %m", p
);
1129 int make_mount_point_inode_from_stat(const struct stat
*st
, const char *dest
, mode_t mode
) {
1133 if (S_ISDIR(st
->st_mode
))
1134 return mkdir_label(dest
, mode
);
1136 return mknod(dest
, S_IFREG
|(mode
& ~0111), 0);
1139 int make_mount_point_inode_from_path(const char *source
, const char *dest
, mode_t mode
) {
1145 if (stat(source
, &st
) < 0)
1148 return make_mount_point_inode_from_stat(&st
, dest
, mode
);