]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/mount-util.c
mount-util: use UMOUNT_NOFOLLOW in recursive umounter
[thirdparty/systemd.git] / src / shared / mount-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <stdlib.h>
5 #include <sys/mount.h>
6 #include <sys/stat.h>
7 #include <sys/statvfs.h>
8 #include <unistd.h>
9
10 #include "alloc-util.h"
11 #include "extract-word.h"
12 #include "fd-util.h"
13 #include "fileio.h"
14 #include "fs-util.h"
15 #include "hashmap.h"
16 #include "libmount-util.h"
17 #include "mount-util.h"
18 #include "mountpoint-util.h"
19 #include "parse-util.h"
20 #include "path-util.h"
21 #include "set.h"
22 #include "stdio-util.h"
23 #include "string-util.h"
24 #include "strv.h"
25
26 int umount_recursive(const char *prefix, int flags) {
27 int n = 0, r;
28 bool again;
29
30 /* Try to umount everything recursively below a
31 * directory. Also, take care of stacked mounts, and keep
32 * unmounting them until they are gone. */
33
34 do {
35 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
36 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
37
38 again = false;
39
40 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
41 if (r < 0)
42 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
43
44 for (;;) {
45 struct libmnt_fs *fs;
46 const char *path;
47
48 r = mnt_table_next_fs(table, iter, &fs);
49 if (r == 1)
50 break;
51 if (r < 0)
52 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
53
54 path = mnt_fs_get_target(fs);
55 if (!path)
56 continue;
57
58 if (!path_startswith(path, prefix))
59 continue;
60
61 if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
62 log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
63 continue;
64 }
65
66 log_debug("Successfully unmounted %s", path);
67
68 again = true;
69 n++;
70
71 break;
72 }
73 } while (again);
74
75 return n;
76 }
77
78 static int get_mount_flags(
79 struct libmnt_table *table,
80 const char *path,
81 unsigned long *ret) {
82 struct libmnt_fs *fs;
83 struct statvfs buf;
84 const char *opts;
85 int r = 0;
86
87 /* Get the mount flags for the mountpoint at "path" from "table". We have a fallback using statvfs()
88 * in place (which provides us with mostly the same info), but it's just a fallback, since using it
89 * means triggering autofs or NFS mounts, which we'd rather avoid needlessly. */
90
91 fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
92 if (!fs) {
93 log_debug("Could not find '%s' in mount table, ignoring.", path);
94 goto fallback;
95 }
96
97 opts = mnt_fs_get_vfs_options(fs);
98 if (!opts) {
99 *ret = 0;
100 return 0;
101 }
102
103 r = mnt_optstr_get_flags(opts, ret, mnt_get_builtin_optmap(MNT_LINUX_MAP));
104 if (r != 0) {
105 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
106 goto fallback;
107 }
108
109 /* MS_RELATIME is default and trying to set it in an unprivileged container causes EPERM */
110 *ret &= ~MS_RELATIME;
111 return 0;
112
113 fallback:
114 if (statvfs(path, &buf) < 0)
115 return -errno;
116
117 /* The statvfs() flags and the mount flags mostly have the same values, but for some cases do
118 * not. Hence map the flags manually. (Strictly speaking, ST_RELATIME/MS_RELATIME is the most
119 * prominent one that doesn't match, but that's the one we mask away anyway, see above.) */
120
121 *ret =
122 FLAGS_SET(buf.f_flag, ST_RDONLY) * MS_RDONLY |
123 FLAGS_SET(buf.f_flag, ST_NODEV) * MS_NODEV |
124 FLAGS_SET(buf.f_flag, ST_NOEXEC) * MS_NOEXEC |
125 FLAGS_SET(buf.f_flag, ST_NOSUID) * MS_NOSUID |
126 FLAGS_SET(buf.f_flag, ST_NOATIME) * MS_NOATIME |
127 FLAGS_SET(buf.f_flag, ST_NODIRATIME) * MS_NODIRATIME;
128
129 return 0;
130 }
131
132 /* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
133 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
134 int bind_remount_recursive_with_mountinfo(
135 const char *prefix,
136 unsigned long new_flags,
137 unsigned long flags_mask,
138 char **deny_list,
139 FILE *proc_self_mountinfo) {
140
141 _cleanup_set_free_free_ Set *done = NULL;
142 _cleanup_free_ char *simplified = NULL;
143 int r;
144
145 assert(prefix);
146 assert(proc_self_mountinfo);
147
148 /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
149 * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
150 * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
151 * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
152 * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
153 * do not have any effect on future submounts that might get propagated, they might be writable. This includes
154 * future submounts that have been triggered via autofs.
155 *
156 * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
157 * remount operation. Note that we'll ignore the deny list for the top-level path. */
158
159 simplified = strdup(prefix);
160 if (!simplified)
161 return -ENOMEM;
162
163 path_simplify(simplified, false);
164
165 done = set_new(&path_hash_ops);
166 if (!done)
167 return -ENOMEM;
168
169 for (;;) {
170 _cleanup_set_free_free_ Set *todo = NULL;
171 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
172 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
173 bool top_autofs = false;
174 char *x;
175 unsigned long orig_flags;
176
177 todo = set_new(&path_hash_ops);
178 if (!todo)
179 return -ENOMEM;
180
181 rewind(proc_self_mountinfo);
182
183 r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
184 if (r < 0)
185 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
186
187 for (;;) {
188 struct libmnt_fs *fs;
189 const char *path, *type;
190
191 r = mnt_table_next_fs(table, iter, &fs);
192 if (r == 1)
193 break;
194 if (r < 0)
195 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
196
197 path = mnt_fs_get_target(fs);
198 type = mnt_fs_get_fstype(fs);
199 if (!path || !type)
200 continue;
201
202 if (!path_startswith(path, simplified))
203 continue;
204
205 /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
206 * we shall operate on. */
207 if (!path_equal(path, simplified)) {
208 bool deny_listed = false;
209 char **i;
210
211 STRV_FOREACH(i, deny_list) {
212 if (path_equal(*i, simplified))
213 continue;
214
215 if (!path_startswith(*i, simplified))
216 continue;
217
218 if (path_startswith(path, *i)) {
219 deny_listed = true;
220 log_debug("Not remounting %s deny-listed by %s, called for %s",
221 path, *i, simplified);
222 break;
223 }
224 }
225 if (deny_listed)
226 continue;
227 }
228
229 /* Let's ignore autofs mounts. If they aren't
230 * triggered yet, we want to avoid triggering
231 * them, as we don't make any guarantees for
232 * future submounts anyway. If they are
233 * already triggered, then we will find
234 * another entry for this. */
235 if (streq(type, "autofs")) {
236 top_autofs = top_autofs || path_equal(path, simplified);
237 continue;
238 }
239
240 if (!set_contains(done, path)) {
241 r = set_put_strdup(&todo, path);
242 if (r < 0)
243 return r;
244 }
245 }
246
247 /* If we have no submounts to process anymore and if
248 * the root is either already done, or an autofs, we
249 * are done */
250 if (set_isempty(todo) &&
251 (top_autofs || set_contains(done, simplified)))
252 return 0;
253
254 if (!set_contains(done, simplified) &&
255 !set_contains(todo, simplified)) {
256 /* The prefix directory itself is not yet a mount, make it one. */
257 if (mount(simplified, simplified, NULL, MS_BIND|MS_REC, NULL) < 0)
258 return -errno;
259
260 orig_flags = 0;
261 (void) get_mount_flags(table, simplified, &orig_flags);
262
263 if (mount(NULL, simplified, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL) < 0)
264 return -errno;
265
266 log_debug("Made top-level directory %s a mount point.", prefix);
267
268 r = set_put_strdup(&done, simplified);
269 if (r < 0)
270 return r;
271 }
272
273 while ((x = set_steal_first(todo))) {
274
275 r = set_consume(done, x);
276 if (IN_SET(r, 0, -EEXIST))
277 continue;
278 if (r < 0)
279 return r;
280
281 /* Deal with mount points that are obstructed by a later mount */
282 r = path_is_mount_point(x, NULL, 0);
283 if (IN_SET(r, 0, -ENOENT))
284 continue;
285 if (IN_SET(r, -EACCES, -EPERM)) {
286 /* Even if root user invoke this, submounts under private FUSE or NFS mount points
287 * may not be acceessed. E.g.,
288 *
289 * $ bindfs --no-allow-other ~/mnt/mnt ~/mnt/mnt
290 * $ bindfs --no-allow-other ~/mnt ~/mnt
291 *
292 * Then, root user cannot access the mount point ~/mnt/mnt.
293 * In such cases, the submounts are ignored, as we have no way to manage them. */
294 log_debug_errno(r, "Failed to determine '%s' is mount point or not, ignoring: %m", x);
295 continue;
296 }
297 if (r < 0)
298 return r;
299
300 /* Try to reuse the original flag set */
301 orig_flags = 0;
302 (void) get_mount_flags(table, x, &orig_flags);
303
304 if (mount(NULL, x, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL) < 0)
305 return -errno;
306
307 log_debug("Remounted %s read-only.", x);
308 }
309 }
310 }
311
312 int bind_remount_recursive(
313 const char *prefix,
314 unsigned long new_flags,
315 unsigned long flags_mask,
316 char **deny_list) {
317
318 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
319 int r;
320
321 r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo);
322 if (r < 0)
323 return r;
324
325 return bind_remount_recursive_with_mountinfo(prefix, new_flags, flags_mask, deny_list, proc_self_mountinfo);
326 }
327
328 int bind_remount_one_with_mountinfo(
329 const char *path,
330 unsigned long new_flags,
331 unsigned long flags_mask,
332 FILE *proc_self_mountinfo) {
333
334 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
335 unsigned long orig_flags = 0;
336 int r;
337
338 assert(path);
339 assert(proc_self_mountinfo);
340
341 rewind(proc_self_mountinfo);
342
343 table = mnt_new_table();
344 if (!table)
345 return -ENOMEM;
346
347 r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
348 if (r < 0)
349 return r;
350
351 /* Try to reuse the original flag set */
352 (void) get_mount_flags(table, path, &orig_flags);
353
354 if (mount(NULL, path, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL) < 0)
355 return -errno;
356
357 return 0;
358 }
359
360 int mount_move_root(const char *path) {
361 assert(path);
362
363 if (chdir(path) < 0)
364 return -errno;
365
366 if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
367 return -errno;
368
369 if (chroot(".") < 0)
370 return -errno;
371
372 if (chdir("/") < 0)
373 return -errno;
374
375 return 0;
376 }
377
378 int repeat_unmount(const char *path, int flags) {
379 bool done = false;
380
381 assert(path);
382
383 /* If there are multiple mounts on a mount point, this
384 * removes them all */
385
386 for (;;) {
387 if (umount2(path, flags) < 0) {
388
389 if (errno == EINVAL)
390 return done;
391
392 return -errno;
393 }
394
395 done = true;
396 }
397 }
398
399 int mode_to_inaccessible_node(
400 const char *runtime_dir,
401 mode_t mode,
402 char **ret) {
403
404 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
405 * during early boot by PID 1. In some cases we lacked the privs to create the character and block
406 * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
407 * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
408 * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
409 * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
410 * file nodes, and that's the most important thing that matters.
411 *
412 * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
413 * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
414
415 _cleanup_free_ char *d = NULL;
416 const char *node = NULL;
417 bool fallback = false;
418
419 assert(ret);
420
421 if (!runtime_dir)
422 runtime_dir = "/run";
423
424 switch(mode & S_IFMT) {
425 case S_IFREG:
426 node = "/systemd/inaccessible/reg";
427 break;
428
429 case S_IFDIR:
430 node = "/systemd/inaccessible/dir";
431 break;
432
433 case S_IFCHR:
434 node = "/systemd/inaccessible/chr";
435 fallback = true;
436 break;
437
438 case S_IFBLK:
439 node = "/systemd/inaccessible/blk";
440 fallback = true;
441 break;
442
443 case S_IFIFO:
444 node = "/systemd/inaccessible/fifo";
445 break;
446
447 case S_IFSOCK:
448 node = "/systemd/inaccessible/sock";
449 break;
450 }
451 if (!node)
452 return -EINVAL;
453
454 d = path_join(runtime_dir, node);
455 if (!d)
456 return -ENOMEM;
457
458 if (fallback && access(d, F_OK) < 0) {
459 free(d);
460 d = path_join(runtime_dir, "/systemd/inaccessible/sock");
461 if (!d)
462 return -ENOMEM;
463 }
464
465 *ret = TAKE_PTR(d);
466 return 0;
467 }
468
469 #define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
470 static char* mount_flags_to_string(long unsigned flags) {
471 char *x;
472 _cleanup_free_ char *y = NULL;
473 long unsigned overflow;
474
475 overflow = flags & ~(MS_RDONLY |
476 MS_NOSUID |
477 MS_NODEV |
478 MS_NOEXEC |
479 MS_SYNCHRONOUS |
480 MS_REMOUNT |
481 MS_MANDLOCK |
482 MS_DIRSYNC |
483 MS_NOATIME |
484 MS_NODIRATIME |
485 MS_BIND |
486 MS_MOVE |
487 MS_REC |
488 MS_SILENT |
489 MS_POSIXACL |
490 MS_UNBINDABLE |
491 MS_PRIVATE |
492 MS_SLAVE |
493 MS_SHARED |
494 MS_RELATIME |
495 MS_KERNMOUNT |
496 MS_I_VERSION |
497 MS_STRICTATIME |
498 MS_LAZYTIME);
499
500 if (flags == 0 || overflow != 0)
501 if (asprintf(&y, "%lx", overflow) < 0)
502 return NULL;
503
504 x = strjoin(FLAG(MS_RDONLY),
505 FLAG(MS_NOSUID),
506 FLAG(MS_NODEV),
507 FLAG(MS_NOEXEC),
508 FLAG(MS_SYNCHRONOUS),
509 FLAG(MS_REMOUNT),
510 FLAG(MS_MANDLOCK),
511 FLAG(MS_DIRSYNC),
512 FLAG(MS_NOATIME),
513 FLAG(MS_NODIRATIME),
514 FLAG(MS_BIND),
515 FLAG(MS_MOVE),
516 FLAG(MS_REC),
517 FLAG(MS_SILENT),
518 FLAG(MS_POSIXACL),
519 FLAG(MS_UNBINDABLE),
520 FLAG(MS_PRIVATE),
521 FLAG(MS_SLAVE),
522 FLAG(MS_SHARED),
523 FLAG(MS_RELATIME),
524 FLAG(MS_KERNMOUNT),
525 FLAG(MS_I_VERSION),
526 FLAG(MS_STRICTATIME),
527 FLAG(MS_LAZYTIME),
528 y);
529 if (!x)
530 return NULL;
531 if (!y)
532 x[strlen(x) - 1] = '\0'; /* truncate the last | */
533 return x;
534 }
535
536 int mount_verbose(
537 int error_log_level,
538 const char *what,
539 const char *where,
540 const char *type,
541 unsigned long flags,
542 const char *options) {
543
544 _cleanup_free_ char *fl = NULL, *o = NULL;
545 unsigned long f;
546 int r;
547
548 r = mount_option_mangle(options, flags, &f, &o);
549 if (r < 0)
550 return log_full_errno(error_log_level, r,
551 "Failed to mangle mount options %s: %m",
552 strempty(options));
553
554 fl = mount_flags_to_string(f);
555
556 if ((f & MS_REMOUNT) && !what && !type)
557 log_debug("Remounting %s (%s \"%s\")...",
558 where, strnull(fl), strempty(o));
559 else if (!what && !type)
560 log_debug("Mounting %s (%s \"%s\")...",
561 where, strnull(fl), strempty(o));
562 else if ((f & MS_BIND) && !type)
563 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
564 what, where, strnull(fl), strempty(o));
565 else if (f & MS_MOVE)
566 log_debug("Moving mount %s → %s (%s \"%s\")...",
567 what, where, strnull(fl), strempty(o));
568 else
569 log_debug("Mounting %s on %s (%s \"%s\")...",
570 strna(type), where, strnull(fl), strempty(o));
571 if (mount(what, where, type, f, o) < 0)
572 return log_full_errno(error_log_level, errno,
573 "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
574 strna(what), strna(type), where, strnull(fl), strempty(o));
575 return 0;
576 }
577
578 int umount_verbose(const char *what) {
579 log_debug("Umounting %s...", what);
580 if (umount(what) < 0)
581 return log_error_errno(errno, "Failed to unmount %s: %m", what);
582 return 0;
583 }
584
585 int mount_option_mangle(
586 const char *options,
587 unsigned long mount_flags,
588 unsigned long *ret_mount_flags,
589 char **ret_remaining_options) {
590
591 const struct libmnt_optmap *map;
592 _cleanup_free_ char *ret = NULL;
593 const char *p;
594 int r;
595
596 /* This extracts mount flags from the mount options, and store
597 * non-mount-flag options to '*ret_remaining_options'.
598 * E.g.,
599 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
600 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
601 * "size=1630748k,mode=700,uid=1000,gid=1000".
602 * See more examples in test-mount-utils.c.
603 *
604 * Note that if 'options' does not contain any non-mount-flag options,
605 * then '*ret_remaining_options' is set to NULL instead of empty string.
606 * Note that this does not check validity of options stored in
607 * '*ret_remaining_options'.
608 * Note that if 'options' is NULL, then this just copies 'mount_flags'
609 * to '*ret_mount_flags'. */
610
611 assert(ret_mount_flags);
612 assert(ret_remaining_options);
613
614 map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
615 if (!map)
616 return -EINVAL;
617
618 p = options;
619 for (;;) {
620 _cleanup_free_ char *word = NULL;
621 const struct libmnt_optmap *ent;
622
623 r = extract_first_word(&p, &word, ",", EXTRACT_UNQUOTE);
624 if (r < 0)
625 return r;
626 if (r == 0)
627 break;
628
629 for (ent = map; ent->name; ent++) {
630 /* All entries in MNT_LINUX_MAP do not take any argument.
631 * Thus, ent->name does not contain "=" or "[=]". */
632 if (!streq(word, ent->name))
633 continue;
634
635 if (!(ent->mask & MNT_INVERT))
636 mount_flags |= ent->id;
637 else if (mount_flags & ent->id)
638 mount_flags ^= ent->id;
639
640 break;
641 }
642
643 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
644 if (!ent->name && !strextend_with_separator(&ret, ",", word, NULL))
645 return -ENOMEM;
646 }
647
648 *ret_mount_flags = mount_flags;
649 *ret_remaining_options = TAKE_PTR(ret);
650
651 return 0;
652 }