]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/mount-util.c
mount-util: extend comment a bit, mention that we aren't atomic in behaviour
[thirdparty/systemd.git] / src / shared / mount-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
4349cd7c 2
11c3a366 3#include <errno.h>
70599967 4#include <linux/loop.h>
11c3a366 5#include <stdlib.h>
4349cd7c 6#include <sys/mount.h>
11c3a366 7#include <sys/stat.h>
4349cd7c 8#include <sys/statvfs.h>
11c3a366 9#include <unistd.h>
4349cd7c 10
b5efdb8a 11#include "alloc-util.h"
70599967 12#include "dissect-image.h"
9e7f941a 13#include "extract-word.h"
4349cd7c
LP
14#include "fd-util.h"
15#include "fileio.h"
e1873695 16#include "fs-util.h"
93cc7779 17#include "hashmap.h"
13dcfe46 18#include "libmount-util.h"
6af52c3a 19#include "mkdir.h"
4349cd7c 20#include "mount-util.h"
049af8ad 21#include "mountpoint-util.h"
2338a175 22#include "namespace-util.h"
4349cd7c
LP
23#include "parse-util.h"
24#include "path-util.h"
6af52c3a 25#include "process-util.h"
4349cd7c 26#include "set.h"
28126409 27#include "stat-util.h"
15a5e950 28#include "stdio-util.h"
4349cd7c 29#include "string-util.h"
6b7c9f8b 30#include "strv.h"
6af52c3a 31#include "tmpfile-util.h"
70599967 32#include "user-util.h"
4349cd7c 33
28126409
LP
34int mount_fd(const char *source,
35 int target_fd,
36 const char *filesystemtype,
37 unsigned long mountflags,
38 const void *data) {
39
40 char path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
41
42 xsprintf(path, "/proc/self/fd/%i", target_fd);
43 if (mount(source, path, filesystemtype, mountflags, data) < 0) {
44 if (errno != ENOENT)
45 return -errno;
46
47 /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
48 * mounted. Check for the latter to generate better error messages. */
49 if (proc_mounted() == 0)
50 return -ENOSYS;
51
52 return -ENOENT;
53 }
54
55 return 0;
56}
57
58int mount_nofollow(
59 const char *source,
60 const char *target,
61 const char *filesystemtype,
62 unsigned long mountflags,
63 const void *data) {
64
65 _cleanup_close_ int fd = -1;
66
67 /* In almost all cases we want to manipulate the mount table without following symlinks, hence
68 * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
69 * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
70 * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
71 * fs to mount) we can only use traditional mount() directly.
72 *
73 * Note that this disables following only for the final component of the target, i.e symlinks within
74 * the path of the target are honoured, as are symlinks in the source path everywhere. */
75
76 fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
77 if (fd < 0)
78 return -errno;
79
80 return mount_fd(source, fd, filesystemtype, mountflags, data);
81}
82
4349cd7c 83int umount_recursive(const char *prefix, int flags) {
4349cd7c 84 int n = 0, r;
f8b1904f 85 bool again;
4349cd7c 86
9d0619de
LP
87 /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
88 * keep unmounting them until they are gone. */
4349cd7c
LP
89
90 do {
13dcfe46
ZJS
91 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
92 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
4349cd7c
LP
93
94 again = false;
4349cd7c 95
2f2d81d9 96 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
fdeea3f4 97 if (r < 0)
13dcfe46 98 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
35bbbf85 99
4349cd7c 100 for (;;) {
13dcfe46
ZJS
101 struct libmnt_fs *fs;
102 const char *path;
4349cd7c 103
13dcfe46
ZJS
104 r = mnt_table_next_fs(table, iter, &fs);
105 if (r == 1)
106 break;
107 if (r < 0)
108 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 109
13dcfe46
ZJS
110 path = mnt_fs_get_target(fs);
111 if (!path)
112 continue;
4349cd7c 113
13dcfe46 114 if (!path_startswith(path, prefix))
4349cd7c
LP
115 continue;
116
827ea521
LP
117 if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
118 log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
4349cd7c
LP
119 continue;
120 }
121
13dcfe46 122 log_debug("Successfully unmounted %s", path);
6b7c9f8b 123
4349cd7c
LP
124 again = true;
125 n++;
126
127 break;
128 }
4349cd7c
LP
129 } while (again);
130
13dcfe46 131 return n;
4349cd7c
LP
132}
133
08b1f5c7
LP
134static int get_mount_flags(
135 struct libmnt_table *table,
136 const char *path,
137 unsigned long *ret) {
5012d567
LP
138
139 _cleanup_close_ int fd = -1;
08b1f5c7
LP
140 struct libmnt_fs *fs;
141 struct statvfs buf;
142 const char *opts;
5012d567 143 int r;
d34a4008 144
08b1f5c7
LP
145 /* Get the mount flags for the mountpoint at "path" from "table". We have a fallback using statvfs()
146 * in place (which provides us with mostly the same info), but it's just a fallback, since using it
5012d567
LP
147 * means triggering autofs or NFS mounts, which we'd rather avoid needlessly.
148 *
149 * This generally doesn't follow symlinks. */
08b1f5c7 150
d34a4008 151 fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
38288f0b 152 if (!fs) {
08b1f5c7 153 log_debug("Could not find '%s' in mount table, ignoring.", path);
d34a4008
JU
154 goto fallback;
155 }
156
157 opts = mnt_fs_get_vfs_options(fs);
08b1f5c7
LP
158 if (!opts) {
159 *ret = 0;
160 return 0;
161 }
162
163 r = mnt_optstr_get_flags(opts, ret, mnt_get_builtin_optmap(MNT_LINUX_MAP));
d34a4008 164 if (r != 0) {
08b1f5c7 165 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
d34a4008
JU
166 goto fallback;
167 }
4349cd7c 168
08b1f5c7
LP
169 /* MS_RELATIME is default and trying to set it in an unprivileged container causes EPERM */
170 *ret &= ~MS_RELATIME;
d34a4008
JU
171 return 0;
172
173fallback:
5012d567
LP
174 fd = open(path, O_PATH|O_CLOEXEC|O_NOFOLLOW);
175 if (fd < 0)
176 return -errno;
177
178 if (fstatvfs(fd, &buf) < 0)
4349cd7c 179 return -errno;
d34a4008 180
08b1f5c7
LP
181 /* The statvfs() flags and the mount flags mostly have the same values, but for some cases do
182 * not. Hence map the flags manually. (Strictly speaking, ST_RELATIME/MS_RELATIME is the most
183 * prominent one that doesn't match, but that's the one we mask away anyway, see above.) */
184
185 *ret =
186 FLAGS_SET(buf.f_flag, ST_RDONLY) * MS_RDONLY |
187 FLAGS_SET(buf.f_flag, ST_NODEV) * MS_NODEV |
188 FLAGS_SET(buf.f_flag, ST_NOEXEC) * MS_NOEXEC |
189 FLAGS_SET(buf.f_flag, ST_NOSUID) * MS_NOSUID |
190 FLAGS_SET(buf.f_flag, ST_NOATIME) * MS_NOATIME |
191 FLAGS_SET(buf.f_flag, ST_NODIRATIME) * MS_NODIRATIME;
192
4349cd7c
LP
193 return 0;
194}
195
be3f3752 196/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
64e82c19
LP
197 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
198int bind_remount_recursive_with_mountinfo(
199 const char *prefix,
200 unsigned long new_flags,
201 unsigned long flags_mask,
6b000af4 202 char **deny_list,
64e82c19
LP
203 FILE *proc_self_mountinfo) {
204
4349cd7c 205 _cleanup_set_free_free_ Set *done = NULL;
670e8efd 206 unsigned n_tries = 0;
4349cd7c
LP
207 int r;
208
8403219f 209 assert(prefix);
ac9de0b3
TR
210 assert(proc_self_mountinfo);
211
ddc155b2
TM
212 /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
213 * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
214 * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
215 * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
216 * access, too. When mounts are stacked on the same mount point we only care for each individual
217 * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
218 * not have any effect on future submounts that might get propagated, they might be writable
4b6ef527
LP
219 * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
220 * operate atomically here. Mounts established while we process the tree might or might not get
221 * noticed and thus might or might not be covered.
6b7c9f8b 222 *
6b000af4
LP
223 * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
224 * remount operation. Note that we'll ignore the deny list for the top-level path. */
4349cd7c 225
548f6937 226 done = set_new(&path_hash_ops);
4349cd7c
LP
227 if (!done)
228 return -ENOMEM;
229
230 for (;;) {
4349cd7c 231 _cleanup_set_free_free_ Set *todo = NULL;
13dcfe46
ZJS
232 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
233 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
4349cd7c
LP
234 bool top_autofs = false;
235 char *x;
236 unsigned long orig_flags;
237
670e8efd
LP
238 if (n_tries++ >= 32) /* Let's not retry this loop forever */
239 return -EBUSY;
240
548f6937 241 todo = set_new(&path_hash_ops);
4349cd7c
LP
242 if (!todo)
243 return -ENOMEM;
244
ac9de0b3 245 rewind(proc_self_mountinfo);
4349cd7c 246
e2857b3d 247 r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
13dcfe46
ZJS
248 if (r < 0)
249 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
4349cd7c 250
13dcfe46
ZJS
251 for (;;) {
252 struct libmnt_fs *fs;
253 const char *path, *type;
4349cd7c 254
13dcfe46 255 r = mnt_table_next_fs(table, iter, &fs);
d6bfab11 256 if (r == 1) /* EOF */
13dcfe46 257 break;
4349cd7c 258 if (r < 0)
13dcfe46 259 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 260
13dcfe46 261 path = mnt_fs_get_target(fs);
d6bfab11 262 if (!path)
6b7c9f8b
LP
263 continue;
264
c6111b85 265 if (!path_startswith(path, prefix))
13dcfe46
ZJS
266 continue;
267
d6bfab11
LP
268 type = mnt_fs_get_fstype(fs);
269 if (!type)
270 continue;
271
272 /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
273 * triggering them, as we don't make any guarantees for future submounts anyway. If
274 * they are already triggered, then we will find another entry for this. */
275 if (streq(type, "autofs")) {
276 top_autofs = top_autofs || path_equal(path, prefix);
277 continue;
278 }
279
280 if (set_contains(done, path))
281 continue;
282
6b000af4 283 /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
13dcfe46 284 * we shall operate on. */
c6111b85 285 if (!path_equal(path, prefix)) {
6b000af4 286 bool deny_listed = false;
6b7c9f8b
LP
287 char **i;
288
6b000af4 289 STRV_FOREACH(i, deny_list) {
c6111b85 290 if (path_equal(*i, prefix))
6b7c9f8b
LP
291 continue;
292
c6111b85 293 if (!path_startswith(*i, prefix))
6b7c9f8b
LP
294 continue;
295
13dcfe46 296 if (path_startswith(path, *i)) {
6b000af4 297 deny_listed = true;
d6bfab11 298 log_debug("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
6b7c9f8b
LP
299 break;
300 }
301 }
d6bfab11 302
6b000af4 303 if (deny_listed)
6b7c9f8b
LP
304 continue;
305 }
306
d6bfab11
LP
307 r = set_put_strdup(&todo, path);
308 if (r < 0)
309 return r;
4349cd7c
LP
310 }
311
5c5753b9
LP
312 /* Check if the top-level directory was among what we have seen so far. For that check both
313 * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
314 * not include it in either set but will set this bool. */
c6111b85 315 if (!set_contains(done, prefix) &&
5c5753b9
LP
316 !(top_autofs || set_contains(todo, prefix))) {
317
6b7c9f8b 318 /* The prefix directory itself is not yet a mount, make it one. */
c6111b85 319 r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
511a8cfe
LP
320 if (r < 0)
321 return r;
4349cd7c 322
5c5753b9
LP
323 /* Immediately rescan, so that we pick up the new mount's flags */
324 continue;
4349cd7c
LP
325 }
326
5c5753b9
LP
327 /* If we have no submounts to process anymore, we are done */
328 if (set_isempty(todo))
329 return 0;
330
4349cd7c
LP
331 while ((x = set_steal_first(todo))) {
332
333 r = set_consume(done, x);
4c701096 334 if (IN_SET(r, 0, -EEXIST))
4349cd7c
LP
335 continue;
336 if (r < 0)
337 return r;
338
6b7c9f8b 339 /* Deal with mount points that are obstructed by a later mount */
e1873695 340 r = path_is_mount_point(x, NULL, 0);
4c701096 341 if (IN_SET(r, 0, -ENOENT))
98df8089 342 continue;
065b4774
LP
343 if (r < 0) {
344 if (!ERRNO_IS_PRIVILEGE(r))
345 return r;
346
53c442ef
YW
347 /* Even if root user invoke this, submounts under private FUSE or NFS mount points
348 * may not be acceessed. E.g.,
349 *
350 * $ bindfs --no-allow-other ~/mnt/mnt ~/mnt/mnt
351 * $ bindfs --no-allow-other ~/mnt ~/mnt
352 *
353 * Then, root user cannot access the mount point ~/mnt/mnt.
354 * In such cases, the submounts are ignored, as we have no way to manage them. */
ef454fd1
YW
355 log_debug_errno(r, "Failed to determine '%s' is mount point or not, ignoring: %m", x);
356 continue;
357 }
98df8089
AC
358
359 /* Try to reuse the original flag set */
4349cd7c 360 orig_flags = 0;
08b1f5c7 361 (void) get_mount_flags(table, x, &orig_flags);
4349cd7c 362
511a8cfe
LP
363 r = mount_nofollow(NULL, x, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
364 if (r < 0)
365 return r;
4349cd7c 366
6b7c9f8b 367 log_debug("Remounted %s read-only.", x);
4349cd7c
LP
368 }
369 }
370}
371
8403219f
LP
372int bind_remount_recursive(
373 const char *prefix,
374 unsigned long new_flags,
375 unsigned long flags_mask,
6b000af4 376 char **deny_list) {
8403219f 377
ac9de0b3 378 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
fdeea3f4 379 int r;
ac9de0b3 380
fdeea3f4
ZJS
381 r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo);
382 if (r < 0)
383 return r;
35bbbf85 384
6b000af4 385 return bind_remount_recursive_with_mountinfo(prefix, new_flags, flags_mask, deny_list, proc_self_mountinfo);
ac9de0b3
TR
386}
387
7cce68e1
LP
388int bind_remount_one_with_mountinfo(
389 const char *path,
390 unsigned long new_flags,
391 unsigned long flags_mask,
392 FILE *proc_self_mountinfo) {
393
394 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
395 unsigned long orig_flags = 0;
396 int r;
397
398 assert(path);
399 assert(proc_self_mountinfo);
400
401 rewind(proc_self_mountinfo);
402
403 table = mnt_new_table();
404 if (!table)
405 return -ENOMEM;
406
407 r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
408 if (r < 0)
409 return r;
410
411 /* Try to reuse the original flag set */
412 (void) get_mount_flags(table, path, &orig_flags);
413
511a8cfe
LP
414 r = mount_nofollow(NULL, path, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
415 if (r < 0)
416 return r;
7cce68e1
LP
417
418 return 0;
419}
420
4349cd7c
LP
421int mount_move_root(const char *path) {
422 assert(path);
423
424 if (chdir(path) < 0)
425 return -errno;
426
427 if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
428 return -errno;
429
430 if (chroot(".") < 0)
431 return -errno;
432
433 if (chdir("/") < 0)
434 return -errno;
435
436 return 0;
437}
4e036b7a 438
3f2c0bec
LP
439int repeat_unmount(const char *path, int flags) {
440 bool done = false;
441
442 assert(path);
443
444 /* If there are multiple mounts on a mount point, this
445 * removes them all */
446
447 for (;;) {
448 if (umount2(path, flags) < 0) {
449
450 if (errno == EINVAL)
451 return done;
452
453 return -errno;
454 }
455
456 done = true;
457 }
458}
c4b41707 459
48b747fa
LP
460int mode_to_inaccessible_node(
461 const char *runtime_dir,
462 mode_t mode,
463 char **ret) {
464
465 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
466 * during early boot by PID 1. In some cases we lacked the privs to create the character and block
467 * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
468 * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
469 * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
470 * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
471 * file nodes, and that's the most important thing that matters.
472 *
473 * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
474 * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
475
e5f10caf
AZ
476 _cleanup_free_ char *d = NULL;
477 const char *node = NULL;
e5f10caf 478
48b747fa
LP
479 assert(ret);
480
481 if (!runtime_dir)
482 runtime_dir = "/run";
fe80fcc7 483
c4b41707
AP
484 switch(mode & S_IFMT) {
485 case S_IFREG:
48b747fa 486 node = "/systemd/inaccessible/reg";
e5f10caf 487 break;
fe80fcc7 488
c4b41707 489 case S_IFDIR:
48b747fa 490 node = "/systemd/inaccessible/dir";
e5f10caf 491 break;
fe80fcc7 492
c4b41707 493 case S_IFCHR:
48b747fa 494 node = "/systemd/inaccessible/chr";
e5f10caf 495 break;
fe80fcc7 496
c4b41707 497 case S_IFBLK:
48b747fa 498 node = "/systemd/inaccessible/blk";
e5f10caf 499 break;
fe80fcc7 500
c4b41707 501 case S_IFIFO:
48b747fa 502 node = "/systemd/inaccessible/fifo";
e5f10caf 503 break;
fe80fcc7 504
c4b41707 505 case S_IFSOCK:
48b747fa 506 node = "/systemd/inaccessible/sock";
e5f10caf 507 break;
c4b41707 508 }
e5f10caf
AZ
509 if (!node)
510 return -EINVAL;
511
48b747fa
LP
512 d = path_join(runtime_dir, node);
513 if (!d)
514 return -ENOMEM;
515
cbed1dc8
LP
516 /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
517 * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
518 * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
519 * inaccessible block device node let's see if the block device node actually exists, and if not,
520 * fall back to the character device node. From there fall back to the socket device node. This means
521 * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
522 * device node at all. */
523
524 if (S_ISBLK(mode) &&
525 access(d, F_OK) < 0 && errno == ENOENT) {
526 free(d);
527 d = path_join(runtime_dir, "/systemd/inaccessible/chr");
528 if (!d)
529 return -ENOMEM;
530 }
531
532 if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
533 access(d, F_OK) < 0 && errno == ENOENT) {
48b747fa
LP
534 free(d);
535 d = path_join(runtime_dir, "/systemd/inaccessible/sock");
536 if (!d)
537 return -ENOMEM;
538 }
e5f10caf 539
48b747fa 540 *ret = TAKE_PTR(d);
e5f10caf 541 return 0;
c4b41707 542}
60e76d48
ZJS
543
544#define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
545static char* mount_flags_to_string(long unsigned flags) {
546 char *x;
547 _cleanup_free_ char *y = NULL;
548 long unsigned overflow;
549
550 overflow = flags & ~(MS_RDONLY |
551 MS_NOSUID |
552 MS_NODEV |
553 MS_NOEXEC |
554 MS_SYNCHRONOUS |
555 MS_REMOUNT |
556 MS_MANDLOCK |
557 MS_DIRSYNC |
558 MS_NOATIME |
559 MS_NODIRATIME |
560 MS_BIND |
561 MS_MOVE |
562 MS_REC |
563 MS_SILENT |
564 MS_POSIXACL |
565 MS_UNBINDABLE |
566 MS_PRIVATE |
567 MS_SLAVE |
568 MS_SHARED |
569 MS_RELATIME |
570 MS_KERNMOUNT |
571 MS_I_VERSION |
572 MS_STRICTATIME |
573 MS_LAZYTIME);
574
575 if (flags == 0 || overflow != 0)
576 if (asprintf(&y, "%lx", overflow) < 0)
577 return NULL;
578
579 x = strjoin(FLAG(MS_RDONLY),
580 FLAG(MS_NOSUID),
581 FLAG(MS_NODEV),
582 FLAG(MS_NOEXEC),
583 FLAG(MS_SYNCHRONOUS),
584 FLAG(MS_REMOUNT),
585 FLAG(MS_MANDLOCK),
586 FLAG(MS_DIRSYNC),
587 FLAG(MS_NOATIME),
588 FLAG(MS_NODIRATIME),
589 FLAG(MS_BIND),
590 FLAG(MS_MOVE),
591 FLAG(MS_REC),
592 FLAG(MS_SILENT),
593 FLAG(MS_POSIXACL),
594 FLAG(MS_UNBINDABLE),
595 FLAG(MS_PRIVATE),
596 FLAG(MS_SLAVE),
597 FLAG(MS_SHARED),
598 FLAG(MS_RELATIME),
599 FLAG(MS_KERNMOUNT),
600 FLAG(MS_I_VERSION),
601 FLAG(MS_STRICTATIME),
602 FLAG(MS_LAZYTIME),
605405c6 603 y);
60e76d48
ZJS
604 if (!x)
605 return NULL;
606 if (!y)
607 x[strlen(x) - 1] = '\0'; /* truncate the last | */
608 return x;
609}
610
511a8cfe 611int mount_verbose_full(
60e76d48
ZJS
612 int error_log_level,
613 const char *what,
614 const char *where,
615 const char *type,
616 unsigned long flags,
511a8cfe
LP
617 const char *options,
618 bool follow_symlink) {
60e76d48 619
6ef8df2b
YW
620 _cleanup_free_ char *fl = NULL, *o = NULL;
621 unsigned long f;
622 int r;
623
624 r = mount_option_mangle(options, flags, &f, &o);
625 if (r < 0)
626 return log_full_errno(error_log_level, r,
627 "Failed to mangle mount options %s: %m",
628 strempty(options));
60e76d48 629
6ef8df2b 630 fl = mount_flags_to_string(f);
60e76d48 631
6ef8df2b 632 if ((f & MS_REMOUNT) && !what && !type)
60e76d48 633 log_debug("Remounting %s (%s \"%s\")...",
6ef8df2b 634 where, strnull(fl), strempty(o));
60e76d48
ZJS
635 else if (!what && !type)
636 log_debug("Mounting %s (%s \"%s\")...",
6ef8df2b
YW
637 where, strnull(fl), strempty(o));
638 else if ((f & MS_BIND) && !type)
60e76d48 639 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
6ef8df2b
YW
640 what, where, strnull(fl), strempty(o));
641 else if (f & MS_MOVE)
afe682bc 642 log_debug("Moving mount %s → %s (%s \"%s\")...",
6ef8df2b 643 what, where, strnull(fl), strempty(o));
60e76d48 644 else
3b493d94
LP
645 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
646 strna(what), strna(type), where, strnull(fl), strempty(o));
511a8cfe
LP
647
648 if (follow_symlink)
649 r = mount(what, where, type, f, o) < 0 ? -errno : 0;
650 else
651 r = mount_nofollow(what, where, type, f, o);
652 if (r < 0)
653 return log_full_errno(error_log_level, r,
3ccf6126
LP
654 "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
655 strna(what), strna(type), where, strnull(fl), strempty(o));
60e76d48
ZJS
656 return 0;
657}
658
30f5d104
LP
659int umount_verbose(
660 int error_log_level,
661 const char *what,
662 int flags) {
663
664 assert(what);
665
60e76d48 666 log_debug("Umounting %s...", what);
30f5d104
LP
667
668 if (umount2(what, flags) < 0)
669 return log_full_errno(error_log_level, errno,
670 "Failed to unmount %s: %m", what);
671
60e76d48
ZJS
672 return 0;
673}
83555251 674
9e7f941a
YW
675int mount_option_mangle(
676 const char *options,
677 unsigned long mount_flags,
678 unsigned long *ret_mount_flags,
679 char **ret_remaining_options) {
680
681 const struct libmnt_optmap *map;
682 _cleanup_free_ char *ret = NULL;
683 const char *p;
684 int r;
685
686 /* This extracts mount flags from the mount options, and store
687 * non-mount-flag options to '*ret_remaining_options'.
688 * E.g.,
689 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
690 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
691 * "size=1630748k,mode=700,uid=1000,gid=1000".
692 * See more examples in test-mount-utils.c.
693 *
694 * Note that if 'options' does not contain any non-mount-flag options,
5238e957 695 * then '*ret_remaining_options' is set to NULL instead of empty string.
9e7f941a
YW
696 * Note that this does not check validity of options stored in
697 * '*ret_remaining_options'.
698 * Note that if 'options' is NULL, then this just copies 'mount_flags'
699 * to '*ret_mount_flags'. */
700
701 assert(ret_mount_flags);
702 assert(ret_remaining_options);
703
704 map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
705 if (!map)
706 return -EINVAL;
707
708 p = options;
709 for (;;) {
710 _cleanup_free_ char *word = NULL;
711 const struct libmnt_optmap *ent;
712
4ec85141 713 r = extract_first_word(&p, &word, ",", EXTRACT_UNQUOTE);
9e7f941a
YW
714 if (r < 0)
715 return r;
716 if (r == 0)
717 break;
718
719 for (ent = map; ent->name; ent++) {
720 /* All entries in MNT_LINUX_MAP do not take any argument.
721 * Thus, ent->name does not contain "=" or "[=]". */
722 if (!streq(word, ent->name))
723 continue;
724
725 if (!(ent->mask & MNT_INVERT))
726 mount_flags |= ent->id;
727 else if (mount_flags & ent->id)
728 mount_flags ^= ent->id;
729
730 break;
731 }
732
733 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
c2bc710b 734 if (!ent->name && !strextend_with_separator(&ret, ",", word))
9e7f941a
YW
735 return -ENOMEM;
736 }
737
738 *ret_mount_flags = mount_flags;
ae2a15bc 739 *ret_remaining_options = TAKE_PTR(ret);
9e7f941a
YW
740
741 return 0;
742}
6af52c3a 743
70599967 744static int mount_in_namespace(
6af52c3a
LB
745 pid_t target,
746 const char *propagate_path,
747 const char *incoming_path,
748 const char *src,
749 const char *dest,
750 bool read_only,
70599967
LB
751 bool make_file_or_directory,
752 const MountOptions *options,
753 bool is_image) {
6af52c3a
LB
754
755 _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 };
f7c18d3d
LB
756 _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1, chased_src_fd = -1;
757 char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p,
758 chased_src[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
6af52c3a
LB
759 bool mount_slave_created = false, mount_slave_mounted = false,
760 mount_tmp_created = false, mount_tmp_mounted = false,
761 mount_outside_created = false, mount_outside_mounted = false;
2338a175 762 struct stat st, self_mntns_st;
6af52c3a
LB
763 pid_t child;
764 int r;
765
766 assert(target > 0);
767 assert(propagate_path);
768 assert(incoming_path);
769 assert(src);
770 assert(dest);
70599967 771 assert(!options || is_image);
6af52c3a 772
98f654fd 773 r = namespace_open(target, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd);
2338a175
LB
774 if (r < 0)
775 return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
776
777 if (fstat(mntns_fd, &st) < 0)
778 return log_debug_errno(errno, "Failed to fstat mount namespace FD of target process: %m");
779
780 r = namespace_open(0, NULL, &self_mntns_fd, NULL, NULL, NULL);
781 if (r < 0)
782 return log_debug_errno(r, "Failed to retrieve FDs of systemd's namespace: %m");
783
784 if (fstat(self_mntns_fd, &self_mntns_st) < 0)
785 return log_debug_errno(errno, "Failed to fstat mount namespace FD of systemd: %m");
786
787 /* We can't add new mounts at runtime if the process wasn't started in a namespace */
788 if (st.st_ino == self_mntns_st.st_ino && st.st_dev == self_mntns_st.st_dev)
789 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace");
790
6af52c3a
LB
791 /* One day, when bind mounting /proc/self/fd/n works across
792 * namespace boundaries we should rework this logic to make
793 * use of it... */
794
795 p = strjoina(propagate_path, "/");
796 r = laccess(p, F_OK);
797 if (r < 0)
798 return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
799
f7c18d3d 800 r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, NULL, &chased_src_fd);
6af52c3a
LB
801 if (r < 0)
802 return log_debug_errno(r, "Failed to resolve source path of %s: %m", src);
f7c18d3d 803 xsprintf(chased_src, "/proc/self/fd/%i", chased_src_fd);
6af52c3a 804
f7c18d3d
LB
805 if (fstat(chased_src_fd, &st) < 0)
806 return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src);
6af52c3a 807 if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
f7c18d3d 808 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src);
6af52c3a
LB
809
810 /* Our goal is to install a new bind mount into the container,
811 possibly read-only. This is irritatingly complex
812 unfortunately, currently.
813
814 First, we start by creating a private playground in /tmp,
815 that we can mount MS_SLAVE. (Which is necessary, since
816 MS_MOVE cannot be applied to mounts with MS_SHARED parent
817 mounts.) */
818
819 if (!mkdtemp(mount_slave))
820 return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
821
822 mount_slave_created = true;
823
824 r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
825 if (r < 0)
826 goto finish;
827
828 mount_slave_mounted = true;
829
830 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
831 if (r < 0)
832 goto finish;
833
834 /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
835 mount_tmp = strjoina(mount_slave, "/mount");
70599967
LB
836 if (is_image)
837 r = mkdir_p(mount_tmp, 0700);
838 else
839 r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700);
6af52c3a
LB
840 if (r < 0) {
841 log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
842 goto finish;
843 }
844
845 mount_tmp_created = true;
846
70599967 847 if (is_image)
93f59701 848 r = verity_dissect_and_mount(chased_src, mount_tmp, options, NULL, NULL, NULL);
70599967
LB
849 else
850 r = mount_follow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL);
6af52c3a
LB
851 if (r < 0)
852 goto finish;
853
854 mount_tmp_mounted = true;
855
856 /* Third, we remount the new bind mount read-only if requested. */
857 if (read_only) {
858 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
859 if (r < 0)
860 goto finish;
861 }
862
863 /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
864 * right-away. */
865
866 mount_outside = strjoina(propagate_path, "/XXXXXX");
70599967 867 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
868 r = mkdtemp(mount_outside) ? 0 : -errno;
869 else {
870 r = mkostemp_safe(mount_outside);
871 safe_close(r);
872 }
873 if (r < 0) {
874 log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
875 goto finish;
876 }
877
878 mount_outside_created = true;
879
880 r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
881 if (r < 0)
882 goto finish;
883
884 mount_outside_mounted = true;
885 mount_tmp_mounted = false;
886
70599967 887 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
888 (void) rmdir(mount_tmp);
889 else
890 (void) unlink(mount_tmp);
891 mount_tmp_created = false;
892
893 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
894 mount_slave_mounted = false;
895
896 (void) rmdir(mount_slave);
897 mount_slave_created = false;
898
899 if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
900 log_debug_errno(errno, "Failed to create pipe: %m");
901 goto finish;
902 }
903
2338a175 904 r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG,
98f654fd 905 pidns_fd, mntns_fd, -1, -1, root_fd, &child);
6af52c3a
LB
906 if (r < 0)
907 goto finish;
908 if (r == 0) {
2338a175 909 const char *mount_inside;
6af52c3a
LB
910
911 errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
912
6af52c3a 913 if (make_file_or_directory) {
70599967
LB
914 if (!is_image) {
915 (void) mkdir_parents(dest, 0755);
916 (void) make_mount_point_inode_from_stat(&st, dest, 0700);
917 } else
918 (void) mkdir_p(dest, 0755);
6af52c3a
LB
919 }
920
921 /* Fifth, move the mount to the right place inside */
922 mount_inside = strjoina(incoming_path, basename(mount_outside));
923 r = mount_nofollow_verbose(LOG_ERR, mount_inside, dest, NULL, MS_MOVE, NULL);
924 if (r < 0)
925 goto child_fail;
926
927 _exit(EXIT_SUCCESS);
928
929 child_fail:
930 (void) write(errno_pipe_fd[1], &r, sizeof(r));
931 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
932
933 _exit(EXIT_FAILURE);
934 }
935
936 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
937
938 r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
939 if (r < 0) {
940 log_debug_errno(r, "Failed to wait for child: %m");
941 goto finish;
942 }
943 if (r != EXIT_SUCCESS) {
944 if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
945 log_debug_errno(r, "Failed to mount: %m");
946 else
947 log_debug("Child failed.");
948 goto finish;
949 }
950
951finish:
952 if (mount_outside_mounted)
953 (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
954 if (mount_outside_created) {
70599967 955 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
956 (void) rmdir(mount_outside);
957 else
958 (void) unlink(mount_outside);
959 }
960
961 if (mount_tmp_mounted)
962 (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
963 if (mount_tmp_created) {
70599967 964 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
965 (void) rmdir(mount_tmp);
966 else
967 (void) unlink(mount_tmp);
968 }
969
970 if (mount_slave_mounted)
971 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
972 if (mount_slave_created)
973 (void) rmdir(mount_slave);
974
975 return r;
976}
70599967
LB
977
978int bind_mount_in_namespace(
979 pid_t target,
980 const char *propagate_path,
981 const char *incoming_path,
982 const char *src,
983 const char *dest,
984 bool read_only,
985 bool make_file_or_directory) {
986
987 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, NULL, false);
988}
989
990int mount_image_in_namespace(
991 pid_t target,
992 const char *propagate_path,
993 const char *incoming_path,
994 const char *src,
995 const char *dest,
996 bool read_only,
997 bool make_file_or_directory,
998 const MountOptions *options) {
999
1000 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, true);
1001}