]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/mount-util.c
mountpoint-util: rebreak some comments
[thirdparty/systemd.git] / src / shared / mount-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
4349cd7c 2
11c3a366 3#include <errno.h>
70599967 4#include <linux/loop.h>
11c3a366 5#include <stdlib.h>
4349cd7c 6#include <sys/mount.h>
11c3a366 7#include <sys/stat.h>
4349cd7c 8#include <sys/statvfs.h>
11c3a366 9#include <unistd.h>
4349cd7c 10
b5efdb8a 11#include "alloc-util.h"
70599967 12#include "dissect-image.h"
9e7f941a 13#include "extract-word.h"
4349cd7c
LP
14#include "fd-util.h"
15#include "fileio.h"
e1873695 16#include "fs-util.h"
93cc7779 17#include "hashmap.h"
13dcfe46 18#include "libmount-util.h"
6af52c3a 19#include "mkdir.h"
4349cd7c 20#include "mount-util.h"
049af8ad 21#include "mountpoint-util.h"
2338a175 22#include "namespace-util.h"
4349cd7c
LP
23#include "parse-util.h"
24#include "path-util.h"
6af52c3a 25#include "process-util.h"
4349cd7c 26#include "set.h"
28126409 27#include "stat-util.h"
15a5e950 28#include "stdio-util.h"
4349cd7c 29#include "string-util.h"
6b7c9f8b 30#include "strv.h"
6af52c3a 31#include "tmpfile-util.h"
70599967 32#include "user-util.h"
4349cd7c 33
28126409
LP
34int mount_fd(const char *source,
35 int target_fd,
36 const char *filesystemtype,
37 unsigned long mountflags,
38 const void *data) {
39
40 char path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
41
42 xsprintf(path, "/proc/self/fd/%i", target_fd);
43 if (mount(source, path, filesystemtype, mountflags, data) < 0) {
44 if (errno != ENOENT)
45 return -errno;
46
47 /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
48 * mounted. Check for the latter to generate better error messages. */
49 if (proc_mounted() == 0)
50 return -ENOSYS;
51
52 return -ENOENT;
53 }
54
55 return 0;
56}
57
58int mount_nofollow(
59 const char *source,
60 const char *target,
61 const char *filesystemtype,
62 unsigned long mountflags,
63 const void *data) {
64
65 _cleanup_close_ int fd = -1;
66
67 /* In almost all cases we want to manipulate the mount table without following symlinks, hence
68 * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
69 * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
70 * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
71 * fs to mount) we can only use traditional mount() directly.
72 *
73 * Note that this disables following only for the final component of the target, i.e symlinks within
74 * the path of the target are honoured, as are symlinks in the source path everywhere. */
75
76 fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
77 if (fd < 0)
78 return -errno;
79
80 return mount_fd(source, fd, filesystemtype, mountflags, data);
81}
82
4349cd7c 83int umount_recursive(const char *prefix, int flags) {
4349cd7c 84 int n = 0, r;
f8b1904f 85 bool again;
4349cd7c 86
9d0619de
LP
87 /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
88 * keep unmounting them until they are gone. */
4349cd7c
LP
89
90 do {
13dcfe46
ZJS
91 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
92 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
4349cd7c
LP
93
94 again = false;
4349cd7c 95
2f2d81d9 96 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
fdeea3f4 97 if (r < 0)
13dcfe46 98 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
35bbbf85 99
4349cd7c 100 for (;;) {
13dcfe46
ZJS
101 struct libmnt_fs *fs;
102 const char *path;
4349cd7c 103
13dcfe46
ZJS
104 r = mnt_table_next_fs(table, iter, &fs);
105 if (r == 1)
106 break;
107 if (r < 0)
108 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 109
13dcfe46
ZJS
110 path = mnt_fs_get_target(fs);
111 if (!path)
112 continue;
4349cd7c 113
13dcfe46 114 if (!path_startswith(path, prefix))
4349cd7c
LP
115 continue;
116
827ea521
LP
117 if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
118 log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
4349cd7c
LP
119 continue;
120 }
121
13dcfe46 122 log_debug("Successfully unmounted %s", path);
6b7c9f8b 123
4349cd7c
LP
124 again = true;
125 n++;
126
127 break;
128 }
4349cd7c
LP
129 } while (again);
130
13dcfe46 131 return n;
4349cd7c
LP
132}
133
08b1f5c7
LP
134static int get_mount_flags(
135 struct libmnt_table *table,
136 const char *path,
137 unsigned long *ret) {
5012d567
LP
138
139 _cleanup_close_ int fd = -1;
08b1f5c7
LP
140 struct libmnt_fs *fs;
141 struct statvfs buf;
142 const char *opts;
5012d567 143 int r;
d34a4008 144
08b1f5c7
LP
145 /* Get the mount flags for the mountpoint at "path" from "table". We have a fallback using statvfs()
146 * in place (which provides us with mostly the same info), but it's just a fallback, since using it
5012d567
LP
147 * means triggering autofs or NFS mounts, which we'd rather avoid needlessly.
148 *
149 * This generally doesn't follow symlinks. */
08b1f5c7 150
d34a4008 151 fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
38288f0b 152 if (!fs) {
08b1f5c7 153 log_debug("Could not find '%s' in mount table, ignoring.", path);
d34a4008
JU
154 goto fallback;
155 }
156
157 opts = mnt_fs_get_vfs_options(fs);
08b1f5c7
LP
158 if (!opts) {
159 *ret = 0;
160 return 0;
161 }
162
163 r = mnt_optstr_get_flags(opts, ret, mnt_get_builtin_optmap(MNT_LINUX_MAP));
d34a4008 164 if (r != 0) {
08b1f5c7 165 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
d34a4008
JU
166 goto fallback;
167 }
4349cd7c 168
08b1f5c7
LP
169 /* MS_RELATIME is default and trying to set it in an unprivileged container causes EPERM */
170 *ret &= ~MS_RELATIME;
d34a4008
JU
171 return 0;
172
173fallback:
5012d567
LP
174 fd = open(path, O_PATH|O_CLOEXEC|O_NOFOLLOW);
175 if (fd < 0)
176 return -errno;
177
178 if (fstatvfs(fd, &buf) < 0)
4349cd7c 179 return -errno;
d34a4008 180
08b1f5c7
LP
181 /* The statvfs() flags and the mount flags mostly have the same values, but for some cases do
182 * not. Hence map the flags manually. (Strictly speaking, ST_RELATIME/MS_RELATIME is the most
183 * prominent one that doesn't match, but that's the one we mask away anyway, see above.) */
184
185 *ret =
186 FLAGS_SET(buf.f_flag, ST_RDONLY) * MS_RDONLY |
187 FLAGS_SET(buf.f_flag, ST_NODEV) * MS_NODEV |
188 FLAGS_SET(buf.f_flag, ST_NOEXEC) * MS_NOEXEC |
189 FLAGS_SET(buf.f_flag, ST_NOSUID) * MS_NOSUID |
190 FLAGS_SET(buf.f_flag, ST_NOATIME) * MS_NOATIME |
191 FLAGS_SET(buf.f_flag, ST_NODIRATIME) * MS_NODIRATIME;
192
4349cd7c
LP
193 return 0;
194}
195
be3f3752 196/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
64e82c19
LP
197 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
198int bind_remount_recursive_with_mountinfo(
199 const char *prefix,
200 unsigned long new_flags,
201 unsigned long flags_mask,
6b000af4 202 char **deny_list,
64e82c19
LP
203 FILE *proc_self_mountinfo) {
204
4349cd7c 205 _cleanup_set_free_free_ Set *done = NULL;
f3dab34d 206 _cleanup_free_ char *simplified = NULL;
4349cd7c
LP
207 int r;
208
8403219f 209 assert(prefix);
ac9de0b3
TR
210 assert(proc_self_mountinfo);
211
ddc155b2
TM
212 /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
213 * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
214 * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
215 * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
216 * access, too. When mounts are stacked on the same mount point we only care for each individual
217 * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
218 * not have any effect on future submounts that might get propagated, they might be writable
219 * etc. This includes future submounts that have been triggered via autofs.
6b7c9f8b 220 *
6b000af4
LP
221 * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
222 * remount operation. Note that we'll ignore the deny list for the top-level path. */
4349cd7c 223
f3dab34d
LP
224 simplified = strdup(prefix);
225 if (!simplified)
4349cd7c
LP
226 return -ENOMEM;
227
f3dab34d 228 path_simplify(simplified, false);
4349cd7c 229
548f6937 230 done = set_new(&path_hash_ops);
4349cd7c
LP
231 if (!done)
232 return -ENOMEM;
233
234 for (;;) {
4349cd7c 235 _cleanup_set_free_free_ Set *todo = NULL;
13dcfe46
ZJS
236 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
237 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
4349cd7c
LP
238 bool top_autofs = false;
239 char *x;
240 unsigned long orig_flags;
241
548f6937 242 todo = set_new(&path_hash_ops);
4349cd7c
LP
243 if (!todo)
244 return -ENOMEM;
245
ac9de0b3 246 rewind(proc_self_mountinfo);
4349cd7c 247
e2857b3d 248 r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
13dcfe46
ZJS
249 if (r < 0)
250 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
4349cd7c 251
13dcfe46
ZJS
252 for (;;) {
253 struct libmnt_fs *fs;
254 const char *path, *type;
4349cd7c 255
13dcfe46
ZJS
256 r = mnt_table_next_fs(table, iter, &fs);
257 if (r == 1)
258 break;
4349cd7c 259 if (r < 0)
13dcfe46 260 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 261
13dcfe46
ZJS
262 path = mnt_fs_get_target(fs);
263 type = mnt_fs_get_fstype(fs);
264 if (!path || !type)
6b7c9f8b
LP
265 continue;
266
f3dab34d 267 if (!path_startswith(path, simplified))
13dcfe46
ZJS
268 continue;
269
6b000af4 270 /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
13dcfe46 271 * we shall operate on. */
f3dab34d 272 if (!path_equal(path, simplified)) {
6b000af4 273 bool deny_listed = false;
6b7c9f8b
LP
274 char **i;
275
6b000af4 276 STRV_FOREACH(i, deny_list) {
f3dab34d 277 if (path_equal(*i, simplified))
6b7c9f8b
LP
278 continue;
279
f3dab34d 280 if (!path_startswith(*i, simplified))
6b7c9f8b
LP
281 continue;
282
13dcfe46 283 if (path_startswith(path, *i)) {
6b000af4
LP
284 deny_listed = true;
285 log_debug("Not remounting %s deny-listed by %s, called for %s",
f3dab34d 286 path, *i, simplified);
6b7c9f8b
LP
287 break;
288 }
289 }
6b000af4 290 if (deny_listed)
6b7c9f8b
LP
291 continue;
292 }
293
4349cd7c
LP
294 /* Let's ignore autofs mounts. If they aren't
295 * triggered yet, we want to avoid triggering
296 * them, as we don't make any guarantees for
297 * future submounts anyway. If they are
298 * already triggered, then we will find
299 * another entry for this. */
300 if (streq(type, "autofs")) {
f3dab34d 301 top_autofs = top_autofs || path_equal(path, simplified);
4349cd7c
LP
302 continue;
303 }
304
13dcfe46 305 if (!set_contains(done, path)) {
be327321 306 r = set_put_strdup(&todo, path);
4349cd7c
LP
307 if (r < 0)
308 return r;
309 }
310 }
311
312 /* If we have no submounts to process anymore and if
313 * the root is either already done, or an autofs, we
314 * are done */
315 if (set_isempty(todo) &&
f3dab34d 316 (top_autofs || set_contains(done, simplified)))
4349cd7c
LP
317 return 0;
318
f3dab34d
LP
319 if (!set_contains(done, simplified) &&
320 !set_contains(todo, simplified)) {
6b7c9f8b 321 /* The prefix directory itself is not yet a mount, make it one. */
511a8cfe
LP
322 r = mount_nofollow(simplified, simplified, NULL, MS_BIND|MS_REC, NULL);
323 if (r < 0)
324 return r;
4349cd7c
LP
325
326 orig_flags = 0;
08b1f5c7 327 (void) get_mount_flags(table, simplified, &orig_flags);
4349cd7c 328
511a8cfe
LP
329 r = mount_nofollow(NULL, simplified, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
330 if (r < 0)
331 return r;
4349cd7c 332
6b7c9f8b
LP
333 log_debug("Made top-level directory %s a mount point.", prefix);
334
be327321 335 r = set_put_strdup(&done, simplified);
4349cd7c
LP
336 if (r < 0)
337 return r;
338 }
339
340 while ((x = set_steal_first(todo))) {
341
342 r = set_consume(done, x);
4c701096 343 if (IN_SET(r, 0, -EEXIST))
4349cd7c
LP
344 continue;
345 if (r < 0)
346 return r;
347
6b7c9f8b 348 /* Deal with mount points that are obstructed by a later mount */
e1873695 349 r = path_is_mount_point(x, NULL, 0);
4c701096 350 if (IN_SET(r, 0, -ENOENT))
98df8089 351 continue;
065b4774
LP
352 if (r < 0) {
353 if (!ERRNO_IS_PRIVILEGE(r))
354 return r;
355
53c442ef
YW
356 /* Even if root user invoke this, submounts under private FUSE or NFS mount points
357 * may not be acceessed. E.g.,
358 *
359 * $ bindfs --no-allow-other ~/mnt/mnt ~/mnt/mnt
360 * $ bindfs --no-allow-other ~/mnt ~/mnt
361 *
362 * Then, root user cannot access the mount point ~/mnt/mnt.
363 * In such cases, the submounts are ignored, as we have no way to manage them. */
ef454fd1
YW
364 log_debug_errno(r, "Failed to determine '%s' is mount point or not, ignoring: %m", x);
365 continue;
366 }
98df8089
AC
367
368 /* Try to reuse the original flag set */
4349cd7c 369 orig_flags = 0;
08b1f5c7 370 (void) get_mount_flags(table, x, &orig_flags);
4349cd7c 371
511a8cfe
LP
372 r = mount_nofollow(NULL, x, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
373 if (r < 0)
374 return r;
4349cd7c 375
6b7c9f8b 376 log_debug("Remounted %s read-only.", x);
4349cd7c
LP
377 }
378 }
379}
380
8403219f
LP
381int bind_remount_recursive(
382 const char *prefix,
383 unsigned long new_flags,
384 unsigned long flags_mask,
6b000af4 385 char **deny_list) {
8403219f 386
ac9de0b3 387 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
fdeea3f4 388 int r;
ac9de0b3 389
fdeea3f4
ZJS
390 r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo);
391 if (r < 0)
392 return r;
35bbbf85 393
6b000af4 394 return bind_remount_recursive_with_mountinfo(prefix, new_flags, flags_mask, deny_list, proc_self_mountinfo);
ac9de0b3
TR
395}
396
7cce68e1
LP
397int bind_remount_one_with_mountinfo(
398 const char *path,
399 unsigned long new_flags,
400 unsigned long flags_mask,
401 FILE *proc_self_mountinfo) {
402
403 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
404 unsigned long orig_flags = 0;
405 int r;
406
407 assert(path);
408 assert(proc_self_mountinfo);
409
410 rewind(proc_self_mountinfo);
411
412 table = mnt_new_table();
413 if (!table)
414 return -ENOMEM;
415
416 r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
417 if (r < 0)
418 return r;
419
420 /* Try to reuse the original flag set */
421 (void) get_mount_flags(table, path, &orig_flags);
422
511a8cfe
LP
423 r = mount_nofollow(NULL, path, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
424 if (r < 0)
425 return r;
7cce68e1
LP
426
427 return 0;
428}
429
4349cd7c
LP
430int mount_move_root(const char *path) {
431 assert(path);
432
433 if (chdir(path) < 0)
434 return -errno;
435
436 if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
437 return -errno;
438
439 if (chroot(".") < 0)
440 return -errno;
441
442 if (chdir("/") < 0)
443 return -errno;
444
445 return 0;
446}
4e036b7a 447
3f2c0bec
LP
448int repeat_unmount(const char *path, int flags) {
449 bool done = false;
450
451 assert(path);
452
453 /* If there are multiple mounts on a mount point, this
454 * removes them all */
455
456 for (;;) {
457 if (umount2(path, flags) < 0) {
458
459 if (errno == EINVAL)
460 return done;
461
462 return -errno;
463 }
464
465 done = true;
466 }
467}
c4b41707 468
48b747fa
LP
469int mode_to_inaccessible_node(
470 const char *runtime_dir,
471 mode_t mode,
472 char **ret) {
473
474 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
475 * during early boot by PID 1. In some cases we lacked the privs to create the character and block
476 * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
477 * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
478 * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
479 * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
480 * file nodes, and that's the most important thing that matters.
481 *
482 * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
483 * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
484
e5f10caf
AZ
485 _cleanup_free_ char *d = NULL;
486 const char *node = NULL;
e5f10caf 487
48b747fa
LP
488 assert(ret);
489
490 if (!runtime_dir)
491 runtime_dir = "/run";
fe80fcc7 492
c4b41707
AP
493 switch(mode & S_IFMT) {
494 case S_IFREG:
48b747fa 495 node = "/systemd/inaccessible/reg";
e5f10caf 496 break;
fe80fcc7 497
c4b41707 498 case S_IFDIR:
48b747fa 499 node = "/systemd/inaccessible/dir";
e5f10caf 500 break;
fe80fcc7 501
c4b41707 502 case S_IFCHR:
48b747fa 503 node = "/systemd/inaccessible/chr";
e5f10caf 504 break;
fe80fcc7 505
c4b41707 506 case S_IFBLK:
48b747fa 507 node = "/systemd/inaccessible/blk";
e5f10caf 508 break;
fe80fcc7 509
c4b41707 510 case S_IFIFO:
48b747fa 511 node = "/systemd/inaccessible/fifo";
e5f10caf 512 break;
fe80fcc7 513
c4b41707 514 case S_IFSOCK:
48b747fa 515 node = "/systemd/inaccessible/sock";
e5f10caf 516 break;
c4b41707 517 }
e5f10caf
AZ
518 if (!node)
519 return -EINVAL;
520
48b747fa
LP
521 d = path_join(runtime_dir, node);
522 if (!d)
523 return -ENOMEM;
524
cbed1dc8
LP
525 /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
526 * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
527 * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
528 * inaccessible block device node let's see if the block device node actually exists, and if not,
529 * fall back to the character device node. From there fall back to the socket device node. This means
530 * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
531 * device node at all. */
532
533 if (S_ISBLK(mode) &&
534 access(d, F_OK) < 0 && errno == ENOENT) {
535 free(d);
536 d = path_join(runtime_dir, "/systemd/inaccessible/chr");
537 if (!d)
538 return -ENOMEM;
539 }
540
541 if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
542 access(d, F_OK) < 0 && errno == ENOENT) {
48b747fa
LP
543 free(d);
544 d = path_join(runtime_dir, "/systemd/inaccessible/sock");
545 if (!d)
546 return -ENOMEM;
547 }
e5f10caf 548
48b747fa 549 *ret = TAKE_PTR(d);
e5f10caf 550 return 0;
c4b41707 551}
60e76d48
ZJS
552
553#define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
554static char* mount_flags_to_string(long unsigned flags) {
555 char *x;
556 _cleanup_free_ char *y = NULL;
557 long unsigned overflow;
558
559 overflow = flags & ~(MS_RDONLY |
560 MS_NOSUID |
561 MS_NODEV |
562 MS_NOEXEC |
563 MS_SYNCHRONOUS |
564 MS_REMOUNT |
565 MS_MANDLOCK |
566 MS_DIRSYNC |
567 MS_NOATIME |
568 MS_NODIRATIME |
569 MS_BIND |
570 MS_MOVE |
571 MS_REC |
572 MS_SILENT |
573 MS_POSIXACL |
574 MS_UNBINDABLE |
575 MS_PRIVATE |
576 MS_SLAVE |
577 MS_SHARED |
578 MS_RELATIME |
579 MS_KERNMOUNT |
580 MS_I_VERSION |
581 MS_STRICTATIME |
582 MS_LAZYTIME);
583
584 if (flags == 0 || overflow != 0)
585 if (asprintf(&y, "%lx", overflow) < 0)
586 return NULL;
587
588 x = strjoin(FLAG(MS_RDONLY),
589 FLAG(MS_NOSUID),
590 FLAG(MS_NODEV),
591 FLAG(MS_NOEXEC),
592 FLAG(MS_SYNCHRONOUS),
593 FLAG(MS_REMOUNT),
594 FLAG(MS_MANDLOCK),
595 FLAG(MS_DIRSYNC),
596 FLAG(MS_NOATIME),
597 FLAG(MS_NODIRATIME),
598 FLAG(MS_BIND),
599 FLAG(MS_MOVE),
600 FLAG(MS_REC),
601 FLAG(MS_SILENT),
602 FLAG(MS_POSIXACL),
603 FLAG(MS_UNBINDABLE),
604 FLAG(MS_PRIVATE),
605 FLAG(MS_SLAVE),
606 FLAG(MS_SHARED),
607 FLAG(MS_RELATIME),
608 FLAG(MS_KERNMOUNT),
609 FLAG(MS_I_VERSION),
610 FLAG(MS_STRICTATIME),
611 FLAG(MS_LAZYTIME),
605405c6 612 y);
60e76d48
ZJS
613 if (!x)
614 return NULL;
615 if (!y)
616 x[strlen(x) - 1] = '\0'; /* truncate the last | */
617 return x;
618}
619
511a8cfe 620int mount_verbose_full(
60e76d48
ZJS
621 int error_log_level,
622 const char *what,
623 const char *where,
624 const char *type,
625 unsigned long flags,
511a8cfe
LP
626 const char *options,
627 bool follow_symlink) {
60e76d48 628
6ef8df2b
YW
629 _cleanup_free_ char *fl = NULL, *o = NULL;
630 unsigned long f;
631 int r;
632
633 r = mount_option_mangle(options, flags, &f, &o);
634 if (r < 0)
635 return log_full_errno(error_log_level, r,
636 "Failed to mangle mount options %s: %m",
637 strempty(options));
60e76d48 638
6ef8df2b 639 fl = mount_flags_to_string(f);
60e76d48 640
6ef8df2b 641 if ((f & MS_REMOUNT) && !what && !type)
60e76d48 642 log_debug("Remounting %s (%s \"%s\")...",
6ef8df2b 643 where, strnull(fl), strempty(o));
60e76d48
ZJS
644 else if (!what && !type)
645 log_debug("Mounting %s (%s \"%s\")...",
6ef8df2b
YW
646 where, strnull(fl), strempty(o));
647 else if ((f & MS_BIND) && !type)
60e76d48 648 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
6ef8df2b
YW
649 what, where, strnull(fl), strempty(o));
650 else if (f & MS_MOVE)
afe682bc 651 log_debug("Moving mount %s → %s (%s \"%s\")...",
6ef8df2b 652 what, where, strnull(fl), strempty(o));
60e76d48 653 else
3b493d94
LP
654 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
655 strna(what), strna(type), where, strnull(fl), strempty(o));
511a8cfe
LP
656
657 if (follow_symlink)
658 r = mount(what, where, type, f, o) < 0 ? -errno : 0;
659 else
660 r = mount_nofollow(what, where, type, f, o);
661 if (r < 0)
662 return log_full_errno(error_log_level, r,
3ccf6126
LP
663 "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
664 strna(what), strna(type), where, strnull(fl), strempty(o));
60e76d48
ZJS
665 return 0;
666}
667
30f5d104
LP
668int umount_verbose(
669 int error_log_level,
670 const char *what,
671 int flags) {
672
673 assert(what);
674
60e76d48 675 log_debug("Umounting %s...", what);
30f5d104
LP
676
677 if (umount2(what, flags) < 0)
678 return log_full_errno(error_log_level, errno,
679 "Failed to unmount %s: %m", what);
680
60e76d48
ZJS
681 return 0;
682}
83555251 683
9e7f941a
YW
684int mount_option_mangle(
685 const char *options,
686 unsigned long mount_flags,
687 unsigned long *ret_mount_flags,
688 char **ret_remaining_options) {
689
690 const struct libmnt_optmap *map;
691 _cleanup_free_ char *ret = NULL;
692 const char *p;
693 int r;
694
695 /* This extracts mount flags from the mount options, and store
696 * non-mount-flag options to '*ret_remaining_options'.
697 * E.g.,
698 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
699 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
700 * "size=1630748k,mode=700,uid=1000,gid=1000".
701 * See more examples in test-mount-utils.c.
702 *
703 * Note that if 'options' does not contain any non-mount-flag options,
5238e957 704 * then '*ret_remaining_options' is set to NULL instead of empty string.
9e7f941a
YW
705 * Note that this does not check validity of options stored in
706 * '*ret_remaining_options'.
707 * Note that if 'options' is NULL, then this just copies 'mount_flags'
708 * to '*ret_mount_flags'. */
709
710 assert(ret_mount_flags);
711 assert(ret_remaining_options);
712
713 map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
714 if (!map)
715 return -EINVAL;
716
717 p = options;
718 for (;;) {
719 _cleanup_free_ char *word = NULL;
720 const struct libmnt_optmap *ent;
721
4ec85141 722 r = extract_first_word(&p, &word, ",", EXTRACT_UNQUOTE);
9e7f941a
YW
723 if (r < 0)
724 return r;
725 if (r == 0)
726 break;
727
728 for (ent = map; ent->name; ent++) {
729 /* All entries in MNT_LINUX_MAP do not take any argument.
730 * Thus, ent->name does not contain "=" or "[=]". */
731 if (!streq(word, ent->name))
732 continue;
733
734 if (!(ent->mask & MNT_INVERT))
735 mount_flags |= ent->id;
736 else if (mount_flags & ent->id)
737 mount_flags ^= ent->id;
738
739 break;
740 }
741
742 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
c2bc710b 743 if (!ent->name && !strextend_with_separator(&ret, ",", word))
9e7f941a
YW
744 return -ENOMEM;
745 }
746
747 *ret_mount_flags = mount_flags;
ae2a15bc 748 *ret_remaining_options = TAKE_PTR(ret);
9e7f941a
YW
749
750 return 0;
751}
6af52c3a 752
70599967 753static int mount_in_namespace(
6af52c3a
LB
754 pid_t target,
755 const char *propagate_path,
756 const char *incoming_path,
757 const char *src,
758 const char *dest,
759 bool read_only,
70599967
LB
760 bool make_file_or_directory,
761 const MountOptions *options,
762 bool is_image) {
6af52c3a
LB
763
764 _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 };
f7c18d3d
LB
765 _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1, chased_src_fd = -1;
766 char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p,
767 chased_src[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
6af52c3a
LB
768 bool mount_slave_created = false, mount_slave_mounted = false,
769 mount_tmp_created = false, mount_tmp_mounted = false,
770 mount_outside_created = false, mount_outside_mounted = false;
2338a175 771 struct stat st, self_mntns_st;
6af52c3a
LB
772 pid_t child;
773 int r;
774
775 assert(target > 0);
776 assert(propagate_path);
777 assert(incoming_path);
778 assert(src);
779 assert(dest);
70599967 780 assert(!options || is_image);
6af52c3a 781
98f654fd 782 r = namespace_open(target, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd);
2338a175
LB
783 if (r < 0)
784 return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
785
786 if (fstat(mntns_fd, &st) < 0)
787 return log_debug_errno(errno, "Failed to fstat mount namespace FD of target process: %m");
788
789 r = namespace_open(0, NULL, &self_mntns_fd, NULL, NULL, NULL);
790 if (r < 0)
791 return log_debug_errno(r, "Failed to retrieve FDs of systemd's namespace: %m");
792
793 if (fstat(self_mntns_fd, &self_mntns_st) < 0)
794 return log_debug_errno(errno, "Failed to fstat mount namespace FD of systemd: %m");
795
796 /* We can't add new mounts at runtime if the process wasn't started in a namespace */
797 if (st.st_ino == self_mntns_st.st_ino && st.st_dev == self_mntns_st.st_dev)
798 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace");
799
6af52c3a
LB
800 /* One day, when bind mounting /proc/self/fd/n works across
801 * namespace boundaries we should rework this logic to make
802 * use of it... */
803
804 p = strjoina(propagate_path, "/");
805 r = laccess(p, F_OK);
806 if (r < 0)
807 return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
808
f7c18d3d 809 r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, NULL, &chased_src_fd);
6af52c3a
LB
810 if (r < 0)
811 return log_debug_errno(r, "Failed to resolve source path of %s: %m", src);
f7c18d3d 812 xsprintf(chased_src, "/proc/self/fd/%i", chased_src_fd);
6af52c3a 813
f7c18d3d
LB
814 if (fstat(chased_src_fd, &st) < 0)
815 return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src);
6af52c3a 816 if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
f7c18d3d 817 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src);
6af52c3a
LB
818
819 /* Our goal is to install a new bind mount into the container,
820 possibly read-only. This is irritatingly complex
821 unfortunately, currently.
822
823 First, we start by creating a private playground in /tmp,
824 that we can mount MS_SLAVE. (Which is necessary, since
825 MS_MOVE cannot be applied to mounts with MS_SHARED parent
826 mounts.) */
827
828 if (!mkdtemp(mount_slave))
829 return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
830
831 mount_slave_created = true;
832
833 r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
834 if (r < 0)
835 goto finish;
836
837 mount_slave_mounted = true;
838
839 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
840 if (r < 0)
841 goto finish;
842
843 /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
844 mount_tmp = strjoina(mount_slave, "/mount");
70599967
LB
845 if (is_image)
846 r = mkdir_p(mount_tmp, 0700);
847 else
848 r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700);
6af52c3a
LB
849 if (r < 0) {
850 log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
851 goto finish;
852 }
853
854 mount_tmp_created = true;
855
70599967 856 if (is_image)
93f59701 857 r = verity_dissect_and_mount(chased_src, mount_tmp, options, NULL, NULL, NULL);
70599967
LB
858 else
859 r = mount_follow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL);
6af52c3a
LB
860 if (r < 0)
861 goto finish;
862
863 mount_tmp_mounted = true;
864
865 /* Third, we remount the new bind mount read-only if requested. */
866 if (read_only) {
867 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
868 if (r < 0)
869 goto finish;
870 }
871
872 /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
873 * right-away. */
874
875 mount_outside = strjoina(propagate_path, "/XXXXXX");
70599967 876 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
877 r = mkdtemp(mount_outside) ? 0 : -errno;
878 else {
879 r = mkostemp_safe(mount_outside);
880 safe_close(r);
881 }
882 if (r < 0) {
883 log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
884 goto finish;
885 }
886
887 mount_outside_created = true;
888
889 r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
890 if (r < 0)
891 goto finish;
892
893 mount_outside_mounted = true;
894 mount_tmp_mounted = false;
895
70599967 896 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
897 (void) rmdir(mount_tmp);
898 else
899 (void) unlink(mount_tmp);
900 mount_tmp_created = false;
901
902 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
903 mount_slave_mounted = false;
904
905 (void) rmdir(mount_slave);
906 mount_slave_created = false;
907
908 if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
909 log_debug_errno(errno, "Failed to create pipe: %m");
910 goto finish;
911 }
912
2338a175 913 r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG,
98f654fd 914 pidns_fd, mntns_fd, -1, -1, root_fd, &child);
6af52c3a
LB
915 if (r < 0)
916 goto finish;
917 if (r == 0) {
2338a175 918 const char *mount_inside;
6af52c3a
LB
919
920 errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
921
6af52c3a 922 if (make_file_or_directory) {
70599967
LB
923 if (!is_image) {
924 (void) mkdir_parents(dest, 0755);
925 (void) make_mount_point_inode_from_stat(&st, dest, 0700);
926 } else
927 (void) mkdir_p(dest, 0755);
6af52c3a
LB
928 }
929
930 /* Fifth, move the mount to the right place inside */
931 mount_inside = strjoina(incoming_path, basename(mount_outside));
932 r = mount_nofollow_verbose(LOG_ERR, mount_inside, dest, NULL, MS_MOVE, NULL);
933 if (r < 0)
934 goto child_fail;
935
936 _exit(EXIT_SUCCESS);
937
938 child_fail:
939 (void) write(errno_pipe_fd[1], &r, sizeof(r));
940 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
941
942 _exit(EXIT_FAILURE);
943 }
944
945 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
946
947 r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
948 if (r < 0) {
949 log_debug_errno(r, "Failed to wait for child: %m");
950 goto finish;
951 }
952 if (r != EXIT_SUCCESS) {
953 if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
954 log_debug_errno(r, "Failed to mount: %m");
955 else
956 log_debug("Child failed.");
957 goto finish;
958 }
959
960finish:
961 if (mount_outside_mounted)
962 (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
963 if (mount_outside_created) {
70599967 964 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
965 (void) rmdir(mount_outside);
966 else
967 (void) unlink(mount_outside);
968 }
969
970 if (mount_tmp_mounted)
971 (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
972 if (mount_tmp_created) {
70599967 973 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
974 (void) rmdir(mount_tmp);
975 else
976 (void) unlink(mount_tmp);
977 }
978
979 if (mount_slave_mounted)
980 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
981 if (mount_slave_created)
982 (void) rmdir(mount_slave);
983
984 return r;
985}
70599967
LB
986
987int bind_mount_in_namespace(
988 pid_t target,
989 const char *propagate_path,
990 const char *incoming_path,
991 const char *src,
992 const char *dest,
993 bool read_only,
994 bool make_file_or_directory) {
995
996 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, NULL, false);
997}
998
999int mount_image_in_namespace(
1000 pid_t target,
1001 const char *propagate_path,
1002 const char *incoming_path,
1003 const char *src,
1004 const char *dest,
1005 bool read_only,
1006 bool make_file_or_directory,
1007 const MountOptions *options) {
1008
1009 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, true);
1010}