]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/mount-util.c
nspawn: replace boolean --private-user-chown by enum
[thirdparty/systemd.git] / src / shared / mount-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
4349cd7c 2
11c3a366 3#include <errno.h>
70599967 4#include <linux/loop.h>
11c3a366 5#include <stdlib.h>
4349cd7c 6#include <sys/mount.h>
11c3a366 7#include <sys/stat.h>
4349cd7c 8#include <sys/statvfs.h>
11c3a366 9#include <unistd.h>
4349cd7c 10
b5efdb8a 11#include "alloc-util.h"
70599967 12#include "dissect-image.h"
9e7f941a 13#include "extract-word.h"
4349cd7c
LP
14#include "fd-util.h"
15#include "fileio.h"
e1873695 16#include "fs-util.h"
93cc7779 17#include "hashmap.h"
13dcfe46 18#include "libmount-util.h"
6af52c3a 19#include "mkdir.h"
4349cd7c 20#include "mount-util.h"
049af8ad 21#include "mountpoint-util.h"
2338a175 22#include "namespace-util.h"
4349cd7c
LP
23#include "parse-util.h"
24#include "path-util.h"
6af52c3a 25#include "process-util.h"
4349cd7c 26#include "set.h"
28126409 27#include "stat-util.h"
15a5e950 28#include "stdio-util.h"
4349cd7c 29#include "string-util.h"
6b7c9f8b 30#include "strv.h"
6af52c3a 31#include "tmpfile-util.h"
70599967 32#include "user-util.h"
4349cd7c 33
28126409
LP
34int mount_fd(const char *source,
35 int target_fd,
36 const char *filesystemtype,
37 unsigned long mountflags,
38 const void *data) {
39
40 char path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
41
42 xsprintf(path, "/proc/self/fd/%i", target_fd);
43 if (mount(source, path, filesystemtype, mountflags, data) < 0) {
44 if (errno != ENOENT)
45 return -errno;
46
47 /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
48 * mounted. Check for the latter to generate better error messages. */
49 if (proc_mounted() == 0)
50 return -ENOSYS;
51
52 return -ENOENT;
53 }
54
55 return 0;
56}
57
58int mount_nofollow(
59 const char *source,
60 const char *target,
61 const char *filesystemtype,
62 unsigned long mountflags,
63 const void *data) {
64
65 _cleanup_close_ int fd = -1;
66
67 /* In almost all cases we want to manipulate the mount table without following symlinks, hence
68 * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
69 * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
70 * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
71 * fs to mount) we can only use traditional mount() directly.
72 *
73 * Note that this disables following only for the final component of the target, i.e symlinks within
74 * the path of the target are honoured, as are symlinks in the source path everywhere. */
75
76 fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
77 if (fd < 0)
78 return -errno;
79
80 return mount_fd(source, fd, filesystemtype, mountflags, data);
81}
82
4349cd7c 83int umount_recursive(const char *prefix, int flags) {
4349cd7c 84 int n = 0, r;
f8b1904f 85 bool again;
4349cd7c 86
9d0619de
LP
87 /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
88 * keep unmounting them until they are gone. */
4349cd7c
LP
89
90 do {
13dcfe46
ZJS
91 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
92 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
4349cd7c
LP
93
94 again = false;
4349cd7c 95
2f2d81d9 96 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
fdeea3f4 97 if (r < 0)
13dcfe46 98 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
35bbbf85 99
4349cd7c 100 for (;;) {
13dcfe46
ZJS
101 struct libmnt_fs *fs;
102 const char *path;
4349cd7c 103
13dcfe46
ZJS
104 r = mnt_table_next_fs(table, iter, &fs);
105 if (r == 1)
106 break;
107 if (r < 0)
108 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 109
13dcfe46
ZJS
110 path = mnt_fs_get_target(fs);
111 if (!path)
112 continue;
4349cd7c 113
13dcfe46 114 if (!path_startswith(path, prefix))
4349cd7c
LP
115 continue;
116
827ea521
LP
117 if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
118 log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
4349cd7c
LP
119 continue;
120 }
121
13dcfe46 122 log_debug("Successfully unmounted %s", path);
6b7c9f8b 123
4349cd7c
LP
124 again = true;
125 n++;
126
127 break;
128 }
4349cd7c
LP
129 } while (again);
130
13dcfe46 131 return n;
4349cd7c
LP
132}
133
be3f3752 134/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
64e82c19
LP
135 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
136int bind_remount_recursive_with_mountinfo(
137 const char *prefix,
138 unsigned long new_flags,
139 unsigned long flags_mask,
6b000af4 140 char **deny_list,
64e82c19
LP
141 FILE *proc_self_mountinfo) {
142
ba8dced2 143 _cleanup_set_free_ Set *done = NULL;
670e8efd 144 unsigned n_tries = 0;
4349cd7c
LP
145 int r;
146
8403219f 147 assert(prefix);
ac9de0b3
TR
148 assert(proc_self_mountinfo);
149
ddc155b2
TM
150 /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
151 * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
152 * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
153 * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
154 * access, too. When mounts are stacked on the same mount point we only care for each individual
155 * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
156 * not have any effect on future submounts that might get propagated, they might be writable
4b6ef527
LP
157 * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
158 * operate atomically here. Mounts established while we process the tree might or might not get
159 * noticed and thus might or might not be covered.
6b7c9f8b 160 *
6b000af4
LP
161 * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
162 * remount operation. Note that we'll ignore the deny list for the top-level path. */
4349cd7c 163
4349cd7c 164 for (;;) {
13dcfe46
ZJS
165 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
166 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
ba8dced2 167 _cleanup_hashmap_free_ Hashmap *todo = NULL;
4349cd7c 168 bool top_autofs = false;
4349cd7c 169
670e8efd
LP
170 if (n_tries++ >= 32) /* Let's not retry this loop forever */
171 return -EBUSY;
172
ac9de0b3 173 rewind(proc_self_mountinfo);
4349cd7c 174
e2857b3d 175 r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
13dcfe46
ZJS
176 if (r < 0)
177 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
4349cd7c 178
13dcfe46 179 for (;;) {
ba8dced2
LP
180 _cleanup_free_ char *d = NULL;
181 const char *path, *type, *opts;
182 unsigned long flags = 0;
13dcfe46 183 struct libmnt_fs *fs;
4349cd7c 184
13dcfe46 185 r = mnt_table_next_fs(table, iter, &fs);
d6bfab11 186 if (r == 1) /* EOF */
13dcfe46 187 break;
4349cd7c 188 if (r < 0)
13dcfe46 189 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 190
13dcfe46 191 path = mnt_fs_get_target(fs);
d6bfab11 192 if (!path)
6b7c9f8b
LP
193 continue;
194
c6111b85 195 if (!path_startswith(path, prefix))
13dcfe46
ZJS
196 continue;
197
d6bfab11
LP
198 type = mnt_fs_get_fstype(fs);
199 if (!type)
200 continue;
201
202 /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
203 * triggering them, as we don't make any guarantees for future submounts anyway. If
204 * they are already triggered, then we will find another entry for this. */
205 if (streq(type, "autofs")) {
206 top_autofs = top_autofs || path_equal(path, prefix);
207 continue;
208 }
209
210 if (set_contains(done, path))
211 continue;
212
6b000af4 213 /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
13dcfe46 214 * we shall operate on. */
c6111b85 215 if (!path_equal(path, prefix)) {
6b000af4 216 bool deny_listed = false;
6b7c9f8b
LP
217 char **i;
218
6b000af4 219 STRV_FOREACH(i, deny_list) {
c6111b85 220 if (path_equal(*i, prefix))
6b7c9f8b
LP
221 continue;
222
c6111b85 223 if (!path_startswith(*i, prefix))
6b7c9f8b
LP
224 continue;
225
13dcfe46 226 if (path_startswith(path, *i)) {
6b000af4 227 deny_listed = true;
d6bfab11 228 log_debug("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
6b7c9f8b
LP
229 break;
230 }
231 }
d6bfab11 232
6b000af4 233 if (deny_listed)
6b7c9f8b
LP
234 continue;
235 }
236
ba8dced2
LP
237 opts = mnt_fs_get_vfs_options(fs);
238 if (opts) {
239 r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
240 if (r < 0)
241 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
242 }
243
244 d = strdup(path);
245 if (!d)
246 return -ENOMEM;
247
248 r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
249 if (r == -EEXIST)
250 continue;
d6bfab11
LP
251 if (r < 0)
252 return r;
ba8dced2
LP
253 if (r > 0)
254 TAKE_PTR(d);
4349cd7c
LP
255 }
256
5c5753b9
LP
257 /* Check if the top-level directory was among what we have seen so far. For that check both
258 * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
259 * not include it in either set but will set this bool. */
c6111b85 260 if (!set_contains(done, prefix) &&
ba8dced2 261 !(top_autofs || hashmap_contains(todo, prefix))) {
5c5753b9 262
6b7c9f8b 263 /* The prefix directory itself is not yet a mount, make it one. */
c6111b85 264 r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
511a8cfe
LP
265 if (r < 0)
266 return r;
4349cd7c 267
5c5753b9
LP
268 /* Immediately rescan, so that we pick up the new mount's flags */
269 continue;
4349cd7c
LP
270 }
271
5c5753b9 272 /* If we have no submounts to process anymore, we are done */
ba8dced2 273 if (hashmap_isempty(todo))
5c5753b9
LP
274 return 0;
275
ba8dced2
LP
276 for (;;) {
277 unsigned long flags;
278 char *x = NULL;
279
280 /* Take the first mount from our list of mounts to still process */
281 flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x));
282 if (!x)
283 break;
4349cd7c 284
ba8dced2 285 r = set_ensure_consume(&done, &path_hash_ops_free, x);
4c701096 286 if (IN_SET(r, 0, -EEXIST))
ba8dced2 287 continue; /* Already done */
4349cd7c
LP
288 if (r < 0)
289 return r;
290
ba8dced2
LP
291 /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
292 * the default anyway, thus redundant, and in userns we'll get an error if we try to
293 * explicitly enable it) */
294 r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
065b4774 295 if (r < 0) {
ba8dced2
LP
296 int q;
297
298 /* OK, so the remount of this entry failed. We'll ultimately ignore this in
299 * almost all cases (there are simply so many reasons why this can fail,
300 * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
301 * the very least. */
302
303 q = path_is_mount_point(x, NULL, 0);
304 if (IN_SET(q, 0, -ENOENT)) {
305 /* Hmm, whaaaa? The mount point is not actually a mount point? Then
306 * it is either obstructed by a later mount or somebody has been
307 * racing against us and removed it. Either way the mount point
308 * doesn't matter to us, let's ignore it hence. */
309 log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x);
310 continue;
311 }
312 if (q < 0) /* Any other error on this? Just log and continue */
313 log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x);
314
315 if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */
316 log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x);
317 continue;
318 }
319
320 /* Make this fatal if this is the top-level mount */
321 if (path_equal(x, prefix))
065b4774
LP
322 return r;
323
ba8dced2
LP
324 /* If this is not the top-level mount, then handle this gracefully: log but
325 * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
326 * this might fail without a chance for us to do anything about it, let's
327 * hence be strict on the top-level mount and lenient on the inner ones. */
328 log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x);
ef454fd1
YW
329 continue;
330 }
98df8089 331
ba8dced2 332 log_debug("Remounted %s.", x);
4349cd7c
LP
333 }
334 }
335}
336
8403219f
LP
337int bind_remount_recursive(
338 const char *prefix,
339 unsigned long new_flags,
340 unsigned long flags_mask,
6b000af4 341 char **deny_list) {
8403219f 342
ac9de0b3 343 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
fdeea3f4 344 int r;
ac9de0b3 345
fdeea3f4
ZJS
346 r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo);
347 if (r < 0)
348 return r;
35bbbf85 349
6b000af4 350 return bind_remount_recursive_with_mountinfo(prefix, new_flags, flags_mask, deny_list, proc_self_mountinfo);
ac9de0b3
TR
351}
352
7cce68e1
LP
353int bind_remount_one_with_mountinfo(
354 const char *path,
355 unsigned long new_flags,
356 unsigned long flags_mask,
357 FILE *proc_self_mountinfo) {
358
359 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
2c5ff8ea
LP
360 unsigned long flags = 0;
361 struct libmnt_fs *fs;
362 const char *opts;
7cce68e1
LP
363 int r;
364
365 assert(path);
366 assert(proc_self_mountinfo);
367
368 rewind(proc_self_mountinfo);
369
370 table = mnt_new_table();
371 if (!table)
372 return -ENOMEM;
373
374 r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
375 if (r < 0)
376 return r;
377
2c5ff8ea 378 fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
0338df47
LP
379 if (!fs) {
380 if (laccess(path, F_OK) < 0) /* Hmm, it's not in the mount table, but does it exist at all? */
381 return -errno;
382
2c5ff8ea 383 return -EINVAL; /* Not a mount point we recognize */
0338df47 384 }
2c5ff8ea
LP
385
386 opts = mnt_fs_get_vfs_options(fs);
387 if (opts) {
388 r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
389 if (r < 0)
390 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
391 }
7cce68e1 392
2c5ff8ea 393 r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
b23c6a64
LP
394 if (r < 0) {
395 if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again,
396 * since kernel adds it in
397 * everywhere, because it's the
398 * default. */
399 return r;
400
401 /* Let's handle redundant remounts gracefully */
402 log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path);
403 }
7cce68e1
LP
404
405 return 0;
406}
407
4349cd7c
LP
408int mount_move_root(const char *path) {
409 assert(path);
410
411 if (chdir(path) < 0)
412 return -errno;
413
414 if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
415 return -errno;
416
417 if (chroot(".") < 0)
418 return -errno;
419
420 if (chdir("/") < 0)
421 return -errno;
422
423 return 0;
424}
4e036b7a 425
3f2c0bec
LP
426int repeat_unmount(const char *path, int flags) {
427 bool done = false;
428
429 assert(path);
430
431 /* If there are multiple mounts on a mount point, this
432 * removes them all */
433
434 for (;;) {
435 if (umount2(path, flags) < 0) {
436
437 if (errno == EINVAL)
438 return done;
439
440 return -errno;
441 }
442
443 done = true;
444 }
445}
c4b41707 446
48b747fa
LP
447int mode_to_inaccessible_node(
448 const char *runtime_dir,
449 mode_t mode,
450 char **ret) {
451
452 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
453 * during early boot by PID 1. In some cases we lacked the privs to create the character and block
454 * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
455 * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
456 * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
457 * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
458 * file nodes, and that's the most important thing that matters.
459 *
460 * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
461 * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
462
e5f10caf
AZ
463 _cleanup_free_ char *d = NULL;
464 const char *node = NULL;
e5f10caf 465
48b747fa
LP
466 assert(ret);
467
468 if (!runtime_dir)
469 runtime_dir = "/run";
fe80fcc7 470
c4b41707
AP
471 switch(mode & S_IFMT) {
472 case S_IFREG:
48b747fa 473 node = "/systemd/inaccessible/reg";
e5f10caf 474 break;
fe80fcc7 475
c4b41707 476 case S_IFDIR:
48b747fa 477 node = "/systemd/inaccessible/dir";
e5f10caf 478 break;
fe80fcc7 479
c4b41707 480 case S_IFCHR:
48b747fa 481 node = "/systemd/inaccessible/chr";
e5f10caf 482 break;
fe80fcc7 483
c4b41707 484 case S_IFBLK:
48b747fa 485 node = "/systemd/inaccessible/blk";
e5f10caf 486 break;
fe80fcc7 487
c4b41707 488 case S_IFIFO:
48b747fa 489 node = "/systemd/inaccessible/fifo";
e5f10caf 490 break;
fe80fcc7 491
c4b41707 492 case S_IFSOCK:
48b747fa 493 node = "/systemd/inaccessible/sock";
e5f10caf 494 break;
c4b41707 495 }
e5f10caf
AZ
496 if (!node)
497 return -EINVAL;
498
48b747fa
LP
499 d = path_join(runtime_dir, node);
500 if (!d)
501 return -ENOMEM;
502
cbed1dc8
LP
503 /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
504 * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
505 * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
506 * inaccessible block device node let's see if the block device node actually exists, and if not,
507 * fall back to the character device node. From there fall back to the socket device node. This means
508 * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
509 * device node at all. */
510
511 if (S_ISBLK(mode) &&
512 access(d, F_OK) < 0 && errno == ENOENT) {
513 free(d);
514 d = path_join(runtime_dir, "/systemd/inaccessible/chr");
515 if (!d)
516 return -ENOMEM;
517 }
518
519 if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
520 access(d, F_OK) < 0 && errno == ENOENT) {
48b747fa
LP
521 free(d);
522 d = path_join(runtime_dir, "/systemd/inaccessible/sock");
523 if (!d)
524 return -ENOMEM;
525 }
e5f10caf 526
48b747fa 527 *ret = TAKE_PTR(d);
e5f10caf 528 return 0;
c4b41707 529}
60e76d48
ZJS
530
531#define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
532static char* mount_flags_to_string(long unsigned flags) {
533 char *x;
534 _cleanup_free_ char *y = NULL;
535 long unsigned overflow;
536
537 overflow = flags & ~(MS_RDONLY |
538 MS_NOSUID |
539 MS_NODEV |
540 MS_NOEXEC |
541 MS_SYNCHRONOUS |
542 MS_REMOUNT |
543 MS_MANDLOCK |
544 MS_DIRSYNC |
545 MS_NOATIME |
546 MS_NODIRATIME |
547 MS_BIND |
548 MS_MOVE |
549 MS_REC |
550 MS_SILENT |
551 MS_POSIXACL |
552 MS_UNBINDABLE |
553 MS_PRIVATE |
554 MS_SLAVE |
555 MS_SHARED |
556 MS_RELATIME |
557 MS_KERNMOUNT |
558 MS_I_VERSION |
559 MS_STRICTATIME |
560 MS_LAZYTIME);
561
562 if (flags == 0 || overflow != 0)
563 if (asprintf(&y, "%lx", overflow) < 0)
564 return NULL;
565
566 x = strjoin(FLAG(MS_RDONLY),
567 FLAG(MS_NOSUID),
568 FLAG(MS_NODEV),
569 FLAG(MS_NOEXEC),
570 FLAG(MS_SYNCHRONOUS),
571 FLAG(MS_REMOUNT),
572 FLAG(MS_MANDLOCK),
573 FLAG(MS_DIRSYNC),
574 FLAG(MS_NOATIME),
575 FLAG(MS_NODIRATIME),
576 FLAG(MS_BIND),
577 FLAG(MS_MOVE),
578 FLAG(MS_REC),
579 FLAG(MS_SILENT),
580 FLAG(MS_POSIXACL),
581 FLAG(MS_UNBINDABLE),
582 FLAG(MS_PRIVATE),
583 FLAG(MS_SLAVE),
584 FLAG(MS_SHARED),
585 FLAG(MS_RELATIME),
586 FLAG(MS_KERNMOUNT),
587 FLAG(MS_I_VERSION),
588 FLAG(MS_STRICTATIME),
589 FLAG(MS_LAZYTIME),
605405c6 590 y);
60e76d48
ZJS
591 if (!x)
592 return NULL;
593 if (!y)
594 x[strlen(x) - 1] = '\0'; /* truncate the last | */
595 return x;
596}
597
511a8cfe 598int mount_verbose_full(
60e76d48
ZJS
599 int error_log_level,
600 const char *what,
601 const char *where,
602 const char *type,
603 unsigned long flags,
511a8cfe
LP
604 const char *options,
605 bool follow_symlink) {
60e76d48 606
6ef8df2b
YW
607 _cleanup_free_ char *fl = NULL, *o = NULL;
608 unsigned long f;
609 int r;
610
611 r = mount_option_mangle(options, flags, &f, &o);
612 if (r < 0)
613 return log_full_errno(error_log_level, r,
614 "Failed to mangle mount options %s: %m",
615 strempty(options));
60e76d48 616
6ef8df2b 617 fl = mount_flags_to_string(f);
60e76d48 618
6ef8df2b 619 if ((f & MS_REMOUNT) && !what && !type)
60e76d48 620 log_debug("Remounting %s (%s \"%s\")...",
6ef8df2b 621 where, strnull(fl), strempty(o));
60e76d48
ZJS
622 else if (!what && !type)
623 log_debug("Mounting %s (%s \"%s\")...",
6ef8df2b
YW
624 where, strnull(fl), strempty(o));
625 else if ((f & MS_BIND) && !type)
60e76d48 626 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
6ef8df2b
YW
627 what, where, strnull(fl), strempty(o));
628 else if (f & MS_MOVE)
afe682bc 629 log_debug("Moving mount %s → %s (%s \"%s\")...",
6ef8df2b 630 what, where, strnull(fl), strempty(o));
60e76d48 631 else
3b493d94
LP
632 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
633 strna(what), strna(type), where, strnull(fl), strempty(o));
511a8cfe
LP
634
635 if (follow_symlink)
636 r = mount(what, where, type, f, o) < 0 ? -errno : 0;
637 else
638 r = mount_nofollow(what, where, type, f, o);
639 if (r < 0)
640 return log_full_errno(error_log_level, r,
3ccf6126
LP
641 "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
642 strna(what), strna(type), where, strnull(fl), strempty(o));
60e76d48
ZJS
643 return 0;
644}
645
30f5d104
LP
646int umount_verbose(
647 int error_log_level,
648 const char *what,
649 int flags) {
650
651 assert(what);
652
60e76d48 653 log_debug("Umounting %s...", what);
30f5d104
LP
654
655 if (umount2(what, flags) < 0)
656 return log_full_errno(error_log_level, errno,
657 "Failed to unmount %s: %m", what);
658
60e76d48
ZJS
659 return 0;
660}
83555251 661
9e7f941a
YW
662int mount_option_mangle(
663 const char *options,
664 unsigned long mount_flags,
665 unsigned long *ret_mount_flags,
666 char **ret_remaining_options) {
667
668 const struct libmnt_optmap *map;
669 _cleanup_free_ char *ret = NULL;
670 const char *p;
671 int r;
672
673 /* This extracts mount flags from the mount options, and store
674 * non-mount-flag options to '*ret_remaining_options'.
675 * E.g.,
676 * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
677 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
678 * "size=1630748k,mode=700,uid=1000,gid=1000".
679 * See more examples in test-mount-utils.c.
680 *
681 * Note that if 'options' does not contain any non-mount-flag options,
5238e957 682 * then '*ret_remaining_options' is set to NULL instead of empty string.
9e7f941a
YW
683 * Note that this does not check validity of options stored in
684 * '*ret_remaining_options'.
685 * Note that if 'options' is NULL, then this just copies 'mount_flags'
686 * to '*ret_mount_flags'. */
687
688 assert(ret_mount_flags);
689 assert(ret_remaining_options);
690
691 map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
692 if (!map)
693 return -EINVAL;
694
695 p = options;
696 for (;;) {
697 _cleanup_free_ char *word = NULL;
698 const struct libmnt_optmap *ent;
699
4ec85141 700 r = extract_first_word(&p, &word, ",", EXTRACT_UNQUOTE);
9e7f941a
YW
701 if (r < 0)
702 return r;
703 if (r == 0)
704 break;
705
706 for (ent = map; ent->name; ent++) {
707 /* All entries in MNT_LINUX_MAP do not take any argument.
708 * Thus, ent->name does not contain "=" or "[=]". */
709 if (!streq(word, ent->name))
710 continue;
711
712 if (!(ent->mask & MNT_INVERT))
713 mount_flags |= ent->id;
714 else if (mount_flags & ent->id)
715 mount_flags ^= ent->id;
716
717 break;
718 }
719
720 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
c2bc710b 721 if (!ent->name && !strextend_with_separator(&ret, ",", word))
9e7f941a
YW
722 return -ENOMEM;
723 }
724
725 *ret_mount_flags = mount_flags;
ae2a15bc 726 *ret_remaining_options = TAKE_PTR(ret);
9e7f941a
YW
727
728 return 0;
729}
6af52c3a 730
70599967 731static int mount_in_namespace(
6af52c3a
LB
732 pid_t target,
733 const char *propagate_path,
734 const char *incoming_path,
735 const char *src,
736 const char *dest,
737 bool read_only,
70599967
LB
738 bool make_file_or_directory,
739 const MountOptions *options,
740 bool is_image) {
6af52c3a
LB
741
742 _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 };
f7c18d3d
LB
743 _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1, chased_src_fd = -1;
744 char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p,
745 chased_src[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
6af52c3a
LB
746 bool mount_slave_created = false, mount_slave_mounted = false,
747 mount_tmp_created = false, mount_tmp_mounted = false,
748 mount_outside_created = false, mount_outside_mounted = false;
2338a175 749 struct stat st, self_mntns_st;
6af52c3a
LB
750 pid_t child;
751 int r;
752
753 assert(target > 0);
754 assert(propagate_path);
755 assert(incoming_path);
756 assert(src);
757 assert(dest);
70599967 758 assert(!options || is_image);
6af52c3a 759
98f654fd 760 r = namespace_open(target, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd);
2338a175
LB
761 if (r < 0)
762 return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
763
764 if (fstat(mntns_fd, &st) < 0)
765 return log_debug_errno(errno, "Failed to fstat mount namespace FD of target process: %m");
766
767 r = namespace_open(0, NULL, &self_mntns_fd, NULL, NULL, NULL);
768 if (r < 0)
769 return log_debug_errno(r, "Failed to retrieve FDs of systemd's namespace: %m");
770
771 if (fstat(self_mntns_fd, &self_mntns_st) < 0)
772 return log_debug_errno(errno, "Failed to fstat mount namespace FD of systemd: %m");
773
774 /* We can't add new mounts at runtime if the process wasn't started in a namespace */
775 if (st.st_ino == self_mntns_st.st_ino && st.st_dev == self_mntns_st.st_dev)
776 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace");
777
6af52c3a
LB
778 /* One day, when bind mounting /proc/self/fd/n works across
779 * namespace boundaries we should rework this logic to make
780 * use of it... */
781
782 p = strjoina(propagate_path, "/");
783 r = laccess(p, F_OK);
784 if (r < 0)
785 return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
786
f7c18d3d 787 r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, NULL, &chased_src_fd);
6af52c3a
LB
788 if (r < 0)
789 return log_debug_errno(r, "Failed to resolve source path of %s: %m", src);
f7c18d3d 790 xsprintf(chased_src, "/proc/self/fd/%i", chased_src_fd);
6af52c3a 791
f7c18d3d
LB
792 if (fstat(chased_src_fd, &st) < 0)
793 return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src);
6af52c3a 794 if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
f7c18d3d 795 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src);
6af52c3a
LB
796
797 /* Our goal is to install a new bind mount into the container,
798 possibly read-only. This is irritatingly complex
799 unfortunately, currently.
800
801 First, we start by creating a private playground in /tmp,
802 that we can mount MS_SLAVE. (Which is necessary, since
803 MS_MOVE cannot be applied to mounts with MS_SHARED parent
804 mounts.) */
805
806 if (!mkdtemp(mount_slave))
807 return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
808
809 mount_slave_created = true;
810
811 r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
812 if (r < 0)
813 goto finish;
814
815 mount_slave_mounted = true;
816
817 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
818 if (r < 0)
819 goto finish;
820
821 /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
822 mount_tmp = strjoina(mount_slave, "/mount");
70599967
LB
823 if (is_image)
824 r = mkdir_p(mount_tmp, 0700);
825 else
826 r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700);
6af52c3a
LB
827 if (r < 0) {
828 log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
829 goto finish;
830 }
831
832 mount_tmp_created = true;
833
70599967 834 if (is_image)
93f59701 835 r = verity_dissect_and_mount(chased_src, mount_tmp, options, NULL, NULL, NULL);
70599967
LB
836 else
837 r = mount_follow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL);
6af52c3a
LB
838 if (r < 0)
839 goto finish;
840
841 mount_tmp_mounted = true;
842
843 /* Third, we remount the new bind mount read-only if requested. */
844 if (read_only) {
845 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
846 if (r < 0)
847 goto finish;
848 }
849
850 /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
851 * right-away. */
852
853 mount_outside = strjoina(propagate_path, "/XXXXXX");
70599967 854 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
855 r = mkdtemp(mount_outside) ? 0 : -errno;
856 else {
857 r = mkostemp_safe(mount_outside);
858 safe_close(r);
859 }
860 if (r < 0) {
861 log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
862 goto finish;
863 }
864
865 mount_outside_created = true;
866
867 r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
868 if (r < 0)
869 goto finish;
870
871 mount_outside_mounted = true;
872 mount_tmp_mounted = false;
873
70599967 874 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
875 (void) rmdir(mount_tmp);
876 else
877 (void) unlink(mount_tmp);
878 mount_tmp_created = false;
879
880 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
881 mount_slave_mounted = false;
882
883 (void) rmdir(mount_slave);
884 mount_slave_created = false;
885
886 if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
887 log_debug_errno(errno, "Failed to create pipe: %m");
888 goto finish;
889 }
890
2338a175 891 r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG,
98f654fd 892 pidns_fd, mntns_fd, -1, -1, root_fd, &child);
6af52c3a
LB
893 if (r < 0)
894 goto finish;
895 if (r == 0) {
2338a175 896 const char *mount_inside;
6af52c3a
LB
897
898 errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
899
6af52c3a 900 if (make_file_or_directory) {
70599967
LB
901 if (!is_image) {
902 (void) mkdir_parents(dest, 0755);
903 (void) make_mount_point_inode_from_stat(&st, dest, 0700);
904 } else
905 (void) mkdir_p(dest, 0755);
6af52c3a
LB
906 }
907
908 /* Fifth, move the mount to the right place inside */
909 mount_inside = strjoina(incoming_path, basename(mount_outside));
910 r = mount_nofollow_verbose(LOG_ERR, mount_inside, dest, NULL, MS_MOVE, NULL);
911 if (r < 0)
912 goto child_fail;
913
914 _exit(EXIT_SUCCESS);
915
916 child_fail:
917 (void) write(errno_pipe_fd[1], &r, sizeof(r));
918 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
919
920 _exit(EXIT_FAILURE);
921 }
922
923 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
924
925 r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
926 if (r < 0) {
927 log_debug_errno(r, "Failed to wait for child: %m");
928 goto finish;
929 }
930 if (r != EXIT_SUCCESS) {
931 if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
932 log_debug_errno(r, "Failed to mount: %m");
933 else
934 log_debug("Child failed.");
935 goto finish;
936 }
937
938finish:
939 if (mount_outside_mounted)
940 (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
941 if (mount_outside_created) {
70599967 942 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
943 (void) rmdir(mount_outside);
944 else
945 (void) unlink(mount_outside);
946 }
947
948 if (mount_tmp_mounted)
949 (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
950 if (mount_tmp_created) {
70599967 951 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
952 (void) rmdir(mount_tmp);
953 else
954 (void) unlink(mount_tmp);
955 }
956
957 if (mount_slave_mounted)
958 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
959 if (mount_slave_created)
960 (void) rmdir(mount_slave);
961
962 return r;
963}
70599967
LB
964
965int bind_mount_in_namespace(
966 pid_t target,
967 const char *propagate_path,
968 const char *incoming_path,
969 const char *src,
970 const char *dest,
971 bool read_only,
972 bool make_file_or_directory) {
973
974 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, NULL, false);
975}
976
977int mount_image_in_namespace(
978 pid_t target,
979 const char *propagate_path,
980 const char *incoming_path,
981 const char *src,
982 const char *dest,
983 bool read_only,
984 bool make_file_or_directory,
985 const MountOptions *options) {
986
987 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, true);
988}