]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/mount-util.c
mkosi: Use kernel-kvmsmall package on opensuse
[thirdparty/systemd.git] / src / shared / mount-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
4349cd7c 2
11c3a366
TA
3#include <errno.h>
4#include <stdlib.h>
4349cd7c 5#include <sys/mount.h>
11c3a366 6#include <sys/stat.h>
4349cd7c 7#include <sys/statvfs.h>
11c3a366 8#include <unistd.h>
35fd3558 9#include <linux/loop.h>
3657d3a0 10#if WANT_LINUX_FS_H
35fd3558 11#include <linux/fs.h>
3657d3a0 12#endif
4349cd7c 13
b5efdb8a 14#include "alloc-util.h"
f461a28d 15#include "chase.h"
70599967 16#include "dissect-image.h"
45a68ed3 17#include "exec-util.h"
9e7f941a 18#include "extract-word.h"
4349cd7c
LP
19#include "fd-util.h"
20#include "fileio.h"
e1873695 21#include "fs-util.h"
e2341b6b 22#include "glyph-util.h"
93cc7779 23#include "hashmap.h"
2e776ed6 24#include "initrd-util.h"
9c653536 25#include "label.h"
13dcfe46 26#include "libmount-util.h"
1c092b62 27#include "missing_mount.h"
35fd3558 28#include "missing_syscall.h"
35cd0ba5 29#include "mkdir-label.h"
4349cd7c 30#include "mount-util.h"
049af8ad 31#include "mountpoint-util.h"
2338a175 32#include "namespace-util.h"
4349cd7c
LP
33#include "parse-util.h"
34#include "path-util.h"
6af52c3a 35#include "process-util.h"
4349cd7c 36#include "set.h"
f63a2c48 37#include "sort-util.h"
28126409 38#include "stat-util.h"
15a5e950 39#include "stdio-util.h"
57c10a56 40#include "string-table.h"
4349cd7c 41#include "string-util.h"
6b7c9f8b 42#include "strv.h"
6af52c3a 43#include "tmpfile-util.h"
70599967 44#include "user-util.h"
4349cd7c 45
4349cd7c 46int umount_recursive(const char *prefix, int flags) {
4349cd7c 47 int n = 0, r;
f8b1904f 48 bool again;
4349cd7c 49
9d0619de
LP
50 /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
51 * keep unmounting them until they are gone. */
4349cd7c
LP
52
53 do {
13dcfe46
ZJS
54 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
55 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
4349cd7c
LP
56
57 again = false;
4349cd7c 58
2f2d81d9 59 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
fdeea3f4 60 if (r < 0)
13dcfe46 61 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
35bbbf85 62
4349cd7c 63 for (;;) {
13dcfe46
ZJS
64 struct libmnt_fs *fs;
65 const char *path;
4349cd7c 66
13dcfe46
ZJS
67 r = mnt_table_next_fs(table, iter, &fs);
68 if (r == 1)
69 break;
70 if (r < 0)
71 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 72
13dcfe46
ZJS
73 path = mnt_fs_get_target(fs);
74 if (!path)
75 continue;
4349cd7c 76
13dcfe46 77 if (!path_startswith(path, prefix))
4349cd7c
LP
78 continue;
79
827ea521
LP
80 if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
81 log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
4349cd7c
LP
82 continue;
83 }
84
13dcfe46 85 log_debug("Successfully unmounted %s", path);
6b7c9f8b 86
4349cd7c
LP
87 again = true;
88 n++;
89
90 break;
91 }
4349cd7c
LP
92 } while (again);
93
13dcfe46 94 return n;
4349cd7c
LP
95}
96
4f5644db
LP
97#define MS_CONVERTIBLE_FLAGS (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_NOSYMFOLLOW)
98
99static uint64_t ms_flags_to_mount_attr(unsigned long a) {
100 uint64_t f = 0;
101
102 if (FLAGS_SET(a, MS_RDONLY))
103 f |= MOUNT_ATTR_RDONLY;
104
105 if (FLAGS_SET(a, MS_NOSUID))
106 f |= MOUNT_ATTR_NOSUID;
107
108 if (FLAGS_SET(a, MS_NODEV))
109 f |= MOUNT_ATTR_NODEV;
110
111 if (FLAGS_SET(a, MS_NOEXEC))
112 f |= MOUNT_ATTR_NOEXEC;
113
114 if (FLAGS_SET(a, MS_NOSYMFOLLOW))
115 f |= MOUNT_ATTR_NOSYMFOLLOW;
116
117 return f;
118}
119
120static bool skip_mount_set_attr = false;
121
be3f3752 122/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
64e82c19
LP
123 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
124int bind_remount_recursive_with_mountinfo(
125 const char *prefix,
126 unsigned long new_flags,
127 unsigned long flags_mask,
6b000af4 128 char **deny_list,
64e82c19
LP
129 FILE *proc_self_mountinfo) {
130
0289948e 131 _cleanup_fclose_ FILE *proc_self_mountinfo_opened = NULL;
ba8dced2 132 _cleanup_set_free_ Set *done = NULL;
670e8efd 133 unsigned n_tries = 0;
4349cd7c
LP
134 int r;
135
8403219f 136 assert(prefix);
ac9de0b3 137
874052c5
LP
138 if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && strv_isempty(deny_list) && !skip_mount_set_attr) {
139 /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
140
141 if (mount_setattr(AT_FDCWD, prefix, AT_SYMLINK_NOFOLLOW|AT_RECURSIVE,
142 &(struct mount_attr) {
143 .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
144 .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
145 }, MOUNT_ATTR_SIZE_VER0) < 0) {
146
147 log_debug_errno(errno, "mount_setattr() failed, falling back to classic remounting: %m");
148
149 /* We fall through to classic behaviour if not supported (i.e. kernel < 5.12). We
150 * also do this for all other kinds of errors since they are so many different, and
151 * mount_setattr() has no graceful mode where it continues despite seeing errors one
152 * some mounts, but we want that. Moreover mount_setattr() only works on the mount
153 * point inode itself, not a non-mount point inode, and we want to support arbitrary
154 * prefixes here. */
155
156 if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
157 skip_mount_set_attr = true;
158 } else
159 return 0; /* Nice, this worked! */
160 }
161
0289948e
LP
162 if (!proc_self_mountinfo) {
163 r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo_opened);
164 if (r < 0)
165 return r;
166
167 proc_self_mountinfo = proc_self_mountinfo_opened;
168 }
169
ddc155b2
TM
170 /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
171 * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
172 * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
173 * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
174 * access, too. When mounts are stacked on the same mount point we only care for each individual
175 * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
176 * not have any effect on future submounts that might get propagated, they might be writable
4b6ef527
LP
177 * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
178 * operate atomically here. Mounts established while we process the tree might or might not get
179 * noticed and thus might or might not be covered.
6b7c9f8b 180 *
6b000af4
LP
181 * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
182 * remount operation. Note that we'll ignore the deny list for the top-level path. */
4349cd7c 183
4349cd7c 184 for (;;) {
13dcfe46
ZJS
185 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
186 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
ba8dced2 187 _cleanup_hashmap_free_ Hashmap *todo = NULL;
4349cd7c 188 bool top_autofs = false;
4349cd7c 189
670e8efd
LP
190 if (n_tries++ >= 32) /* Let's not retry this loop forever */
191 return -EBUSY;
192
ac9de0b3 193 rewind(proc_self_mountinfo);
4349cd7c 194
e2857b3d 195 r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
13dcfe46
ZJS
196 if (r < 0)
197 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
4349cd7c 198
13dcfe46 199 for (;;) {
ba8dced2
LP
200 _cleanup_free_ char *d = NULL;
201 const char *path, *type, *opts;
202 unsigned long flags = 0;
13dcfe46 203 struct libmnt_fs *fs;
4349cd7c 204
13dcfe46 205 r = mnt_table_next_fs(table, iter, &fs);
d6bfab11 206 if (r == 1) /* EOF */
13dcfe46 207 break;
4349cd7c 208 if (r < 0)
13dcfe46 209 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
4349cd7c 210
13dcfe46 211 path = mnt_fs_get_target(fs);
d6bfab11 212 if (!path)
6b7c9f8b
LP
213 continue;
214
c6111b85 215 if (!path_startswith(path, prefix))
13dcfe46
ZJS
216 continue;
217
d6bfab11
LP
218 type = mnt_fs_get_fstype(fs);
219 if (!type)
220 continue;
221
222 /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
223 * triggering them, as we don't make any guarantees for future submounts anyway. If
224 * they are already triggered, then we will find another entry for this. */
225 if (streq(type, "autofs")) {
226 top_autofs = top_autofs || path_equal(path, prefix);
227 continue;
228 }
229
230 if (set_contains(done, path))
231 continue;
232
6b000af4 233 /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
13dcfe46 234 * we shall operate on. */
c6111b85 235 if (!path_equal(path, prefix)) {
6b000af4 236 bool deny_listed = false;
6b7c9f8b 237
6b000af4 238 STRV_FOREACH(i, deny_list) {
c6111b85 239 if (path_equal(*i, prefix))
6b7c9f8b
LP
240 continue;
241
c6111b85 242 if (!path_startswith(*i, prefix))
6b7c9f8b
LP
243 continue;
244
13dcfe46 245 if (path_startswith(path, *i)) {
6b000af4 246 deny_listed = true;
d6bfab11 247 log_debug("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
6b7c9f8b
LP
248 break;
249 }
250 }
d6bfab11 251
6b000af4 252 if (deny_listed)
6b7c9f8b
LP
253 continue;
254 }
255
ba8dced2
LP
256 opts = mnt_fs_get_vfs_options(fs);
257 if (opts) {
258 r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
259 if (r < 0)
260 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
261 }
262
263 d = strdup(path);
264 if (!d)
265 return -ENOMEM;
266
267 r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
268 if (r == -EEXIST)
e0103063
LB
269 /* If the same path was recorded, but with different mount flags, update it:
270 * it means a mount point is overmounted, and libmount returns the "bottom" (or
271 * older one) first, but we want to reapply the flags from the "top" (or newer
272 * one). See: https://github.com/systemd/systemd/issues/20032
273 * Note that this shouldn't really fail, as we were just told that the key
274 * exists, and it's an update so we want 'd' to be freed immediately. */
275 r = hashmap_update(todo, d, ULONG_TO_PTR(flags));
d6bfab11
LP
276 if (r < 0)
277 return r;
ba8dced2
LP
278 if (r > 0)
279 TAKE_PTR(d);
4349cd7c
LP
280 }
281
5c5753b9
LP
282 /* Check if the top-level directory was among what we have seen so far. For that check both
283 * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
284 * not include it in either set but will set this bool. */
c6111b85 285 if (!set_contains(done, prefix) &&
ba8dced2 286 !(top_autofs || hashmap_contains(todo, prefix))) {
5c5753b9 287
6b7c9f8b 288 /* The prefix directory itself is not yet a mount, make it one. */
c6111b85 289 r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
511a8cfe
LP
290 if (r < 0)
291 return r;
4349cd7c 292
5c5753b9
LP
293 /* Immediately rescan, so that we pick up the new mount's flags */
294 continue;
4349cd7c
LP
295 }
296
5c5753b9 297 /* If we have no submounts to process anymore, we are done */
ba8dced2 298 if (hashmap_isempty(todo))
5c5753b9
LP
299 return 0;
300
ba8dced2
LP
301 for (;;) {
302 unsigned long flags;
303 char *x = NULL;
304
305 /* Take the first mount from our list of mounts to still process */
306 flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x));
307 if (!x)
308 break;
4349cd7c 309
ba8dced2 310 r = set_ensure_consume(&done, &path_hash_ops_free, x);
4c701096 311 if (IN_SET(r, 0, -EEXIST))
ba8dced2 312 continue; /* Already done */
4349cd7c
LP
313 if (r < 0)
314 return r;
315
ba8dced2
LP
316 /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
317 * the default anyway, thus redundant, and in userns we'll get an error if we try to
318 * explicitly enable it) */
319 r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
065b4774 320 if (r < 0) {
ba8dced2
LP
321 int q;
322
323 /* OK, so the remount of this entry failed. We'll ultimately ignore this in
324 * almost all cases (there are simply so many reasons why this can fail,
325 * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
326 * the very least. */
327
328 q = path_is_mount_point(x, NULL, 0);
329 if (IN_SET(q, 0, -ENOENT)) {
330 /* Hmm, whaaaa? The mount point is not actually a mount point? Then
331 * it is either obstructed by a later mount or somebody has been
332 * racing against us and removed it. Either way the mount point
333 * doesn't matter to us, let's ignore it hence. */
334 log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x);
335 continue;
336 }
337 if (q < 0) /* Any other error on this? Just log and continue */
338 log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x);
339
340 if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */
341 log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x);
342 continue;
343 }
344
345 /* Make this fatal if this is the top-level mount */
346 if (path_equal(x, prefix))
065b4774
LP
347 return r;
348
ba8dced2
LP
349 /* If this is not the top-level mount, then handle this gracefully: log but
350 * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
351 * this might fail without a chance for us to do anything about it, let's
352 * hence be strict on the top-level mount and lenient on the inner ones. */
353 log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x);
ef454fd1
YW
354 continue;
355 }
98df8089 356
ba8dced2 357 log_debug("Remounted %s.", x);
4349cd7c
LP
358 }
359 }
360}
361
7cce68e1
LP
362int bind_remount_one_with_mountinfo(
363 const char *path,
364 unsigned long new_flags,
365 unsigned long flags_mask,
366 FILE *proc_self_mountinfo) {
367
368 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
2c5ff8ea
LP
369 unsigned long flags = 0;
370 struct libmnt_fs *fs;
371 const char *opts;
7cce68e1
LP
372 int r;
373
374 assert(path);
375 assert(proc_self_mountinfo);
376
4f5644db
LP
377 if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && !skip_mount_set_attr) {
378 /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
379
380 if (mount_setattr(AT_FDCWD, path, AT_SYMLINK_NOFOLLOW,
381 &(struct mount_attr) {
382 .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
383 .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
384 }, MOUNT_ATTR_SIZE_VER0) < 0) {
385
386 log_debug_errno(errno, "mount_setattr() didn't work, falling back to classic remounting: %m");
387
388 if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
389 skip_mount_set_attr = true;
390 } else
391 return 0; /* Nice, this worked! */
392 }
393
7cce68e1
LP
394 rewind(proc_self_mountinfo);
395
396 table = mnt_new_table();
397 if (!table)
398 return -ENOMEM;
399
400 r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
401 if (r < 0)
402 return r;
403
2c5ff8ea 404 fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
0338df47
LP
405 if (!fs) {
406 if (laccess(path, F_OK) < 0) /* Hmm, it's not in the mount table, but does it exist at all? */
407 return -errno;
408
2c5ff8ea 409 return -EINVAL; /* Not a mount point we recognize */
0338df47 410 }
2c5ff8ea
LP
411
412 opts = mnt_fs_get_vfs_options(fs);
413 if (opts) {
414 r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
415 if (r < 0)
416 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
417 }
7cce68e1 418
2c5ff8ea 419 r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
b23c6a64
LP
420 if (r < 0) {
421 if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again,
422 * since kernel adds it in
423 * everywhere, because it's the
424 * default. */
425 return r;
426
427 /* Let's handle redundant remounts gracefully */
428 log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path);
429 }
7cce68e1
LP
430
431 return 0;
432}
433
b8b4f80a 434static int mount_switch_root_pivot(const char *path, int fd_newroot) {
57c10a56 435 _cleanup_close_ int fd_oldroot = -EBADF;
2e776ed6
CB
436
437 fd_oldroot = open("/", O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
438 if (fd_oldroot < 0)
439 return log_debug_errno(errno, "Failed to open old rootfs");
440
2e776ed6
CB
441 /* Let the kernel tuck the new root under the old one. */
442 if (pivot_root(".", ".") < 0)
443 return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path);
444
2e776ed6
CB
445 /* At this point the new root is tucked under the old root. If we want
446 * to unmount it we cannot be fchdir()ed into it. So escape back to the
447 * old root. */
448 if (fchdir(fd_oldroot) < 0)
449 return log_debug_errno(errno, "Failed to change back to old rootfs: %m");
450
451 /* Note, usually we should set mount propagation up here but we'll
452 * assume that the caller has already done that. */
453
454 /* Get rid of the old root and reveal our brand new root. */
455 if (umount2(".", MNT_DETACH) < 0)
456 return log_debug_errno(errno, "Failed to unmount old rootfs: %m");
457
458 if (fchdir(fd_newroot) < 0)
459 return log_debug_errno(errno, "Failed to switch to new rootfs '%s': %m", path);
460
461 return 0;
462}
463
b8b4f80a 464static int mount_switch_root_move(const char *path) {
57c10a56
CB
465 if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
466 return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path);
467
468 if (chroot(".") < 0)
469 return log_debug_errno(errno, "Failed to chroot to new rootfs '%s': %m", path);
470
471 if (chdir("/"))
472 return log_debug_errno(errno, "Failed to chdir to new rootfs '%s': %m", path);
473
474 return 0;
475}
476
9d50f850 477int mount_switch_root(const char *path, unsigned long mount_propagation_flag) {
57c10a56 478 _cleanup_close_ int fd_newroot = -EBADF;
6c6eb219 479 int r;
57c10a56
CB
480
481 assert(path);
9d50f850 482 assert(mount_propagation_flag_is_valid(mount_propagation_flag));
57c10a56
CB
483
484 fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
485 if (fd_newroot < 0)
486 return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
487
488 /* Change into the new rootfs. */
489 if (fchdir(fd_newroot) < 0)
490 return log_debug_errno(errno, "Failed to change into new rootfs '%s': %m", path);
491
492 r = mount_switch_root_pivot(path, fd_newroot);
493 if (r < 0) {
494 /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the
495 * rootfs is an initramfs in which case pivot_root() isn't supported. */
496 log_debug_errno(r, "Failed to pivot into new rootfs '%s': %m", path);
497 r = mount_switch_root_move(path);
498 }
499 if (r < 0)
500 return log_debug_errno(r, "Failed to switch to new rootfs '%s': %m", path);
501
9d50f850
YW
502 /* Finally, let's establish the requested propagation flags. */
503 if (mount_propagation_flag == 0)
504 return 0;
505
506 if (mount(NULL, ".", NULL, mount_propagation_flag | MS_REC, 0) < 0)
57c10a56 507 return log_debug_errno(errno, "Failed to turn new rootfs '%s' into %s mount: %m",
9d50f850 508 mount_propagation_flag_to_string(mount_propagation_flag), path);
57c10a56
CB
509
510 return 0;
511}
2e776ed6 512
3f2c0bec
LP
513int repeat_unmount(const char *path, int flags) {
514 bool done = false;
515
516 assert(path);
517
518 /* If there are multiple mounts on a mount point, this
519 * removes them all */
520
521 for (;;) {
522 if (umount2(path, flags) < 0) {
523
524 if (errno == EINVAL)
525 return done;
526
527 return -errno;
528 }
529
530 done = true;
531 }
532}
c4b41707 533
48b747fa
LP
534int mode_to_inaccessible_node(
535 const char *runtime_dir,
536 mode_t mode,
537 char **ret) {
538
539 /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
540 * during early boot by PID 1. In some cases we lacked the privs to create the character and block
541 * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
542 * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
543 * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
544 * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
545 * file nodes, and that's the most important thing that matters.
546 *
547 * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
548 * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
549
e5f10caf
AZ
550 _cleanup_free_ char *d = NULL;
551 const char *node = NULL;
e5f10caf 552
48b747fa
LP
553 assert(ret);
554
555 if (!runtime_dir)
556 runtime_dir = "/run";
fe80fcc7 557
79893116 558 switch (mode & S_IFMT) {
c4b41707 559 case S_IFREG:
48b747fa 560 node = "/systemd/inaccessible/reg";
e5f10caf 561 break;
fe80fcc7 562
c4b41707 563 case S_IFDIR:
48b747fa 564 node = "/systemd/inaccessible/dir";
e5f10caf 565 break;
fe80fcc7 566
c4b41707 567 case S_IFCHR:
48b747fa 568 node = "/systemd/inaccessible/chr";
e5f10caf 569 break;
fe80fcc7 570
c4b41707 571 case S_IFBLK:
48b747fa 572 node = "/systemd/inaccessible/blk";
e5f10caf 573 break;
fe80fcc7 574
c4b41707 575 case S_IFIFO:
48b747fa 576 node = "/systemd/inaccessible/fifo";
e5f10caf 577 break;
fe80fcc7 578
c4b41707 579 case S_IFSOCK:
48b747fa 580 node = "/systemd/inaccessible/sock";
e5f10caf 581 break;
c4b41707 582 }
e5f10caf
AZ
583 if (!node)
584 return -EINVAL;
585
48b747fa
LP
586 d = path_join(runtime_dir, node);
587 if (!d)
588 return -ENOMEM;
589
cbed1dc8
LP
590 /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
591 * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
592 * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
593 * inaccessible block device node let's see if the block device node actually exists, and if not,
594 * fall back to the character device node. From there fall back to the socket device node. This means
595 * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
596 * device node at all. */
597
598 if (S_ISBLK(mode) &&
599 access(d, F_OK) < 0 && errno == ENOENT) {
600 free(d);
601 d = path_join(runtime_dir, "/systemd/inaccessible/chr");
602 if (!d)
603 return -ENOMEM;
604 }
605
606 if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
607 access(d, F_OK) < 0 && errno == ENOENT) {
48b747fa
LP
608 free(d);
609 d = path_join(runtime_dir, "/systemd/inaccessible/sock");
610 if (!d)
611 return -ENOMEM;
612 }
e5f10caf 613
48b747fa 614 *ret = TAKE_PTR(d);
e5f10caf 615 return 0;
c4b41707 616}
60e76d48 617
da185cd0 618int mount_flags_to_string(unsigned long flags, char **ret) {
1c092b62 619 static const struct {
da185cd0 620 unsigned long flag;
1c092b62
YW
621 const char *name;
622 } map[] = {
623 { .flag = MS_RDONLY, .name = "MS_RDONLY", },
624 { .flag = MS_NOSUID, .name = "MS_NOSUID", },
625 { .flag = MS_NODEV, .name = "MS_NODEV", },
626 { .flag = MS_NOEXEC, .name = "MS_NOEXEC", },
627 { .flag = MS_SYNCHRONOUS, .name = "MS_SYNCHRONOUS", },
628 { .flag = MS_REMOUNT, .name = "MS_REMOUNT", },
629 { .flag = MS_MANDLOCK, .name = "MS_MANDLOCK", },
630 { .flag = MS_DIRSYNC, .name = "MS_DIRSYNC", },
631 { .flag = MS_NOSYMFOLLOW, .name = "MS_NOSYMFOLLOW", },
632 { .flag = MS_NOATIME, .name = "MS_NOATIME", },
633 { .flag = MS_NODIRATIME, .name = "MS_NODIRATIME", },
634 { .flag = MS_BIND, .name = "MS_BIND", },
635 { .flag = MS_MOVE, .name = "MS_MOVE", },
636 { .flag = MS_REC, .name = "MS_REC", },
637 { .flag = MS_SILENT, .name = "MS_SILENT", },
638 { .flag = MS_POSIXACL, .name = "MS_POSIXACL", },
639 { .flag = MS_UNBINDABLE, .name = "MS_UNBINDABLE", },
640 { .flag = MS_PRIVATE, .name = "MS_PRIVATE", },
641 { .flag = MS_SLAVE, .name = "MS_SLAVE", },
642 { .flag = MS_SHARED, .name = "MS_SHARED", },
643 { .flag = MS_RELATIME, .name = "MS_RELATIME", },
644 { .flag = MS_KERNMOUNT, .name = "MS_KERNMOUNT", },
645 { .flag = MS_I_VERSION, .name = "MS_I_VERSION", },
646 { .flag = MS_STRICTATIME, .name = "MS_STRICTATIME", },
647 { .flag = MS_LAZYTIME, .name = "MS_LAZYTIME", },
648 };
649 _cleanup_free_ char *str = NULL;
650
4bee2333
YW
651 assert(ret);
652
1c092b62
YW
653 for (size_t i = 0; i < ELEMENTSOF(map); i++)
654 if (flags & map[i].flag) {
655 if (!strextend_with_separator(&str, "|", map[i].name))
656 return -ENOMEM;
657 flags &= ~map[i].flag;
658 }
659
660 if (!str || flags != 0)
661 if (strextendf_with_separator(&str, "|", "%lx", flags) < 0)
662 return -ENOMEM;
663
664 *ret = TAKE_PTR(str);
665 return 0;
60e76d48
ZJS
666}
667
511a8cfe 668int mount_verbose_full(
60e76d48
ZJS
669 int error_log_level,
670 const char *what,
671 const char *where,
672 const char *type,
673 unsigned long flags,
511a8cfe
LP
674 const char *options,
675 bool follow_symlink) {
60e76d48 676
6ef8df2b
YW
677 _cleanup_free_ char *fl = NULL, *o = NULL;
678 unsigned long f;
679 int r;
680
681 r = mount_option_mangle(options, flags, &f, &o);
682 if (r < 0)
683 return log_full_errno(error_log_level, r,
684 "Failed to mangle mount options %s: %m",
685 strempty(options));
60e76d48 686
1c092b62 687 (void) mount_flags_to_string(f, &fl);
60e76d48 688
6ef8df2b 689 if ((f & MS_REMOUNT) && !what && !type)
60e76d48 690 log_debug("Remounting %s (%s \"%s\")...",
6ef8df2b 691 where, strnull(fl), strempty(o));
60e76d48
ZJS
692 else if (!what && !type)
693 log_debug("Mounting %s (%s \"%s\")...",
6ef8df2b
YW
694 where, strnull(fl), strempty(o));
695 else if ((f & MS_BIND) && !type)
60e76d48 696 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
6ef8df2b
YW
697 what, where, strnull(fl), strempty(o));
698 else if (f & MS_MOVE)
e2341b6b
DT
699 log_debug("Moving mount %s %s %s (%s \"%s\")...",
700 what, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), where, strnull(fl), strempty(o));
60e76d48 701 else
3b493d94
LP
702 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
703 strna(what), strna(type), where, strnull(fl), strempty(o));
511a8cfe
LP
704
705 if (follow_symlink)
7c248223 706 r = RET_NERRNO(mount(what, where, type, f, o));
511a8cfe
LP
707 else
708 r = mount_nofollow(what, where, type, f, o);
709 if (r < 0)
710 return log_full_errno(error_log_level, r,
3ccf6126
LP
711 "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
712 strna(what), strna(type), where, strnull(fl), strempty(o));
60e76d48
ZJS
713 return 0;
714}
715
30f5d104
LP
716int umount_verbose(
717 int error_log_level,
718 const char *what,
719 int flags) {
720
721 assert(what);
722
60e76d48 723 log_debug("Umounting %s...", what);
30f5d104
LP
724
725 if (umount2(what, flags) < 0)
726 return log_full_errno(error_log_level, errno,
727 "Failed to unmount %s: %m", what);
728
60e76d48
ZJS
729 return 0;
730}
83555251 731
9e7f941a
YW
732int mount_option_mangle(
733 const char *options,
734 unsigned long mount_flags,
735 unsigned long *ret_mount_flags,
736 char **ret_remaining_options) {
737
738 const struct libmnt_optmap *map;
739 _cleanup_free_ char *ret = NULL;
9e7f941a
YW
740 int r;
741
3ca4ec20 742 /* This extracts mount flags from the mount options, and stores
9e7f941a
YW
743 * non-mount-flag options to '*ret_remaining_options'.
744 * E.g.,
9f563f27 745 * "rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000"
9e7f941a 746 * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
9f563f27 747 * "size=1630748k,mode=0700,uid=1000,gid=1000".
3ca4ec20 748 * See more examples in test-mount-util.c.
9e7f941a 749 *
3ca4ec20 750 * If 'options' does not contain any non-mount-flag options,
5238e957 751 * then '*ret_remaining_options' is set to NULL instead of empty string.
3ca4ec20
ZJS
752 * The validity of options stored in '*ret_remaining_options' is not checked.
753 * If 'options' is NULL, this just copies 'mount_flags' to *ret_mount_flags. */
9e7f941a
YW
754
755 assert(ret_mount_flags);
756 assert(ret_remaining_options);
757
758 map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
759 if (!map)
760 return -EINVAL;
761
25086b4c 762 for (const char *p = options;;) {
9e7f941a
YW
763 _cleanup_free_ char *word = NULL;
764 const struct libmnt_optmap *ent;
765
9b23679e 766 r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
9e7f941a
YW
767 if (r < 0)
768 return r;
769 if (r == 0)
770 break;
771
772 for (ent = map; ent->name; ent++) {
773 /* All entries in MNT_LINUX_MAP do not take any argument.
774 * Thus, ent->name does not contain "=" or "[=]". */
775 if (!streq(word, ent->name))
776 continue;
777
778 if (!(ent->mask & MNT_INVERT))
779 mount_flags |= ent->id;
780 else if (mount_flags & ent->id)
781 mount_flags ^= ent->id;
782
783 break;
784 }
785
786 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
ac6086fd
LB
787 if (!ent->name &&
788 !startswith_no_case(word, "x-") &&
789 !strextend_with_separator(&ret, ",", word))
9e7f941a
YW
790 return -ENOMEM;
791 }
792
793 *ret_mount_flags = mount_flags;
ae2a15bc 794 *ret_remaining_options = TAKE_PTR(ret);
9e7f941a
YW
795
796 return 0;
797}
6af52c3a 798
70599967 799static int mount_in_namespace(
6af52c3a
LB
800 pid_t target,
801 const char *propagate_path,
802 const char *incoming_path,
803 const char *src,
804 const char *dest,
805 bool read_only,
70599967
LB
806 bool make_file_or_directory,
807 const MountOptions *options,
84be0c71 808 const ImagePolicy *image_policy,
70599967 809 bool is_image) {
6af52c3a 810
19ee48a6 811 _cleanup_close_pair_ int errno_pipe_fd[2] = PIPE_EBADF;
254d1313 812 _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF, chased_src_fd = -EBADF;
ddb6eeaf 813 char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p;
6af52c3a
LB
814 bool mount_slave_created = false, mount_slave_mounted = false,
815 mount_tmp_created = false, mount_tmp_mounted = false,
816 mount_outside_created = false, mount_outside_mounted = false;
cedf5b1a 817 _cleanup_free_ char *chased_src_path = NULL;
4b00e738 818 struct stat st;
6af52c3a
LB
819 pid_t child;
820 int r;
821
822 assert(target > 0);
823 assert(propagate_path);
824 assert(incoming_path);
825 assert(src);
826 assert(dest);
70599967 827 assert(!options || is_image);
6af52c3a 828
98f654fd 829 r = namespace_open(target, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd);
2338a175
LB
830 if (r < 0)
831 return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
832
4b00e738 833 r = in_same_namespace(target, 0, NAMESPACE_MOUNT);
2338a175 834 if (r < 0)
4b00e738 835 return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m");
2338a175 836 /* We can't add new mounts at runtime if the process wasn't started in a namespace */
4b00e738 837 if (r > 0)
2338a175
LB
838 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace");
839
ddb6eeaf
LP
840 /* One day, when bind mounting /proc/self/fd/n works across namespace boundaries we should rework
841 * this logic to make use of it... */
6af52c3a
LB
842
843 p = strjoina(propagate_path, "/");
844 r = laccess(p, F_OK);
845 if (r < 0)
846 return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
847
f461a28d 848 r = chase(src, NULL, 0, &chased_src_path, &chased_src_fd);
6af52c3a
LB
849 if (r < 0)
850 return log_debug_errno(r, "Failed to resolve source path of %s: %m", src);
cedf5b1a 851 log_debug("Chased source path of %s to %s", src, chased_src_path);
6af52c3a 852
f7c18d3d
LB
853 if (fstat(chased_src_fd, &st) < 0)
854 return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src);
6af52c3a 855 if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
f7c18d3d 856 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src);
6af52c3a
LB
857
858 /* Our goal is to install a new bind mount into the container,
859 possibly read-only. This is irritatingly complex
860 unfortunately, currently.
861
862 First, we start by creating a private playground in /tmp,
863 that we can mount MS_SLAVE. (Which is necessary, since
864 MS_MOVE cannot be applied to mounts with MS_SHARED parent
865 mounts.) */
866
867 if (!mkdtemp(mount_slave))
868 return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
869
870 mount_slave_created = true;
871
872 r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
873 if (r < 0)
874 goto finish;
875
876 mount_slave_mounted = true;
877
878 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
879 if (r < 0)
880 goto finish;
881
882 /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
883 mount_tmp = strjoina(mount_slave, "/mount");
70599967
LB
884 if (is_image)
885 r = mkdir_p(mount_tmp, 0700);
886 else
887 r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700);
6af52c3a
LB
888 if (r < 0) {
889 log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
890 goto finish;
891 }
892
893 mount_tmp_created = true;
894
70599967 895 if (is_image)
84be0c71 896 r = verity_dissect_and_mount(chased_src_fd, chased_src_path, mount_tmp, options, image_policy, NULL, NULL, NULL, NULL);
70599967 897 else
ddb6eeaf 898 r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(chased_src_fd), mount_tmp, NULL, MS_BIND, NULL);
6af52c3a
LB
899 if (r < 0)
900 goto finish;
901
902 mount_tmp_mounted = true;
903
904 /* Third, we remount the new bind mount read-only if requested. */
905 if (read_only) {
906 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
907 if (r < 0)
908 goto finish;
909 }
910
911 /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
912 * right-away. */
913
914 mount_outside = strjoina(propagate_path, "/XXXXXX");
70599967 915 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
916 r = mkdtemp(mount_outside) ? 0 : -errno;
917 else {
918 r = mkostemp_safe(mount_outside);
919 safe_close(r);
920 }
921 if (r < 0) {
922 log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
923 goto finish;
924 }
925
926 mount_outside_created = true;
927
928 r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
929 if (r < 0)
930 goto finish;
931
932 mount_outside_mounted = true;
933 mount_tmp_mounted = false;
934
70599967 935 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
936 (void) rmdir(mount_tmp);
937 else
938 (void) unlink(mount_tmp);
939 mount_tmp_created = false;
940
941 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
942 mount_slave_mounted = false;
943
944 (void) rmdir(mount_slave);
945 mount_slave_created = false;
946
947 if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
948 log_debug_errno(errno, "Failed to create pipe: %m");
949 goto finish;
950 }
951
2338a175 952 r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG,
98f654fd 953 pidns_fd, mntns_fd, -1, -1, root_fd, &child);
6af52c3a
LB
954 if (r < 0)
955 goto finish;
956 if (r == 0) {
03469b77 957 _cleanup_free_ char *mount_outside_fn = NULL, *mount_inside = NULL;
6af52c3a
LB
958
959 errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
960
6af52c3a 961 if (make_file_or_directory) {
70599967
LB
962 if (!is_image) {
963 (void) mkdir_parents(dest, 0755);
964 (void) make_mount_point_inode_from_stat(&st, dest, 0700);
965 } else
966 (void) mkdir_p(dest, 0755);
6af52c3a
LB
967 }
968
969 /* Fifth, move the mount to the right place inside */
03469b77
LP
970 r = path_extract_filename(mount_outside, &mount_outside_fn);
971 if (r < 0) {
972 log_debug_errno(r, "Failed to extract filename from propagation file or directory '%s': %m", mount_outside);
973 goto child_fail;
974 }
975
976 mount_inside = path_join(incoming_path, mount_outside_fn);
977 if (!mount_inside) {
978 r = log_oom_debug();
979 goto child_fail;
980 }
981
982 r = mount_nofollow_verbose(LOG_DEBUG, mount_inside, dest, NULL, MS_MOVE, NULL);
6af52c3a
LB
983 if (r < 0)
984 goto child_fail;
985
986 _exit(EXIT_SUCCESS);
987
988 child_fail:
989 (void) write(errno_pipe_fd[1], &r, sizeof(r));
990 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
991
992 _exit(EXIT_FAILURE);
993 }
994
995 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
996
997 r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
998 if (r < 0) {
999 log_debug_errno(r, "Failed to wait for child: %m");
1000 goto finish;
1001 }
1002 if (r != EXIT_SUCCESS) {
1003 if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
1004 log_debug_errno(r, "Failed to mount: %m");
1005 else
1006 log_debug("Child failed.");
1007 goto finish;
1008 }
1009
1010finish:
1011 if (mount_outside_mounted)
1012 (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
1013 if (mount_outside_created) {
70599967 1014 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
1015 (void) rmdir(mount_outside);
1016 else
1017 (void) unlink(mount_outside);
1018 }
1019
1020 if (mount_tmp_mounted)
1021 (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
1022 if (mount_tmp_created) {
70599967 1023 if (is_image || S_ISDIR(st.st_mode))
6af52c3a
LB
1024 (void) rmdir(mount_tmp);
1025 else
1026 (void) unlink(mount_tmp);
1027 }
1028
1029 if (mount_slave_mounted)
1030 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
1031 if (mount_slave_created)
1032 (void) rmdir(mount_slave);
1033
1034 return r;
1035}
70599967
LB
1036
1037int bind_mount_in_namespace(
1038 pid_t target,
1039 const char *propagate_path,
1040 const char *incoming_path,
1041 const char *src,
1042 const char *dest,
1043 bool read_only,
1044 bool make_file_or_directory) {
1045
84be0c71 1046 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, /* options= */ NULL, /* image_policy= */ NULL, /* is_image= */ false);
70599967
LB
1047}
1048
1049int mount_image_in_namespace(
1050 pid_t target,
1051 const char *propagate_path,
1052 const char *incoming_path,
1053 const char *src,
1054 const char *dest,
1055 bool read_only,
1056 bool make_file_or_directory,
84be0c71
LP
1057 const MountOptions *options,
1058 const ImagePolicy *image_policy) {
70599967 1059
84be0c71 1060 return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, image_policy, /* is_image=*/ true);
70599967 1061}
14a25e1f
LP
1062
1063int make_mount_point(const char *path) {
1064 int r;
1065
1066 assert(path);
1067
1068 /* If 'path' is already a mount point, does nothing and returns 0. If it is not it makes it one, and returns 1. */
1069
1070 r = path_is_mount_point(path, NULL, 0);
1071 if (r < 0)
1072 return log_debug_errno(r, "Failed to determine whether '%s' is a mount point: %m", path);
1073 if (r > 0)
1074 return 0;
1075
1076 r = mount_nofollow_verbose(LOG_DEBUG, path, path, NULL, MS_BIND|MS_REC, NULL);
1077 if (r < 0)
1078 return r;
1079
1080 return 1;
1081}
35fd3558 1082
17b798d9 1083int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) {
254d1313 1084 _cleanup_close_ int userns_fd = -EBADF;
50ae2966 1085 _cleanup_free_ char *line = NULL;
35fd3558
LP
1086
1087 /* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
1088 * process whose only purpose is to give us a new user namespace. It's killed when we got it. */
1089
17b798d9
LP
1090 if (!userns_shift_range_valid(uid_shift, uid_range))
1091 return -EINVAL;
1092
2b2777ed
QD
1093 if (IN_SET(idmapping, REMOUNT_IDMAPPING_NONE, REMOUNT_IDMAPPING_HOST_ROOT)) {
1094 if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
50ae2966 1095 return log_oom_debug();
35fd3558 1096
2b2777ed
QD
1097 /* If requested we'll include an entry in the mapping so that the host root user can make
1098 * changes to the uidmapped mount like it normally would. Specifically, we'll map the user
1099 * with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants
1100 * to create various missing inodes in the OS tree before booting into it, and this becomes
1101 * very easy and straightforward to do if it can just do it under its own regular UID. Note
1102 * that in that case the container's runtime uidmap (i.e. the one the container payload
1103 * processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned
1104 * by host root in the already uidmapped tree around they'll show up as owned by 'nobody',
1105 * which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them
1106 * to the container's own UID range, but it's good to have a safety net, in case we
1107 * forget it.) */
1108 if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
1109 if (strextendf(&line,
1110 UID_FMT " " UID_FMT " " UID_FMT "\n",
1111 UID_MAPPED_ROOT, 0u, 1u) < 0)
1112 return log_oom_debug();
1113 }
1114
1115 if (idmapping == REMOUNT_IDMAPPING_HOST_OWNER) {
1116 /* Remap the owner of the bind mounted directory to the root user within the container. This
1117 * way every file written by root within the container to the bind-mounted directory will
1118 * be owned by the original user. All other user will remain unmapped. */
1119 if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", owner, uid_shift, 1u) < 0)
1120 return log_oom_debug();
1121 }
1122
35fd3558 1123 /* We always assign the same UID and GID ranges */
979b0ff2
LP
1124 userns_fd = userns_acquire(line, line);
1125 if (userns_fd < 0)
1126 return log_debug_errno(userns_fd, "Failed to acquire new userns: %m");
35fd3558
LP
1127
1128 return TAKE_FD(userns_fd);
1129}
1130
17b798d9 1131int remount_idmap_fd(
35fd3558 1132 const char *p,
17b798d9 1133 int userns_fd) {
35fd3558 1134
17b798d9 1135 _cleanup_close_ int mount_fd = -EBADF;
35fd3558
LP
1136 int r;
1137
1138 assert(p);
17b798d9 1139 assert(userns_fd >= 0);
35fd3558
LP
1140
1141 /* Clone the mount point */
1142 mount_fd = open_tree(-1, p, OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
1143 if (mount_fd < 0)
1144 return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", p);
1145
35fd3558
LP
1146 /* Set the user namespace mapping attribute on the cloned mount point */
1147 if (mount_setattr(mount_fd, "", AT_EMPTY_PATH | AT_RECURSIVE,
1148 &(struct mount_attr) {
1149 .attr_set = MOUNT_ATTR_IDMAP,
1150 .userns_fd = userns_fd,
1151 }, sizeof(struct mount_attr)) < 0)
1152 return log_debug_errno(errno, "Failed to change bind mount attributes for '%s': %m", p);
1153
1154 /* Remove the old mount point */
1155 r = umount_verbose(LOG_DEBUG, p, UMOUNT_NOFOLLOW);
1156 if (r < 0)
1157 return r;
1158
1159 /* And place the cloned version in its place */
1160 if (move_mount(mount_fd, "", -1, p, MOVE_MOUNT_F_EMPTY_PATH) < 0)
1161 return log_debug_errno(errno, "Failed to attach UID mapped mount to '%s': %m", p);
1162
1163 return 0;
1164}
9c653536 1165
17b798d9
LP
1166int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) {
1167 _cleanup_close_ int userns_fd = -EBADF;
1168
1169 userns_fd = make_userns(uid_shift, uid_range, owner, idmapping);
1170 if (userns_fd < 0)
1171 return userns_fd;
1172
1173 return remount_idmap_fd(p, userns_fd);
1174}
1175
f63a2c48
YW
1176typedef struct SubMount {
1177 char *path;
1178 int mount_fd;
1179} SubMount;
1180
1181static void sub_mount_clear(SubMount *s) {
1182 assert(s);
1183
1184 s->path = mfree(s->path);
1185 s->mount_fd = safe_close(s->mount_fd);
1186}
1187
1188static void sub_mount_array_free(SubMount *s, size_t n) {
1189 assert(s || n == 0);
1190
1191 for (size_t i = 0; i < n; i++)
1192 sub_mount_clear(s + i);
1193
1194 free(s);
1195}
1196
1197static int sub_mount_compare(const SubMount *a, const SubMount *b) {
1198 assert(a);
1199 assert(b);
1200 assert(a->path);
1201 assert(b->path);
1202
1203 return path_compare(a->path, b->path);
1204}
1205
1206static void sub_mount_drop(SubMount *s, size_t n) {
1207 assert(s || n == 0);
1208
1209 for (size_t m = 0, i = 1; i < n; i++) {
1210 if (path_startswith(s[i].path, s[m].path))
1211 sub_mount_clear(s + i);
1212 else
1213 m = i;
1214 }
1215}
1216
1217static int get_sub_mounts(const char *prefix, SubMount **ret_mounts, size_t *ret_n_mounts) {
1218 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
1219 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
1220 SubMount *mounts = NULL;
1221 size_t n = 0;
1222 int r;
1223
1224 CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
1225
1226 assert(prefix);
1227 assert(ret_mounts);
1228 assert(ret_n_mounts);
1229
1230 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
1231 if (r < 0)
1232 return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
1233
1234 for (;;) {
1235 _cleanup_close_ int mount_fd = -EBADF;
1236 _cleanup_free_ char *p = NULL;
1237 struct libmnt_fs *fs;
1238 const char *path;
1239 int id1, id2;
1240
1241 r = mnt_table_next_fs(table, iter, &fs);
1242 if (r == 1)
1243 break; /* EOF */
1244 if (r < 0)
1245 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
1246
1247 path = mnt_fs_get_target(fs);
1248 if (!path)
1249 continue;
1250
1251 if (isempty(path_startswith(path, prefix)))
1252 continue;
1253
1254 id1 = mnt_fs_get_id(fs);
1255 r = path_get_mnt_id(path, &id2);
1256 if (r < 0) {
1257 log_debug_errno(r, "Failed to get mount ID of '%s', ignoring: %m", path);
1258 continue;
1259 }
1260 if (id1 != id2) {
1261 /* The path may be hidden by another over-mount or already remounted. */
1262 log_debug("The mount IDs of '%s' obtained by libmount and path_get_mnt_id() are different (%i vs %i), ignoring.",
1263 path, id1, id2);
1264 continue;
1265 }
1266
1267 mount_fd = open_tree(AT_FDCWD, path, OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_RECURSIVE);
1268 if (mount_fd < 0) {
1269 if (errno == ENOENT) /* The path may be hidden by another over-mount or already unmounted. */
1270 continue;
1271
1272 return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", path);
1273 }
1274
1275 p = strdup(path);
1276 if (!p)
1277 return log_oom_debug();
1278
1279 if (!GREEDY_REALLOC(mounts, n + 1))
1280 return log_oom_debug();
1281
1282 mounts[n++] = (SubMount) {
1283 .path = TAKE_PTR(p),
1284 .mount_fd = TAKE_FD(mount_fd),
1285 };
1286 }
1287
1288 typesafe_qsort(mounts, n, sub_mount_compare);
1289 sub_mount_drop(mounts, n);
1290
1291 *ret_mounts = TAKE_PTR(mounts);
1292 *ret_n_mounts = n;
1293 return 0;
1294}
1295
1296static int move_sub_mounts(SubMount *mounts, size_t n) {
1297 assert(mounts || n == 0);
1298
1299 for (size_t i = 0; i < n; i++) {
1300 if (!mounts[i].path || mounts[i].mount_fd < 0)
1301 continue;
1302
1303 (void) mkdir_p_label(mounts[i].path, 0755);
1304
1305 if (move_mount(mounts[i].mount_fd, "", AT_FDCWD, mounts[i].path, MOVE_MOUNT_F_EMPTY_PATH) < 0)
1306 return log_debug_errno(errno, "Failed to move mount_fd to '%s': %m", mounts[i].path);
1307 }
1308
1309 return 0;
1310}
1311
1312int remount_and_move_sub_mounts(
1313 const char *what,
1314 const char *where,
1315 const char *type,
1316 unsigned long flags,
1317 const char *options) {
1318
1319 SubMount *mounts = NULL; /* avoid false maybe-uninitialized warning */
1320 size_t n = 0; /* avoid false maybe-uninitialized warning */
1321 int r;
1322
1323 CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
1324
1325 assert(where);
1326
1327 /* This is useful when creating a new network namespace. Unlike procfs, we need to remount sysfs,
1328 * otherwise properties of the network interfaces in the main network namespace are still accessible
1329 * through the old sysfs, e.g. /sys/class/net/eth0. All sub-mounts previously mounted on the sysfs
1330 * are moved onto the new sysfs mount. */
1331
1332 r = path_is_mount_point(where, NULL, 0);
1333 if (r < 0)
1334 return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", where);
1335 if (r == 0)
1336 /* Shortcut. Simply mount the requested filesystem. */
1337 return mount_nofollow_verbose(LOG_DEBUG, what, where, type, flags, options);
1338
1339 /* Get the list of sub-mounts and duplicate them. */
1340 r = get_sub_mounts(where, &mounts, &n);
1341 if (r < 0)
1342 return r;
1343
1344 /* Then, remount the mount and its sub-mounts. */
1345 (void) umount_recursive(where, 0);
1346
1347 /* Remount the target filesystem. */
1348 r = mount_nofollow_verbose(LOG_DEBUG, what, where, type, flags, options);
1349 if (r < 0)
1350 return r;
1351
1352 /* Finally, move the all sub-mounts on the new target mount point. */
1353 return move_sub_mounts(mounts, n);
1354}
1355
1356int remount_sysfs(const char *where) {
1357 return remount_and_move_sub_mounts("sysfs", where, "sysfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1358}
1359
9c653536
ZJS
1360int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode) {
1361 assert(st);
1362 assert(dest);
1363
1364 if (S_ISDIR(st->st_mode))
1365 return mkdir_label(dest, mode);
1366 else
b6ca2b28 1367 return RET_NERRNO(mknod(dest, S_IFREG|(mode & ~0111), 0));
9c653536
ZJS
1368}
1369
1370int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t mode) {
1371 struct stat st;
1372
1373 assert(source);
1374 assert(dest);
1375
1376 if (stat(source, &st) < 0)
1377 return -errno;
1378
1379 return make_mount_point_inode_from_stat(&st, dest, mode);
1380}