]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn-mount.c
udev: gracefully handle ENODEV or friends in opening device node
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
e83bebef 2
07630cea 3#include <linux/magic.h>
1cf40697 4#include <sys/mount.h>
4f18ff2e 5#include <unistd.h>
e83bebef 6
b5efdb8a 7#include "alloc-util.h"
f461a28d 8#include "chase.h"
4f5dd394 9#include "escape.h"
6ee31a53 10#include "extract-word.h"
0996ef00 11#include "fd-util.h"
ca78ad1d 12#include "format-util.h"
f4f15635 13#include "fs-util.h"
8aa304d3 14#include "log.h"
35cd0ba5 15#include "mkdir-label.h"
4349cd7c 16#include "mount-util.h"
049af8ad 17#include "mountpoint-util.h"
b71a0192 18#include "namespace-util.h"
6bedfcbb 19#include "nspawn-mount.h"
4f5dd394
LP
20#include "path-util.h"
21#include "rm-rf.h"
760877e9 22#include "sort-util.h"
8fcde012 23#include "stat-util.h"
07630cea 24#include "string-util.h"
4f5dd394 25#include "strv.h"
e4de7287 26#include "tmpfile-util.h"
e83bebef 27
88614c8a 28CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
223a67e5 29 CustomMount *ret;
e83bebef
LP
30
31 assert(l);
32 assert(n);
33 assert(t >= 0);
34 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
35
223a67e5 36 if (!GREEDY_REALLOC(*l, *n + 1))
e83bebef
LP
37 return NULL;
38
e83bebef
LP
39 ret = *l + *n;
40 (*n)++;
41
511a8cfe
LP
42 *ret = (CustomMount) {
43 .type = t
44 };
e83bebef
LP
45
46 return ret;
47}
48
88614c8a 49void custom_mount_free_all(CustomMount *l, size_t n) {
1236f06c 50 FOREACH_ARRAY(m, l, n) {
e83bebef
LP
51 free(m->source);
52 free(m->destination);
53 free(m->options);
54
55 if (m->work_dir) {
56 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
57 free(m->work_dir);
58 }
59
c7a4890c
LP
60 if (m->rm_rf_tmpdir) {
61 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
62 free(m->rm_rf_tmpdir);
63 }
64
e83bebef 65 strv_free(m->lower);
de40a303 66 free(m->type_argument);
e83bebef
LP
67 }
68
69 free(l);
70}
71
93bab288 72static int custom_mount_compare(const CustomMount *a, const CustomMount *b) {
e83bebef
LP
73 int r;
74
93bab288 75 r = path_compare(a->destination, b->destination);
e83bebef
LP
76 if (r != 0)
77 return r;
78
93bab288 79 return CMP(a->type, b->type);
e83bebef
LP
80}
81
448f7377 82static int source_path_parse(const char *p, char **ret) {
86c0dd4a 83 assert(p);
448f7377 84 assert(ret);
86c0dd4a 85
448f7377
DDM
86 if (isempty(p))
87 return -EINVAL;
88
89 if (*p == '+') {
90 if (!path_is_absolute(p + 1))
91 return -EINVAL;
92
93 char *s = strdup(p);
94 if (!s)
95 return -ENOMEM;
96
97 *ret = TAKE_PTR(s);
98 return 0;
99 }
100
101 return path_make_absolute_cwd(p, ret);
102}
103
104static int source_path_parse_nullable(const char *p, char **ret) {
105 assert(p);
106 assert(ret);
107
108 if (isempty(p)) {
109 *ret = NULL;
110 return 0;
111 }
86c0dd4a 112
448f7377 113 return source_path_parse(p, ret);
86c0dd4a
LP
114}
115
116static char *resolve_source_path(const char *dest, const char *source) {
86c0dd4a
LP
117 if (!source)
118 return NULL;
119
120 if (source[0] == '+')
c6134d3e 121 return path_join(dest, source + 1);
86c0dd4a
LP
122
123 return strdup(source);
124}
125
d0556c55 126static int allocate_temporary_source(CustomMount *m) {
bf37a69c
LP
127 int r;
128
d0556c55
LP
129 assert(m);
130 assert(!m->source);
131 assert(!m->rm_rf_tmpdir);
132
bf37a69c
LP
133 r = mkdtemp_malloc("/var/tmp/nspawn-temp-XXXXXX", &m->rm_rf_tmpdir);
134 if (r < 0)
135 return log_error_errno(r, "Failed to acquire temporary directory: %m");
d0556c55
LP
136
137 m->source = path_join(m->rm_rf_tmpdir, "src");
138 if (!m->source)
139 return log_oom();
140
141 if (mkdir(m->source, 0755) < 0)
142 return log_error_errno(errno, "Failed to create %s: %m", m->source);
143
144 return 0;
145}
146
88614c8a 147int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
86c0dd4a
LP
148 int r;
149
d58577d4 150 /* Prepare all custom mounts. This will make sure we know all temporary directories. This is called in the
86c0dd4a
LP
151 * parent process, so that we know the temporary directories to remove on exit before we fork off the
152 * children. */
153
154 assert(l || n == 0);
155
156 /* Order the custom mounts, and make sure we have a working directory */
93bab288 157 typesafe_qsort(l, n, custom_mount_compare);
86c0dd4a 158
d58577d4 159 FOREACH_ARRAY(m, l, n) {
de40a303
LP
160 /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
161 * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in
162 * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in
163 * the inner child, not the outer one. Determine this here. */
164 m->in_userns = path_startswith(m->destination, "/proc");
86c0dd4a 165
de40a303
LP
166 if (m->type == CUSTOM_MOUNT_BIND) {
167 if (m->source) {
168 char *s;
86c0dd4a 169
de40a303
LP
170 s = resolve_source_path(dest, m->source);
171 if (!s)
172 return log_oom();
c7a4890c 173
de40a303
LP
174 free_and_replace(m->source, s);
175 } else {
176 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
c7a4890c 177
d0556c55
LP
178 r = allocate_temporary_source(m);
179 if (r < 0)
180 return r;
de40a303 181 }
86c0dd4a
LP
182 }
183
184 if (m->type == CUSTOM_MOUNT_OVERLAY) {
86c0dd4a
LP
185 STRV_FOREACH(j, m->lower) {
186 char *s;
187
188 s = resolve_source_path(dest, *j);
189 if (!s)
190 return log_oom();
191
10af01a5 192 free_and_replace(*j, s);
86c0dd4a
LP
193 }
194
d0556c55
LP
195 if (m->source) {
196 char *s;
197
198 s = resolve_source_path(dest, m->source);
199 if (!s)
200 return log_oom();
201
202 free_and_replace(m->source, s);
203 } else {
204 r = allocate_temporary_source(m);
205 if (r < 0)
206 return r;
207 }
208
86c0dd4a
LP
209 if (m->work_dir) {
210 char *s;
211
212 s = resolve_source_path(dest, m->work_dir);
213 if (!s)
214 return log_oom();
215
10af01a5 216 free_and_replace(m->work_dir, s);
86c0dd4a 217 } else {
86c0dd4a
LP
218 r = tempfn_random(m->source, NULL, &m->work_dir);
219 if (r < 0)
220 return log_error_errno(r, "Failed to acquire working directory: %m");
221 }
222
223 (void) mkdir_label(m->work_dir, 0700);
224 }
225 }
226
227 return 0;
228}
229
88614c8a 230int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
448f7377 231 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL, *p = NULL;
e83bebef
LP
232 CustomMount *m;
233 int r;
234
235 assert(l);
236 assert(n);
237
4f495126 238 r = extract_many_words(&s, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination);
e83bebef
LP
239 if (r < 0)
240 return r;
241 if (r == 0)
242 return -EINVAL;
e83bebef 243 if (r == 1) {
86c0dd4a 244 destination = strdup(source[0] == '+' ? source+1 : source);
e83bebef
LP
245 if (!destination)
246 return -ENOMEM;
247 }
a50947c5
DDM
248 if (r == 2 && !isempty(s)) {
249 opts = strdup(s);
e83bebef
LP
250 if (!opts)
251 return -ENOMEM;
252 }
253
448f7377
DDM
254 r = source_path_parse_nullable(source, &p);
255 if (r < 0)
256 return r;
c7a4890c 257
e83bebef
LP
258 if (!path_is_absolute(destination))
259 return -EINVAL;
260
261 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
262 if (!m)
48cbe5f8 263 return -ENOMEM;
e83bebef 264
448f7377 265 m->source = TAKE_PTR(p);
0e636bf5 266 m->destination = TAKE_PTR(destination);
e83bebef 267 m->read_only = read_only;
0e636bf5 268 m->options = TAKE_PTR(opts);
de40a303 269
e83bebef
LP
270 return 0;
271}
272
88614c8a 273int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
e83bebef 274 _cleanup_free_ char *path = NULL, *opts = NULL;
99534007 275 const char *p = ASSERT_PTR(s);
e83bebef
LP
276 CustomMount *m;
277 int r;
278
279 assert(l);
280 assert(n);
e83bebef
LP
281
282 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
283 if (r < 0)
284 return r;
285 if (r == 0)
286 return -EINVAL;
287
288 if (isempty(p))
289 opts = strdup("mode=0755");
290 else
291 opts = strdup(p);
292 if (!opts)
293 return -ENOMEM;
294
295 if (!path_is_absolute(path))
296 return -EINVAL;
297
298 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
299 if (!m)
300 return -ENOMEM;
301
1cc6c93a
YW
302 m->destination = TAKE_PTR(path);
303 m->options = TAKE_PTR(opts);
e83bebef 304
e83bebef
LP
305 return 0;
306}
307
88614c8a 308int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
ad85779a
LP
309 _cleanup_free_ char *upper = NULL, *destination = NULL;
310 _cleanup_strv_free_ char **lower = NULL;
311 CustomMount *m;
448f7377 312 int r, k;
ad85779a 313
90e30d76 314 k = strv_split_full(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
86c0dd4a
LP
315 if (k < 0)
316 return k;
ad85779a
LP
317 if (k < 2)
318 return -EADDRNOTAVAIL;
319 if (k == 2) {
448f7377
DDM
320 _cleanup_free_ char *p = NULL;
321
86c0dd4a
LP
322 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
323 * we'll also define the destination mount point the same as the upper. */
324
448f7377
DDM
325 r = source_path_parse(lower[0], &p);
326 if (r < 0)
327 return r;
328
329 free_and_replace(lower[0], p);
330
331 r = source_path_parse(lower[1], &p);
332 if (r < 0)
333 return r;
334
335 free_and_replace(lower[1], p);
86c0dd4a 336
ae2a15bc 337 upper = TAKE_PTR(lower[1]);
ad85779a 338
86c0dd4a 339 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
ad85779a
LP
340 if (!destination)
341 return -ENOMEM;
ad85779a 342 } else {
448f7377
DDM
343 _cleanup_free_ char *p = NULL;
344
86c0dd4a
LP
345 /* If more than two parameters are specified, the last one is the destination, the second to last one
346 * the "upper", and all before that the "lower" directories. */
347
ad85779a 348 destination = lower[k - 1];
ae2a15bc 349 upper = TAKE_PTR(lower[k - 2]);
86c0dd4a 350
448f7377
DDM
351 STRV_FOREACH(i, lower) {
352 r = source_path_parse(*i, &p);
353 if (r < 0)
354 return r;
355
356 free_and_replace(*i, p);
357 }
c7a4890c
LP
358
359 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
360 * in /var/tmp */
448f7377
DDM
361 r = source_path_parse_nullable(upper, &p);
362 if (r < 0)
363 return r;
364
365 free_and_replace(upper, p);
c7a4890c 366
86c0dd4a
LP
367 if (!path_is_absolute(destination))
368 return -EINVAL;
ad85779a
LP
369 }
370
371 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
372 if (!m)
373 return -ENOMEM;
374
1cc6c93a
YW
375 m->destination = TAKE_PTR(destination);
376 m->source = TAKE_PTR(upper);
377 m->lower = TAKE_PTR(lower);
ad85779a
LP
378 m->read_only = read_only;
379
ad85779a
LP
380 return 0;
381}
382
de40a303
LP
383int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s) {
384 _cleanup_free_ char *path = NULL;
385 CustomMount *m;
386
387 assert(l);
388 assert(n);
389 assert(s);
390
391 if (!path_is_absolute(s))
392 return -EINVAL;
393
394 path = strdup(s);
395 if (!path)
396 return -ENOMEM;
397
398 m = custom_mount_add(l, n, CUSTOM_MOUNT_INACCESSIBLE);
399 if (!m)
400 return -ENOMEM;
401
402 m->destination = TAKE_PTR(path);
403 return 0;
404}
405
04029482 406int tmpfs_patch_options(
e83bebef 407 const char *options,
2fa017f1 408 uid_t uid_shift,
e83bebef
LP
409 const char *selinux_apifs_context,
410 char **ret) {
411
cfea7618 412 _cleanup_free_ char *buf = NULL;
e83bebef 413
cfea7618 414 assert(ret);
e83bebef 415
cfea7618
YW
416 if (options) {
417 buf = strdup(options);
418 if (!buf)
419 return -ENOMEM;
e83bebef
LP
420 }
421
cfea7618
YW
422 if (uid_shift != UID_INVALID)
423 if (strextendf_with_separator(&buf, ",", "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift) < 0)
e83bebef 424 return -ENOMEM;
e83bebef 425
cfea7618
YW
426#if HAVE_SELINUX
427 if (selinux_apifs_context)
d3d15594 428 if (strextendf_with_separator(&buf, ",", "context=\"%s\"", selinux_apifs_context) < 0)
0996ef00 429 return -ENOMEM;
cfea7618 430#endif
0996ef00 431
cfea7618
YW
432 *ret = TAKE_PTR(buf);
433 return !!*ret;
e83bebef
LP
434}
435
4f086aab 436int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
0af7e294 437 _cleanup_free_ char *top = NULL, *full = NULL;;
4f086aab 438 unsigned long extra_flags = 0;
0af7e294
LP
439 int r;
440
441 top = path_join(dest, "/sys");
442 if (!top)
443 return log_oom();
d8fc6a00 444
0af7e294 445 r = path_is_mount_point(top);
d1678248 446 if (r < 0)
0af7e294
LP
447 return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top);
448 if (r == 0) {
449 /* If this is not a mount point yet, then mount a tmpfs there */
450 r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS);
451 if (r < 0)
452 return r;
453 } else {
454 r = path_is_fs_type(top, SYSFS_MAGIC);
455 if (r < 0)
456 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
457
458 /* /sys/ might already be mounted as sysfs by the outer child in the !netns case. In this case, it's
459 * all good. Don't touch it because we don't have the right to do so, see
460 * https://github.com/systemd/systemd/issues/1555.
461 */
462 if (r > 0)
463 return 0;
464 }
d1678248 465
0af7e294
LP
466 full = path_join(top, "/full");
467 if (!full)
468 return log_oom();
d8fc6a00 469
8f9ea89c
LP
470 if (mkdir(full, 0755) < 0 && errno != EEXIST)
471 return log_error_errno(errno, "Failed to create directory '%s': %m", full);
d8fc6a00 472
bd6609eb 473 if (FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO))
4f086aab
SU
474 extra_flags |= MS_RDONLY;
475
511a8cfe
LP
476 r = mount_nofollow_verbose(LOG_ERR, "sysfs", full, "sysfs",
477 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
60e76d48
ZJS
478 if (r < 0)
479 return r;
d8fc6a00
LP
480
481 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
482 _cleanup_free_ char *from = NULL, *to = NULL;
483
c6134d3e 484 from = path_join(full, x);
d8fc6a00
LP
485 if (!from)
486 return log_oom();
487
c6134d3e 488 to = path_join(top, x);
d8fc6a00
LP
489 if (!to)
490 return log_oom();
491
492 (void) mkdir(to, 0755);
493
511a8cfe 494 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
495 if (r < 0)
496 return r;
d8fc6a00 497
511a8cfe
LP
498 r = mount_nofollow_verbose(LOG_ERR, NULL, to, NULL,
499 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
60e76d48
ZJS
500 if (r < 0)
501 return r;
d8fc6a00
LP
502 }
503
30f5d104 504 r = umount_verbose(LOG_ERR, full, UMOUNT_NOFOLLOW);
60e76d48
ZJS
505 if (r < 0)
506 return r;
d8fc6a00
LP
507
508 if (rmdir(full) < 0)
509 return log_error_errno(errno, "Failed to remove %s: %m", full);
510
0af7e294
LP
511 /* Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys/ read-only. */
512 _cleanup_free_ char *x = path_join(top, "/fs/cgroup");
513 if (!x)
514 return log_oom();
515
677a72cd 516 (void) mkdir_p(x, 0755);
d8fc6a00 517
511a8cfe
LP
518 return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL,
519 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
d8fc6a00
LP
520}
521
b71a0192
CB
522#define PROC_DEFAULT_MOUNT_FLAGS (MS_NOSUID|MS_NOEXEC|MS_NODEV)
523#define SYS_DEFAULT_MOUNT_FLAGS (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV)
524
e83bebef 525int mount_all(const char *dest,
4f086aab 526 MountSettingsMask mount_settings,
2fa017f1 527 uid_t uid_shift,
e83bebef
LP
528 const char *selinux_apifs_context) {
529
de40a303
LP
530#define PROC_INACCESSIBLE_REG(path) \
531 { "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \
532 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
d4b653c5
LP
533 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
534 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
535
536#define PROC_READ_ONLY(path) \
537 { (path), (path), NULL, NULL, MS_BIND, \
538 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
539 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
540 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
541
e83bebef
LP
542 typedef struct MountPoint {
543 const char *what;
544 const char *where;
545 const char *type;
546 const char *options;
547 unsigned long flags;
4f086aab 548 MountSettingsMask mount_settings;
e83bebef
LP
549 } MountPoint;
550
551 static const MountPoint mount_table[] = {
0af7e294 552 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing when we are privileged) */
b71a0192 553 { "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS,
511a8cfe 554 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
d4b653c5
LP
555
556 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
557 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
558
559 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
560 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
561
562 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
563 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
564
565 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
566 * internals or the host's execution environment to the container */
de40a303
LP
567 PROC_INACCESSIBLE_REG("/proc/kallsyms"),
568 PROC_INACCESSIBLE_REG("/proc/kcore"),
569 PROC_INACCESSIBLE_REG("/proc/keys"),
570 PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"),
571 PROC_INACCESSIBLE_REG("/proc/timer_list"),
d4b653c5
LP
572
573 /* Make these directories read-only to container payloads: they show hardware information, and in some
574 * cases contain tunables the container really shouldn't have access to. */
575 PROC_READ_ONLY("/proc/acpi"),
576 PROC_READ_ONLY("/proc/apm"),
577 PROC_READ_ONLY("/proc/asound"),
578 PROC_READ_ONLY("/proc/bus"),
579 PROC_READ_ONLY("/proc/fs"),
580 PROC_READ_ONLY("/proc/irq"),
581 PROC_READ_ONLY("/proc/scsi"),
582
e1bb4b0d 583 { "mqueue", "/dev/mqueue", "mqueue", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
dcff2fa5 584 MOUNT_IN_USERNS|MOUNT_MKDIR },
849b9b85 585
0af7e294 586 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing when we are privileged) */
9f563f27 587 { "tmpfs", "/tmp", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
611ae598 588 MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR|MOUNT_USRQUOTA_GRACEFUL },
9f563f27 589 { "tmpfs", "/sys", "tmpfs", "mode=0555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
46b7e967 590 MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_UNMANAGED },
b71a0192 591 { "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS,
46b7e967 592 MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_UNMANAGED }, /* skipped if above was mounted */
e1bb4b0d 593 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
46b7e967 594 MOUNT_FATAL|MOUNT_MKDIR|MOUNT_UNMANAGED }, /* skipped if above was mounted */
9f563f27 595 { "tmpfs", "/dev", "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
dcff2fa5 596 MOUNT_FATAL|MOUNT_MKDIR },
9f563f27 597 { "tmpfs", "/dev/shm", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
611ae598 598 MOUNT_FATAL|MOUNT_MKDIR|MOUNT_USRQUOTA_GRACEFUL },
9f563f27 599 { "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
dcff2fa5 600 MOUNT_FATAL|MOUNT_MKDIR },
d64e32c2
LP
601 { "/run/host", "/run/host", NULL, NULL, MS_BIND,
602 MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PREFIX_ROOT }, /* Prepare this so that we can make it read-only when we are done */
603 { "/etc/os-release", "/run/host/os-release", NULL, NULL, MS_BIND,
604 MOUNT_TOUCH }, /* As per kernel interface requirements, bind mount first (creating mount points) and make read-only later */
605 { "/usr/lib/os-release", "/run/host/os-release", NULL, NULL, MS_BIND,
606 MOUNT_FATAL }, /* If /etc/os-release doesn't exist use the version in /usr/lib as fallback */
607 { NULL, "/run/host/os-release", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
608 MOUNT_FATAL },
56339a10
LP
609 { NULL, "/run/host/os-release", NULL, NULL, MS_PRIVATE,
610 MOUNT_FATAL }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
d64e32c2
LP
611 { NULL, "/run/host", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
612 MOUNT_FATAL|MOUNT_IN_USERNS },
349cc4a5 613#if HAVE_SELINUX
e1bb4b0d 614 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
0af7e294 615 MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
e1bb4b0d 616 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
46b7e967 617 MOUNT_UNMANAGED|MOUNT_PRIVILEGED }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
56339a10 618 { NULL, "/sys/fs/selinux", NULL, NULL, MS_PRIVATE,
46b7e967 619 MOUNT_UNMANAGED|MOUNT_PRIVILEGED }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
e83bebef
LP
620#endif
621 };
622
bd6609eb
DDM
623 bool use_userns = FLAGS_SET(mount_settings, MOUNT_USE_USERNS);
624 bool netns = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_NETNS);
625 bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO);
626 bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS);
627 bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP);
46b7e967 628 bool unmanaged = FLAGS_SET(mount_settings, MOUNT_UNMANAGED);
0af7e294 629 bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED);
88614c8a 630 int r;
e83bebef 631
ddb8a639 632 FOREACH_ELEMENT(m, mount_table) {
d64e32c2 633 _cleanup_free_ char *where = NULL, *options = NULL, *prefixed = NULL;
ddb8a639 634 bool fatal = FLAGS_SET(m->mount_settings, MOUNT_FATAL);
d64e32c2 635 const char *o;
4f086aab 636
46b7e967
LP
637 /* If we are in managed user namespace mode but the entry is marked for mount outside of
638 * managed user namespace mode, and to be mounted outside the user namespace, then skip it */
639 if (!unmanaged && FLAGS_SET(m->mount_settings, MOUNT_UNMANAGED) && !FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
0af7e294
LP
640 continue;
641
ddb8a639 642 if (in_userns != FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
4f086aab 643 continue;
e83bebef 644
ddb8a639 645 if (!netns && FLAGS_SET(m->mount_settings, MOUNT_APPLY_APIVFS_NETNS))
d1678248
ILG
646 continue;
647
ddb8a639 648 if (!ro && FLAGS_SET(m->mount_settings, MOUNT_APPLY_APIVFS_RO))
e83bebef
LP
649 continue;
650
ddb8a639 651 if (!tmpfs_tmp && FLAGS_SET(m->mount_settings, MOUNT_APPLY_TMPFS_TMP))
1099ceeb
LP
652 continue;
653
46b7e967
LP
654 if (!privileged && FLAGS_SET(m->mount_settings, MOUNT_PRIVILEGED))
655 continue;
656
ddb8a639 657 r = chase(m->where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where, NULL);
8ce48cf0 658 if (r < 0)
ddb8a639 659 return log_error_errno(r, "Failed to resolve %s%s: %m", strempty(dest), m->where);
e83bebef 660
e83bebef 661 /* Skip this entry if it is not a remount. */
ddb8a639 662 if (m->what) {
b409aacb 663 r = path_is_mount_point(where);
de40a303
LP
664 if (r < 0 && r != -ENOENT)
665 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
666 if (r > 0)
667 continue;
668 }
e83bebef 669
ddb8a639 670 if ((m->mount_settings & (MOUNT_MKDIR|MOUNT_TOUCH)) != 0) {
b3b1a08a 671 uid_t u = (use_userns && !in_userns) ? uid_shift : UID_INVALID;
e1bb4b0d 672
ddb8a639 673 if (FLAGS_SET(m->mount_settings, MOUNT_TOUCH))
e1bb4b0d
LB
674 r = mkdir_parents_safe(dest, where, 0755, u, u, 0);
675 else
676 r = mkdir_p_safe(dest, where, 0755, u, u, 0);
dcff2fa5
LP
677 if (r < 0 && r != -EEXIST) {
678 if (fatal && r != -EROFS)
679 return log_error_errno(r, "Failed to create directory %s: %m", where);
e83bebef 680
dcff2fa5
LP
681 log_debug_errno(r, "Failed to create directory %s: %m", where);
682
683 /* If we failed mkdir() or chown() due to the root directory being read only,
684 * attempt to mount this fs anyway and let mount_verbose log any errors */
685 if (r != -EROFS)
686 continue;
687 }
d64e32c2
LP
688 }
689
ddb8a639 690 if (FLAGS_SET(m->mount_settings, MOUNT_TOUCH)) {
d64e32c2
LP
691 r = touch(where);
692 if (r < 0 && r != -EEXIST) {
693 if (fatal && r != -EROFS)
694 return log_error_errno(r, "Failed to create file %s: %m", where);
695
696 log_debug_errno(r, "Failed to create file %s: %m", where);
697 if (r != -EROFS)
698 continue;
e1bb4b0d 699 }
e83bebef
LP
700 }
701
ddb8a639
I
702 o = m->options;
703 if (streq_ptr(m->type, "tmpfs")) {
2fa017f1 704 r = tmpfs_patch_options(o, in_userns ? 0 : uid_shift, selinux_apifs_context, &options);
e83bebef
LP
705 if (r < 0)
706 return log_oom();
707 if (r > 0)
708 o = options;
709 }
710
611ae598
LP
711 if (FLAGS_SET(m->mount_settings, MOUNT_USRQUOTA_GRACEFUL)) {
712 r = mount_option_supported(m->type, /* key= */ "usrquota", /* value= */ NULL);
713 if (r < 0)
714 log_warning_errno(r, "Failed to determine if '%s' supports 'usrquota', assuming it doesn't: %m", m->type);
715 else if (r == 0)
8fa7863a 716 log_debug("Kernel doesn't support 'usrquota' on '%s', not including in mount options for '%s'.", m->type, m->where);
611ae598
LP
717 else {
718 _cleanup_free_ char *joined = NULL;
719
720 if (!strextend_with_separator(&joined, ",", o ?: POINTER_MAX, "usrquota"))
721 return log_oom();
722
723 free_and_replace(options, joined);
724 o = options;
725 }
726 }
727
ddb8a639 728 if (FLAGS_SET(m->mount_settings, MOUNT_PREFIX_ROOT)) {
d64e32c2
LP
729 /* Optionally prefix the mount source with the root dir. This is useful in bind
730 * mounts to be created within the container image before we transition into it. Note
fcdd21ec 731 * that MOUNT_IN_USERNS is run after we transitioned hence prefixing is not necessary
d64e32c2 732 * for those. */
ddb8a639 733 r = chase(m->what, dest, CHASE_PREFIX_ROOT, &prefixed, NULL);
d64e32c2 734 if (r < 0)
ddb8a639 735 return log_error_errno(r, "Failed to resolve %s%s: %m", strempty(dest), m->what);
d64e32c2
LP
736 }
737
511a8cfe
LP
738 r = mount_verbose_full(
739 fatal ? LOG_ERR : LOG_DEBUG,
ddb8a639 740 prefixed ?: m->what,
511a8cfe 741 where,
ddb8a639
I
742 m->type,
743 m->flags,
511a8cfe 744 o,
ddb8a639 745 FLAGS_SET(m->mount_settings, MOUNT_FOLLOW_SYMLINKS));
4f086aab 746 if (r < 0 && fatal)
60e76d48 747 return r;
e83bebef
LP
748 }
749
750 return 0;
751}
752
1aa18710 753static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts, RemountIdmapping *idmapping) {
a11fd406
ILG
754 unsigned long flags = *mount_flags;
755 char *opts = NULL;
1aa18710 756 RemountIdmapping new_idmapping = *idmapping;
a11fd406
ILG
757 int r;
758
759 assert(options);
760
761 for (;;) {
762 _cleanup_free_ char *word = NULL;
763
d2b99ed7 764 r = extract_first_word(&options, &word, ",", 0);
a11fd406
ILG
765 if (r < 0)
766 return log_error_errno(r, "Failed to extract mount option: %m");
767 if (r == 0)
768 break;
769
770 if (streq(word, "rbind"))
771 flags |= MS_REC;
772 else if (streq(word, "norbind"))
773 flags &= ~MS_REC;
c0c8f718 774 else if (streq(word, "idmap"))
1aa18710 775 new_idmapping = REMOUNT_IDMAPPING_HOST_ROOT;
c0c8f718 776 else if (streq(word, "noidmap"))
1aa18710 777 new_idmapping = REMOUNT_IDMAPPING_NONE;
2b2777ed
QD
778 else if (streq(word, "rootidmap"))
779 new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER;
614d09a3
BF
780 else if (streq(word, "owneridmap"))
781 new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER_TO_TARGET_OWNER;
d2b99ed7 782 else
38288f0b 783 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
d2b99ed7 784 "Invalid bind mount option: %s", word);
a11fd406
ILG
785 }
786
787 *mount_flags = flags;
1aa18710 788 *idmapping = new_idmapping;
a11fd406
ILG
789 /* in the future mount_opts will hold string options for mount(2) */
790 *mount_opts = opts;
791
792 return 0;
793}
794
c0c8f718 795static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t uid_range) {
a11fd406
ILG
796 _cleanup_free_ char *mount_opts = NULL, *where = NULL;
797 unsigned long mount_flags = MS_BIND | MS_REC;
68cf43c3 798 struct stat source_st, dest_st;
614d09a3 799 uid_t dest_uid = UID_INVALID;
e83bebef 800 int r;
1aa18710 801 RemountIdmapping idmapping = REMOUNT_IDMAPPING_NONE;
e83bebef 802
86c0dd4a 803 assert(dest);
e83bebef
LP
804 assert(m);
805
a11fd406 806 if (m->options) {
1aa18710 807 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts, &idmapping);
a11fd406
ILG
808 if (r < 0)
809 return r;
810 }
811
07bca16f
LP
812 /* If this is a bind mount from a temporary sources change ownership of the source to the container's
813 * root UID. Otherwise it would always show up as "nobody" if user namespacing is used. */
814 if (m->rm_rf_tmpdir && chown(m->source, uid_shift, uid_shift) < 0)
815 return log_error_errno(errno, "Failed to chown %s: %m", m->source);
816
e83bebef
LP
817 if (stat(m->source, &source_st) < 0)
818 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
819
f461a28d 820 r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
68cf43c3 821 if (r < 0)
ec57bd42 822 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
823 if (r > 0) { /* Path exists already? */
824
825 if (stat(where, &dest_st) < 0)
826 return log_error_errno(errno, "Failed to stat %s: %m", where);
e83bebef 827
614d09a3
BF
828 dest_uid = dest_st.st_uid;
829
baaa35ad
ZJS
830 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode))
831 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
832 "Cannot bind mount directory %s on file %s.",
833 m->source, where);
834
835 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode))
836 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
837 "Cannot bind mount file %s on directory %s.",
838 m->source, where);
e83bebef 839
8ce48cf0 840 } else { /* Path doesn't exist yet? */
0a67965f 841 r = mkdir_parents_safe_label(dest, where, 0755, uid_shift, uid_shift, MKDIR_IGNORE_EXISTING);
e83bebef
LP
842 if (r < 0)
843 return log_error_errno(r, "Failed to make parents of %s: %m", where);
b97e83cb
BN
844
845 /* Create the mount point. Any non-directory file can be
846 * mounted on any non-directory file (regular, fifo, socket,
847 * char, block).
848 */
849 if (S_ISDIR(source_st.st_mode))
850 r = mkdir_label(where, 0755);
851 else
852 r = touch(where);
853 if (r < 0)
854 return log_error_errno(r, "Failed to create mount point %s: %m", where);
0a67965f
DDM
855
856 if (chown(where, uid_shift, uid_shift) < 0)
857 return log_error_errno(errno, "Failed to chown %s: %m", where);
614d09a3
BF
858
859 dest_uid = uid_shift;
8ce48cf0 860 }
e83bebef 861
511a8cfe 862 r = mount_nofollow_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
60e76d48
ZJS
863 if (r < 0)
864 return r;
e83bebef
LP
865
866 if (m->read_only) {
64e82c19 867 r = bind_remount_recursive(where, MS_RDONLY, MS_RDONLY, NULL);
e83bebef
LP
868 if (r < 0)
869 return log_error_errno(r, "Read-only bind mount failed: %m");
870 }
871
1aa18710 872 if (idmapping != REMOUNT_IDMAPPING_NONE) {
614d09a3 873 r = remount_idmap(STRV_MAKE(where), uid_shift, uid_range, source_st.st_uid, dest_uid, idmapping);
c0c8f718
AV
874 if (r < 0)
875 return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where);
876 }
877
e83bebef
LP
878 return 0;
879}
880
e091a5df 881static int mount_tmpfs(const char *dest, CustomMount *m, uid_t uid_shift, const char *selinux_apifs_context) {
68cf43c3
LP
882 const char *options;
883 _cleanup_free_ char *buf = NULL, *where = NULL;
e83bebef
LP
884 int r;
885
886 assert(dest);
887 assert(m);
888
f461a28d 889 r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
68cf43c3 890 if (r < 0)
ec57bd42 891 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
892 if (r == 0) { /* Doesn't exist yet? */
893 r = mkdir_p_label(where, 0755);
894 if (r < 0)
895 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
896 }
e83bebef 897
2fa017f1 898 r = tmpfs_patch_options(m->options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
e83bebef
LP
899 if (r < 0)
900 return log_oom();
901 options = r > 0 ? buf : m->options;
902
511a8cfe 903 return mount_nofollow_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
e83bebef
LP
904}
905
86c0dd4a 906static char *joined_and_escaped_lower_dirs(char **lower) {
e83bebef
LP
907 _cleanup_strv_free_ char **sv = NULL;
908
909 sv = strv_copy(lower);
910 if (!sv)
911 return NULL;
912
913 strv_reverse(sv);
914
915 if (!strv_shell_escape(sv, ",:"))
916 return NULL;
917
918 return strv_join(sv, ":");
919}
920
921static int mount_overlay(const char *dest, CustomMount *m) {
86c0dd4a 922 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
68cf43c3 923 const char *options;
e83bebef
LP
924 int r;
925
926 assert(dest);
927 assert(m);
928
f461a28d 929 r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
68cf43c3 930 if (r < 0)
ec57bd42 931 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
932 if (r == 0) { /* Doesn't exist yet? */
933 r = mkdir_label(where, 0755);
934 if (r < 0)
935 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
936 }
e83bebef
LP
937
938 (void) mkdir_p_label(m->source, 0755);
939
940 lower = joined_and_escaped_lower_dirs(m->lower);
941 if (!lower)
942 return log_oom();
943
86c0dd4a
LP
944 escaped_source = shell_escape(m->source, ",:");
945 if (!escaped_source)
946 return log_oom();
e83bebef 947
86c0dd4a 948 if (m->read_only)
e83bebef 949 options = strjoina("lowerdir=", escaped_source, ":", lower);
86c0dd4a
LP
950 else {
951 _cleanup_free_ char *escaped_work_dir = NULL;
e83bebef 952
e83bebef
LP
953 escaped_work_dir = shell_escape(m->work_dir, ",:");
954 if (!escaped_work_dir)
955 return log_oom();
956
957 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
958 }
959
511a8cfe 960 return mount_nofollow_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
e83bebef
LP
961}
962
de40a303 963static int mount_inaccessible(const char *dest, CustomMount *m) {
e5f10caf 964 _cleanup_free_ char *where = NULL, *source = NULL;
de40a303
LP
965 struct stat st;
966 int r;
967
968 assert(dest);
969 assert(m);
970
f461a28d 971 r = chase_and_stat(m->destination, dest, CHASE_PREFIX_ROOT, &where, &st);
de40a303
LP
972 if (r < 0) {
973 log_full_errno(m->graceful ? LOG_DEBUG : LOG_ERR, r, "Failed to resolve %s/%s: %m", dest, m->destination);
974 return m->graceful ? 0 : r;
975 }
976
48b747fa 977 r = mode_to_inaccessible_node(NULL, st.st_mode, &source);
e5f10caf
AZ
978 if (r < 0)
979 return m->graceful ? 0 : r;
de40a303 980
511a8cfe 981 r = mount_nofollow_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL);
de40a303
LP
982 if (r < 0)
983 return m->graceful ? 0 : r;
984
511a8cfe 985 r = mount_nofollow_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, NULL, where, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL);
2c9b7a7e 986 if (r < 0) {
30f5d104 987 (void) umount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, where, UMOUNT_NOFOLLOW);
de40a303 988 return m->graceful ? 0 : r;
2c9b7a7e 989 }
de40a303
LP
990
991 return 0;
992}
993
994static int mount_arbitrary(const char *dest, CustomMount *m) {
995 _cleanup_free_ char *where = NULL;
996 int r;
997
998 assert(dest);
999 assert(m);
1000
f461a28d 1001 r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
de40a303
LP
1002 if (r < 0)
1003 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
1004 if (r == 0) { /* Doesn't exist yet? */
1005 r = mkdir_p_label(where, 0755);
1006 if (r < 0)
1007 return log_error_errno(r, "Creating mount point for mount %s failed: %m", where);
1008 }
1009
511a8cfe 1010 return mount_nofollow_verbose(LOG_ERR, m->source, where, m->type_argument, 0, m->options);
de40a303
LP
1011}
1012
e83bebef
LP
1013int mount_custom(
1014 const char *dest,
88614c8a 1015 CustomMount *mounts, size_t n,
e091a5df 1016 uid_t uid_shift,
c0c8f718 1017 uid_t uid_range,
de40a303 1018 const char *selinux_apifs_context,
5f0a6347 1019 MountSettingsMask mount_settings) {
e83bebef
LP
1020 int r;
1021
1022 assert(dest);
1023
1236f06c 1024 FOREACH_ARRAY(m, mounts, n) {
bd6609eb 1025 if (FLAGS_SET(mount_settings, MOUNT_IN_USERNS) != m->in_userns)
5f0a6347
DDM
1026 continue;
1027
bd6609eb 1028 if (FLAGS_SET(mount_settings, MOUNT_ROOT_ONLY) && !path_equal(m->destination, "/"))
5f0a6347
DDM
1029 continue;
1030
bd6609eb 1031 if (FLAGS_SET(mount_settings, MOUNT_NON_ROOT_ONLY) && path_equal(m->destination, "/"))
de40a303
LP
1032 continue;
1033
e83bebef
LP
1034 switch (m->type) {
1035
1036 case CUSTOM_MOUNT_BIND:
c0c8f718 1037 r = mount_bind(dest, m, uid_shift, uid_range);
e83bebef
LP
1038 break;
1039
1040 case CUSTOM_MOUNT_TMPFS:
e091a5df 1041 r = mount_tmpfs(dest, m, uid_shift, selinux_apifs_context);
e83bebef
LP
1042 break;
1043
1044 case CUSTOM_MOUNT_OVERLAY:
1045 r = mount_overlay(dest, m);
1046 break;
1047
de40a303
LP
1048 case CUSTOM_MOUNT_INACCESSIBLE:
1049 r = mount_inaccessible(dest, m);
1050 break;
1051
1052 case CUSTOM_MOUNT_ARBITRARY:
1053 r = mount_arbitrary(dest, m);
1054 break;
1055
e83bebef 1056 default:
04499a70 1057 assert_not_reached();
e83bebef
LP
1058 }
1059
1060 if (r < 0)
1061 return r;
1062 }
1063
1064 return 0;
1065}
1066
bbd407ea 1067bool has_custom_root_mount(const CustomMount *mounts, size_t n) {
1236f06c
YW
1068 FOREACH_ARRAY(m, mounts, n)
1069 if (path_equal(m->destination, "/"))
bbd407ea 1070 return true;
bbd407ea
DDM
1071
1072 return false;
1073}
1074
2c2511aa 1075static int setup_volatile_state(const char *directory) {
e83bebef
LP
1076 int r;
1077
1078 assert(directory);
1079
e5b43a04 1080 /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
e83bebef 1081
2c2511aa 1082 /* First, remount the root directory. */
64e82c19 1083 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
e83bebef
LP
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1086
2c2511aa
YW
1087 return 0;
1088}
1089
1090static int setup_volatile_state_after_remount_idmap(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
1091 _cleanup_free_ char *buf = NULL;
2c2511aa
YW
1092 int r;
1093
1094 assert(directory);
1095
1096 /* Then, after remount_idmap(), overmount /var/ with a tmpfs. */
1097
1fbfbe81
DDM
1098 _cleanup_free_ char *p = path_join(directory, "/var");
1099 if (!p)
1100 return log_oom();
1101
e83bebef
LP
1102 r = mkdir(p, 0755);
1103 if (r < 0 && errno != EEXIST)
1104 return log_error_errno(errno, "Failed to create %s: %m", directory);
1105
1fbfbe81 1106 const char *options = "mode=0755" TMPFS_LIMITS_VOLATILE_STATE;
2fa017f1 1107 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
e83bebef
LP
1108 if (r < 0)
1109 return log_oom();
1110 if (r > 0)
1111 options = buf;
1112
511a8cfe 1113 return mount_nofollow_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
e83bebef
LP
1114}
1115
e091a5df 1116static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
e83bebef 1117 bool tmpfs_mounted = false, bind_mounted = false;
bf37a69c 1118 _cleanup_(rmdir_and_freep) char *template = NULL;
1fbfbe81 1119 _cleanup_free_ char *buf = NULL, *bindir = NULL, *f = NULL, *t = NULL;
07b9f3f0 1120 struct stat st;
e83bebef
LP
1121 int r;
1122
1123 assert(directory);
1124
07b9f3f0
LP
1125 /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and
1126 * that read-only. Before we start setting this up let's validate if the image has the /usr merge
1127 * implemented, and let's output a friendly log message if it hasn't. */
1128
1129 bindir = path_join(directory, "/bin");
1130 if (!bindir)
1131 return log_oom();
1132 if (lstat(bindir, &st) < 0) {
1133 if (errno != ENOENT)
1134 return log_error_errno(errno, "Failed to stat /bin directory below image: %m");
1135
1136 /* ENOENT is fine, just means the image is probably just a naked /usr and we can create the
1137 * rest. */
1138 } else if (S_ISDIR(st.st_mode))
1139 return log_error_errno(SYNTHETIC_ERRNO(EISDIR),
1140 "Sorry, --volatile=yes mode is not supported with OS images that have not merged /bin/, /sbin/, /lib/, /lib64/ into /usr/. "
1141 "Please work with your distribution and help them adopt the merged /usr scheme.");
1142 else if (!S_ISLNK(st.st_mode))
1143 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1144 "Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically).");
e83bebef 1145
bf37a69c
LP
1146 r = mkdtemp_malloc("/tmp/nspawn-volatile-XXXXXX", &template);
1147 if (r < 0)
1148 return log_error_errno(r, "Failed to create temporary directory: %m");
e83bebef 1149
1fbfbe81 1150 const char *options = "mode=0755" TMPFS_LIMITS_ROOTFS;
2fa017f1 1151 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
e83bebef 1152 if (r < 0)
c55d0ae7 1153 goto fail;
e83bebef
LP
1154 if (r > 0)
1155 options = buf;
1156
511a8cfe 1157 r = mount_nofollow_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
60e76d48 1158 if (r < 0)
e83bebef 1159 goto fail;
e83bebef
LP
1160
1161 tmpfs_mounted = true;
1162
1fbfbe81
DDM
1163 f = path_join(directory, "/usr");
1164 if (!f) {
1165 r = log_oom();
1166 goto fail;
1167 }
1168
1169 t = path_join(template, "/usr");
1170 if (!t) {
1171 r = log_oom();
1172 goto fail;
1173 }
e83bebef
LP
1174
1175 r = mkdir(t, 0755);
1176 if (r < 0 && errno != EEXIST) {
1177 r = log_error_errno(errno, "Failed to create %s: %m", t);
1178 goto fail;
1179 }
1180
511a8cfe 1181 r = mount_nofollow_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
60e76d48 1182 if (r < 0)
e83bebef 1183 goto fail;
e83bebef
LP
1184
1185 bind_mounted = true;
1186
64e82c19 1187 r = bind_remount_recursive(t, MS_RDONLY, MS_RDONLY, NULL);
e83bebef
LP
1188 if (r < 0) {
1189 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1190 goto fail;
1191 }
1192
511a8cfe 1193 r = mount_nofollow_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
60e76d48 1194 if (r < 0)
e83bebef 1195 goto fail;
e83bebef
LP
1196
1197 (void) rmdir(template);
1198
1199 return 0;
1200
1201fail:
1202 if (bind_mounted)
30f5d104 1203 (void) umount_verbose(LOG_ERR, t, UMOUNT_NOFOLLOW);
e83bebef
LP
1204
1205 if (tmpfs_mounted)
30f5d104
LP
1206 (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW);
1207
e83bebef
LP
1208 return r;
1209}
b53ede69 1210
e091a5df 1211static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
6c610aca 1212 _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL;
bf37a69c 1213 _cleanup_(rmdir_and_freep) char *template = NULL;
6c610aca
LP
1214 const char *upper, *work, *options;
1215 bool tmpfs_mounted = false;
1216 int r;
1217
1218 assert(directory);
1219
1220 /* --volatile=overlay means we mount an overlayfs to the root dir. */
1221
bf37a69c
LP
1222 r = mkdtemp_malloc("/tmp/nspawn-volatile-XXXXXX", &template);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to create temporary directory: %m");
6c610aca 1225
9f563f27 1226 options = "mode=0755" TMPFS_LIMITS_ROOTFS;
6c610aca
LP
1227 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1228 if (r < 0)
1229 goto finish;
1230 if (r > 0)
1231 options = buf;
1232
511a8cfe 1233 r = mount_nofollow_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
6c610aca
LP
1234 if (r < 0)
1235 goto finish;
1236
1237 tmpfs_mounted = true;
1238
1239 upper = strjoina(template, "/upper");
1240 work = strjoina(template, "/work");
1241
1242 if (mkdir(upper, 0755) < 0) {
1243 r = log_error_errno(errno, "Failed to create %s: %m", upper);
1244 goto finish;
1245 }
1246 if (mkdir(work, 0755) < 0) {
1247 r = log_error_errno(errno, "Failed to create %s: %m", work);
1248 goto finish;
1249 }
1250
1251 /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
1252 * that the kernel allows us to do that without going through some mount point rearrangements. */
1253
1254 escaped_directory = shell_escape(directory, ",:");
1255 escaped_upper = shell_escape(upper, ",:");
1256 escaped_work = shell_escape(work, ",:");
1257 if (!escaped_directory || !escaped_upper || !escaped_work) {
1258 r = -ENOMEM;
1259 goto finish;
1260 }
1261
1262 options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work);
511a8cfe 1263 r = mount_nofollow_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options);
6c610aca
LP
1264
1265finish:
1266 if (tmpfs_mounted)
30f5d104 1267 (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW);
6c610aca 1268
6c610aca
LP
1269 return r;
1270}
1271
e5b43a04
LP
1272int setup_volatile_mode(
1273 const char *directory,
1274 VolatileMode mode,
e091a5df 1275 uid_t uid_shift,
e5b43a04
LP
1276 const char *selinux_apifs_context) {
1277
1278 switch (mode) {
1279
1280 case VOLATILE_YES:
e091a5df 1281 return setup_volatile_yes(directory, uid_shift, selinux_apifs_context);
e5b43a04
LP
1282
1283 case VOLATILE_STATE:
2c2511aa 1284 return setup_volatile_state(directory);
e5b43a04 1285
6c610aca 1286 case VOLATILE_OVERLAY:
e091a5df 1287 return setup_volatile_overlay(directory, uid_shift, selinux_apifs_context);
6c610aca 1288
e5b43a04
LP
1289 default:
1290 return 0;
1291 }
1292}
1293
2c2511aa
YW
1294int setup_volatile_mode_after_remount_idmap(
1295 const char *directory,
1296 VolatileMode mode,
1297 uid_t uid_shift,
1298 const char *selinux_apifs_context) {
1299
1300 switch (mode) {
1301
1302 case VOLATILE_STATE:
1303 return setup_volatile_state_after_remount_idmap(directory, uid_shift, selinux_apifs_context);
1304
1305 default:
1306 return 0;
1307 }
1308}
1309
b53ede69
PW
1310/* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1311int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1312 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1313 const char *p = s;
1314 int r;
1315
1316 assert(pivot_root_new);
1317 assert(pivot_root_old);
1318
1319 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1320 if (r < 0)
1321 return r;
1322 if (r == 0)
1323 return -EINVAL;
1324
1325 if (isempty(p))
1326 root_old = NULL;
1327 else {
1328 root_old = strdup(p);
1329 if (!root_old)
1330 return -ENOMEM;
1331 }
1332
1333 if (!path_is_absolute(root_new))
1334 return -EINVAL;
1335 if (root_old && !path_is_absolute(root_old))
1336 return -EINVAL;
1337
1338 free_and_replace(*pivot_root_new, root_new);
1339 free_and_replace(*pivot_root_old, root_old);
1340
1341 return 0;
1342}
1343
1344int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1345 _cleanup_free_ char *directory_pivot_root_new = NULL;
1346 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
bf37a69c 1347 _cleanup_(rmdir_and_freep) char *pivot_tmp = NULL;
b53ede69
PW
1348 int r;
1349
1350 assert(directory);
1351
1352 if (!pivot_root_new)
1353 return 0;
1354
1355 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1356 * If pivot_root_old is NULL, the existing / disappears.
1357 * This requires a temporary directory, pivot_tmp, which is
1358 * not a child of either.
1359 *
32e27670
LP
1360 * This is typically used for OSTree-style containers, where the root partition contains several
1361 * sysroots which could be run. Normally, one would be chosen by the bootloader and pivoted to / by
1362 * initrd.
b53ede69
PW
1363 *
1364 * For example, for an OSTree deployment, pivot_root_new
1365 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1366 * code doesn’t do the /var mount which OSTree expects: use
1367 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1368 *
1369 * So in the OSTree case, we’ll end up with something like:
1370 * - directory = /tmp/nspawn-root-123456
1371 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1372 * - pivot_root_old = /sysroot
1373 * - directory_pivot_root_new =
1374 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1375 * - pivot_tmp = /tmp/nspawn-pivot-123456
1376 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1377 *
1378 * Requires all file systems at directory and below to be mounted
1379 * MS_PRIVATE or MS_SLAVE so they can be moved.
1380 */
c6134d3e
LP
1381 directory_pivot_root_new = path_join(directory, pivot_root_new);
1382 if (!directory_pivot_root_new)
1383 return log_oom();
b53ede69
PW
1384
1385 /* Remount directory_pivot_root_new to make it movable. */
511a8cfe 1386 r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
b53ede69 1387 if (r < 0)
bf37a69c 1388 return r;
b53ede69
PW
1389
1390 if (pivot_root_old) {
bf37a69c
LP
1391 r = mkdtemp_malloc("/tmp/nspawn-pivot-XXXXXX", &pivot_tmp);
1392 if (r < 0)
1393 return log_error_errno(r, "Failed to create temporary directory: %m");
b53ede69 1394
c6134d3e 1395 pivot_tmp_pivot_root_old = path_join(pivot_tmp, pivot_root_old);
bf37a69c
LP
1396 if (!pivot_tmp_pivot_root_old)
1397 return log_oom();
b53ede69 1398
511a8cfe 1399 r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
b53ede69 1400 if (r < 0)
bf37a69c 1401 return r;
b53ede69 1402
511a8cfe 1403 r = mount_nofollow_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
b53ede69 1404 if (r < 0)
bf37a69c 1405 return r;
b53ede69 1406
511a8cfe 1407 r = mount_nofollow_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
bf37a69c 1408 } else
511a8cfe 1409 r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
b53ede69 1410
bf37a69c
LP
1411 if (r < 0)
1412 return r;
b53ede69 1413
bf37a69c 1414 return 0;
b53ede69 1415}
b71a0192
CB
1416
1417#define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc"
1418#define NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS "/run/host/sys"
1419
b83358b8 1420int pin_fully_visible_api_fs(void) {
b71a0192
CB
1421 int r;
1422
bf1ef54d
LP
1423 log_debug("Pinning fully visible API FS");
1424
b71a0192
CB
1425 (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, 0755);
1426 (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, 0755);
1427
1428 r = mount_follow_verbose(LOG_ERR, "proc", NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, "proc", PROC_DEFAULT_MOUNT_FLAGS, NULL);
1429 if (r < 0)
1430 return r;
1431
1432 r = mount_follow_verbose(LOG_ERR, "sysfs", NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, "sysfs", SYS_DEFAULT_MOUNT_FLAGS, NULL);
1433 if (r < 0)
1434 return r;
1435
1436 return 0;
1437}
1438
b83358b8 1439static int do_wipe_fully_visible_api_fs(void) {
b71a0192
CB
1440 if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, MNT_DETACH) < 0)
1441 return log_error_errno(errno, "Failed to unmount temporary proc: %m");
1442
1443 if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS) < 0)
1444 return log_error_errno(errno, "Failed to remove temporary proc mountpoint: %m");
1445
1446 if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, MNT_DETACH) < 0)
1447 return log_error_errno(errno, "Failed to unmount temporary sys: %m");
1448
1449 if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS) < 0)
1450 return log_error_errno(errno, "Failed to remove temporary sys mountpoint: %m");
1451
1452 return 0;
1453}
1454
b83358b8 1455int wipe_fully_visible_api_fs(int mntns_fd) {
b71a0192
CB
1456 _cleanup_close_ int orig_mntns_fd = -EBADF;
1457 int r, rr;
1458
bf1ef54d
LP
1459 log_debug("Wiping fully visible API FS");
1460
8c0da3af
LP
1461 orig_mntns_fd = namespace_open_by_type(NAMESPACE_MOUNT);
1462 if (orig_mntns_fd < 0)
1463 return log_error_errno(orig_mntns_fd, "Failed to pin originating mount namespace: %m");
b71a0192 1464
d2881ef9
YW
1465 r = namespace_enter(/* pidns_fd = */ -EBADF,
1466 mntns_fd,
1467 /* netns_fd = */ -EBADF,
1468 /* userns_fd = */ -EBADF,
1469 /* root_fd = */ -EBADF);
b71a0192
CB
1470 if (r < 0)
1471 return log_error_errno(r, "Failed to enter mount namespace: %m");
1472
b83358b8 1473 rr = do_wipe_fully_visible_api_fs();
b71a0192 1474
d2881ef9
YW
1475 r = namespace_enter(/* pidns_fd = */ -EBADF,
1476 orig_mntns_fd,
1477 /* netns_fd = */ -EBADF,
1478 /* userns_fd = */ -EBADF,
1479 /* root_fd = */ -EBADF);
b71a0192
CB
1480 if (r < 0)
1481 return log_error_errno(r, "Failed to enter original mount namespace: %m");
1482
1483 return rr;
1484}