]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
Merge pull request #3762 from poettering/sigkill-log
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2015 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <sys/mount.h>
21 #include <linux/magic.h>
22
23 #include "alloc-util.h"
24 #include "cgroup-util.h"
25 #include "escape.h"
26 #include "fs-util.h"
27 #include "label.h"
28 #include "mkdir.h"
29 #include "mount-util.h"
30 #include "nspawn-mount.h"
31 #include "parse-util.h"
32 #include "path-util.h"
33 #include "rm-rf.h"
34 #include "set.h"
35 #include "stat-util.h"
36 #include "string-util.h"
37 #include "strv.h"
38 #include "user-util.h"
39 #include "util.h"
40
41 CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
42 CustomMount *c, *ret;
43
44 assert(l);
45 assert(n);
46 assert(t >= 0);
47 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
48
49 c = realloc(*l, (*n + 1) * sizeof(CustomMount));
50 if (!c)
51 return NULL;
52
53 *l = c;
54 ret = *l + *n;
55 (*n)++;
56
57 *ret = (CustomMount) { .type = t };
58
59 return ret;
60 }
61
62 void custom_mount_free_all(CustomMount *l, unsigned n) {
63 unsigned i;
64
65 for (i = 0; i < n; i++) {
66 CustomMount *m = l + i;
67
68 free(m->source);
69 free(m->destination);
70 free(m->options);
71
72 if (m->work_dir) {
73 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
74 free(m->work_dir);
75 }
76
77 strv_free(m->lower);
78 }
79
80 free(l);
81 }
82
83 int custom_mount_compare(const void *a, const void *b) {
84 const CustomMount *x = a, *y = b;
85 int r;
86
87 r = path_compare(x->destination, y->destination);
88 if (r != 0)
89 return r;
90
91 if (x->type < y->type)
92 return -1;
93 if (x->type > y->type)
94 return 1;
95
96 return 0;
97 }
98
99 int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
100 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
101 const char *p = s;
102 CustomMount *m;
103 int r;
104
105 assert(l);
106 assert(n);
107
108 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
109 if (r < 0)
110 return r;
111 if (r == 0)
112 return -EINVAL;
113
114 if (r == 1) {
115 destination = strdup(source);
116 if (!destination)
117 return -ENOMEM;
118 }
119
120 if (r == 2 && !isempty(p)) {
121 opts = strdup(p);
122 if (!opts)
123 return -ENOMEM;
124 }
125
126 if (!path_is_absolute(source))
127 return -EINVAL;
128
129 if (!path_is_absolute(destination))
130 return -EINVAL;
131
132 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
133 if (!m)
134 return log_oom();
135
136 m->source = source;
137 m->destination = destination;
138 m->read_only = read_only;
139 m->options = opts;
140
141 source = destination = opts = NULL;
142 return 0;
143 }
144
145 int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
146 _cleanup_free_ char *path = NULL, *opts = NULL;
147 const char *p = s;
148 CustomMount *m;
149 int r;
150
151 assert(l);
152 assert(n);
153 assert(s);
154
155 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
156 if (r < 0)
157 return r;
158 if (r == 0)
159 return -EINVAL;
160
161 if (isempty(p))
162 opts = strdup("mode=0755");
163 else
164 opts = strdup(p);
165 if (!opts)
166 return -ENOMEM;
167
168 if (!path_is_absolute(path))
169 return -EINVAL;
170
171 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
172 if (!m)
173 return -ENOMEM;
174
175 m->destination = path;
176 m->options = opts;
177
178 path = opts = NULL;
179 return 0;
180 }
181
182 static int tmpfs_patch_options(
183 const char *options,
184 bool userns, uid_t uid_shift, uid_t uid_range,
185 const char *selinux_apifs_context,
186 char **ret) {
187
188 char *buf = NULL;
189
190 if (userns && uid_shift != 0) {
191 assert(uid_shift != UID_INVALID);
192
193 if (options)
194 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, uid_shift, uid_shift);
195 else
196 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift);
197 if (!buf)
198 return -ENOMEM;
199
200 options = buf;
201 }
202
203 #ifdef HAVE_SELINUX
204 if (selinux_apifs_context) {
205 char *t;
206
207 if (options)
208 t = strjoin(options, ",context=\"", selinux_apifs_context, "\"", NULL);
209 else
210 t = strjoin("context=\"", selinux_apifs_context, "\"", NULL);
211 if (!t) {
212 free(buf);
213 return -ENOMEM;
214 }
215
216 free(buf);
217 buf = t;
218 }
219 #endif
220
221 *ret = buf;
222 return !!buf;
223 }
224
225 int mount_sysfs(const char *dest) {
226 const char *full, *top, *x;
227 int r;
228
229 top = prefix_roota(dest, "/sys");
230 r = path_check_fstype(top, SYSFS_MAGIC);
231 if (r < 0)
232 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
233 /* /sys might already be mounted as sysfs by the outer child in the
234 * !netns case. In this case, it's all good. Don't touch it because we
235 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
236 */
237 if (r > 0)
238 return 0;
239
240 full = prefix_roota(top, "/full");
241
242 (void) mkdir(full, 0755);
243
244 if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
245 return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
246
247 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
248 _cleanup_free_ char *from = NULL, *to = NULL;
249
250 from = prefix_root(full, x);
251 if (!from)
252 return log_oom();
253
254 to = prefix_root(top, x);
255 if (!to)
256 return log_oom();
257
258 (void) mkdir(to, 0755);
259
260 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
261 return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
262
263 if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
264 return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
265 }
266
267 if (umount(full) < 0)
268 return log_error_errno(errno, "Failed to unmount %s: %m", full);
269
270 if (rmdir(full) < 0)
271 return log_error_errno(errno, "Failed to remove %s: %m", full);
272
273 x = prefix_roota(top, "/fs/kdbus");
274 (void) mkdir(x, 0755);
275
276 if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
277 return log_error_errno(errno, "Failed to make %s read-only: %m", top);
278
279 return 0;
280 }
281
282 int mount_all(const char *dest,
283 bool use_userns, bool in_userns,
284 bool use_netns,
285 uid_t uid_shift, uid_t uid_range,
286 const char *selinux_apifs_context) {
287
288 typedef struct MountPoint {
289 const char *what;
290 const char *where;
291 const char *type;
292 const char *options;
293 unsigned long flags;
294 bool fatal;
295 bool in_userns;
296 bool use_netns;
297 } MountPoint;
298
299 static const MountPoint mount_table[] = {
300 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
301 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
302 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
303 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
304 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
305 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
306 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
307 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
308 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
309 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
310 #ifdef HAVE_SELINUX
311 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
312 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
313 #endif
314 };
315
316 unsigned k;
317 int r;
318
319 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
320 _cleanup_free_ char *where = NULL, *options = NULL;
321 const char *o;
322
323 if (in_userns != mount_table[k].in_userns)
324 continue;
325
326 if (!use_netns && mount_table[k].use_netns)
327 continue;
328
329 where = prefix_root(dest, mount_table[k].where);
330 if (!where)
331 return log_oom();
332
333 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
334 if (r < 0 && r != -ENOENT)
335 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
336
337 /* Skip this entry if it is not a remount. */
338 if (mount_table[k].what && r > 0)
339 continue;
340
341 r = mkdir_p(where, 0755);
342 if (r < 0) {
343 if (mount_table[k].fatal)
344 return log_error_errno(r, "Failed to create directory %s: %m", where);
345
346 log_debug_errno(r, "Failed to create directory %s: %m", where);
347 continue;
348 }
349
350 o = mount_table[k].options;
351 if (streq_ptr(mount_table[k].type, "tmpfs")) {
352 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
353 if (r < 0)
354 return log_oom();
355 if (r > 0)
356 o = options;
357 }
358
359 if (mount(mount_table[k].what,
360 where,
361 mount_table[k].type,
362 mount_table[k].flags,
363 o) < 0) {
364
365 if (mount_table[k].fatal)
366 return log_error_errno(errno, "mount(%s) failed: %m", where);
367
368 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
369 }
370 }
371
372 return 0;
373 }
374
375 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
376 const char *p = options;
377 unsigned long flags = *mount_flags;
378 char *opts = NULL;
379
380 assert(options);
381
382 for (;;) {
383 _cleanup_free_ char *word = NULL;
384 int r = extract_first_word(&p, &word, ",", 0);
385 if (r < 0)
386 return log_error_errno(r, "Failed to extract mount option: %m");
387 if (r == 0)
388 break;
389
390 if (streq(word, "rbind"))
391 flags |= MS_REC;
392 else if (streq(word, "norbind"))
393 flags &= ~MS_REC;
394 else {
395 log_error("Invalid bind mount option: %s", word);
396 return -EINVAL;
397 }
398 }
399
400 *mount_flags = flags;
401 /* in the future mount_opts will hold string options for mount(2) */
402 *mount_opts = opts;
403
404 return 0;
405 }
406
407 static int mount_bind(const char *dest, CustomMount *m) {
408 struct stat source_st, dest_st;
409 const char *where;
410 unsigned long mount_flags = MS_BIND | MS_REC;
411 _cleanup_free_ char *mount_opts = NULL;
412 int r;
413
414 assert(m);
415
416 if (m->options) {
417 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
418 if (r < 0)
419 return r;
420 }
421
422 if (stat(m->source, &source_st) < 0)
423 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
424
425 where = prefix_roota(dest, m->destination);
426
427 if (stat(where, &dest_st) >= 0) {
428 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
429 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
430 return -EINVAL;
431 }
432
433 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
434 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
435 return -EINVAL;
436 }
437
438 } else if (errno == ENOENT) {
439 r = mkdir_parents_label(where, 0755);
440 if (r < 0)
441 return log_error_errno(r, "Failed to make parents of %s: %m", where);
442
443 /* Create the mount point. Any non-directory file can be
444 * mounted on any non-directory file (regular, fifo, socket,
445 * char, block).
446 */
447 if (S_ISDIR(source_st.st_mode))
448 r = mkdir_label(where, 0755);
449 else
450 r = touch(where);
451 if (r < 0)
452 return log_error_errno(r, "Failed to create mount point %s: %m", where);
453
454 } else {
455 return log_error_errno(errno, "Failed to stat %s: %m", where);
456 }
457
458 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
459 return log_error_errno(errno, "mount(%s) failed: %m", where);
460
461 if (m->read_only) {
462 r = bind_remount_recursive(where, true);
463 if (r < 0)
464 return log_error_errno(r, "Read-only bind mount failed: %m");
465 }
466
467 return 0;
468 }
469
470 static int mount_tmpfs(
471 const char *dest,
472 CustomMount *m,
473 bool userns, uid_t uid_shift, uid_t uid_range,
474 const char *selinux_apifs_context) {
475
476 const char *where, *options;
477 _cleanup_free_ char *buf = NULL;
478 int r;
479
480 assert(dest);
481 assert(m);
482
483 where = prefix_roota(dest, m->destination);
484
485 r = mkdir_p_label(where, 0755);
486 if (r < 0 && r != -EEXIST)
487 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
488
489 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
490 if (r < 0)
491 return log_oom();
492 options = r > 0 ? buf : m->options;
493
494 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
495 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
496
497 return 0;
498 }
499
500 static char *joined_and_escaped_lower_dirs(char * const *lower) {
501 _cleanup_strv_free_ char **sv = NULL;
502
503 sv = strv_copy(lower);
504 if (!sv)
505 return NULL;
506
507 strv_reverse(sv);
508
509 if (!strv_shell_escape(sv, ",:"))
510 return NULL;
511
512 return strv_join(sv, ":");
513 }
514
515 static int mount_overlay(const char *dest, CustomMount *m) {
516 _cleanup_free_ char *lower = NULL;
517 const char *where, *options;
518 int r;
519
520 assert(dest);
521 assert(m);
522
523 where = prefix_roota(dest, m->destination);
524
525 r = mkdir_label(where, 0755);
526 if (r < 0 && r != -EEXIST)
527 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
528
529 (void) mkdir_p_label(m->source, 0755);
530
531 lower = joined_and_escaped_lower_dirs(m->lower);
532 if (!lower)
533 return log_oom();
534
535 if (m->read_only) {
536 _cleanup_free_ char *escaped_source = NULL;
537
538 escaped_source = shell_escape(m->source, ",:");
539 if (!escaped_source)
540 return log_oom();
541
542 options = strjoina("lowerdir=", escaped_source, ":", lower);
543 } else {
544 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
545
546 assert(m->work_dir);
547 (void) mkdir_label(m->work_dir, 0700);
548
549 escaped_source = shell_escape(m->source, ",:");
550 if (!escaped_source)
551 return log_oom();
552 escaped_work_dir = shell_escape(m->work_dir, ",:");
553 if (!escaped_work_dir)
554 return log_oom();
555
556 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
557 }
558
559 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
560 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
561
562 return 0;
563 }
564
565 int mount_custom(
566 const char *dest,
567 CustomMount *mounts, unsigned n,
568 bool userns, uid_t uid_shift, uid_t uid_range,
569 const char *selinux_apifs_context) {
570
571 unsigned i;
572 int r;
573
574 assert(dest);
575
576 for (i = 0; i < n; i++) {
577 CustomMount *m = mounts + i;
578
579 switch (m->type) {
580
581 case CUSTOM_MOUNT_BIND:
582 r = mount_bind(dest, m);
583 break;
584
585 case CUSTOM_MOUNT_TMPFS:
586 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
587 break;
588
589 case CUSTOM_MOUNT_OVERLAY:
590 r = mount_overlay(dest, m);
591 break;
592
593 default:
594 assert_not_reached("Unknown custom mount type");
595 }
596
597 if (r < 0)
598 return r;
599 }
600
601 return 0;
602 }
603
604 static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
605 char *to;
606 int r;
607
608 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
609
610 r = path_is_mount_point(to, 0);
611 if (r < 0 && r != -ENOENT)
612 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
613 if (r > 0)
614 return 0;
615
616 mkdir_p(to, 0755);
617
618 /* The superblock mount options of the mount point need to be
619 * identical to the hosts', and hence writable... */
620 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
621 return log_error_errno(errno, "Failed to mount to %s: %m", to);
622
623 /* ... hence let's only make the bind mount read-only, not the
624 * superblock. */
625 if (read_only) {
626 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
627 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
628 }
629 return 1;
630 }
631
632 static int mount_legacy_cgroups(
633 const char *dest,
634 bool userns, uid_t uid_shift, uid_t uid_range,
635 const char *selinux_apifs_context) {
636
637 _cleanup_set_free_free_ Set *controllers = NULL;
638 const char *cgroup_root;
639 int r;
640
641 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
642
643 (void) mkdir_p(cgroup_root, 0755);
644
645 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
646 r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
647 if (r < 0)
648 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
649 if (r == 0) {
650 _cleanup_free_ char *options = NULL;
651
652 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
653 if (r < 0)
654 return log_oom();
655
656 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
657 return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
658 }
659
660 if (cg_unified() > 0)
661 goto skip_controllers;
662
663 controllers = set_new(&string_hash_ops);
664 if (!controllers)
665 return log_oom();
666
667 r = cg_kernel_controllers(controllers);
668 if (r < 0)
669 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
670
671 for (;;) {
672 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
673
674 controller = set_steal_first(controllers);
675 if (!controller)
676 break;
677
678 origin = prefix_root("/sys/fs/cgroup/", controller);
679 if (!origin)
680 return log_oom();
681
682 r = readlink_malloc(origin, &combined);
683 if (r == -EINVAL) {
684 /* Not a symbolic link, but directly a single cgroup hierarchy */
685
686 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
687 if (r < 0)
688 return r;
689
690 } else if (r < 0)
691 return log_error_errno(r, "Failed to read link %s: %m", origin);
692 else {
693 _cleanup_free_ char *target = NULL;
694
695 target = prefix_root(dest, origin);
696 if (!target)
697 return log_oom();
698
699 /* A symbolic link, a combination of controllers in one hierarchy */
700
701 if (!filename_is_valid(combined)) {
702 log_warning("Ignoring invalid combined hierarchy %s.", combined);
703 continue;
704 }
705
706 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
707 if (r < 0)
708 return r;
709
710 r = symlink_idempotent(combined, target);
711 if (r == -EINVAL) {
712 log_error("Invalid existing symlink for combined hierarchy");
713 return r;
714 }
715 if (r < 0)
716 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
717 }
718 }
719
720 skip_controllers:
721 r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
722 if (r < 0)
723 return r;
724
725 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
726 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
727
728 return 0;
729 }
730
731 static int mount_unified_cgroups(const char *dest) {
732 const char *p;
733 int r;
734
735 assert(dest);
736
737 p = prefix_roota(dest, "/sys/fs/cgroup");
738
739 (void) mkdir_p(p, 0755);
740
741 r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
742 if (r < 0)
743 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
744 if (r > 0) {
745 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
746 if (access(p, F_OK) >= 0)
747 return 0;
748 if (errno != ENOENT)
749 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
750
751 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
752 return -EINVAL;
753 }
754
755 if (mount("cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
756 return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
757
758 return 0;
759 }
760
761 int mount_cgroups(
762 const char *dest,
763 bool unified_requested,
764 bool userns, uid_t uid_shift, uid_t uid_range,
765 const char *selinux_apifs_context) {
766
767 if (unified_requested)
768 return mount_unified_cgroups(dest);
769 else
770 return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
771 }
772
773 int mount_systemd_cgroup_writable(
774 const char *dest,
775 bool unified_requested) {
776
777 _cleanup_free_ char *own_cgroup_path = NULL;
778 const char *systemd_root, *systemd_own;
779 int r;
780
781 assert(dest);
782
783 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
784 if (r < 0)
785 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
786
787 /* If we are living in the top-level, then there's nothing to do... */
788 if (path_equal(own_cgroup_path, "/"))
789 return 0;
790
791 if (unified_requested) {
792 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
793 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
794 } else {
795 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
796 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
797 }
798
799 /* Make our own cgroup a (writable) bind mount */
800 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
801 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
802
803 /* And then remount the systemd cgroup root read-only */
804 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
805 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
806
807 return 0;
808 }
809
810 int setup_volatile_state(
811 const char *directory,
812 VolatileMode mode,
813 bool userns, uid_t uid_shift, uid_t uid_range,
814 const char *selinux_apifs_context) {
815
816 _cleanup_free_ char *buf = NULL;
817 const char *p, *options;
818 int r;
819
820 assert(directory);
821
822 if (mode != VOLATILE_STATE)
823 return 0;
824
825 /* --volatile=state means we simply overmount /var
826 with a tmpfs, and the rest read-only. */
827
828 r = bind_remount_recursive(directory, true);
829 if (r < 0)
830 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
831
832 p = prefix_roota(directory, "/var");
833 r = mkdir(p, 0755);
834 if (r < 0 && errno != EEXIST)
835 return log_error_errno(errno, "Failed to create %s: %m", directory);
836
837 options = "mode=755";
838 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
839 if (r < 0)
840 return log_oom();
841 if (r > 0)
842 options = buf;
843
844 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
845 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
846
847 return 0;
848 }
849
850 int setup_volatile(
851 const char *directory,
852 VolatileMode mode,
853 bool userns, uid_t uid_shift, uid_t uid_range,
854 const char *selinux_apifs_context) {
855
856 bool tmpfs_mounted = false, bind_mounted = false;
857 char template[] = "/tmp/nspawn-volatile-XXXXXX";
858 _cleanup_free_ char *buf = NULL;
859 const char *f, *t, *options;
860 int r;
861
862 assert(directory);
863
864 if (mode != VOLATILE_YES)
865 return 0;
866
867 /* --volatile=yes means we mount a tmpfs to the root dir, and
868 the original /usr to use inside it, and that read-only. */
869
870 if (!mkdtemp(template))
871 return log_error_errno(errno, "Failed to create temporary directory: %m");
872
873 options = "mode=755";
874 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
875 if (r < 0)
876 return log_oom();
877 if (r > 0)
878 options = buf;
879
880 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
881 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
882 goto fail;
883 }
884
885 tmpfs_mounted = true;
886
887 f = prefix_roota(directory, "/usr");
888 t = prefix_roota(template, "/usr");
889
890 r = mkdir(t, 0755);
891 if (r < 0 && errno != EEXIST) {
892 r = log_error_errno(errno, "Failed to create %s: %m", t);
893 goto fail;
894 }
895
896 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
897 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
898 goto fail;
899 }
900
901 bind_mounted = true;
902
903 r = bind_remount_recursive(t, true);
904 if (r < 0) {
905 log_error_errno(r, "Failed to remount %s read-only: %m", t);
906 goto fail;
907 }
908
909 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
910 r = log_error_errno(errno, "Failed to move root mount: %m");
911 goto fail;
912 }
913
914 (void) rmdir(template);
915
916 return 0;
917
918 fail:
919 if (bind_mounted)
920 (void) umount(t);
921
922 if (tmpfs_mounted)
923 (void) umount(template);
924 (void) rmdir(template);
925 return r;
926 }
927
928 VolatileMode volatile_mode_from_string(const char *s) {
929 int b;
930
931 if (isempty(s))
932 return _VOLATILE_MODE_INVALID;
933
934 b = parse_boolean(s);
935 if (b > 0)
936 return VOLATILE_YES;
937 if (b == 0)
938 return VOLATILE_NO;
939
940 if (streq(s, "state"))
941 return VOLATILE_STATE;
942
943 return _VOLATILE_MODE_INVALID;
944 }