]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
Merge pull request #3757 from poettering/efi-search
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2015 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <sys/mount.h>
21 #include <linux/magic.h>
22
23 #include "alloc-util.h"
24 #include "cgroup-util.h"
25 #include "escape.h"
26 #include "fd-util.h"
27 #include "fileio.h"
28 #include "fs-util.h"
29 #include "label.h"
30 #include "mkdir.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "rm-rf.h"
36 #include "set.h"
37 #include "stat-util.h"
38 #include "string-util.h"
39 #include "strv.h"
40 #include "user-util.h"
41 #include "util.h"
42
43 CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
51 c = realloc(*l, (*n + 1) * sizeof(CustomMount));
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62 }
63
64 void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
79 strv_free(m->lower);
80 }
81
82 free(l);
83 }
84
85 int custom_mount_compare(const void *a, const void *b) {
86 const CustomMount *x = a, *y = b;
87 int r;
88
89 r = path_compare(x->destination, y->destination);
90 if (r != 0)
91 return r;
92
93 if (x->type < y->type)
94 return -1;
95 if (x->type > y->type)
96 return 1;
97
98 return 0;
99 }
100
101 int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
102 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
103 const char *p = s;
104 CustomMount *m;
105 int r;
106
107 assert(l);
108 assert(n);
109
110 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
111 if (r < 0)
112 return r;
113 if (r == 0)
114 return -EINVAL;
115
116 if (r == 1) {
117 destination = strdup(source);
118 if (!destination)
119 return -ENOMEM;
120 }
121
122 if (r == 2 && !isempty(p)) {
123 opts = strdup(p);
124 if (!opts)
125 return -ENOMEM;
126 }
127
128 if (!path_is_absolute(source))
129 return -EINVAL;
130
131 if (!path_is_absolute(destination))
132 return -EINVAL;
133
134 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
135 if (!m)
136 return log_oom();
137
138 m->source = source;
139 m->destination = destination;
140 m->read_only = read_only;
141 m->options = opts;
142
143 source = destination = opts = NULL;
144 return 0;
145 }
146
147 int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
148 _cleanup_free_ char *path = NULL, *opts = NULL;
149 const char *p = s;
150 CustomMount *m;
151 int r;
152
153 assert(l);
154 assert(n);
155 assert(s);
156
157 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
158 if (r < 0)
159 return r;
160 if (r == 0)
161 return -EINVAL;
162
163 if (isempty(p))
164 opts = strdup("mode=0755");
165 else
166 opts = strdup(p);
167 if (!opts)
168 return -ENOMEM;
169
170 if (!path_is_absolute(path))
171 return -EINVAL;
172
173 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
174 if (!m)
175 return -ENOMEM;
176
177 m->destination = path;
178 m->options = opts;
179
180 path = opts = NULL;
181 return 0;
182 }
183
184 static int tmpfs_patch_options(
185 const char *options,
186 bool userns,
187 uid_t uid_shift, uid_t uid_range,
188 bool patch_ids,
189 const char *selinux_apifs_context,
190 char **ret) {
191
192 char *buf = NULL;
193
194 if ((userns && uid_shift != 0) || patch_ids) {
195 assert(uid_shift != UID_INVALID);
196
197 if (options)
198 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, uid_shift, uid_shift);
199 else
200 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift);
201 if (!buf)
202 return -ENOMEM;
203
204 options = buf;
205 }
206
207 #ifdef HAVE_SELINUX
208 if (selinux_apifs_context) {
209 char *t;
210
211 if (options)
212 t = strjoin(options, ",context=\"", selinux_apifs_context, "\"", NULL);
213 else
214 t = strjoin("context=\"", selinux_apifs_context, "\"", NULL);
215 if (!t) {
216 free(buf);
217 return -ENOMEM;
218 }
219
220 free(buf);
221 buf = t;
222 }
223 #endif
224
225 if (!buf && options) {
226 buf = strdup(options);
227 if (!buf)
228 return -ENOMEM;
229 }
230 *ret = buf;
231
232 return !!buf;
233 }
234
235 int mount_sysfs(const char *dest) {
236 const char *full, *top, *x;
237 int r;
238
239 top = prefix_roota(dest, "/sys");
240 r = path_check_fstype(top, SYSFS_MAGIC);
241 if (r < 0)
242 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
243 /* /sys might already be mounted as sysfs by the outer child in the
244 * !netns case. In this case, it's all good. Don't touch it because we
245 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
246 */
247 if (r > 0)
248 return 0;
249
250 full = prefix_roota(top, "/full");
251
252 (void) mkdir(full, 0755);
253
254 if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
255 return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
256
257 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
258 _cleanup_free_ char *from = NULL, *to = NULL;
259
260 from = prefix_root(full, x);
261 if (!from)
262 return log_oom();
263
264 to = prefix_root(top, x);
265 if (!to)
266 return log_oom();
267
268 (void) mkdir(to, 0755);
269
270 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
271 return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
272
273 if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
274 return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
275 }
276
277 if (umount(full) < 0)
278 return log_error_errno(errno, "Failed to unmount %s: %m", full);
279
280 if (rmdir(full) < 0)
281 return log_error_errno(errno, "Failed to remove %s: %m", full);
282
283 x = prefix_roota(top, "/fs/kdbus");
284 (void) mkdir_p(x, 0755);
285
286 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
287 * remount /sys read-only.
288 */
289 if (cg_ns_supported()) {
290 x = prefix_roota(top, "/fs/cgroup");
291 (void) mkdir_p(x, 0755);
292 }
293
294 if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
295 return log_error_errno(errno, "Failed to make %s read-only: %m", top);
296
297 return 0;
298 }
299
300 int mount_all(const char *dest,
301 bool use_userns, bool in_userns,
302 bool use_netns,
303 uid_t uid_shift, uid_t uid_range,
304 const char *selinux_apifs_context) {
305
306 typedef struct MountPoint {
307 const char *what;
308 const char *where;
309 const char *type;
310 const char *options;
311 unsigned long flags;
312 bool fatal;
313 bool in_userns;
314 bool use_netns;
315 } MountPoint;
316
317 static const MountPoint mount_table[] = {
318 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
319 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
320 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
321 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
322 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
323 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
324 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
325 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
326 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
327 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
328 #ifdef HAVE_SELINUX
329 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
330 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
331 #endif
332 };
333
334 unsigned k;
335 int r;
336
337 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
338 _cleanup_free_ char *where = NULL, *options = NULL;
339 const char *o;
340
341 if (in_userns != mount_table[k].in_userns)
342 continue;
343
344 if (!use_netns && mount_table[k].use_netns)
345 continue;
346
347 where = prefix_root(dest, mount_table[k].where);
348 if (!where)
349 return log_oom();
350
351 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
352 if (r < 0 && r != -ENOENT)
353 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
354
355 /* Skip this entry if it is not a remount. */
356 if (mount_table[k].what && r > 0)
357 continue;
358
359 r = mkdir_p(where, 0755);
360 if (r < 0) {
361 if (mount_table[k].fatal)
362 return log_error_errno(r, "Failed to create directory %s: %m", where);
363
364 log_debug_errno(r, "Failed to create directory %s: %m", where);
365 continue;
366 }
367
368 o = mount_table[k].options;
369 if (streq_ptr(mount_table[k].type, "tmpfs")) {
370 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
371 if (r < 0)
372 return log_oom();
373 if (r > 0)
374 o = options;
375 }
376
377 if (mount(mount_table[k].what,
378 where,
379 mount_table[k].type,
380 mount_table[k].flags,
381 o) < 0) {
382
383 if (mount_table[k].fatal)
384 return log_error_errno(errno, "mount(%s) failed: %m", where);
385
386 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
387 }
388 }
389
390 return 0;
391 }
392
393 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
394 const char *p = options;
395 unsigned long flags = *mount_flags;
396 char *opts = NULL;
397
398 assert(options);
399
400 for (;;) {
401 _cleanup_free_ char *word = NULL;
402 int r = extract_first_word(&p, &word, ",", 0);
403 if (r < 0)
404 return log_error_errno(r, "Failed to extract mount option: %m");
405 if (r == 0)
406 break;
407
408 if (streq(word, "rbind"))
409 flags |= MS_REC;
410 else if (streq(word, "norbind"))
411 flags &= ~MS_REC;
412 else {
413 log_error("Invalid bind mount option: %s", word);
414 return -EINVAL;
415 }
416 }
417
418 *mount_flags = flags;
419 /* in the future mount_opts will hold string options for mount(2) */
420 *mount_opts = opts;
421
422 return 0;
423 }
424
425 static int mount_bind(const char *dest, CustomMount *m) {
426 struct stat source_st, dest_st;
427 const char *where;
428 unsigned long mount_flags = MS_BIND | MS_REC;
429 _cleanup_free_ char *mount_opts = NULL;
430 int r;
431
432 assert(m);
433
434 if (m->options) {
435 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
436 if (r < 0)
437 return r;
438 }
439
440 if (stat(m->source, &source_st) < 0)
441 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
442
443 where = prefix_roota(dest, m->destination);
444
445 if (stat(where, &dest_st) >= 0) {
446 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
447 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
448 return -EINVAL;
449 }
450
451 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
452 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
453 return -EINVAL;
454 }
455
456 } else if (errno == ENOENT) {
457 r = mkdir_parents_label(where, 0755);
458 if (r < 0)
459 return log_error_errno(r, "Failed to make parents of %s: %m", where);
460
461 /* Create the mount point. Any non-directory file can be
462 * mounted on any non-directory file (regular, fifo, socket,
463 * char, block).
464 */
465 if (S_ISDIR(source_st.st_mode))
466 r = mkdir_label(where, 0755);
467 else
468 r = touch(where);
469 if (r < 0)
470 return log_error_errno(r, "Failed to create mount point %s: %m", where);
471
472 } else {
473 return log_error_errno(errno, "Failed to stat %s: %m", where);
474 }
475
476 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
477 return log_error_errno(errno, "mount(%s) failed: %m", where);
478
479 if (m->read_only) {
480 r = bind_remount_recursive(where, true);
481 if (r < 0)
482 return log_error_errno(r, "Read-only bind mount failed: %m");
483 }
484
485 return 0;
486 }
487
488 static int mount_tmpfs(
489 const char *dest,
490 CustomMount *m,
491 bool userns, uid_t uid_shift, uid_t uid_range,
492 const char *selinux_apifs_context) {
493
494 const char *where, *options;
495 _cleanup_free_ char *buf = NULL;
496 int r;
497
498 assert(dest);
499 assert(m);
500
501 where = prefix_roota(dest, m->destination);
502
503 r = mkdir_p_label(where, 0755);
504 if (r < 0 && r != -EEXIST)
505 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
506
507 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
508 if (r < 0)
509 return log_oom();
510 options = r > 0 ? buf : m->options;
511
512 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
513 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
514
515 return 0;
516 }
517
518 static char *joined_and_escaped_lower_dirs(char * const *lower) {
519 _cleanup_strv_free_ char **sv = NULL;
520
521 sv = strv_copy(lower);
522 if (!sv)
523 return NULL;
524
525 strv_reverse(sv);
526
527 if (!strv_shell_escape(sv, ",:"))
528 return NULL;
529
530 return strv_join(sv, ":");
531 }
532
533 static int mount_overlay(const char *dest, CustomMount *m) {
534 _cleanup_free_ char *lower = NULL;
535 const char *where, *options;
536 int r;
537
538 assert(dest);
539 assert(m);
540
541 where = prefix_roota(dest, m->destination);
542
543 r = mkdir_label(where, 0755);
544 if (r < 0 && r != -EEXIST)
545 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
546
547 (void) mkdir_p_label(m->source, 0755);
548
549 lower = joined_and_escaped_lower_dirs(m->lower);
550 if (!lower)
551 return log_oom();
552
553 if (m->read_only) {
554 _cleanup_free_ char *escaped_source = NULL;
555
556 escaped_source = shell_escape(m->source, ",:");
557 if (!escaped_source)
558 return log_oom();
559
560 options = strjoina("lowerdir=", escaped_source, ":", lower);
561 } else {
562 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
563
564 assert(m->work_dir);
565 (void) mkdir_label(m->work_dir, 0700);
566
567 escaped_source = shell_escape(m->source, ",:");
568 if (!escaped_source)
569 return log_oom();
570 escaped_work_dir = shell_escape(m->work_dir, ",:");
571 if (!escaped_work_dir)
572 return log_oom();
573
574 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
575 }
576
577 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
578 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
579
580 return 0;
581 }
582
583 int mount_custom(
584 const char *dest,
585 CustomMount *mounts, unsigned n,
586 bool userns, uid_t uid_shift, uid_t uid_range,
587 const char *selinux_apifs_context) {
588
589 unsigned i;
590 int r;
591
592 assert(dest);
593
594 for (i = 0; i < n; i++) {
595 CustomMount *m = mounts + i;
596
597 switch (m->type) {
598
599 case CUSTOM_MOUNT_BIND:
600 r = mount_bind(dest, m);
601 break;
602
603 case CUSTOM_MOUNT_TMPFS:
604 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
605 break;
606
607 case CUSTOM_MOUNT_OVERLAY:
608 r = mount_overlay(dest, m);
609 break;
610
611 default:
612 assert_not_reached("Unknown custom mount type");
613 }
614
615 if (r < 0)
616 return r;
617 }
618
619 return 0;
620 }
621
622 /* Retrieve existing subsystems. This function is called in a new cgroup
623 * namespace.
624 */
625 static int get_controllers(Set *subsystems) {
626 _cleanup_fclose_ FILE *f = NULL;
627 char line[LINE_MAX];
628
629 assert(subsystems);
630
631 f = fopen("/proc/self/cgroup", "re");
632 if (!f)
633 return errno == ENOENT ? -ESRCH : -errno;
634
635 FOREACH_LINE(line, f, return -errno) {
636 int r;
637 char *e, *l, *p;
638
639 truncate_nl(line);
640
641 l = strchr(line, ':');
642 if (!l)
643 continue;
644
645 l++;
646 e = strchr(l, ':');
647 if (!e)
648 continue;
649
650 *e = 0;
651
652 if (streq(l, "") || streq(l, "name=systemd"))
653 continue;
654
655 p = strdup(l);
656 r = set_consume(subsystems, p);
657 if (r < 0)
658 return r;
659 }
660
661 return 0;
662 }
663
664 static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
665 char *to;
666 int r;
667
668 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
669
670 r = path_is_mount_point(to, 0);
671 if (r < 0 && r != -ENOENT)
672 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
673 if (r > 0)
674 return 0;
675
676 mkdir_p(to, 0755);
677
678 /* The superblock mount options of the mount point need to be
679 * identical to the hosts', and hence writable... */
680 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
681 return log_error_errno(errno, "Failed to mount to %s: %m", to);
682
683 /* ... hence let's only make the bind mount read-only, not the
684 * superblock. */
685 if (read_only) {
686 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
687 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
688 }
689 return 1;
690 }
691
692 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
693 static int mount_legacy_cgns_supported(
694 bool userns, uid_t uid_shift, uid_t uid_range,
695 const char *selinux_apifs_context) {
696 _cleanup_set_free_free_ Set *controllers = NULL;
697 const char *cgroup_root = "/sys/fs/cgroup", *c;
698 int r;
699
700 (void) mkdir_p(cgroup_root, 0755);
701
702 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
703 r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
704 if (r < 0)
705 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
706 if (r == 0) {
707 _cleanup_free_ char *options = NULL;
708
709 /* When cgroup namespaces are enabled and user namespaces are
710 * used then the mount of the cgroupfs is done *inside* the new
711 * user namespace. We're root in the new user namespace and the
712 * kernel will happily translate our uid/gid to the correct
713 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
714 * pass uid 0 and not uid_shift to tmpfs_patch_options().
715 */
716 r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
717 if (r < 0)
718 return log_oom();
719
720 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
721 return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
722 }
723
724 if (cg_unified() > 0)
725 goto skip_controllers;
726
727 controllers = set_new(&string_hash_ops);
728 if (!controllers)
729 return log_oom();
730
731 r = get_controllers(controllers);
732 if (r < 0)
733 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
734
735 for (;;) {
736 _cleanup_free_ const char *controller = NULL;
737
738 controller = set_steal_first(controllers);
739 if (!controller)
740 break;
741
742 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
743 if (r < 0)
744 return r;
745
746 /* When multiple hierarchies are co-mounted, make their
747 * constituting individual hierarchies a symlink to the
748 * co-mount.
749 */
750 c = controller;
751 for (;;) {
752 _cleanup_free_ char *target = NULL, *tok = NULL;
753
754 r = extract_first_word(&c, &tok, ",", 0);
755 if (r < 0)
756 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
757 if (r == 0)
758 break;
759
760 target = prefix_root("/sys/fs/cgroup", tok);
761 if (!target)
762 return log_oom();
763
764 if (streq(controller, tok))
765 break;
766
767 r = symlink_idempotent(controller, target);
768 if (r == -EINVAL)
769 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
770 if (r < 0)
771 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
772 }
773 }
774
775 skip_controllers:
776 r = mount_legacy_cgroup_hierarchy("", "none,name=systemd,xattr", "systemd", false);
777 if (r < 0)
778 return r;
779
780 if (!userns) {
781 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
782 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
783 }
784
785 return 0;
786 }
787
788 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
789 static int mount_legacy_cgns_unsupported(
790 const char *dest,
791 bool userns, uid_t uid_shift, uid_t uid_range,
792 const char *selinux_apifs_context) {
793 _cleanup_set_free_free_ Set *controllers = NULL;
794 const char *cgroup_root;
795 int r;
796
797 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
798
799 (void) mkdir_p(cgroup_root, 0755);
800
801 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
802 r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
803 if (r < 0)
804 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
805 if (r == 0) {
806 _cleanup_free_ char *options = NULL;
807
808 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
809 if (r < 0)
810 return log_oom();
811
812 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
813 return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
814 }
815
816 if (cg_unified() > 0)
817 goto skip_controllers;
818
819 controllers = set_new(&string_hash_ops);
820 if (!controllers)
821 return log_oom();
822
823 r = cg_kernel_controllers(controllers);
824 if (r < 0)
825 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
826
827 for (;;) {
828 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
829
830 controller = set_steal_first(controllers);
831 if (!controller)
832 break;
833
834 origin = prefix_root("/sys/fs/cgroup/", controller);
835 if (!origin)
836 return log_oom();
837
838 r = readlink_malloc(origin, &combined);
839 if (r == -EINVAL) {
840 /* Not a symbolic link, but directly a single cgroup hierarchy */
841
842 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
843 if (r < 0)
844 return r;
845
846 } else if (r < 0)
847 return log_error_errno(r, "Failed to read link %s: %m", origin);
848 else {
849 _cleanup_free_ char *target = NULL;
850
851 target = prefix_root(dest, origin);
852 if (!target)
853 return log_oom();
854
855 /* A symbolic link, a combination of controllers in one hierarchy */
856
857 if (!filename_is_valid(combined)) {
858 log_warning("Ignoring invalid combined hierarchy %s.", combined);
859 continue;
860 }
861
862 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
863 if (r < 0)
864 return r;
865
866 r = symlink_idempotent(combined, target);
867 if (r == -EINVAL)
868 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
869 if (r < 0)
870 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
871 }
872 }
873
874 skip_controllers:
875 r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
876 if (r < 0)
877 return r;
878
879 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
880 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
881
882 return 0;
883 }
884
885 static int mount_unified_cgroups(const char *dest) {
886 const char *p;
887 int r;
888
889 assert(dest);
890
891 p = prefix_roota(dest, "/sys/fs/cgroup");
892
893 (void) mkdir_p(p, 0755);
894
895 r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
896 if (r < 0)
897 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
898 if (r > 0) {
899 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
900 if (access(p, F_OK) >= 0)
901 return 0;
902 if (errno != ENOENT)
903 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
904
905 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
906 return -EINVAL;
907 }
908
909 if (mount("cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
910 return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
911
912 return 0;
913 }
914
915 int mount_cgroups(
916 const char *dest,
917 bool unified_requested,
918 bool userns, uid_t uid_shift, uid_t uid_range,
919 const char *selinux_apifs_context) {
920
921 if (unified_requested)
922 return mount_unified_cgroups(dest);
923 else if (cg_ns_supported())
924 return mount_legacy_cgns_supported(userns, uid_shift, uid_range, selinux_apifs_context);
925
926 return mount_legacy_cgns_unsupported(dest, userns, uid_shift, uid_range, selinux_apifs_context);
927 }
928
929 int mount_systemd_cgroup_writable(
930 const char *dest,
931 bool unified_requested) {
932
933 _cleanup_free_ char *own_cgroup_path = NULL;
934 const char *systemd_root, *systemd_own;
935 int r;
936
937 assert(dest);
938
939 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
940 if (r < 0)
941 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
942
943 /* If we are living in the top-level, then there's nothing to do... */
944 if (path_equal(own_cgroup_path, "/"))
945 return 0;
946
947 if (unified_requested) {
948 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
949 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
950 } else {
951 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
952 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
953 }
954
955 /* Make our own cgroup a (writable) bind mount */
956 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
957 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
958
959 /* And then remount the systemd cgroup root read-only */
960 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
961 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
962
963 return 0;
964 }
965
966 int setup_volatile_state(
967 const char *directory,
968 VolatileMode mode,
969 bool userns, uid_t uid_shift, uid_t uid_range,
970 const char *selinux_apifs_context) {
971
972 _cleanup_free_ char *buf = NULL;
973 const char *p, *options;
974 int r;
975
976 assert(directory);
977
978 if (mode != VOLATILE_STATE)
979 return 0;
980
981 /* --volatile=state means we simply overmount /var
982 with a tmpfs, and the rest read-only. */
983
984 r = bind_remount_recursive(directory, true);
985 if (r < 0)
986 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
987
988 p = prefix_roota(directory, "/var");
989 r = mkdir(p, 0755);
990 if (r < 0 && errno != EEXIST)
991 return log_error_errno(errno, "Failed to create %s: %m", directory);
992
993 options = "mode=755";
994 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
995 if (r < 0)
996 return log_oom();
997 if (r > 0)
998 options = buf;
999
1000 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1001 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1002
1003 return 0;
1004 }
1005
1006 int setup_volatile(
1007 const char *directory,
1008 VolatileMode mode,
1009 bool userns, uid_t uid_shift, uid_t uid_range,
1010 const char *selinux_apifs_context) {
1011
1012 bool tmpfs_mounted = false, bind_mounted = false;
1013 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1014 _cleanup_free_ char *buf = NULL;
1015 const char *f, *t, *options;
1016 int r;
1017
1018 assert(directory);
1019
1020 if (mode != VOLATILE_YES)
1021 return 0;
1022
1023 /* --volatile=yes means we mount a tmpfs to the root dir, and
1024 the original /usr to use inside it, and that read-only. */
1025
1026 if (!mkdtemp(template))
1027 return log_error_errno(errno, "Failed to create temporary directory: %m");
1028
1029 options = "mode=755";
1030 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1031 if (r < 0)
1032 return log_oom();
1033 if (r > 0)
1034 options = buf;
1035
1036 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1037 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1038 goto fail;
1039 }
1040
1041 tmpfs_mounted = true;
1042
1043 f = prefix_roota(directory, "/usr");
1044 t = prefix_roota(template, "/usr");
1045
1046 r = mkdir(t, 0755);
1047 if (r < 0 && errno != EEXIST) {
1048 r = log_error_errno(errno, "Failed to create %s: %m", t);
1049 goto fail;
1050 }
1051
1052 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1053 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1054 goto fail;
1055 }
1056
1057 bind_mounted = true;
1058
1059 r = bind_remount_recursive(t, true);
1060 if (r < 0) {
1061 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1062 goto fail;
1063 }
1064
1065 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1066 r = log_error_errno(errno, "Failed to move root mount: %m");
1067 goto fail;
1068 }
1069
1070 (void) rmdir(template);
1071
1072 return 0;
1073
1074 fail:
1075 if (bind_mounted)
1076 (void) umount(t);
1077
1078 if (tmpfs_mounted)
1079 (void) umount(template);
1080 (void) rmdir(template);
1081 return r;
1082 }
1083
1084 VolatileMode volatile_mode_from_string(const char *s) {
1085 int b;
1086
1087 if (isempty(s))
1088 return _VOLATILE_MODE_INVALID;
1089
1090 b = parse_boolean(s);
1091 if (b > 0)
1092 return VOLATILE_YES;
1093 if (b == 0)
1094 return VOLATILE_NO;
1095
1096 if (streq(s, "state"))
1097 return VOLATILE_STATE;
1098
1099 return _VOLATILE_MODE_INVALID;
1100 }