]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn-mount.c
util-lib: split out allocation calls into alloc-util.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
CommitLineData
e83bebef
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2015 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
4f5dd394 22#include <sys/mount.h>
07630cea 23#include <linux/magic.h>
e83bebef 24
b5efdb8a 25#include "alloc-util.h"
4f5dd394
LP
26#include "cgroup-util.h"
27#include "escape.h"
f4f15635 28#include "fs-util.h"
e83bebef 29#include "label.h"
4f5dd394 30#include "mkdir.h"
4349cd7c 31#include "mount-util.h"
6bedfcbb
LP
32#include "nspawn-mount.h"
33#include "parse-util.h"
4f5dd394
LP
34#include "path-util.h"
35#include "rm-rf.h"
e83bebef 36#include "set.h"
8fcde012 37#include "stat-util.h"
07630cea 38#include "string-util.h"
4f5dd394 39#include "strv.h"
ee104e11 40#include "user-util.h"
4f5dd394 41#include "util.h"
e83bebef
LP
42
43CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
51 c = realloc(*l, (*n + 1) * sizeof(CustomMount));
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62}
63
64void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
79 strv_free(m->lower);
80 }
81
82 free(l);
83}
84
85int custom_mount_compare(const void *a, const void *b) {
86 const CustomMount *x = a, *y = b;
87 int r;
88
89 r = path_compare(x->destination, y->destination);
90 if (r != 0)
91 return r;
92
93 if (x->type < y->type)
94 return -1;
95 if (x->type > y->type)
96 return 1;
97
98 return 0;
99}
100
101int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
102 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
103 const char *p = s;
104 CustomMount *m;
105 int r;
106
107 assert(l);
108 assert(n);
109
110 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
111 if (r < 0)
112 return r;
113 if (r == 0)
114 return -EINVAL;
115
116 if (r == 1) {
117 destination = strdup(source);
118 if (!destination)
119 return -ENOMEM;
120 }
121
122 if (r == 2 && !isempty(p)) {
123 opts = strdup(p);
124 if (!opts)
125 return -ENOMEM;
126 }
127
128 if (!path_is_absolute(source))
129 return -EINVAL;
130
131 if (!path_is_absolute(destination))
132 return -EINVAL;
133
134 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
135 if (!m)
136 return log_oom();
137
138 m->source = source;
139 m->destination = destination;
140 m->read_only = read_only;
141 m->options = opts;
142
143 source = destination = opts = NULL;
144 return 0;
145}
146
147int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
148 _cleanup_free_ char *path = NULL, *opts = NULL;
149 const char *p = s;
150 CustomMount *m;
151 int r;
152
153 assert(l);
154 assert(n);
155 assert(s);
156
157 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
158 if (r < 0)
159 return r;
160 if (r == 0)
161 return -EINVAL;
162
163 if (isempty(p))
164 opts = strdup("mode=0755");
165 else
166 opts = strdup(p);
167 if (!opts)
168 return -ENOMEM;
169
170 if (!path_is_absolute(path))
171 return -EINVAL;
172
173 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
174 if (!m)
175 return -ENOMEM;
176
177 m->destination = path;
178 m->options = opts;
179
180 path = opts = NULL;
181 return 0;
182}
183
184static int tmpfs_patch_options(
185 const char *options,
186 bool userns, uid_t uid_shift, uid_t uid_range,
187 const char *selinux_apifs_context,
188 char **ret) {
189
190 char *buf = NULL;
191
192 if (userns && uid_shift != 0) {
193 assert(uid_shift != UID_INVALID);
194
195 if (options)
196 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, uid_shift, uid_shift);
197 else
198 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift);
199 if (!buf)
200 return -ENOMEM;
201
202 options = buf;
203 }
204
205#ifdef HAVE_SELINUX
206 if (selinux_apifs_context) {
207 char *t;
208
209 if (options)
210 t = strjoin(options, ",context=\"", selinux_apifs_context, "\"", NULL);
211 else
212 t = strjoin("context=\"", selinux_apifs_context, "\"", NULL);
213 if (!t) {
214 free(buf);
215 return -ENOMEM;
216 }
217
218 free(buf);
219 buf = t;
220 }
221#endif
222
223 *ret = buf;
224 return !!buf;
225}
226
d8fc6a00
LP
227int mount_sysfs(const char *dest) {
228 const char *full, *top, *x;
d1678248 229 int r;
d8fc6a00
LP
230
231 top = prefix_roota(dest, "/sys");
d1678248
ILG
232 r = path_check_fstype(top, SYSFS_MAGIC);
233 if (r < 0)
234 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
235 /* /sys might already be mounted as sysfs by the outer child in the
236 * !netns case. In this case, it's all good. Don't touch it because we
237 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
238 */
239 if (r > 0)
240 return 0;
241
d8fc6a00
LP
242 full = prefix_roota(top, "/full");
243
244 (void) mkdir(full, 0755);
245
246 if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
247 return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
248
249 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
250 _cleanup_free_ char *from = NULL, *to = NULL;
251
252 from = prefix_root(full, x);
253 if (!from)
254 return log_oom();
255
256 to = prefix_root(top, x);
257 if (!to)
258 return log_oom();
259
260 (void) mkdir(to, 0755);
261
262 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
263 return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
264
265 if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
266 return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
267 }
268
269 if (umount(full) < 0)
270 return log_error_errno(errno, "Failed to unmount %s: %m", full);
271
272 if (rmdir(full) < 0)
273 return log_error_errno(errno, "Failed to remove %s: %m", full);
274
275 x = prefix_roota(top, "/fs/kdbus");
276 (void) mkdir(x, 0755);
277
278 if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
279 return log_error_errno(errno, "Failed to make %s read-only: %m", top);
280
281 return 0;
282}
283
e83bebef 284int mount_all(const char *dest,
403af78c 285 bool use_userns, bool in_userns,
d1678248 286 bool use_netns,
403af78c 287 uid_t uid_shift, uid_t uid_range,
e83bebef
LP
288 const char *selinux_apifs_context) {
289
290 typedef struct MountPoint {
291 const char *what;
292 const char *where;
293 const char *type;
294 const char *options;
295 unsigned long flags;
296 bool fatal;
d1678248
ILG
297 bool in_userns;
298 bool use_netns;
e83bebef
LP
299 } MountPoint;
300
301 static const MountPoint mount_table[] = {
d1678248
ILG
302 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
303 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first */
304 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* Then, make it r/o */
305 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
306 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
307 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
308 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
309 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
310 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
e83bebef 311#ifdef HAVE_SELINUX
d1678248
ILG
312 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
313 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
e83bebef
LP
314#endif
315 };
316
317 unsigned k;
318 int r;
319
320 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
321 _cleanup_free_ char *where = NULL, *options = NULL;
322 const char *o;
323
d1678248
ILG
324 if (in_userns != mount_table[k].in_userns)
325 continue;
326
327 if (!use_netns && mount_table[k].use_netns)
e83bebef
LP
328 continue;
329
330 where = prefix_root(dest, mount_table[k].where);
331 if (!where)
332 return log_oom();
333
334 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
335 if (r < 0 && r != -ENOENT)
336 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
337
338 /* Skip this entry if it is not a remount. */
339 if (mount_table[k].what && r > 0)
340 continue;
341
342 r = mkdir_p(where, 0755);
343 if (r < 0) {
344 if (mount_table[k].fatal)
345 return log_error_errno(r, "Failed to create directory %s: %m", where);
346
347 log_warning_errno(r, "Failed to create directory %s: %m", where);
348 continue;
349 }
350
351 o = mount_table[k].options;
352 if (streq_ptr(mount_table[k].type, "tmpfs")) {
403af78c 353 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
e83bebef
LP
354 if (r < 0)
355 return log_oom();
356 if (r > 0)
357 o = options;
358 }
359
360 if (mount(mount_table[k].what,
361 where,
362 mount_table[k].type,
363 mount_table[k].flags,
364 o) < 0) {
365
366 if (mount_table[k].fatal)
367 return log_error_errno(errno, "mount(%s) failed: %m", where);
368
369 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
370 }
371 }
372
373 return 0;
374}
375
376static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
377 const char *p = options;
378 unsigned long flags = *mount_flags;
379 char *opts = NULL;
380
381 assert(options);
382
383 for (;;) {
384 _cleanup_free_ char *word = NULL;
385 int r = extract_first_word(&p, &word, ",", 0);
386 if (r < 0)
387 return log_error_errno(r, "Failed to extract mount option: %m");
388 if (r == 0)
389 break;
390
391 if (streq(word, "rbind"))
392 flags |= MS_REC;
393 else if (streq(word, "norbind"))
394 flags &= ~MS_REC;
395 else {
396 log_error("Invalid bind mount option: %s", word);
397 return -EINVAL;
398 }
399 }
400
401 *mount_flags = flags;
402 /* in the future mount_opts will hold string options for mount(2) */
403 *mount_opts = opts;
404
405 return 0;
406}
407
408static int mount_bind(const char *dest, CustomMount *m) {
409 struct stat source_st, dest_st;
410 const char *where;
411 unsigned long mount_flags = MS_BIND | MS_REC;
412 _cleanup_free_ char *mount_opts = NULL;
413 int r;
414
415 assert(m);
416
417 if (m->options) {
418 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
419 if (r < 0)
420 return r;
421 }
422
423 if (stat(m->source, &source_st) < 0)
424 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
425
426 where = prefix_roota(dest, m->destination);
427
428 if (stat(where, &dest_st) >= 0) {
429 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
430 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
431 return -EINVAL;
432 }
433
434 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
435 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
436 return -EINVAL;
437 }
438
439 } else if (errno == ENOENT) {
440 r = mkdir_parents_label(where, 0755);
441 if (r < 0)
442 return log_error_errno(r, "Failed to make parents of %s: %m", where);
443 } else {
444 log_error_errno(errno, "Failed to stat %s: %m", where);
445 return -errno;
446 }
447
448 /* Create the mount point. Any non-directory file can be
449 * mounted on any non-directory file (regular, fifo, socket,
450 * char, block).
451 */
452 if (S_ISDIR(source_st.st_mode))
453 r = mkdir_label(where, 0755);
454 else
455 r = touch(where);
456 if (r < 0 && r != -EEXIST)
457 return log_error_errno(r, "Failed to create mount point %s: %m", where);
458
459 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
460 return log_error_errno(errno, "mount(%s) failed: %m", where);
461
462 if (m->read_only) {
463 r = bind_remount_recursive(where, true);
464 if (r < 0)
465 return log_error_errno(r, "Read-only bind mount failed: %m");
466 }
467
468 return 0;
469}
470
471static int mount_tmpfs(
472 const char *dest,
473 CustomMount *m,
474 bool userns, uid_t uid_shift, uid_t uid_range,
475 const char *selinux_apifs_context) {
476
477 const char *where, *options;
478 _cleanup_free_ char *buf = NULL;
479 int r;
480
481 assert(dest);
482 assert(m);
483
484 where = prefix_roota(dest, m->destination);
485
486 r = mkdir_p_label(where, 0755);
487 if (r < 0 && r != -EEXIST)
488 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
489
490 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
491 if (r < 0)
492 return log_oom();
493 options = r > 0 ? buf : m->options;
494
495 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
496 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
497
498 return 0;
499}
500
501static char *joined_and_escaped_lower_dirs(char * const *lower) {
502 _cleanup_strv_free_ char **sv = NULL;
503
504 sv = strv_copy(lower);
505 if (!sv)
506 return NULL;
507
508 strv_reverse(sv);
509
510 if (!strv_shell_escape(sv, ",:"))
511 return NULL;
512
513 return strv_join(sv, ":");
514}
515
516static int mount_overlay(const char *dest, CustomMount *m) {
517 _cleanup_free_ char *lower = NULL;
518 const char *where, *options;
519 int r;
520
521 assert(dest);
522 assert(m);
523
524 where = prefix_roota(dest, m->destination);
525
526 r = mkdir_label(where, 0755);
527 if (r < 0 && r != -EEXIST)
528 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
529
530 (void) mkdir_p_label(m->source, 0755);
531
532 lower = joined_and_escaped_lower_dirs(m->lower);
533 if (!lower)
534 return log_oom();
535
536 if (m->read_only) {
537 _cleanup_free_ char *escaped_source = NULL;
538
539 escaped_source = shell_escape(m->source, ",:");
540 if (!escaped_source)
541 return log_oom();
542
543 options = strjoina("lowerdir=", escaped_source, ":", lower);
544 } else {
545 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
546
547 assert(m->work_dir);
548 (void) mkdir_label(m->work_dir, 0700);
549
550 escaped_source = shell_escape(m->source, ",:");
551 if (!escaped_source)
552 return log_oom();
553 escaped_work_dir = shell_escape(m->work_dir, ",:");
554 if (!escaped_work_dir)
555 return log_oom();
556
557 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
558 }
559
560 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
561 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
562
563 return 0;
564}
565
566int mount_custom(
567 const char *dest,
568 CustomMount *mounts, unsigned n,
569 bool userns, uid_t uid_shift, uid_t uid_range,
570 const char *selinux_apifs_context) {
571
572 unsigned i;
573 int r;
574
575 assert(dest);
576
577 for (i = 0; i < n; i++) {
578 CustomMount *m = mounts + i;
579
580 switch (m->type) {
581
582 case CUSTOM_MOUNT_BIND:
583 r = mount_bind(dest, m);
584 break;
585
586 case CUSTOM_MOUNT_TMPFS:
587 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
588 break;
589
590 case CUSTOM_MOUNT_OVERLAY:
591 r = mount_overlay(dest, m);
592 break;
593
594 default:
595 assert_not_reached("Unknown custom mount type");
596 }
597
598 if (r < 0)
599 return r;
600 }
601
602 return 0;
603}
604
605static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
606 char *to;
607 int r;
608
ee30f6ac 609 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
e83bebef
LP
610
611 r = path_is_mount_point(to, 0);
612 if (r < 0 && r != -ENOENT)
613 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
614 if (r > 0)
615 return 0;
616
617 mkdir_p(to, 0755);
618
619 /* The superblock mount options of the mount point need to be
620 * identical to the hosts', and hence writable... */
621 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
622 return log_error_errno(errno, "Failed to mount to %s: %m", to);
623
624 /* ... hence let's only make the bind mount read-only, not the
625 * superblock. */
626 if (read_only) {
627 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
628 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
629 }
630 return 1;
631}
632
633static int mount_legacy_cgroups(
634 const char *dest,
635 bool userns, uid_t uid_shift, uid_t uid_range,
636 const char *selinux_apifs_context) {
637
638 _cleanup_set_free_free_ Set *controllers = NULL;
639 const char *cgroup_root;
640 int r;
641
642 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
643
d8fc6a00
LP
644 (void) mkdir_p(cgroup_root, 0755);
645
e83bebef
LP
646 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
647 r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
648 if (r < 0)
649 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
650 if (r == 0) {
651 _cleanup_free_ char *options = NULL;
652
653 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
654 if (r < 0)
655 return log_oom();
656
657 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
658 return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
659 }
660
661 if (cg_unified() > 0)
662 goto skip_controllers;
663
664 controllers = set_new(&string_hash_ops);
665 if (!controllers)
666 return log_oom();
667
668 r = cg_kernel_controllers(controllers);
669 if (r < 0)
670 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
671
672 for (;;) {
673 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
674
675 controller = set_steal_first(controllers);
676 if (!controller)
677 break;
678
679 origin = prefix_root("/sys/fs/cgroup/", controller);
680 if (!origin)
681 return log_oom();
682
683 r = readlink_malloc(origin, &combined);
684 if (r == -EINVAL) {
685 /* Not a symbolic link, but directly a single cgroup hierarchy */
686
687 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
688 if (r < 0)
689 return r;
690
691 } else if (r < 0)
692 return log_error_errno(r, "Failed to read link %s: %m", origin);
693 else {
694 _cleanup_free_ char *target = NULL;
695
696 target = prefix_root(dest, origin);
697 if (!target)
698 return log_oom();
699
700 /* A symbolic link, a combination of controllers in one hierarchy */
701
702 if (!filename_is_valid(combined)) {
703 log_warning("Ignoring invalid combined hierarchy %s.", combined);
704 continue;
705 }
706
707 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
708 if (r < 0)
709 return r;
710
711 r = symlink_idempotent(combined, target);
712 if (r == -EINVAL) {
713 log_error("Invalid existing symlink for combined hierarchy");
714 return r;
715 }
716 if (r < 0)
717 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
718 }
719 }
720
721skip_controllers:
722 r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
723 if (r < 0)
724 return r;
725
726 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
727 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
728
729 return 0;
730}
731
732static int mount_unified_cgroups(const char *dest) {
733 const char *p;
734 int r;
735
736 assert(dest);
737
88e10572
MT
738 p = prefix_roota(dest, "/sys/fs/cgroup");
739
740 (void) mkdir_p(p, 0755);
e83bebef
LP
741
742 r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
743 if (r < 0)
744 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
745 if (r > 0) {
88e10572 746 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
e83bebef
LP
747 if (access(p, F_OK) >= 0)
748 return 0;
749 if (errno != ENOENT)
750 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
751
752 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
753 return -EINVAL;
754 }
755
756 if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
757 return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
758
759 return 0;
760}
761
762int mount_cgroups(
763 const char *dest,
764 bool unified_requested,
765 bool userns, uid_t uid_shift, uid_t uid_range,
766 const char *selinux_apifs_context) {
767
768 if (unified_requested)
769 return mount_unified_cgroups(dest);
770 else
771 return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
772}
773
774int mount_systemd_cgroup_writable(
775 const char *dest,
776 bool unified_requested) {
777
778 _cleanup_free_ char *own_cgroup_path = NULL;
779 const char *systemd_root, *systemd_own;
780 int r;
781
782 assert(dest);
783
784 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
785 if (r < 0)
786 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
787
788 /* If we are living in the top-level, then there's nothing to do... */
789 if (path_equal(own_cgroup_path, "/"))
790 return 0;
791
792 if (unified_requested) {
793 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
794 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
795 } else {
796 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
797 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
798 }
799
800 /* Make our own cgroup a (writable) bind mount */
801 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
802 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
803
804 /* And then remount the systemd cgroup root read-only */
805 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
806 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
807
808 return 0;
809}
810
811int setup_volatile_state(
812 const char *directory,
813 VolatileMode mode,
814 bool userns, uid_t uid_shift, uid_t uid_range,
815 const char *selinux_apifs_context) {
816
817 _cleanup_free_ char *buf = NULL;
818 const char *p, *options;
819 int r;
820
821 assert(directory);
822
823 if (mode != VOLATILE_STATE)
824 return 0;
825
826 /* --volatile=state means we simply overmount /var
827 with a tmpfs, and the rest read-only. */
828
829 r = bind_remount_recursive(directory, true);
830 if (r < 0)
831 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
832
833 p = prefix_roota(directory, "/var");
834 r = mkdir(p, 0755);
835 if (r < 0 && errno != EEXIST)
836 return log_error_errno(errno, "Failed to create %s: %m", directory);
837
838 options = "mode=755";
839 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
840 if (r < 0)
841 return log_oom();
842 if (r > 0)
843 options = buf;
844
845 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
846 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
847
848 return 0;
849}
850
851int setup_volatile(
852 const char *directory,
853 VolatileMode mode,
854 bool userns, uid_t uid_shift, uid_t uid_range,
855 const char *selinux_apifs_context) {
856
857 bool tmpfs_mounted = false, bind_mounted = false;
858 char template[] = "/tmp/nspawn-volatile-XXXXXX";
859 _cleanup_free_ char *buf = NULL;
860 const char *f, *t, *options;
861 int r;
862
863 assert(directory);
864
865 if (mode != VOLATILE_YES)
866 return 0;
867
868 /* --volatile=yes means we mount a tmpfs to the root dir, and
869 the original /usr to use inside it, and that read-only. */
870
871 if (!mkdtemp(template))
872 return log_error_errno(errno, "Failed to create temporary directory: %m");
873
874 options = "mode=755";
875 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
876 if (r < 0)
877 return log_oom();
878 if (r > 0)
879 options = buf;
880
881 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
882 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
883 goto fail;
884 }
885
886 tmpfs_mounted = true;
887
888 f = prefix_roota(directory, "/usr");
889 t = prefix_roota(template, "/usr");
890
891 r = mkdir(t, 0755);
892 if (r < 0 && errno != EEXIST) {
893 r = log_error_errno(errno, "Failed to create %s: %m", t);
894 goto fail;
895 }
896
897 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
898 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
899 goto fail;
900 }
901
902 bind_mounted = true;
903
904 r = bind_remount_recursive(t, true);
905 if (r < 0) {
906 log_error_errno(r, "Failed to remount %s read-only: %m", t);
907 goto fail;
908 }
909
910 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
911 r = log_error_errno(errno, "Failed to move root mount: %m");
912 goto fail;
913 }
914
915 (void) rmdir(template);
916
917 return 0;
918
919fail:
920 if (bind_mounted)
921 (void) umount(t);
922
923 if (tmpfs_mounted)
924 (void) umount(template);
925 (void) rmdir(template);
926 return r;
927}
928
929VolatileMode volatile_mode_from_string(const char *s) {
930 int b;
931
932 if (isempty(s))
933 return _VOLATILE_MODE_INVALID;
934
935 b = parse_boolean(s);
936 if (b > 0)
937 return VOLATILE_YES;
938 if (b == 0)
939 return VOLATILE_NO;
940
941 if (streq(s, "state"))
942 return VOLATILE_STATE;
943
944 return _VOLATILE_MODE_INVALID;
945}