]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn-mount.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
e83bebef
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2015 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
4f5dd394 21#include <sys/mount.h>
07630cea 22#include <linux/magic.h>
e83bebef 23
b5efdb8a 24#include "alloc-util.h"
4f5dd394 25#include "escape.h"
0996ef00
CB
26#include "fd-util.h"
27#include "fileio.h"
f4f15635 28#include "fs-util.h"
e83bebef 29#include "label.h"
4f5dd394 30#include "mkdir.h"
4349cd7c 31#include "mount-util.h"
6bedfcbb
LP
32#include "nspawn-mount.h"
33#include "parse-util.h"
4f5dd394
LP
34#include "path-util.h"
35#include "rm-rf.h"
e83bebef 36#include "set.h"
8fcde012 37#include "stat-util.h"
07630cea 38#include "string-util.h"
4f5dd394 39#include "strv.h"
ee104e11 40#include "user-util.h"
4f5dd394 41#include "util.h"
e83bebef
LP
42
43CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
56391931 51 c = realloc_multiply(*l, (*n + 1), sizeof(CustomMount));
e83bebef
LP
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62}
63
64void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
c7a4890c
LP
79 if (m->rm_rf_tmpdir) {
80 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
81 free(m->rm_rf_tmpdir);
82 }
83
e83bebef
LP
84 strv_free(m->lower);
85 }
86
87 free(l);
88}
89
86c0dd4a 90static int custom_mount_compare(const void *a, const void *b) {
e83bebef
LP
91 const CustomMount *x = a, *y = b;
92 int r;
93
94 r = path_compare(x->destination, y->destination);
95 if (r != 0)
96 return r;
97
98 if (x->type < y->type)
99 return -1;
100 if (x->type > y->type)
101 return 1;
102
103 return 0;
104}
105
86c0dd4a
LP
106static bool source_path_is_valid(const char *p) {
107 assert(p);
108
109 if (*p == '+')
110 p++;
111
112 return path_is_absolute(p);
113}
114
115static char *resolve_source_path(const char *dest, const char *source) {
116
117 if (!source)
118 return NULL;
119
120 if (source[0] == '+')
121 return prefix_root(dest, source + 1);
122
123 return strdup(source);
124}
125
126int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
127 unsigned i;
128 int r;
129
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
132 * children. */
133
134 assert(l || n == 0);
135
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l, n, sizeof(CustomMount), custom_mount_compare);
138
139 for (i = 0; i < n; i++) {
140 CustomMount *m = l + i;
141
142 if (m->source) {
143 char *s;
144
145 s = resolve_source_path(dest, m->source);
146 if (!s)
147 return log_oom();
148
149 free(m->source);
150 m->source = s;
c7a4890c
LP
151 } else {
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
153
154 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m->rm_rf_tmpdir)
156 return log_oom();
157
158 if (!mkdtemp(m->rm_rf_tmpdir)) {
159 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
160 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
161 }
162
163 m->source = strjoin(m->rm_rf_tmpdir, "/src");
164 if (!m->source)
165 return log_oom();
166
167 if (mkdir(m->source, 0755) < 0)
168 return log_error_errno(errno, "Failed to create %s: %m", m->source);
86c0dd4a
LP
169 }
170
171 if (m->type == CUSTOM_MOUNT_OVERLAY) {
172 char **j;
173
174 STRV_FOREACH(j, m->lower) {
175 char *s;
176
177 s = resolve_source_path(dest, *j);
178 if (!s)
179 return log_oom();
180
181 free(*j);
182 *j = s;
183 }
184
185 if (m->work_dir) {
186 char *s;
187
188 s = resolve_source_path(dest, m->work_dir);
189 if (!s)
190 return log_oom();
191
192 free(m->work_dir);
193 m->work_dir = s;
194 } else {
195 assert(m->source);
196
197 r = tempfn_random(m->source, NULL, &m->work_dir);
198 if (r < 0)
199 return log_error_errno(r, "Failed to acquire working directory: %m");
200 }
201
202 (void) mkdir_label(m->work_dir, 0700);
203 }
204 }
205
206 return 0;
207}
208
e83bebef
LP
209int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
210 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
211 const char *p = s;
212 CustomMount *m;
213 int r;
214
215 assert(l);
216 assert(n);
217
218 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
219 if (r < 0)
220 return r;
221 if (r == 0)
222 return -EINVAL;
e83bebef 223 if (r == 1) {
86c0dd4a 224 destination = strdup(source[0] == '+' ? source+1 : source);
e83bebef
LP
225 if (!destination)
226 return -ENOMEM;
227 }
e83bebef
LP
228 if (r == 2 && !isempty(p)) {
229 opts = strdup(p);
230 if (!opts)
231 return -ENOMEM;
232 }
233
c7a4890c
LP
234 if (isempty(source))
235 source = NULL;
236 else if (!source_path_is_valid(source))
e83bebef 237 return -EINVAL;
c7a4890c 238
e83bebef
LP
239 if (!path_is_absolute(destination))
240 return -EINVAL;
241
242 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
243 if (!m)
48cbe5f8 244 return -ENOMEM;
e83bebef
LP
245
246 m->source = source;
247 m->destination = destination;
248 m->read_only = read_only;
249 m->options = opts;
250
251 source = destination = opts = NULL;
252 return 0;
253}
254
255int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
256 _cleanup_free_ char *path = NULL, *opts = NULL;
257 const char *p = s;
258 CustomMount *m;
259 int r;
260
261 assert(l);
262 assert(n);
263 assert(s);
264
265 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
266 if (r < 0)
267 return r;
268 if (r == 0)
269 return -EINVAL;
270
271 if (isempty(p))
272 opts = strdup("mode=0755");
273 else
274 opts = strdup(p);
275 if (!opts)
276 return -ENOMEM;
277
278 if (!path_is_absolute(path))
279 return -EINVAL;
280
281 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
282 if (!m)
283 return -ENOMEM;
284
285 m->destination = path;
286 m->options = opts;
287
288 path = opts = NULL;
289 return 0;
290}
291
ad85779a
LP
292int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
293 _cleanup_free_ char *upper = NULL, *destination = NULL;
294 _cleanup_strv_free_ char **lower = NULL;
295 CustomMount *m;
86c0dd4a 296 int k;
ad85779a 297
86c0dd4a
LP
298 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
299 if (k < 0)
300 return k;
ad85779a
LP
301 if (k < 2)
302 return -EADDRNOTAVAIL;
303 if (k == 2) {
86c0dd4a
LP
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
306
307 if (!source_path_is_valid(lower[0]) ||
308 !source_path_is_valid(lower[1]))
309 return -EINVAL;
310
ad85779a
LP
311 upper = lower[1];
312 lower[1] = NULL;
313
86c0dd4a 314 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
ad85779a
LP
315 if (!destination)
316 return -ENOMEM;
ad85779a 317 } else {
c7a4890c 318 char **i;
86c0dd4a
LP
319
320 /* If more than two parameters are specified, the last one is the destination, the second to last one
321 * the "upper", and all before that the "lower" directories. */
322
ad85779a 323 destination = lower[k - 1];
86c0dd4a 324 upper = lower[k - 2];
ad85779a 325 lower[k - 2] = NULL;
86c0dd4a 326
c7a4890c
LP
327 STRV_FOREACH(i, lower)
328 if (!source_path_is_valid(*i))
329 return -EINVAL;
330
331 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
332 * in /var/tmp */
333 if (isempty(upper))
334 upper = NULL;
335 else if (!source_path_is_valid(upper))
336 return -EINVAL;
337
86c0dd4a
LP
338 if (!path_is_absolute(destination))
339 return -EINVAL;
ad85779a
LP
340 }
341
342 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
343 if (!m)
344 return -ENOMEM;
345
346 m->destination = destination;
347 m->source = upper;
348 m->lower = lower;
349 m->read_only = read_only;
350
351 upper = destination = NULL;
352 lower = NULL;
353
354 return 0;
355}
356
e83bebef
LP
357static int tmpfs_patch_options(
358 const char *options,
0996ef00
CB
359 bool userns,
360 uid_t uid_shift, uid_t uid_range,
361 bool patch_ids,
e83bebef
LP
362 const char *selinux_apifs_context,
363 char **ret) {
364
365 char *buf = NULL;
366
0996ef00 367 if ((userns && uid_shift != 0) || patch_ids) {
e83bebef
LP
368 assert(uid_shift != UID_INVALID);
369
9aa2169e
ZJS
370 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
371 options ?: "", options ? "," : "",
372 uid_shift, uid_shift) < 0)
e83bebef
LP
373 return -ENOMEM;
374
375 options = buf;
376 }
377
349cc4a5 378#if HAVE_SELINUX
e83bebef
LP
379 if (selinux_apifs_context) {
380 char *t;
381
9aa2169e
ZJS
382 t = strjoin(options ?: "", options ? "," : "",
383 "context=\"", selinux_apifs_context, "\"");
384 free(buf);
385 if (!t)
e83bebef 386 return -ENOMEM;
e83bebef 387
e83bebef
LP
388 buf = t;
389 }
390#endif
391
0996ef00
CB
392 if (!buf && options) {
393 buf = strdup(options);
394 if (!buf)
395 return -ENOMEM;
396 }
e83bebef 397 *ret = buf;
0996ef00 398
e83bebef
LP
399 return !!buf;
400}
401
4f086aab 402int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
d8fc6a00 403 const char *full, *top, *x;
d1678248 404 int r;
4f086aab 405 unsigned long extra_flags = 0;
d8fc6a00
LP
406
407 top = prefix_roota(dest, "/sys");
d1678248
ILG
408 r = path_check_fstype(top, SYSFS_MAGIC);
409 if (r < 0)
410 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
411 /* /sys might already be mounted as sysfs by the outer child in the
412 * !netns case. In this case, it's all good. Don't touch it because we
413 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
414 */
415 if (r > 0)
416 return 0;
417
d8fc6a00
LP
418 full = prefix_roota(top, "/full");
419
420 (void) mkdir(full, 0755);
421
4f086aab
SU
422 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
423 extra_flags |= MS_RDONLY;
424
60e76d48 425 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
4f086aab 426 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
60e76d48
ZJS
427 if (r < 0)
428 return r;
d8fc6a00
LP
429
430 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
431 _cleanup_free_ char *from = NULL, *to = NULL;
432
433 from = prefix_root(full, x);
434 if (!from)
435 return log_oom();
436
437 to = prefix_root(top, x);
438 if (!to)
439 return log_oom();
440
441 (void) mkdir(to, 0755);
442
60e76d48
ZJS
443 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
444 if (r < 0)
445 return r;
d8fc6a00 446
60e76d48 447 r = mount_verbose(LOG_ERR, NULL, to, NULL,
4f086aab 448 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
60e76d48
ZJS
449 if (r < 0)
450 return r;
d8fc6a00
LP
451 }
452
60e76d48
ZJS
453 r = umount_verbose(full);
454 if (r < 0)
455 return r;
d8fc6a00
LP
456
457 if (rmdir(full) < 0)
458 return log_error_errno(errno, "Failed to remove %s: %m", full);
459
0996ef00
CB
460 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
461 * remount /sys read-only.
462 */
463 if (cg_ns_supported()) {
464 x = prefix_roota(top, "/fs/cgroup");
465 (void) mkdir_p(x, 0755);
466 }
d8fc6a00 467
60e76d48 468 return mount_verbose(LOG_ERR, NULL, top, NULL,
4f086aab 469 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
d8fc6a00
LP
470}
471
acbbf69b 472static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
63eae723
EV
473 int r;
474
475 assert(path);
476
477 r = mkdir(path, mode);
478 if (r < 0 && errno != EEXIST)
479 return -errno;
480
acbbf69b
LP
481 if ((mask & MOUNT_USE_USERNS) == 0)
482 return 0;
483
484 if (mask & MOUNT_IN_USERNS)
485 return 0;
486
487 r = lchown(path, uid_shift, uid_shift);
488 if (r < 0)
489 return -errno;
63eae723
EV
490
491 return 0;
492}
493
acbbf69b 494static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
63eae723
EV
495 const char *p, *e;
496 int r;
497
498 assert(path);
499
500 if (prefix && !path_startswith(path, prefix))
501 return -ENOTDIR;
502
503 /* create every parent directory in the path, except the last component */
504 p = path + strspn(path, "/");
505 for (;;) {
506 char t[strlen(path) + 1];
507
508 e = p + strcspn(p, "/");
509 p = e + strspn(e, "/");
510
511 /* Is this the last component? If so, then we're done */
512 if (*p == 0)
513 break;
514
515 memcpy(t, path, e - path);
516 t[e-path] = 0;
517
518 if (prefix && path_startswith(prefix, t))
519 continue;
520
acbbf69b 521 r = mkdir_userns(t, mode, mask, uid_shift);
63eae723
EV
522 if (r < 0)
523 return r;
524 }
525
acbbf69b 526 return mkdir_userns(path, mode, mask, uid_shift);
63eae723
EV
527}
528
e83bebef 529int mount_all(const char *dest,
4f086aab 530 MountSettingsMask mount_settings,
403af78c 531 uid_t uid_shift, uid_t uid_range,
e83bebef
LP
532 const char *selinux_apifs_context) {
533
534 typedef struct MountPoint {
535 const char *what;
536 const char *where;
537 const char *type;
538 const char *options;
539 unsigned long flags;
4f086aab 540 MountSettingsMask mount_settings;
e83bebef
LP
541 } MountPoint;
542
543 static const MountPoint mount_table[] = {
4f086aab
SU
544 /* inner child mounts */
545 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
13e785f7 546 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
4f086aab
SU
547 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
548 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
13e785f7 549 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
4f086aab 550 { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
4f086aab
SU
551
552 /* outer child mounts */
e8a94ce8 553 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
4f086aab
SU
554 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
555 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
556 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
557
558 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
559 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
560 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
349cc4a5 561#if HAVE_SELINUX
4f086aab
SU
562 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
563 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
e83bebef
LP
564#endif
565 };
566
567 unsigned k;
568 int r;
4f086aab
SU
569 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
570 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
571 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
572 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
e83bebef
LP
573
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575 _cleanup_free_ char *where = NULL, *options = NULL;
576 const char *o;
4f086aab
SU
577 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
578
579 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
580 continue;
e83bebef 581
4f086aab 582 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
d1678248
ILG
583 continue;
584
4f086aab 585 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
e83bebef
LP
586 continue;
587
cb638b5e 588 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
8ce48cf0 589 if (r < 0)
ec57bd42 590 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
e83bebef 591
8ce48cf0 592 r = path_is_mount_point(where, NULL, 0);
e83bebef
LP
593 if (r < 0 && r != -ENOENT)
594 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
595
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && r > 0)
598 continue;
599
acbbf69b 600 r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
920a7899 601 if (r < 0 && r != -EEXIST) {
4f13e534 602 if (fatal && r != -EROFS)
e83bebef
LP
603 return log_error_errno(r, "Failed to create directory %s: %m", where);
604
201b13c8 605 log_debug_errno(r, "Failed to create directory %s: %m", where);
4f13e534
LT
606 /* If we failed mkdir() or chown() due to the root
607 * directory being read only, attempt to mount this fs
608 * anyway and let mount_verbose log any errors */
609 if (r != -EROFS)
610 continue;
e83bebef
LP
611 }
612
613 o = mount_table[k].options;
614 if (streq_ptr(mount_table[k].type, "tmpfs")) {
8492849e
EV
615 if (in_userns)
616 r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
617 else
618 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
e83bebef
LP
619 if (r < 0)
620 return log_oom();
621 if (r > 0)
622 o = options;
623 }
624
4f086aab 625 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
60e76d48
ZJS
626 mount_table[k].what,
627 where,
628 mount_table[k].type,
629 mount_table[k].flags,
630 o);
4f086aab 631 if (r < 0 && fatal)
60e76d48 632 return r;
e83bebef
LP
633 }
634
635 return 0;
636}
637
638static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
639 const char *p = options;
640 unsigned long flags = *mount_flags;
641 char *opts = NULL;
4da92e58 642 int r;
e83bebef
LP
643
644 assert(options);
645
646 for (;;) {
647 _cleanup_free_ char *word = NULL;
4da92e58
LP
648
649 r = extract_first_word(&p, &word, ",", 0);
e83bebef
LP
650 if (r < 0)
651 return log_error_errno(r, "Failed to extract mount option: %m");
652 if (r == 0)
653 break;
654
655 if (streq(word, "rbind"))
656 flags |= MS_REC;
657 else if (streq(word, "norbind"))
658 flags &= ~MS_REC;
659 else {
660 log_error("Invalid bind mount option: %s", word);
661 return -EINVAL;
662 }
663 }
664
665 *mount_flags = flags;
666 /* in the future mount_opts will hold string options for mount(2) */
667 *mount_opts = opts;
668
669 return 0;
670}
671
672static int mount_bind(const char *dest, CustomMount *m) {
68cf43c3
LP
673
674 _cleanup_free_ char *mount_opts = NULL, *where = NULL;
e83bebef 675 unsigned long mount_flags = MS_BIND | MS_REC;
68cf43c3 676 struct stat source_st, dest_st;
e83bebef
LP
677 int r;
678
86c0dd4a 679 assert(dest);
e83bebef
LP
680 assert(m);
681
682 if (m->options) {
683 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
684 if (r < 0)
685 return r;
686 }
687
688 if (stat(m->source, &source_st) < 0)
689 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
690
cb638b5e 691 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
68cf43c3 692 if (r < 0)
ec57bd42 693 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
694 if (r > 0) { /* Path exists already? */
695
696 if (stat(where, &dest_st) < 0)
697 return log_error_errno(errno, "Failed to stat %s: %m", where);
e83bebef 698
e83bebef
LP
699 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
700 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
701 return -EINVAL;
702 }
703
704 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
705 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
706 return -EINVAL;
707 }
708
8ce48cf0 709 } else { /* Path doesn't exist yet? */
e83bebef
LP
710 r = mkdir_parents_label(where, 0755);
711 if (r < 0)
712 return log_error_errno(r, "Failed to make parents of %s: %m", where);
b97e83cb
BN
713
714 /* Create the mount point. Any non-directory file can be
715 * mounted on any non-directory file (regular, fifo, socket,
716 * char, block).
717 */
718 if (S_ISDIR(source_st.st_mode))
719 r = mkdir_label(where, 0755);
720 else
721 r = touch(where);
722 if (r < 0)
723 return log_error_errno(r, "Failed to create mount point %s: %m", where);
724
8ce48cf0 725 }
e83bebef 726
60e76d48
ZJS
727 r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
728 if (r < 0)
729 return r;
e83bebef
LP
730
731 if (m->read_only) {
6b7c9f8b 732 r = bind_remount_recursive(where, true, NULL);
e83bebef
LP
733 if (r < 0)
734 return log_error_errno(r, "Read-only bind mount failed: %m");
735 }
736
737 return 0;
738}
739
740static int mount_tmpfs(
741 const char *dest,
742 CustomMount *m,
743 bool userns, uid_t uid_shift, uid_t uid_range,
744 const char *selinux_apifs_context) {
745
68cf43c3
LP
746 const char *options;
747 _cleanup_free_ char *buf = NULL, *where = NULL;
e83bebef
LP
748 int r;
749
750 assert(dest);
751 assert(m);
752
cb638b5e 753 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
68cf43c3 754 if (r < 0)
ec57bd42 755 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
756 if (r == 0) { /* Doesn't exist yet? */
757 r = mkdir_p_label(where, 0755);
758 if (r < 0)
759 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
760 }
e83bebef 761
0996ef00 762 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
e83bebef
LP
763 if (r < 0)
764 return log_oom();
765 options = r > 0 ? buf : m->options;
766
60e76d48 767 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
e83bebef
LP
768}
769
86c0dd4a 770static char *joined_and_escaped_lower_dirs(char **lower) {
e83bebef
LP
771 _cleanup_strv_free_ char **sv = NULL;
772
773 sv = strv_copy(lower);
774 if (!sv)
775 return NULL;
776
777 strv_reverse(sv);
778
779 if (!strv_shell_escape(sv, ",:"))
780 return NULL;
781
782 return strv_join(sv, ":");
783}
784
785static int mount_overlay(const char *dest, CustomMount *m) {
68cf43c3 786
86c0dd4a 787 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
68cf43c3 788 const char *options;
e83bebef
LP
789 int r;
790
791 assert(dest);
792 assert(m);
793
cb638b5e 794 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
68cf43c3 795 if (r < 0)
ec57bd42 796 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
797 if (r == 0) { /* Doesn't exist yet? */
798 r = mkdir_label(where, 0755);
799 if (r < 0)
800 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
801 }
e83bebef
LP
802
803 (void) mkdir_p_label(m->source, 0755);
804
805 lower = joined_and_escaped_lower_dirs(m->lower);
806 if (!lower)
807 return log_oom();
808
86c0dd4a
LP
809 escaped_source = shell_escape(m->source, ",:");
810 if (!escaped_source)
811 return log_oom();
e83bebef 812
86c0dd4a 813 if (m->read_only)
e83bebef 814 options = strjoina("lowerdir=", escaped_source, ":", lower);
86c0dd4a
LP
815 else {
816 _cleanup_free_ char *escaped_work_dir = NULL;
e83bebef 817
e83bebef
LP
818 escaped_work_dir = shell_escape(m->work_dir, ",:");
819 if (!escaped_work_dir)
820 return log_oom();
821
822 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
823 }
824
60e76d48 825 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
e83bebef
LP
826}
827
828int mount_custom(
829 const char *dest,
830 CustomMount *mounts, unsigned n,
831 bool userns, uid_t uid_shift, uid_t uid_range,
832 const char *selinux_apifs_context) {
833
834 unsigned i;
835 int r;
836
837 assert(dest);
838
839 for (i = 0; i < n; i++) {
840 CustomMount *m = mounts + i;
841
842 switch (m->type) {
843
844 case CUSTOM_MOUNT_BIND:
845 r = mount_bind(dest, m);
846 break;
847
848 case CUSTOM_MOUNT_TMPFS:
849 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
850 break;
851
852 case CUSTOM_MOUNT_OVERLAY:
853 r = mount_overlay(dest, m);
854 break;
855
856 default:
857 assert_not_reached("Unknown custom mount type");
858 }
859
860 if (r < 0)
861 return r;
862 }
863
864 return 0;
865}
866
0996ef00
CB
867/* Retrieve existing subsystems. This function is called in a new cgroup
868 * namespace.
869 */
870static int get_controllers(Set *subsystems) {
871 _cleanup_fclose_ FILE *f = NULL;
872 char line[LINE_MAX];
873
874 assert(subsystems);
875
876 f = fopen("/proc/self/cgroup", "re");
877 if (!f)
878 return errno == ENOENT ? -ESRCH : -errno;
879
880 FOREACH_LINE(line, f, return -errno) {
881 int r;
882 char *e, *l, *p;
883
0996ef00
CB
884 l = strchr(line, ':');
885 if (!l)
886 continue;
887
888 l++;
889 e = strchr(l, ':');
890 if (!e)
891 continue;
892
893 *e = 0;
894
2977724b 895 if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
0996ef00
CB
896 continue;
897
898 p = strdup(l);
add554f4
ZJS
899 if (!p)
900 return -ENOMEM;
901
0996ef00
CB
902 r = set_consume(subsystems, p);
903 if (r < 0)
904 return r;
905 }
906
907 return 0;
908}
909
e1873695
LP
910static int mount_legacy_cgroup_hierarchy(
911 const char *dest,
912 const char *controller,
913 const char *hierarchy,
e1873695
LP
914 bool read_only) {
915
60e76d48 916 const char *to, *fstype, *opts;
e83bebef
LP
917 int r;
918
ee30f6ac 919 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
e83bebef 920
e1873695 921 r = path_is_mount_point(to, dest, 0);
e83bebef
LP
922 if (r < 0 && r != -ENOENT)
923 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
924 if (r > 0)
925 return 0;
926
927 mkdir_p(to, 0755);
928
929 /* The superblock mount options of the mount point need to be
930 * identical to the hosts', and hence writable... */
2977724b
TH
931 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
932 fstype = "cgroup2";
933 opts = NULL;
934 } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
935 fstype = "cgroup";
936 opts = "none,name=systemd,xattr";
60e76d48
ZJS
937 } else {
938 fstype = "cgroup";
939 opts = controller;
940 }
5da38d07 941
60e76d48 942 r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
5da38d07 943 if (r < 0)
60e76d48 944 return r;
e83bebef 945
60e76d48 946 /* ... hence let's only make the bind mount read-only, not the superblock. */
e83bebef 947 if (read_only) {
60e76d48
ZJS
948 r = mount_verbose(LOG_ERR, NULL, to, NULL,
949 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
950 if (r < 0)
951 return r;
e83bebef 952 }
60e76d48 953
e83bebef
LP
954 return 1;
955}
956
0996ef00
CB
957/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
958static int mount_legacy_cgns_supported(
e1873695
LP
959 const char *dest,
960 CGroupUnified unified_requested,
961 bool userns,
962 uid_t uid_shift,
963 uid_t uid_range,
964 const char *selinux_apifs_context) {
965
0996ef00
CB
966 _cleanup_set_free_free_ Set *controllers = NULL;
967 const char *cgroup_root = "/sys/fs/cgroup", *c;
968 int r;
e83bebef 969
0996ef00
CB
970 (void) mkdir_p(cgroup_root, 0755);
971
972 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
e1873695 973 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
0996ef00
CB
974 if (r < 0)
975 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
976 if (r == 0) {
977 _cleanup_free_ char *options = NULL;
978
979 /* When cgroup namespaces are enabled and user namespaces are
980 * used then the mount of the cgroupfs is done *inside* the new
981 * user namespace. We're root in the new user namespace and the
982 * kernel will happily translate our uid/gid to the correct
983 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
984 * pass uid 0 and not uid_shift to tmpfs_patch_options().
985 */
986 r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
987 if (r < 0)
988 return log_oom();
989
60e76d48
ZJS
990 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
991 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
992 if (r < 0)
993 return r;
0996ef00
CB
994 }
995
b4cccbc1
LP
996 r = cg_all_unified();
997 if (r < 0)
998 return r;
999 if (r > 0)
0996ef00
CB
1000 goto skip_controllers;
1001
1002 controllers = set_new(&string_hash_ops);
1003 if (!controllers)
1004 return log_oom();
1005
1006 r = get_controllers(controllers);
1007 if (r < 0)
1008 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1009
1010 for (;;) {
1011 _cleanup_free_ const char *controller = NULL;
1012
1013 controller = set_steal_first(controllers);
1014 if (!controller)
1015 break;
1016
2977724b 1017 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
0996ef00
CB
1018 if (r < 0)
1019 return r;
1020
1021 /* When multiple hierarchies are co-mounted, make their
1022 * constituting individual hierarchies a symlink to the
1023 * co-mount.
1024 */
1025 c = controller;
1026 for (;;) {
1027 _cleanup_free_ char *target = NULL, *tok = NULL;
1028
1029 r = extract_first_word(&c, &tok, ",", 0);
1030 if (r < 0)
1031 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
1032 if (r == 0)
1033 break;
1034
1035 target = prefix_root("/sys/fs/cgroup", tok);
1036 if (!target)
1037 return log_oom();
1038
1039 if (streq(controller, tok))
1040 break;
1041
1042 r = symlink_idempotent(controller, target);
1043 if (r == -EINVAL)
1044 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1045 if (r < 0)
1046 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1047 }
1048 }
1049
1050skip_controllers:
2977724b
TH
1051 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1052 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1053 if (r < 0)
1054 return r;
1055 }
1056
1057 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
0996ef00
CB
1058 if (r < 0)
1059 return r;
1060
60e76d48
ZJS
1061 if (!userns)
1062 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1063 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
0996ef00
CB
1064
1065 return 0;
1066}
1067
1068/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1069static int mount_legacy_cgns_unsupported(
1070 const char *dest,
e1873695
LP
1071 CGroupUnified unified_requested,
1072 bool userns,
1073 uid_t uid_shift,
1074 uid_t uid_range,
0996ef00 1075 const char *selinux_apifs_context) {
e1873695 1076
e83bebef
LP
1077 _cleanup_set_free_free_ Set *controllers = NULL;
1078 const char *cgroup_root;
1079 int r;
1080
1081 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1082
d8fc6a00
LP
1083 (void) mkdir_p(cgroup_root, 0755);
1084
e83bebef 1085 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
e1873695 1086 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
e83bebef
LP
1087 if (r < 0)
1088 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1089 if (r == 0) {
1090 _cleanup_free_ char *options = NULL;
1091
0996ef00 1092 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
e83bebef
LP
1093 if (r < 0)
1094 return log_oom();
1095
60e76d48
ZJS
1096 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1097 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1098 if (r < 0)
1099 return r;
e83bebef
LP
1100 }
1101
b4cccbc1
LP
1102 r = cg_all_unified();
1103 if (r < 0)
1104 return r;
1105 if (r > 0)
e83bebef
LP
1106 goto skip_controllers;
1107
1108 controllers = set_new(&string_hash_ops);
1109 if (!controllers)
1110 return log_oom();
1111
1112 r = cg_kernel_controllers(controllers);
1113 if (r < 0)
1114 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1115
1116 for (;;) {
1117 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1118
1119 controller = set_steal_first(controllers);
1120 if (!controller)
1121 break;
1122
1123 origin = prefix_root("/sys/fs/cgroup/", controller);
1124 if (!origin)
1125 return log_oom();
1126
1127 r = readlink_malloc(origin, &combined);
1128 if (r == -EINVAL) {
1129 /* Not a symbolic link, but directly a single cgroup hierarchy */
1130
2977724b 1131 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
e83bebef
LP
1132 if (r < 0)
1133 return r;
1134
1135 } else if (r < 0)
1136 return log_error_errno(r, "Failed to read link %s: %m", origin);
1137 else {
1138 _cleanup_free_ char *target = NULL;
1139
1140 target = prefix_root(dest, origin);
1141 if (!target)
1142 return log_oom();
1143
1144 /* A symbolic link, a combination of controllers in one hierarchy */
1145
1146 if (!filename_is_valid(combined)) {
1147 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1148 continue;
1149 }
1150
2977724b 1151 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
e83bebef
LP
1152 if (r < 0)
1153 return r;
1154
1155 r = symlink_idempotent(combined, target);
0996ef00
CB
1156 if (r == -EINVAL)
1157 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
e83bebef
LP
1158 if (r < 0)
1159 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1160 }
1161 }
1162
1163skip_controllers:
2977724b
TH
1164 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1165 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1166 if (r < 0)
1167 return r;
1168 }
1169
1170 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
e83bebef
LP
1171 if (r < 0)
1172 return r;
1173
60e76d48
ZJS
1174 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1175 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
e83bebef
LP
1176}
1177
1178static int mount_unified_cgroups(const char *dest) {
1179 const char *p;
1180 int r;
1181
1182 assert(dest);
1183
88e10572
MT
1184 p = prefix_roota(dest, "/sys/fs/cgroup");
1185
1186 (void) mkdir_p(p, 0755);
e83bebef 1187
e1873695 1188 r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
e83bebef
LP
1189 if (r < 0)
1190 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1191 if (r > 0) {
88e10572 1192 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
e83bebef
LP
1193 if (access(p, F_OK) >= 0)
1194 return 0;
1195 if (errno != ENOENT)
1196 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1197
1198 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1199 return -EINVAL;
1200 }
1201
60e76d48 1202 return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
e83bebef
LP
1203}
1204
1205int mount_cgroups(
1206 const char *dest,
5da38d07 1207 CGroupUnified unified_requested,
e1873695
LP
1208 bool userns,
1209 uid_t uid_shift,
1210 uid_t uid_range,
5a8ff0e6
CB
1211 const char *selinux_apifs_context,
1212 bool use_cgns) {
e83bebef 1213
5da38d07 1214 if (unified_requested >= CGROUP_UNIFIED_ALL)
e83bebef 1215 return mount_unified_cgroups(dest);
ada54120 1216 else if (use_cgns)
e1873695 1217 return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
0996ef00 1218
5da38d07 1219 return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
e83bebef
LP
1220}
1221
2977724b
TH
1222static int mount_systemd_cgroup_writable_one(const char *systemd_own, const char *systemd_root)
1223{
1224 int r;
1225
1226 /* Make our own cgroup a (writable) bind mount */
1227 r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
1228 if (r < 0)
1229 return r;
1230
1231 /* And then remount the systemd cgroup root read-only */
1232 return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
1233 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
1234}
1235
e83bebef
LP
1236int mount_systemd_cgroup_writable(
1237 const char *dest,
5da38d07 1238 CGroupUnified unified_requested) {
e83bebef
LP
1239
1240 _cleanup_free_ char *own_cgroup_path = NULL;
e83bebef
LP
1241 int r;
1242
1243 assert(dest);
1244
1245 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1246 if (r < 0)
1247 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1248
1249 /* If we are living in the top-level, then there's nothing to do... */
1250 if (path_equal(own_cgroup_path, "/"))
1251 return 0;
1252
2977724b
TH
1253 if (unified_requested >= CGROUP_UNIFIED_ALL)
1254 return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup", own_cgroup_path),
1255 prefix_roota(dest, "/sys/fs/cgroup"));
e83bebef 1256
2977724b
TH
1257 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1258 r = mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/unified", own_cgroup_path),
1259 prefix_roota(dest, "/sys/fs/cgroup/unified"));
1260 if (r < 0)
1261 return r;
1262 }
e83bebef 1263
2977724b
TH
1264 return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path),
1265 prefix_roota(dest, "/sys/fs/cgroup/systemd"));
e83bebef
LP
1266}
1267
1268int setup_volatile_state(
1269 const char *directory,
1270 VolatileMode mode,
1271 bool userns, uid_t uid_shift, uid_t uid_range,
1272 const char *selinux_apifs_context) {
1273
1274 _cleanup_free_ char *buf = NULL;
1275 const char *p, *options;
1276 int r;
1277
1278 assert(directory);
1279
1280 if (mode != VOLATILE_STATE)
1281 return 0;
1282
1283 /* --volatile=state means we simply overmount /var
1284 with a tmpfs, and the rest read-only. */
1285
6b7c9f8b 1286 r = bind_remount_recursive(directory, true, NULL);
e83bebef
LP
1287 if (r < 0)
1288 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1289
1290 p = prefix_roota(directory, "/var");
1291 r = mkdir(p, 0755);
1292 if (r < 0 && errno != EEXIST)
1293 return log_error_errno(errno, "Failed to create %s: %m", directory);
1294
1295 options = "mode=755";
0996ef00 1296 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
e83bebef
LP
1297 if (r < 0)
1298 return log_oom();
1299 if (r > 0)
1300 options = buf;
1301
60e76d48 1302 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
e83bebef
LP
1303}
1304
1305int setup_volatile(
1306 const char *directory,
1307 VolatileMode mode,
1308 bool userns, uid_t uid_shift, uid_t uid_range,
1309 const char *selinux_apifs_context) {
1310
1311 bool tmpfs_mounted = false, bind_mounted = false;
1312 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1313 _cleanup_free_ char *buf = NULL;
1314 const char *f, *t, *options;
1315 int r;
1316
1317 assert(directory);
1318
1319 if (mode != VOLATILE_YES)
1320 return 0;
1321
1322 /* --volatile=yes means we mount a tmpfs to the root dir, and
1323 the original /usr to use inside it, and that read-only. */
1324
1325 if (!mkdtemp(template))
1326 return log_error_errno(errno, "Failed to create temporary directory: %m");
1327
1328 options = "mode=755";
0996ef00 1329 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
e83bebef
LP
1330 if (r < 0)
1331 return log_oom();
1332 if (r > 0)
1333 options = buf;
1334
60e76d48
ZJS
1335 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1336 if (r < 0)
e83bebef 1337 goto fail;
e83bebef
LP
1338
1339 tmpfs_mounted = true;
1340
1341 f = prefix_roota(directory, "/usr");
1342 t = prefix_roota(template, "/usr");
1343
1344 r = mkdir(t, 0755);
1345 if (r < 0 && errno != EEXIST) {
1346 r = log_error_errno(errno, "Failed to create %s: %m", t);
1347 goto fail;
1348 }
1349
60e76d48
ZJS
1350 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1351 if (r < 0)
e83bebef 1352 goto fail;
e83bebef
LP
1353
1354 bind_mounted = true;
1355
6b7c9f8b 1356 r = bind_remount_recursive(t, true, NULL);
e83bebef
LP
1357 if (r < 0) {
1358 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1359 goto fail;
1360 }
1361
60e76d48
ZJS
1362 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1363 if (r < 0)
e83bebef 1364 goto fail;
e83bebef
LP
1365
1366 (void) rmdir(template);
1367
1368 return 0;
1369
1370fail:
1371 if (bind_mounted)
60e76d48 1372 (void) umount_verbose(t);
e83bebef
LP
1373
1374 if (tmpfs_mounted)
60e76d48 1375 (void) umount_verbose(template);
e83bebef
LP
1376 (void) rmdir(template);
1377 return r;
1378}
b53ede69
PW
1379
1380/* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1381int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1382 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1383 const char *p = s;
1384 int r;
1385
1386 assert(pivot_root_new);
1387 assert(pivot_root_old);
1388
1389 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1390 if (r < 0)
1391 return r;
1392 if (r == 0)
1393 return -EINVAL;
1394
1395 if (isempty(p))
1396 root_old = NULL;
1397 else {
1398 root_old = strdup(p);
1399 if (!root_old)
1400 return -ENOMEM;
1401 }
1402
1403 if (!path_is_absolute(root_new))
1404 return -EINVAL;
1405 if (root_old && !path_is_absolute(root_old))
1406 return -EINVAL;
1407
1408 free_and_replace(*pivot_root_new, root_new);
1409 free_and_replace(*pivot_root_old, root_old);
1410
1411 return 0;
1412}
1413
1414int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1415 _cleanup_free_ char *directory_pivot_root_new = NULL;
1416 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1417 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1418 bool remove_pivot_tmp = false;
1419 int r;
1420
1421 assert(directory);
1422
1423 if (!pivot_root_new)
1424 return 0;
1425
1426 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1427 * If pivot_root_old is NULL, the existing / disappears.
1428 * This requires a temporary directory, pivot_tmp, which is
1429 * not a child of either.
1430 *
1431 * This is typically used for OSTree-style containers, where
1432 * the root partition contains several sysroots which could be
1433 * run. Normally, one would be chosen by the bootloader and
1434 * pivoted to / by initramfs.
1435 *
1436 * For example, for an OSTree deployment, pivot_root_new
1437 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1438 * code doesn’t do the /var mount which OSTree expects: use
1439 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1440 *
1441 * So in the OSTree case, we’ll end up with something like:
1442 * - directory = /tmp/nspawn-root-123456
1443 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1444 * - pivot_root_old = /sysroot
1445 * - directory_pivot_root_new =
1446 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1447 * - pivot_tmp = /tmp/nspawn-pivot-123456
1448 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1449 *
1450 * Requires all file systems at directory and below to be mounted
1451 * MS_PRIVATE or MS_SLAVE so they can be moved.
1452 */
1453 directory_pivot_root_new = prefix_root(directory, pivot_root_new);
1454
1455 /* Remount directory_pivot_root_new to make it movable. */
1456 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1457 if (r < 0)
1458 goto done;
1459
1460 if (pivot_root_old) {
1461 if (!mkdtemp(pivot_tmp)) {
1462 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1463 goto done;
1464 }
1465
1466 remove_pivot_tmp = true;
1467 pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
1468
1469 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1470 if (r < 0)
1471 goto done;
1472
1473 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1474 if (r < 0)
1475 goto done;
1476
1477 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1478 if (r < 0)
1479 goto done;
1480 } else {
1481 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1482 if (r < 0)
1483 goto done;
1484 }
1485
1486done:
1487 if (remove_pivot_tmp)
1488 (void) rmdir(pivot_tmp);
1489
1490 return r;
1491}