]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn-mount.c
util-lib: rename fd_check_fstype to fd_is_fs_type
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
e83bebef
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2015 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
4f5dd394 21#include <sys/mount.h>
07630cea 22#include <linux/magic.h>
e83bebef 23
b5efdb8a 24#include "alloc-util.h"
4f5dd394 25#include "escape.h"
0996ef00
CB
26#include "fd-util.h"
27#include "fileio.h"
f4f15635 28#include "fs-util.h"
e83bebef 29#include "label.h"
4f5dd394 30#include "mkdir.h"
4349cd7c 31#include "mount-util.h"
6bedfcbb
LP
32#include "nspawn-mount.h"
33#include "parse-util.h"
4f5dd394
LP
34#include "path-util.h"
35#include "rm-rf.h"
e83bebef 36#include "set.h"
8fcde012 37#include "stat-util.h"
07630cea 38#include "string-util.h"
4f5dd394 39#include "strv.h"
ee104e11 40#include "user-util.h"
4f5dd394 41#include "util.h"
e83bebef
LP
42
43CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
56391931 51 c = realloc_multiply(*l, (*n + 1), sizeof(CustomMount));
e83bebef
LP
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62}
63
64void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
c7a4890c
LP
79 if (m->rm_rf_tmpdir) {
80 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
81 free(m->rm_rf_tmpdir);
82 }
83
e83bebef
LP
84 strv_free(m->lower);
85 }
86
87 free(l);
88}
89
86c0dd4a 90static int custom_mount_compare(const void *a, const void *b) {
e83bebef
LP
91 const CustomMount *x = a, *y = b;
92 int r;
93
94 r = path_compare(x->destination, y->destination);
95 if (r != 0)
96 return r;
97
98 if (x->type < y->type)
99 return -1;
100 if (x->type > y->type)
101 return 1;
102
103 return 0;
104}
105
86c0dd4a
LP
106static bool source_path_is_valid(const char *p) {
107 assert(p);
108
109 if (*p == '+')
110 p++;
111
112 return path_is_absolute(p);
113}
114
115static char *resolve_source_path(const char *dest, const char *source) {
116
117 if (!source)
118 return NULL;
119
120 if (source[0] == '+')
121 return prefix_root(dest, source + 1);
122
123 return strdup(source);
124}
125
126int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
127 unsigned i;
128 int r;
129
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
132 * children. */
133
134 assert(l || n == 0);
135
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l, n, sizeof(CustomMount), custom_mount_compare);
138
139 for (i = 0; i < n; i++) {
140 CustomMount *m = l + i;
141
142 if (m->source) {
143 char *s;
144
145 s = resolve_source_path(dest, m->source);
146 if (!s)
147 return log_oom();
148
149 free(m->source);
150 m->source = s;
c7a4890c
LP
151 } else {
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
153
154 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m->rm_rf_tmpdir)
156 return log_oom();
157
158 if (!mkdtemp(m->rm_rf_tmpdir)) {
159 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
160 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
161 }
162
163 m->source = strjoin(m->rm_rf_tmpdir, "/src");
164 if (!m->source)
165 return log_oom();
166
167 if (mkdir(m->source, 0755) < 0)
168 return log_error_errno(errno, "Failed to create %s: %m", m->source);
86c0dd4a
LP
169 }
170
171 if (m->type == CUSTOM_MOUNT_OVERLAY) {
172 char **j;
173
174 STRV_FOREACH(j, m->lower) {
175 char *s;
176
177 s = resolve_source_path(dest, *j);
178 if (!s)
179 return log_oom();
180
181 free(*j);
182 *j = s;
183 }
184
185 if (m->work_dir) {
186 char *s;
187
188 s = resolve_source_path(dest, m->work_dir);
189 if (!s)
190 return log_oom();
191
192 free(m->work_dir);
193 m->work_dir = s;
194 } else {
195 assert(m->source);
196
197 r = tempfn_random(m->source, NULL, &m->work_dir);
198 if (r < 0)
199 return log_error_errno(r, "Failed to acquire working directory: %m");
200 }
201
202 (void) mkdir_label(m->work_dir, 0700);
203 }
204 }
205
206 return 0;
207}
208
e83bebef
LP
209int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
210 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
211 const char *p = s;
212 CustomMount *m;
213 int r;
214
215 assert(l);
216 assert(n);
217
218 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
219 if (r < 0)
220 return r;
221 if (r == 0)
222 return -EINVAL;
e83bebef 223 if (r == 1) {
86c0dd4a 224 destination = strdup(source[0] == '+' ? source+1 : source);
e83bebef
LP
225 if (!destination)
226 return -ENOMEM;
227 }
e83bebef
LP
228 if (r == 2 && !isempty(p)) {
229 opts = strdup(p);
230 if (!opts)
231 return -ENOMEM;
232 }
233
c7a4890c
LP
234 if (isempty(source))
235 source = NULL;
236 else if (!source_path_is_valid(source))
e83bebef 237 return -EINVAL;
c7a4890c 238
e83bebef
LP
239 if (!path_is_absolute(destination))
240 return -EINVAL;
241
242 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
243 if (!m)
48cbe5f8 244 return -ENOMEM;
e83bebef
LP
245
246 m->source = source;
247 m->destination = destination;
248 m->read_only = read_only;
249 m->options = opts;
250
251 source = destination = opts = NULL;
252 return 0;
253}
254
255int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
256 _cleanup_free_ char *path = NULL, *opts = NULL;
257 const char *p = s;
258 CustomMount *m;
259 int r;
260
261 assert(l);
262 assert(n);
263 assert(s);
264
265 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
266 if (r < 0)
267 return r;
268 if (r == 0)
269 return -EINVAL;
270
271 if (isempty(p))
272 opts = strdup("mode=0755");
273 else
274 opts = strdup(p);
275 if (!opts)
276 return -ENOMEM;
277
278 if (!path_is_absolute(path))
279 return -EINVAL;
280
281 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
282 if (!m)
283 return -ENOMEM;
284
285 m->destination = path;
286 m->options = opts;
287
288 path = opts = NULL;
289 return 0;
290}
291
ad85779a
LP
292int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
293 _cleanup_free_ char *upper = NULL, *destination = NULL;
294 _cleanup_strv_free_ char **lower = NULL;
295 CustomMount *m;
86c0dd4a 296 int k;
ad85779a 297
86c0dd4a
LP
298 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
299 if (k < 0)
300 return k;
ad85779a
LP
301 if (k < 2)
302 return -EADDRNOTAVAIL;
303 if (k == 2) {
86c0dd4a
LP
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
306
307 if (!source_path_is_valid(lower[0]) ||
308 !source_path_is_valid(lower[1]))
309 return -EINVAL;
310
ad85779a
LP
311 upper = lower[1];
312 lower[1] = NULL;
313
86c0dd4a 314 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
ad85779a
LP
315 if (!destination)
316 return -ENOMEM;
ad85779a 317 } else {
c7a4890c 318 char **i;
86c0dd4a
LP
319
320 /* If more than two parameters are specified, the last one is the destination, the second to last one
321 * the "upper", and all before that the "lower" directories. */
322
ad85779a 323 destination = lower[k - 1];
86c0dd4a 324 upper = lower[k - 2];
ad85779a 325 lower[k - 2] = NULL;
86c0dd4a 326
c7a4890c
LP
327 STRV_FOREACH(i, lower)
328 if (!source_path_is_valid(*i))
329 return -EINVAL;
330
331 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
332 * in /var/tmp */
333 if (isempty(upper))
334 upper = NULL;
335 else if (!source_path_is_valid(upper))
336 return -EINVAL;
337
86c0dd4a
LP
338 if (!path_is_absolute(destination))
339 return -EINVAL;
ad85779a
LP
340 }
341
342 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
343 if (!m)
344 return -ENOMEM;
345
346 m->destination = destination;
347 m->source = upper;
348 m->lower = lower;
349 m->read_only = read_only;
350
351 upper = destination = NULL;
352 lower = NULL;
353
354 return 0;
355}
356
e83bebef
LP
357static int tmpfs_patch_options(
358 const char *options,
0996ef00
CB
359 bool userns,
360 uid_t uid_shift, uid_t uid_range,
361 bool patch_ids,
e83bebef
LP
362 const char *selinux_apifs_context,
363 char **ret) {
364
365 char *buf = NULL;
366
0996ef00 367 if ((userns && uid_shift != 0) || patch_ids) {
e83bebef
LP
368 assert(uid_shift != UID_INVALID);
369
9aa2169e 370 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
87e4e28d 371 strempty(options), options ? "," : "",
9aa2169e 372 uid_shift, uid_shift) < 0)
e83bebef
LP
373 return -ENOMEM;
374
375 options = buf;
376 }
377
349cc4a5 378#if HAVE_SELINUX
e83bebef
LP
379 if (selinux_apifs_context) {
380 char *t;
381
87e4e28d 382 t = strjoin(strempty(options), options ? "," : "",
9aa2169e
ZJS
383 "context=\"", selinux_apifs_context, "\"");
384 free(buf);
385 if (!t)
e83bebef 386 return -ENOMEM;
e83bebef 387
e83bebef
LP
388 buf = t;
389 }
390#endif
391
0996ef00
CB
392 if (!buf && options) {
393 buf = strdup(options);
394 if (!buf)
395 return -ENOMEM;
396 }
e83bebef 397 *ret = buf;
0996ef00 398
e83bebef
LP
399 return !!buf;
400}
401
4f086aab 402int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
d8fc6a00 403 const char *full, *top, *x;
d1678248 404 int r;
4f086aab 405 unsigned long extra_flags = 0;
d8fc6a00
LP
406
407 top = prefix_roota(dest, "/sys");
d1678248
ILG
408 r = path_check_fstype(top, SYSFS_MAGIC);
409 if (r < 0)
410 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
411 /* /sys might already be mounted as sysfs by the outer child in the
412 * !netns case. In this case, it's all good. Don't touch it because we
413 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
414 */
415 if (r > 0)
416 return 0;
417
d8fc6a00
LP
418 full = prefix_roota(top, "/full");
419
420 (void) mkdir(full, 0755);
421
4f086aab
SU
422 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
423 extra_flags |= MS_RDONLY;
424
60e76d48 425 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
4f086aab 426 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
60e76d48
ZJS
427 if (r < 0)
428 return r;
d8fc6a00
LP
429
430 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
431 _cleanup_free_ char *from = NULL, *to = NULL;
432
433 from = prefix_root(full, x);
434 if (!from)
435 return log_oom();
436
437 to = prefix_root(top, x);
438 if (!to)
439 return log_oom();
440
441 (void) mkdir(to, 0755);
442
60e76d48
ZJS
443 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
444 if (r < 0)
445 return r;
d8fc6a00 446
60e76d48 447 r = mount_verbose(LOG_ERR, NULL, to, NULL,
4f086aab 448 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
60e76d48
ZJS
449 if (r < 0)
450 return r;
d8fc6a00
LP
451 }
452
60e76d48
ZJS
453 r = umount_verbose(full);
454 if (r < 0)
455 return r;
d8fc6a00
LP
456
457 if (rmdir(full) < 0)
458 return log_error_errno(errno, "Failed to remove %s: %m", full);
459
0996ef00
CB
460 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
461 * remount /sys read-only.
462 */
463 if (cg_ns_supported()) {
464 x = prefix_roota(top, "/fs/cgroup");
465 (void) mkdir_p(x, 0755);
466 }
d8fc6a00 467
60e76d48 468 return mount_verbose(LOG_ERR, NULL, top, NULL,
4f086aab 469 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
d8fc6a00
LP
470}
471
acbbf69b 472static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
63eae723
EV
473 int r;
474
475 assert(path);
476
477 r = mkdir(path, mode);
478 if (r < 0 && errno != EEXIST)
479 return -errno;
480
acbbf69b
LP
481 if ((mask & MOUNT_USE_USERNS) == 0)
482 return 0;
483
484 if (mask & MOUNT_IN_USERNS)
485 return 0;
486
487 r = lchown(path, uid_shift, uid_shift);
488 if (r < 0)
489 return -errno;
63eae723
EV
490
491 return 0;
492}
493
acbbf69b 494static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
63eae723
EV
495 const char *p, *e;
496 int r;
497
498 assert(path);
499
500 if (prefix && !path_startswith(path, prefix))
501 return -ENOTDIR;
502
503 /* create every parent directory in the path, except the last component */
504 p = path + strspn(path, "/");
505 for (;;) {
506 char t[strlen(path) + 1];
507
508 e = p + strcspn(p, "/");
509 p = e + strspn(e, "/");
510
511 /* Is this the last component? If so, then we're done */
512 if (*p == 0)
513 break;
514
515 memcpy(t, path, e - path);
516 t[e-path] = 0;
517
518 if (prefix && path_startswith(prefix, t))
519 continue;
520
acbbf69b 521 r = mkdir_userns(t, mode, mask, uid_shift);
63eae723
EV
522 if (r < 0)
523 return r;
524 }
525
acbbf69b 526 return mkdir_userns(path, mode, mask, uid_shift);
63eae723
EV
527}
528
e83bebef 529int mount_all(const char *dest,
4f086aab 530 MountSettingsMask mount_settings,
403af78c 531 uid_t uid_shift, uid_t uid_range,
e83bebef
LP
532 const char *selinux_apifs_context) {
533
534 typedef struct MountPoint {
535 const char *what;
536 const char *where;
537 const char *type;
538 const char *options;
539 unsigned long flags;
4f086aab 540 MountSettingsMask mount_settings;
e83bebef
LP
541 } MountPoint;
542
543 static const MountPoint mount_table[] = {
4f086aab
SU
544 /* inner child mounts */
545 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
13e785f7 546 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
4f086aab
SU
547 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
548 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
13e785f7 549 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
4f086aab 550 { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
4f086aab
SU
551
552 /* outer child mounts */
e8a94ce8 553 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
4f086aab
SU
554 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
555 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
556 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
557
558 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
559 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
560 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
349cc4a5 561#if HAVE_SELINUX
4f086aab
SU
562 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
563 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
e83bebef
LP
564#endif
565 };
566
567 unsigned k;
568 int r;
4f086aab
SU
569 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
570 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
571 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
572 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
e83bebef
LP
573
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575 _cleanup_free_ char *where = NULL, *options = NULL;
576 const char *o;
4f086aab
SU
577 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
578
579 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
580 continue;
e83bebef 581
4f086aab 582 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
d1678248
ILG
583 continue;
584
4f086aab 585 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
e83bebef
LP
586 continue;
587
cb638b5e 588 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
8ce48cf0 589 if (r < 0)
ec57bd42 590 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
e83bebef 591
8ce48cf0 592 r = path_is_mount_point(where, NULL, 0);
e83bebef
LP
593 if (r < 0 && r != -ENOENT)
594 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
595
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && r > 0)
598 continue;
599
acbbf69b 600 r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
920a7899 601 if (r < 0 && r != -EEXIST) {
4f13e534 602 if (fatal && r != -EROFS)
e83bebef
LP
603 return log_error_errno(r, "Failed to create directory %s: %m", where);
604
201b13c8 605 log_debug_errno(r, "Failed to create directory %s: %m", where);
4f13e534
LT
606 /* If we failed mkdir() or chown() due to the root
607 * directory being read only, attempt to mount this fs
608 * anyway and let mount_verbose log any errors */
609 if (r != -EROFS)
610 continue;
e83bebef
LP
611 }
612
613 o = mount_table[k].options;
614 if (streq_ptr(mount_table[k].type, "tmpfs")) {
8492849e
EV
615 if (in_userns)
616 r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
617 else
618 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
e83bebef
LP
619 if (r < 0)
620 return log_oom();
621 if (r > 0)
622 o = options;
623 }
624
4f086aab 625 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
60e76d48
ZJS
626 mount_table[k].what,
627 where,
628 mount_table[k].type,
629 mount_table[k].flags,
630 o);
4f086aab 631 if (r < 0 && fatal)
60e76d48 632 return r;
e83bebef
LP
633 }
634
635 return 0;
636}
637
638static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
639 const char *p = options;
640 unsigned long flags = *mount_flags;
641 char *opts = NULL;
4da92e58 642 int r;
e83bebef
LP
643
644 assert(options);
645
646 for (;;) {
647 _cleanup_free_ char *word = NULL;
4da92e58
LP
648
649 r = extract_first_word(&p, &word, ",", 0);
e83bebef
LP
650 if (r < 0)
651 return log_error_errno(r, "Failed to extract mount option: %m");
652 if (r == 0)
653 break;
654
655 if (streq(word, "rbind"))
656 flags |= MS_REC;
657 else if (streq(word, "norbind"))
658 flags &= ~MS_REC;
659 else {
660 log_error("Invalid bind mount option: %s", word);
661 return -EINVAL;
662 }
663 }
664
665 *mount_flags = flags;
666 /* in the future mount_opts will hold string options for mount(2) */
667 *mount_opts = opts;
668
669 return 0;
670}
671
672static int mount_bind(const char *dest, CustomMount *m) {
68cf43c3
LP
673
674 _cleanup_free_ char *mount_opts = NULL, *where = NULL;
e83bebef 675 unsigned long mount_flags = MS_BIND | MS_REC;
68cf43c3 676 struct stat source_st, dest_st;
e83bebef
LP
677 int r;
678
86c0dd4a 679 assert(dest);
e83bebef
LP
680 assert(m);
681
682 if (m->options) {
683 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
684 if (r < 0)
685 return r;
686 }
687
688 if (stat(m->source, &source_st) < 0)
689 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
690
cb638b5e 691 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
68cf43c3 692 if (r < 0)
ec57bd42 693 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
694 if (r > 0) { /* Path exists already? */
695
696 if (stat(where, &dest_st) < 0)
697 return log_error_errno(errno, "Failed to stat %s: %m", where);
e83bebef 698
e83bebef
LP
699 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
700 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
701 return -EINVAL;
702 }
703
704 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
705 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
706 return -EINVAL;
707 }
708
8ce48cf0 709 } else { /* Path doesn't exist yet? */
e83bebef
LP
710 r = mkdir_parents_label(where, 0755);
711 if (r < 0)
712 return log_error_errno(r, "Failed to make parents of %s: %m", where);
b97e83cb
BN
713
714 /* Create the mount point. Any non-directory file can be
715 * mounted on any non-directory file (regular, fifo, socket,
716 * char, block).
717 */
718 if (S_ISDIR(source_st.st_mode))
719 r = mkdir_label(where, 0755);
720 else
721 r = touch(where);
722 if (r < 0)
723 return log_error_errno(r, "Failed to create mount point %s: %m", where);
724
8ce48cf0 725 }
e83bebef 726
60e76d48
ZJS
727 r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
728 if (r < 0)
729 return r;
e83bebef
LP
730
731 if (m->read_only) {
6b7c9f8b 732 r = bind_remount_recursive(where, true, NULL);
e83bebef
LP
733 if (r < 0)
734 return log_error_errno(r, "Read-only bind mount failed: %m");
735 }
736
737 return 0;
738}
739
740static int mount_tmpfs(
741 const char *dest,
742 CustomMount *m,
743 bool userns, uid_t uid_shift, uid_t uid_range,
744 const char *selinux_apifs_context) {
745
68cf43c3
LP
746 const char *options;
747 _cleanup_free_ char *buf = NULL, *where = NULL;
e83bebef
LP
748 int r;
749
750 assert(dest);
751 assert(m);
752
cb638b5e 753 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
68cf43c3 754 if (r < 0)
ec57bd42 755 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
756 if (r == 0) { /* Doesn't exist yet? */
757 r = mkdir_p_label(where, 0755);
758 if (r < 0)
759 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
760 }
e83bebef 761
0996ef00 762 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
e83bebef
LP
763 if (r < 0)
764 return log_oom();
765 options = r > 0 ? buf : m->options;
766
60e76d48 767 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
e83bebef
LP
768}
769
86c0dd4a 770static char *joined_and_escaped_lower_dirs(char **lower) {
e83bebef
LP
771 _cleanup_strv_free_ char **sv = NULL;
772
773 sv = strv_copy(lower);
774 if (!sv)
775 return NULL;
776
777 strv_reverse(sv);
778
779 if (!strv_shell_escape(sv, ",:"))
780 return NULL;
781
782 return strv_join(sv, ":");
783}
784
785static int mount_overlay(const char *dest, CustomMount *m) {
68cf43c3 786
86c0dd4a 787 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
68cf43c3 788 const char *options;
e83bebef
LP
789 int r;
790
791 assert(dest);
792 assert(m);
793
cb638b5e 794 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
68cf43c3 795 if (r < 0)
ec57bd42 796 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
8ce48cf0
LP
797 if (r == 0) { /* Doesn't exist yet? */
798 r = mkdir_label(where, 0755);
799 if (r < 0)
800 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
801 }
e83bebef
LP
802
803 (void) mkdir_p_label(m->source, 0755);
804
805 lower = joined_and_escaped_lower_dirs(m->lower);
806 if (!lower)
807 return log_oom();
808
86c0dd4a
LP
809 escaped_source = shell_escape(m->source, ",:");
810 if (!escaped_source)
811 return log_oom();
e83bebef 812
86c0dd4a 813 if (m->read_only)
e83bebef 814 options = strjoina("lowerdir=", escaped_source, ":", lower);
86c0dd4a
LP
815 else {
816 _cleanup_free_ char *escaped_work_dir = NULL;
e83bebef 817
e83bebef
LP
818 escaped_work_dir = shell_escape(m->work_dir, ",:");
819 if (!escaped_work_dir)
820 return log_oom();
821
822 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
823 }
824
60e76d48 825 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
e83bebef
LP
826}
827
828int mount_custom(
829 const char *dest,
830 CustomMount *mounts, unsigned n,
831 bool userns, uid_t uid_shift, uid_t uid_range,
832 const char *selinux_apifs_context) {
833
834 unsigned i;
835 int r;
836
837 assert(dest);
838
839 for (i = 0; i < n; i++) {
840 CustomMount *m = mounts + i;
841
842 switch (m->type) {
843
844 case CUSTOM_MOUNT_BIND:
845 r = mount_bind(dest, m);
846 break;
847
848 case CUSTOM_MOUNT_TMPFS:
849 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
850 break;
851
852 case CUSTOM_MOUNT_OVERLAY:
853 r = mount_overlay(dest, m);
854 break;
855
856 default:
857 assert_not_reached("Unknown custom mount type");
858 }
859
860 if (r < 0)
861 return r;
862 }
863
864 return 0;
865}
866
0996ef00
CB
867/* Retrieve existing subsystems. This function is called in a new cgroup
868 * namespace.
869 */
d7c9693a
LP
870static int get_process_controllers(Set **ret) {
871 _cleanup_set_free_free_ Set *controllers = NULL;
0996ef00 872 _cleanup_fclose_ FILE *f = NULL;
d7c9693a
LP
873 int r;
874
875 assert(ret);
0996ef00 876
d7c9693a
LP
877 controllers = set_new(&string_hash_ops);
878 if (!controllers)
879 return -ENOMEM;
0996ef00
CB
880
881 f = fopen("/proc/self/cgroup", "re");
882 if (!f)
883 return errno == ENOENT ? -ESRCH : -errno;
884
d7c9693a
LP
885 for (;;) {
886 _cleanup_free_ char *line = NULL;
887 char *e, *l;
888
889 r = read_line(f, LONG_LINE_MAX, &line);
890 if (r < 0)
891 return r;
892 if (r == 0)
893 break;
0996ef00 894
0996ef00
CB
895 l = strchr(line, ':');
896 if (!l)
897 continue;
898
899 l++;
900 e = strchr(l, ':');
901 if (!e)
902 continue;
903
904 *e = 0;
905
2977724b 906 if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
0996ef00
CB
907 continue;
908
d7c9693a 909 r = set_put_strdup(controllers, l);
0996ef00
CB
910 if (r < 0)
911 return r;
912 }
913
d7c9693a
LP
914 *ret = controllers;
915 controllers = NULL;
916
0996ef00
CB
917 return 0;
918}
919
e1873695
LP
920static int mount_legacy_cgroup_hierarchy(
921 const char *dest,
922 const char *controller,
923 const char *hierarchy,
e1873695
LP
924 bool read_only) {
925
60e76d48 926 const char *to, *fstype, *opts;
e83bebef
LP
927 int r;
928
ee30f6ac 929 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
e83bebef 930
e1873695 931 r = path_is_mount_point(to, dest, 0);
e83bebef
LP
932 if (r < 0 && r != -ENOENT)
933 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
934 if (r > 0)
935 return 0;
936
937 mkdir_p(to, 0755);
938
939 /* The superblock mount options of the mount point need to be
940 * identical to the hosts', and hence writable... */
2977724b
TH
941 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
942 fstype = "cgroup2";
943 opts = NULL;
944 } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
945 fstype = "cgroup";
946 opts = "none,name=systemd,xattr";
60e76d48
ZJS
947 } else {
948 fstype = "cgroup";
949 opts = controller;
950 }
5da38d07 951
60e76d48 952 r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
5da38d07 953 if (r < 0)
60e76d48 954 return r;
e83bebef 955
60e76d48 956 /* ... hence let's only make the bind mount read-only, not the superblock. */
e83bebef 957 if (read_only) {
60e76d48
ZJS
958 r = mount_verbose(LOG_ERR, NULL, to, NULL,
959 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
960 if (r < 0)
961 return r;
e83bebef 962 }
60e76d48 963
e83bebef
LP
964 return 1;
965}
966
0996ef00
CB
967/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
968static int mount_legacy_cgns_supported(
e1873695
LP
969 const char *dest,
970 CGroupUnified unified_requested,
971 bool userns,
972 uid_t uid_shift,
973 uid_t uid_range,
974 const char *selinux_apifs_context) {
975
0996ef00
CB
976 _cleanup_set_free_free_ Set *controllers = NULL;
977 const char *cgroup_root = "/sys/fs/cgroup", *c;
978 int r;
e83bebef 979
0996ef00
CB
980 (void) mkdir_p(cgroup_root, 0755);
981
982 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
e1873695 983 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
0996ef00
CB
984 if (r < 0)
985 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
986 if (r == 0) {
987 _cleanup_free_ char *options = NULL;
988
989 /* When cgroup namespaces are enabled and user namespaces are
990 * used then the mount of the cgroupfs is done *inside* the new
991 * user namespace. We're root in the new user namespace and the
992 * kernel will happily translate our uid/gid to the correct
993 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
994 * pass uid 0 and not uid_shift to tmpfs_patch_options().
995 */
996 r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
997 if (r < 0)
998 return log_oom();
999
60e76d48
ZJS
1000 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1001 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1002 if (r < 0)
1003 return r;
0996ef00
CB
1004 }
1005
b4cccbc1
LP
1006 r = cg_all_unified();
1007 if (r < 0)
1008 return r;
1009 if (r > 0)
0996ef00
CB
1010 goto skip_controllers;
1011
d7c9693a 1012 r = get_process_controllers(&controllers);
0996ef00
CB
1013 if (r < 0)
1014 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1015
1016 for (;;) {
1017 _cleanup_free_ const char *controller = NULL;
1018
1019 controller = set_steal_first(controllers);
1020 if (!controller)
1021 break;
1022
2977724b 1023 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
0996ef00
CB
1024 if (r < 0)
1025 return r;
1026
1027 /* When multiple hierarchies are co-mounted, make their
1028 * constituting individual hierarchies a symlink to the
1029 * co-mount.
1030 */
1031 c = controller;
1032 for (;;) {
1033 _cleanup_free_ char *target = NULL, *tok = NULL;
1034
1035 r = extract_first_word(&c, &tok, ",", 0);
1036 if (r < 0)
1037 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
1038 if (r == 0)
1039 break;
1040
0996ef00
CB
1041 if (streq(controller, tok))
1042 break;
1043
bf516294
LP
1044 target = prefix_root("/sys/fs/cgroup/", tok);
1045 if (!target)
1046 return log_oom();
1047
0996ef00
CB
1048 r = symlink_idempotent(controller, target);
1049 if (r == -EINVAL)
1050 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1051 if (r < 0)
1052 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1053 }
1054 }
1055
1056skip_controllers:
2977724b
TH
1057 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1058 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1059 if (r < 0)
1060 return r;
1061 }
1062
1063 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
0996ef00
CB
1064 if (r < 0)
1065 return r;
1066
60e76d48
ZJS
1067 if (!userns)
1068 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1069 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
0996ef00
CB
1070
1071 return 0;
1072}
1073
1074/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1075static int mount_legacy_cgns_unsupported(
1076 const char *dest,
e1873695
LP
1077 CGroupUnified unified_requested,
1078 bool userns,
1079 uid_t uid_shift,
1080 uid_t uid_range,
0996ef00 1081 const char *selinux_apifs_context) {
e1873695 1082
e83bebef
LP
1083 _cleanup_set_free_free_ Set *controllers = NULL;
1084 const char *cgroup_root;
1085 int r;
1086
1087 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1088
d8fc6a00
LP
1089 (void) mkdir_p(cgroup_root, 0755);
1090
e83bebef 1091 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
e1873695 1092 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
e83bebef
LP
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1095 if (r == 0) {
1096 _cleanup_free_ char *options = NULL;
1097
0996ef00 1098 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
e83bebef
LP
1099 if (r < 0)
1100 return log_oom();
1101
60e76d48
ZJS
1102 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1103 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1104 if (r < 0)
1105 return r;
e83bebef
LP
1106 }
1107
b4cccbc1
LP
1108 r = cg_all_unified();
1109 if (r < 0)
1110 return r;
1111 if (r > 0)
e83bebef
LP
1112 goto skip_controllers;
1113
6925a0de 1114 r = cg_kernel_controllers(&controllers);
e83bebef
LP
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1117
1118 for (;;) {
1119 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1120
1121 controller = set_steal_first(controllers);
1122 if (!controller)
1123 break;
1124
1125 origin = prefix_root("/sys/fs/cgroup/", controller);
1126 if (!origin)
1127 return log_oom();
1128
1129 r = readlink_malloc(origin, &combined);
1130 if (r == -EINVAL) {
1131 /* Not a symbolic link, but directly a single cgroup hierarchy */
1132
2977724b 1133 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
e83bebef
LP
1134 if (r < 0)
1135 return r;
1136
1137 } else if (r < 0)
1138 return log_error_errno(r, "Failed to read link %s: %m", origin);
1139 else {
1140 _cleanup_free_ char *target = NULL;
1141
1142 target = prefix_root(dest, origin);
1143 if (!target)
1144 return log_oom();
1145
1146 /* A symbolic link, a combination of controllers in one hierarchy */
1147
1148 if (!filename_is_valid(combined)) {
1149 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1150 continue;
1151 }
1152
2977724b 1153 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
e83bebef
LP
1154 if (r < 0)
1155 return r;
1156
1157 r = symlink_idempotent(combined, target);
0996ef00
CB
1158 if (r == -EINVAL)
1159 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
e83bebef
LP
1160 if (r < 0)
1161 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1162 }
1163 }
1164
1165skip_controllers:
2977724b
TH
1166 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1167 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1168 if (r < 0)
1169 return r;
1170 }
1171
1172 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
e83bebef
LP
1173 if (r < 0)
1174 return r;
1175
60e76d48
ZJS
1176 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1177 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
e83bebef
LP
1178}
1179
1180static int mount_unified_cgroups(const char *dest) {
1181 const char *p;
1182 int r;
1183
1184 assert(dest);
1185
88e10572
MT
1186 p = prefix_roota(dest, "/sys/fs/cgroup");
1187
1188 (void) mkdir_p(p, 0755);
e83bebef 1189
e1873695 1190 r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
e83bebef
LP
1191 if (r < 0)
1192 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1193 if (r > 0) {
88e10572 1194 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
e83bebef
LP
1195 if (access(p, F_OK) >= 0)
1196 return 0;
1197 if (errno != ENOENT)
1198 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1199
1200 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1201 return -EINVAL;
1202 }
1203
60e76d48 1204 return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
e83bebef
LP
1205}
1206
1207int mount_cgroups(
1208 const char *dest,
5da38d07 1209 CGroupUnified unified_requested,
e1873695
LP
1210 bool userns,
1211 uid_t uid_shift,
1212 uid_t uid_range,
5a8ff0e6
CB
1213 const char *selinux_apifs_context,
1214 bool use_cgns) {
e83bebef 1215
5da38d07 1216 if (unified_requested >= CGROUP_UNIFIED_ALL)
e83bebef 1217 return mount_unified_cgroups(dest);
ea9053c5 1218 if (use_cgns)
e1873695 1219 return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
0996ef00 1220
5da38d07 1221 return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
e83bebef
LP
1222}
1223
ea9053c5 1224static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
2977724b
TH
1225 int r;
1226
ea9053c5
LP
1227 assert(root);
1228 assert(own);
1229
2977724b 1230 /* Make our own cgroup a (writable) bind mount */
ea9053c5 1231 r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
2977724b
TH
1232 if (r < 0)
1233 return r;
1234
1235 /* And then remount the systemd cgroup root read-only */
ea9053c5 1236 return mount_verbose(LOG_ERR, NULL, root, NULL,
2977724b
TH
1237 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
1238}
1239
e83bebef
LP
1240int mount_systemd_cgroup_writable(
1241 const char *dest,
5da38d07 1242 CGroupUnified unified_requested) {
e83bebef
LP
1243
1244 _cleanup_free_ char *own_cgroup_path = NULL;
ea9053c5 1245 const char *root, *own;
e83bebef
LP
1246 int r;
1247
1248 assert(dest);
1249
1250 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1251 if (r < 0)
1252 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1253
1254 /* If we are living in the top-level, then there's nothing to do... */
1255 if (path_equal(own_cgroup_path, "/"))
1256 return 0;
1257
ea9053c5 1258 if (unified_requested >= CGROUP_UNIFIED_ALL) {
e83bebef 1259
ea9053c5
LP
1260 root = prefix_roota(dest, "/sys/fs/cgroup");
1261 own = strjoina(root, own_cgroup_path);
1262
1263 } else {
1264
1265 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1266 root = prefix_roota(dest, "/sys/fs/cgroup/unified");
1267 own = strjoina(root, own_cgroup_path);
1268
1269 r = mount_systemd_cgroup_writable_one(root, own);
1270 if (r < 0)
1271 return r;
1272 }
1273
1274 root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1275 own = strjoina(root, own_cgroup_path);
2977724b 1276 }
e83bebef 1277
ea9053c5 1278 return mount_systemd_cgroup_writable_one(root, own);
e83bebef
LP
1279}
1280
1281int setup_volatile_state(
1282 const char *directory,
1283 VolatileMode mode,
1284 bool userns, uid_t uid_shift, uid_t uid_range,
1285 const char *selinux_apifs_context) {
1286
1287 _cleanup_free_ char *buf = NULL;
1288 const char *p, *options;
1289 int r;
1290
1291 assert(directory);
1292
1293 if (mode != VOLATILE_STATE)
1294 return 0;
1295
1296 /* --volatile=state means we simply overmount /var
1297 with a tmpfs, and the rest read-only. */
1298
6b7c9f8b 1299 r = bind_remount_recursive(directory, true, NULL);
e83bebef
LP
1300 if (r < 0)
1301 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1302
1303 p = prefix_roota(directory, "/var");
1304 r = mkdir(p, 0755);
1305 if (r < 0 && errno != EEXIST)
1306 return log_error_errno(errno, "Failed to create %s: %m", directory);
1307
1308 options = "mode=755";
0996ef00 1309 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
e83bebef
LP
1310 if (r < 0)
1311 return log_oom();
1312 if (r > 0)
1313 options = buf;
1314
60e76d48 1315 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
e83bebef
LP
1316}
1317
1318int setup_volatile(
1319 const char *directory,
1320 VolatileMode mode,
1321 bool userns, uid_t uid_shift, uid_t uid_range,
1322 const char *selinux_apifs_context) {
1323
1324 bool tmpfs_mounted = false, bind_mounted = false;
1325 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1326 _cleanup_free_ char *buf = NULL;
1327 const char *f, *t, *options;
1328 int r;
1329
1330 assert(directory);
1331
1332 if (mode != VOLATILE_YES)
1333 return 0;
1334
1335 /* --volatile=yes means we mount a tmpfs to the root dir, and
1336 the original /usr to use inside it, and that read-only. */
1337
1338 if (!mkdtemp(template))
1339 return log_error_errno(errno, "Failed to create temporary directory: %m");
1340
1341 options = "mode=755";
0996ef00 1342 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
e83bebef
LP
1343 if (r < 0)
1344 return log_oom();
1345 if (r > 0)
1346 options = buf;
1347
60e76d48
ZJS
1348 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1349 if (r < 0)
e83bebef 1350 goto fail;
e83bebef
LP
1351
1352 tmpfs_mounted = true;
1353
1354 f = prefix_roota(directory, "/usr");
1355 t = prefix_roota(template, "/usr");
1356
1357 r = mkdir(t, 0755);
1358 if (r < 0 && errno != EEXIST) {
1359 r = log_error_errno(errno, "Failed to create %s: %m", t);
1360 goto fail;
1361 }
1362
60e76d48
ZJS
1363 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1364 if (r < 0)
e83bebef 1365 goto fail;
e83bebef
LP
1366
1367 bind_mounted = true;
1368
6b7c9f8b 1369 r = bind_remount_recursive(t, true, NULL);
e83bebef
LP
1370 if (r < 0) {
1371 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1372 goto fail;
1373 }
1374
60e76d48
ZJS
1375 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1376 if (r < 0)
e83bebef 1377 goto fail;
e83bebef
LP
1378
1379 (void) rmdir(template);
1380
1381 return 0;
1382
1383fail:
1384 if (bind_mounted)
60e76d48 1385 (void) umount_verbose(t);
e83bebef
LP
1386
1387 if (tmpfs_mounted)
60e76d48 1388 (void) umount_verbose(template);
e83bebef
LP
1389 (void) rmdir(template);
1390 return r;
1391}
b53ede69
PW
1392
1393/* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1394int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1395 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1396 const char *p = s;
1397 int r;
1398
1399 assert(pivot_root_new);
1400 assert(pivot_root_old);
1401
1402 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1403 if (r < 0)
1404 return r;
1405 if (r == 0)
1406 return -EINVAL;
1407
1408 if (isempty(p))
1409 root_old = NULL;
1410 else {
1411 root_old = strdup(p);
1412 if (!root_old)
1413 return -ENOMEM;
1414 }
1415
1416 if (!path_is_absolute(root_new))
1417 return -EINVAL;
1418 if (root_old && !path_is_absolute(root_old))
1419 return -EINVAL;
1420
1421 free_and_replace(*pivot_root_new, root_new);
1422 free_and_replace(*pivot_root_old, root_old);
1423
1424 return 0;
1425}
1426
1427int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1428 _cleanup_free_ char *directory_pivot_root_new = NULL;
1429 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1430 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1431 bool remove_pivot_tmp = false;
1432 int r;
1433
1434 assert(directory);
1435
1436 if (!pivot_root_new)
1437 return 0;
1438
1439 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1440 * If pivot_root_old is NULL, the existing / disappears.
1441 * This requires a temporary directory, pivot_tmp, which is
1442 * not a child of either.
1443 *
1444 * This is typically used for OSTree-style containers, where
1445 * the root partition contains several sysroots which could be
1446 * run. Normally, one would be chosen by the bootloader and
1447 * pivoted to / by initramfs.
1448 *
1449 * For example, for an OSTree deployment, pivot_root_new
1450 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1451 * code doesn’t do the /var mount which OSTree expects: use
1452 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1453 *
1454 * So in the OSTree case, we’ll end up with something like:
1455 * - directory = /tmp/nspawn-root-123456
1456 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1457 * - pivot_root_old = /sysroot
1458 * - directory_pivot_root_new =
1459 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1460 * - pivot_tmp = /tmp/nspawn-pivot-123456
1461 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1462 *
1463 * Requires all file systems at directory and below to be mounted
1464 * MS_PRIVATE or MS_SLAVE so they can be moved.
1465 */
1466 directory_pivot_root_new = prefix_root(directory, pivot_root_new);
1467
1468 /* Remount directory_pivot_root_new to make it movable. */
1469 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1470 if (r < 0)
1471 goto done;
1472
1473 if (pivot_root_old) {
1474 if (!mkdtemp(pivot_tmp)) {
1475 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1476 goto done;
1477 }
1478
1479 remove_pivot_tmp = true;
1480 pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
1481
1482 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1483 if (r < 0)
1484 goto done;
1485
1486 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1487 if (r < 0)
1488 goto done;
1489
1490 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1491 if (r < 0)
1492 goto done;
1493 } else {
1494 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1495 if (r < 0)
1496 goto done;
1497 }
1498
1499done:
1500 if (remove_pivot_tmp)
1501 (void) rmdir(pivot_tmp);
1502
1503 return r;
1504}