]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-cgroup.c
headers: remove unneeded includes from util.h
[thirdparty/systemd.git] / src / nspawn / nspawn-cgroup.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <sys/mount.h>
4
5 #include "alloc-util.h"
6 #include "fd-util.h"
7 #include "fileio.h"
8 #include "format-util.h"
9 #include "fs-util.h"
10 #include "mkdir.h"
11 #include "mount-util.h"
12 #include "mountpoint-util.h"
13 #include "nspawn-cgroup.h"
14 #include "nspawn-mount.h"
15 #include "path-util.h"
16 #include "rm-rf.h"
17 #include "string-util.h"
18 #include "strv.h"
19 #include "user-util.h"
20 #include "util.h"
21
22 static int chown_cgroup_path(const char *path, uid_t uid_shift) {
23 _cleanup_close_ int fd = -1;
24 const char *fn;
25
26 fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
27 if (fd < 0)
28 return -errno;
29
30 FOREACH_STRING(fn,
31 ".",
32 "cgroup.clone_children",
33 "cgroup.controllers",
34 "cgroup.events",
35 "cgroup.procs",
36 "cgroup.stat",
37 "cgroup.subtree_control",
38 "cgroup.threads",
39 "notify_on_release",
40 "tasks")
41 if (fchownat(fd, fn, uid_shift, uid_shift, 0) < 0)
42 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
43 "Failed to chown \"%s/%s\", ignoring: %m", path, fn);
44
45 return 0;
46 }
47
48 int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
49 _cleanup_free_ char *path = NULL, *fs = NULL;
50 int r;
51
52 r = cg_pid_get_path(NULL, pid, &path);
53 if (r < 0)
54 return log_error_errno(r, "Failed to get container cgroup path: %m");
55
56 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
57 if (r < 0)
58 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
59
60 r = chown_cgroup_path(fs, uid_shift);
61 if (r < 0)
62 return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
63
64 if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) {
65 _cleanup_free_ char *lfs = NULL;
66 /* Always propagate access rights from unified to legacy controller */
67
68 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, NULL, &lfs);
69 if (r < 0)
70 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
71
72 r = chown_cgroup_path(lfs, uid_shift);
73 if (r < 0)
74 return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs);
75 }
76
77 return 0;
78 }
79
80 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
81 _cleanup_free_ char *cgroup = NULL;
82 char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
83 bool undo_mount = false;
84 const char *fn;
85 int r, unified_controller;
86
87 unified_controller = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
88 if (unified_controller < 0)
89 return log_error_errno(unified_controller, "Failed to determine whether the systemd hierarchy is unified: %m");
90 if ((unified_controller > 0) == (unified_requested >= CGROUP_UNIFIED_SYSTEMD))
91 return 0;
92
93 /* When the host uses the legacy cgroup setup, but the
94 * container shall use the unified hierarchy, let's make sure
95 * we copy the path from the name=systemd hierarchy into the
96 * unified hierarchy. Similar for the reverse situation. */
97
98 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
99 if (r < 0)
100 return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
101
102 /* In order to access the unified hierarchy we need to mount it */
103 if (!mkdtemp(tree))
104 return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
105
106 if (unified_controller > 0)
107 r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup",
108 MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
109 else
110 r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup2",
111 MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
112 if (r < 0)
113 goto finish;
114
115 undo_mount = true;
116
117 /* If nspawn dies abruptly the cgroup hierarchy created below
118 * its unit isn't cleaned up. So, let's remove it
119 * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
120 fn = strjoina(tree, cgroup);
121 (void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES);
122
123 fn = strjoina(tree, cgroup, "/cgroup.procs");
124 (void) mkdir_parents(fn, 0755);
125
126 sprintf(pid_string, PID_FMT, pid);
127 r = write_string_file(fn, pid_string, WRITE_STRING_FILE_DISABLE_BUFFER);
128 if (r < 0) {
129 log_error_errno(r, "Failed to move process: %m");
130 goto finish;
131 }
132
133 fn = strjoina(tree, cgroup);
134 r = chown_cgroup_path(fn, uid_shift);
135 if (r < 0)
136 log_error_errno(r, "Failed to chown() cgroup %s: %m", fn);
137 finish:
138 if (undo_mount)
139 (void) umount_verbose(tree);
140
141 (void) rmdir(tree);
142 return r;
143 }
144
145 int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
146 _cleanup_free_ char *cgroup = NULL;
147 CGroupMask supported;
148 const char *payload;
149 int r;
150
151 assert(pid > 1);
152
153 /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
154 * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
155 * move us and the container into two separate subcgroups.
156 *
157 * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
158 * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
159 * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
160 * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
161 * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
162 *
163 * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
164 * that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
165 * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
166 * do it. */
167
168 r = cg_mask_supported(&supported);
169 if (r < 0)
170 return log_error_errno(r, "Failed to determine supported controllers: %m");
171
172 if (keep_unit)
173 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
174 else
175 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
176 if (r < 0)
177 return log_error_errno(r, "Failed to get our control group: %m");
178
179 payload = strjoina(cgroup, "/payload");
180 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
181 if (r < 0)
182 return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
183
184 if (keep_unit) {
185 const char *supervisor;
186
187 supervisor = strjoina(cgroup, "/supervisor");
188 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
189 if (r < 0)
190 return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
191 }
192
193 /* Try to enable as many controllers as possible for the new payload. */
194 (void) cg_enable_everywhere(supported, supported, cgroup, NULL);
195 return 0;
196 }
197
198 /* Retrieve existing subsystems. This function is called in a new cgroup
199 * namespace.
200 */
201 static int get_process_controllers(Set **ret) {
202 _cleanup_set_free_free_ Set *controllers = NULL;
203 _cleanup_fclose_ FILE *f = NULL;
204 int r;
205
206 assert(ret);
207
208 controllers = set_new(&string_hash_ops);
209 if (!controllers)
210 return -ENOMEM;
211
212 f = fopen("/proc/self/cgroup", "re");
213 if (!f)
214 return errno == ENOENT ? -ESRCH : -errno;
215
216 for (;;) {
217 _cleanup_free_ char *line = NULL;
218 char *e, *l;
219
220 r = read_line(f, LONG_LINE_MAX, &line);
221 if (r < 0)
222 return r;
223 if (r == 0)
224 break;
225
226 l = strchr(line, ':');
227 if (!l)
228 continue;
229
230 l++;
231 e = strchr(l, ':');
232 if (!e)
233 continue;
234
235 *e = 0;
236
237 if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
238 continue;
239
240 r = set_put_strdup(controllers, l);
241 if (r < 0)
242 return r;
243 }
244
245 *ret = TAKE_PTR(controllers);
246
247 return 0;
248 }
249
250 static int mount_legacy_cgroup_hierarchy(
251 const char *dest,
252 const char *controller,
253 const char *hierarchy,
254 bool read_only) {
255
256 const char *to, *fstype, *opts;
257 int r;
258
259 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
260
261 r = path_is_mount_point(to, dest, 0);
262 if (r < 0 && r != -ENOENT)
263 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
264 if (r > 0)
265 return 0;
266
267 (void) mkdir_p(to, 0755);
268
269 /* The superblock mount options of the mount point need to be
270 * identical to the hosts', and hence writable... */
271 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
272 fstype = "cgroup2";
273 opts = NULL;
274 } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
275 fstype = "cgroup";
276 opts = "none,name=systemd,xattr";
277 } else {
278 fstype = "cgroup";
279 opts = controller;
280 }
281
282 r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
283 if (r < 0)
284 return r;
285
286 /* ... hence let's only make the bind mount read-only, not the superblock. */
287 if (read_only) {
288 r = mount_verbose(LOG_ERR, NULL, to, NULL,
289 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
290 if (r < 0)
291 return r;
292 }
293
294 return 1;
295 }
296
297 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
298 static int mount_legacy_cgns_supported(
299 const char *dest,
300 CGroupUnified unified_requested,
301 bool userns,
302 uid_t uid_shift,
303 uid_t uid_range,
304 const char *selinux_apifs_context) {
305
306 _cleanup_set_free_free_ Set *controllers = NULL;
307 const char *cgroup_root = "/sys/fs/cgroup", *c;
308 int r;
309
310 (void) mkdir_p(cgroup_root, 0755);
311
312 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
313 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
314 if (r < 0)
315 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
316 if (r == 0) {
317 _cleanup_free_ char *options = NULL;
318
319 /* When cgroup namespaces are enabled and user namespaces are
320 * used then the mount of the cgroupfs is done *inside* the new
321 * user namespace. We're root in the new user namespace and the
322 * kernel will happily translate our uid/gid to the correct
323 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
324 * pass uid 0 and not uid_shift to tmpfs_patch_options().
325 */
326 r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options);
327 if (r < 0)
328 return log_oom();
329
330 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
331 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
332 if (r < 0)
333 return r;
334 }
335
336 r = cg_all_unified();
337 if (r < 0)
338 return r;
339 if (r > 0)
340 goto skip_controllers;
341
342 r = get_process_controllers(&controllers);
343 if (r < 0)
344 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
345
346 for (;;) {
347 _cleanup_free_ const char *controller = NULL;
348
349 controller = set_steal_first(controllers);
350 if (!controller)
351 break;
352
353 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
354 if (r < 0)
355 return r;
356
357 /* When multiple hierarchies are co-mounted, make their
358 * constituting individual hierarchies a symlink to the
359 * co-mount.
360 */
361 c = controller;
362 for (;;) {
363 _cleanup_free_ char *target = NULL, *tok = NULL;
364
365 r = extract_first_word(&c, &tok, ",", 0);
366 if (r < 0)
367 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
368 if (r == 0)
369 break;
370
371 if (streq(controller, tok))
372 break;
373
374 target = prefix_root("/sys/fs/cgroup/", tok);
375 if (!target)
376 return log_oom();
377
378 r = symlink_idempotent(controller, target, false);
379 if (r == -EINVAL)
380 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
381 if (r < 0)
382 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
383 }
384 }
385
386 skip_controllers:
387 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
388 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
389 if (r < 0)
390 return r;
391 }
392
393 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
394 if (r < 0)
395 return r;
396
397 if (!userns)
398 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
399 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
400
401 return 0;
402 }
403
404 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
405 static int mount_legacy_cgns_unsupported(
406 const char *dest,
407 CGroupUnified unified_requested,
408 bool userns,
409 uid_t uid_shift,
410 uid_t uid_range,
411 const char *selinux_apifs_context) {
412
413 _cleanup_set_free_free_ Set *controllers = NULL;
414 const char *cgroup_root;
415 int r;
416
417 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
418
419 (void) mkdir_p(cgroup_root, 0755);
420
421 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
422 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
423 if (r < 0)
424 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
425 if (r == 0) {
426 _cleanup_free_ char *options = NULL;
427
428 r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options);
429 if (r < 0)
430 return log_oom();
431
432 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
433 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
434 if (r < 0)
435 return r;
436 }
437
438 r = cg_all_unified();
439 if (r < 0)
440 return r;
441 if (r > 0)
442 goto skip_controllers;
443
444 r = cg_kernel_controllers(&controllers);
445 if (r < 0)
446 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
447
448 for (;;) {
449 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
450
451 controller = set_steal_first(controllers);
452 if (!controller)
453 break;
454
455 origin = prefix_root("/sys/fs/cgroup/", controller);
456 if (!origin)
457 return log_oom();
458
459 r = readlink_malloc(origin, &combined);
460 if (r == -EINVAL) {
461 /* Not a symbolic link, but directly a single cgroup hierarchy */
462
463 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
464 if (r < 0)
465 return r;
466
467 } else if (r < 0)
468 return log_error_errno(r, "Failed to read link %s: %m", origin);
469 else {
470 _cleanup_free_ char *target = NULL;
471
472 target = prefix_root(dest, origin);
473 if (!target)
474 return log_oom();
475
476 /* A symbolic link, a combination of controllers in one hierarchy */
477
478 if (!filename_is_valid(combined)) {
479 log_warning("Ignoring invalid combined hierarchy %s.", combined);
480 continue;
481 }
482
483 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
484 if (r < 0)
485 return r;
486
487 r = symlink_idempotent(combined, target, false);
488 if (r == -EINVAL)
489 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
490 if (r < 0)
491 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
492 }
493 }
494
495 skip_controllers:
496 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
497 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
498 if (r < 0)
499 return r;
500 }
501
502 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
503 if (r < 0)
504 return r;
505
506 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
507 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
508 }
509
510 static int mount_unified_cgroups(const char *dest) {
511 const char *p;
512 int r;
513
514 assert(dest);
515
516 p = prefix_roota(dest, "/sys/fs/cgroup");
517
518 (void) mkdir_p(p, 0755);
519
520 r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
521 if (r < 0)
522 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
523 if (r > 0) {
524 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
525 if (access(p, F_OK) >= 0)
526 return 0;
527 if (errno != ENOENT)
528 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
529
530 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
531 "%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
532 }
533
534 return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
535 }
536
537 int mount_cgroups(
538 const char *dest,
539 CGroupUnified unified_requested,
540 bool userns,
541 uid_t uid_shift,
542 uid_t uid_range,
543 const char *selinux_apifs_context,
544 bool use_cgns) {
545
546 if (unified_requested >= CGROUP_UNIFIED_ALL)
547 return mount_unified_cgroups(dest);
548 if (use_cgns)
549 return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
550
551 return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
552 }
553
554 static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
555 int r;
556
557 assert(root);
558 assert(own);
559
560 /* Make our own cgroup a (writable) bind mount */
561 r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
562 if (r < 0)
563 return r;
564
565 /* And then remount the systemd cgroup root read-only */
566 return mount_verbose(LOG_ERR, NULL, root, NULL,
567 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
568 }
569
570 int mount_systemd_cgroup_writable(
571 const char *dest,
572 CGroupUnified unified_requested) {
573
574 _cleanup_free_ char *own_cgroup_path = NULL;
575 const char *root, *own;
576 int r;
577
578 assert(dest);
579
580 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
581 if (r < 0)
582 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
583
584 /* If we are living in the top-level, then there's nothing to do... */
585 if (path_equal(own_cgroup_path, "/"))
586 return 0;
587
588 if (unified_requested >= CGROUP_UNIFIED_ALL) {
589
590 root = prefix_roota(dest, "/sys/fs/cgroup");
591 own = strjoina(root, own_cgroup_path);
592
593 } else {
594
595 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
596 root = prefix_roota(dest, "/sys/fs/cgroup/unified");
597 own = strjoina(root, own_cgroup_path);
598
599 r = mount_systemd_cgroup_writable_one(root, own);
600 if (r < 0)
601 return r;
602 }
603
604 root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
605 own = strjoina(root, own_cgroup_path);
606 }
607
608 return mount_systemd_cgroup_writable_one(root, own);
609 }