]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/basic/cgroup-util.c
json: use secure un{base64,hex}mem for sensitive variants
[thirdparty/systemd.git] / src / basic / cgroup-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <limits.h>
5 #include <signal.h>
6 #include <stddef.h>
7 #include <stdlib.h>
8 #include <sys/types.h>
9 #include <sys/utsname.h>
10 #include <sys/xattr.h>
11 #include <unistd.h>
12
13 #include "alloc-util.h"
14 #include "cgroup-util.h"
15 #include "constants.h"
16 #include "dirent-util.h"
17 #include "extract-word.h"
18 #include "fd-util.h"
19 #include "fileio.h"
20 #include "format-util.h"
21 #include "fs-util.h"
22 #include "log.h"
23 #include "login-util.h"
24 #include "macro.h"
25 #include "missing_fs.h"
26 #include "missing_magic.h"
27 #include "missing_threads.h"
28 #include "mkdir.h"
29 #include "parse-util.h"
30 #include "path-util.h"
31 #include "process-util.h"
32 #include "set.h"
33 #include "special.h"
34 #include "stat-util.h"
35 #include "stdio-util.h"
36 #include "string-table.h"
37 #include "string-util.h"
38 #include "strv.h"
39 #include "unit-name.h"
40 #include "user-util.h"
41 #include "xattr-util.h"
42
43 int cg_path_open(const char *controller, const char *path) {
44 _cleanup_free_ char *fs = NULL;
45 int r;
46
47 r = cg_get_path(controller, path, /* item=*/ NULL, &fs);
48 if (r < 0)
49 return r;
50
51 return RET_NERRNO(open(fs, O_DIRECTORY|O_CLOEXEC));
52 }
53
54 int cg_cgroupid_open(int cgroupfs_fd, uint64_t id) {
55 _cleanup_close_ int fsfd = -EBADF;
56
57 if (cgroupfs_fd < 0) {
58 fsfd = open("/sys/fs/cgroup", O_CLOEXEC|O_DIRECTORY);
59 if (fsfd < 0)
60 return -errno;
61
62 cgroupfs_fd = fsfd;
63 }
64
65 cg_file_handle fh = CG_FILE_HANDLE_INIT;
66 CG_FILE_HANDLE_CGROUPID(fh) = id;
67
68 int fd = open_by_handle_at(cgroupfs_fd, &fh.file_handle, O_DIRECTORY|O_CLOEXEC);
69 if (fd < 0)
70 return -errno;
71
72 return fd;
73 }
74
75 static int cg_enumerate_items(const char *controller, const char *path, FILE **ret, const char *item) {
76 _cleanup_free_ char *fs = NULL;
77 FILE *f;
78 int r;
79
80 assert(ret);
81
82 r = cg_get_path(controller, path, item, &fs);
83 if (r < 0)
84 return r;
85
86 f = fopen(fs, "re");
87 if (!f)
88 return -errno;
89
90 *ret = f;
91 return 0;
92 }
93
94 int cg_enumerate_processes(const char *controller, const char *path, FILE **ret) {
95 return cg_enumerate_items(controller, path, ret, "cgroup.procs");
96 }
97
98 int cg_read_pid(FILE *f, pid_t *ret, CGroupFlags flags) {
99 unsigned long ul;
100
101 /* Note that the cgroup.procs might contain duplicates! See cgroups.txt for details. */
102
103 assert(f);
104 assert(ret);
105
106 for (;;) {
107 errno = 0;
108 if (fscanf(f, "%lu", &ul) != 1) {
109
110 if (feof(f)) {
111 *ret = 0;
112 return 0;
113 }
114
115 return errno_or_else(EIO);
116 }
117
118 if (ul > PID_T_MAX)
119 return -EIO;
120
121 /* In some circumstances (e.g. WSL), cgroups might contain unmappable PIDs from other
122 * contexts. These show up as zeros, and depending on the caller, can either be plain
123 * skipped over, or returned as-is. */
124 if (ul == 0 && !FLAGS_SET(flags, CGROUP_DONT_SKIP_UNMAPPED))
125 continue;
126
127 *ret = (pid_t) ul;
128 return 1;
129 }
130 }
131
132 int cg_read_pidref(FILE *f, PidRef *ret, CGroupFlags flags) {
133 int r;
134
135 assert(f);
136 assert(ret);
137
138 for (;;) {
139 pid_t pid;
140
141 r = cg_read_pid(f, &pid, flags);
142 if (r < 0)
143 return log_debug_errno(r, "Failed to read pid from cgroup item: %m");
144 if (r == 0) {
145 *ret = PIDREF_NULL;
146 return 0;
147 }
148
149 if (pid == 0)
150 return -EREMOTE;
151
152 if (FLAGS_SET(flags, CGROUP_NO_PIDFD)) {
153 *ret = PIDREF_MAKE_FROM_PID(pid);
154 return 1;
155 }
156
157 r = pidref_set_pid(ret, pid);
158 if (r >= 0)
159 return 1;
160 if (r != -ESRCH)
161 return r;
162
163 /* ESRCH → gone by now? just skip over it, read the next */
164 }
165 }
166
167 int cg_read_event(
168 const char *controller,
169 const char *path,
170 const char *event,
171 char **ret) {
172
173 _cleanup_free_ char *events = NULL, *content = NULL;
174 int r;
175
176 r = cg_get_path(controller, path, "cgroup.events", &events);
177 if (r < 0)
178 return r;
179
180 r = read_full_virtual_file(events, &content, NULL);
181 if (r < 0)
182 return r;
183
184 for (const char *p = content;;) {
185 _cleanup_free_ char *line = NULL, *key = NULL;
186 const char *q;
187
188 r = extract_first_word(&p, &line, "\n", 0);
189 if (r < 0)
190 return r;
191 if (r == 0)
192 return -ENOENT;
193
194 q = line;
195 r = extract_first_word(&q, &key, " ", 0);
196 if (r < 0)
197 return r;
198 if (r == 0)
199 return -EINVAL;
200
201 if (!streq(key, event))
202 continue;
203
204 return strdup_to(ret, q);
205 }
206 }
207
208 bool cg_ns_supported(void) {
209 static thread_local int enabled = -1;
210
211 if (enabled >= 0)
212 return enabled;
213
214 if (access("/proc/self/ns/cgroup", F_OK) < 0) {
215 if (errno != ENOENT)
216 log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
217 enabled = false;
218 } else
219 enabled = true;
220
221 return enabled;
222 }
223
224 bool cg_freezer_supported(void) {
225 static thread_local int supported = -1;
226
227 if (supported >= 0)
228 return supported;
229
230 supported = cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK) == 0;
231
232 return supported;
233 }
234
235 bool cg_kill_supported(void) {
236 static thread_local int supported = -1;
237
238 if (supported >= 0)
239 return supported;
240
241 if (cg_all_unified() <= 0)
242 supported = false;
243 else if (access("/sys/fs/cgroup/init.scope/cgroup.kill", F_OK) < 0) {
244 if (errno != ENOENT)
245 log_debug_errno(errno, "Failed to check if cgroup.kill is available, assuming not: %m");
246 supported = false;
247 } else
248 supported = true;
249
250 return supported;
251 }
252
253 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **ret) {
254 _cleanup_free_ char *fs = NULL;
255 DIR *d;
256 int r;
257
258 assert(ret);
259
260 /* This is not recursive! */
261
262 r = cg_get_path(controller, path, NULL, &fs);
263 if (r < 0)
264 return r;
265
266 d = opendir(fs);
267 if (!d)
268 return -errno;
269
270 *ret = d;
271 return 0;
272 }
273
274 int cg_read_subgroup(DIR *d, char **ret) {
275 assert(d);
276 assert(ret);
277
278 FOREACH_DIRENT_ALL(de, d, return -errno) {
279 if (de->d_type != DT_DIR)
280 continue;
281
282 if (dot_or_dot_dot(de->d_name))
283 continue;
284
285 return strdup_to_full(ret, de->d_name);
286 }
287
288 *ret = NULL;
289 return 0;
290 }
291
292 int cg_rmdir(const char *controller, const char *path) {
293 _cleanup_free_ char *p = NULL;
294 int r;
295
296 r = cg_get_path(controller, path, NULL, &p);
297 if (r < 0)
298 return r;
299
300 r = rmdir(p);
301 if (r < 0 && errno != ENOENT)
302 return -errno;
303
304 r = cg_hybrid_unified();
305 if (r <= 0)
306 return r;
307
308 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
309 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
310 if (r < 0)
311 log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
312 }
313
314 return 0;
315 }
316
317 static int cg_kill_items(
318 const char *path,
319 int sig,
320 CGroupFlags flags,
321 Set *s,
322 cg_kill_log_func_t log_kill,
323 void *userdata,
324 const char *item) {
325
326 _cleanup_set_free_ Set *allocated_set = NULL;
327 bool done = false;
328 int r, ret = 0, ret_log_kill = 0;
329
330 assert(sig >= 0);
331
332 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
333 * SIGCONT on SIGKILL. */
334 if (IN_SET(sig, SIGCONT, SIGKILL))
335 flags &= ~CGROUP_SIGCONT;
336
337 /* This goes through the tasks list and kills them all. This
338 * is repeated until no further processes are added to the
339 * tasks list, to properly handle forking processes */
340
341 if (!s) {
342 s = allocated_set = set_new(NULL);
343 if (!s)
344 return -ENOMEM;
345 }
346
347 do {
348 _cleanup_fclose_ FILE *f = NULL;
349 done = true;
350
351 r = cg_enumerate_items(SYSTEMD_CGROUP_CONTROLLER, path, &f, item);
352 if (r == -ENOENT)
353 break;
354 if (r < 0)
355 return RET_GATHER(ret, log_debug_errno(r, "Failed to enumerate cgroup items: %m"));
356
357 for (;;) {
358 _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
359
360 r = cg_read_pidref(f, &pidref, flags);
361 if (r < 0)
362 return RET_GATHER(ret, log_debug_errno(r, "Failed to read pidref from cgroup '%s': %m", path));
363 if (r == 0)
364 break;
365
366 if ((flags & CGROUP_IGNORE_SELF) && pidref_is_self(&pidref))
367 continue;
368
369 if (set_get(s, PID_TO_PTR(pidref.pid)) == PID_TO_PTR(pidref.pid))
370 continue;
371
372 if (log_kill)
373 ret_log_kill = log_kill(&pidref, sig, userdata);
374
375 /* If we haven't killed this process yet, kill it */
376 r = pidref_kill(&pidref, sig);
377 if (r < 0 && r != -ESRCH)
378 RET_GATHER(ret, log_debug_errno(r, "Failed to kill process with pid " PID_FMT " from cgroup '%s': %m", pidref.pid, path));
379 if (r >= 0) {
380 if (flags & CGROUP_SIGCONT)
381 (void) pidref_kill(&pidref, SIGCONT);
382
383 if (ret == 0) {
384 if (log_kill)
385 ret = ret_log_kill;
386 else
387 ret = 1;
388 }
389 }
390
391 done = false;
392
393 r = set_put(s, PID_TO_PTR(pidref.pid));
394 if (r < 0)
395 return RET_GATHER(ret, r);
396 }
397
398 /* To avoid racing against processes which fork quicker than we can kill them, we repeat this
399 * until no new pids need to be killed. */
400
401 } while (!done);
402
403 return ret;
404 }
405
406 int cg_kill(
407 const char *path,
408 int sig,
409 CGroupFlags flags,
410 Set *s,
411 cg_kill_log_func_t log_kill,
412 void *userdata) {
413
414 int r, ret;
415
416 r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.procs");
417 if (r < 0)
418 log_debug_errno(r, "Failed to kill processes in cgroup '%s' item cgroup.procs: %m", path);
419 if (r < 0 || sig != SIGKILL)
420 return r;
421
422 ret = r;
423
424 /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
425 a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
426 (4340d175b898) and 4.14.138 (feb6b123b7dd). */
427 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
428 if (r < 0)
429 return r;
430 if (r == 0)
431 return ret;
432
433 /* Opening pidfds for non thread group leaders only works from 6.9 onwards with PIDFD_THREAD. On
434 * older kernels or without PIDFD_THREAD pidfd_open() fails with EINVAL. Since we might read non
435 * thread group leader IDs from cgroup.threads, we set CGROUP_NO_PIDFD to avoid trying open pidfd's
436 * for them and instead use the regular pid. */
437 r = cg_kill_items(path, sig, flags|CGROUP_NO_PIDFD, s, log_kill, userdata, "cgroup.threads");
438 if (r < 0)
439 return log_debug_errno(r, "Failed to kill processes in cgroup '%s' item cgroup.threads: %m", path);
440
441 return r > 0 || ret > 0;
442 }
443
444 int cg_kill_kernel_sigkill(const char *path) {
445 /* Kills the cgroup at `path` directly by writing to its cgroup.kill file. This sends SIGKILL to all
446 * processes in the cgroup and has the advantage of being completely atomic, unlike cg_kill_items(). */
447
448 _cleanup_free_ char *killfile = NULL;
449 int r;
450
451 assert(path);
452
453 if (!cg_kill_supported())
454 return -EOPNOTSUPP;
455
456 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.kill", &killfile);
457 if (r < 0)
458 return r;
459
460 r = write_string_file(killfile, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
461 if (r < 0)
462 return log_debug_errno(r, "Failed to write to cgroup.kill for cgroup '%s': %m", path);
463
464 return 0;
465 }
466
467 int cg_kill_recursive(
468 const char *path,
469 int sig,
470 CGroupFlags flags,
471 Set *s,
472 cg_kill_log_func_t log_kill,
473 void *userdata) {
474
475 int r, ret;
476
477 assert(path);
478 assert(sig >= 0);
479
480 if (sig == SIGKILL && cg_kill_supported() &&
481 !FLAGS_SET(flags, CGROUP_IGNORE_SELF) && !s && !log_kill)
482 /* ignore CGROUP_SIGCONT, since this is a no-op alongside SIGKILL */
483 ret = cg_kill_kernel_sigkill(path);
484 else {
485 _cleanup_set_free_ Set *allocated_set = NULL;
486 _cleanup_closedir_ DIR *d = NULL;
487
488 if (!s) {
489 s = allocated_set = set_new(NULL);
490 if (!s)
491 return -ENOMEM;
492 }
493
494 ret = cg_kill(path, sig, flags, s, log_kill, userdata);
495
496 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
497 if (r < 0) {
498 if (r != -ENOENT)
499 RET_GATHER(ret, log_debug_errno(r, "Failed to enumerate cgroup '%s' subgroups: %m", path));
500
501 return ret;
502 }
503
504 for (;;) {
505 _cleanup_free_ char *fn = NULL, *p = NULL;
506
507 r = cg_read_subgroup(d, &fn);
508 if (r < 0) {
509 RET_GATHER(ret, log_debug_errno(r, "Failed to read subgroup from cgroup '%s': %m", path));
510 break;
511 }
512 if (r == 0)
513 break;
514
515 p = path_join(empty_to_root(path), fn);
516 if (!p)
517 return -ENOMEM;
518
519 r = cg_kill_recursive(p, sig, flags, s, log_kill, userdata);
520 if (r < 0)
521 log_debug_errno(r, "Failed to recursively kill processes in cgroup '%s': %m", p);
522 if (r != 0 && ret >= 0)
523 ret = r;
524 }
525 }
526
527 if (FLAGS_SET(flags, CGROUP_REMOVE)) {
528 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, path);
529 if (!IN_SET(r, -ENOENT, -EBUSY))
530 RET_GATHER(ret, log_debug_errno(r, "Failed to remove cgroup '%s': %m", path));
531 }
532
533 return ret;
534 }
535
536 static const char *controller_to_dirname(const char *controller) {
537 assert(controller);
538
539 /* Converts a controller name to the directory name below /sys/fs/cgroup/ we want to mount it
540 * to. Effectively, this just cuts off the name= prefixed used for named hierarchies, if it is
541 * specified. */
542
543 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
544 if (cg_hybrid_unified() > 0)
545 controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
546 else
547 controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
548 }
549
550 return startswith(controller, "name=") ?: controller;
551 }
552
553 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **ret) {
554 const char *dn;
555 char *t = NULL;
556
557 assert(ret);
558 assert(controller);
559
560 dn = controller_to_dirname(controller);
561
562 if (isempty(path) && isempty(suffix))
563 t = path_join("/sys/fs/cgroup", dn);
564 else if (isempty(path))
565 t = path_join("/sys/fs/cgroup", dn, suffix);
566 else if (isempty(suffix))
567 t = path_join("/sys/fs/cgroup", dn, path);
568 else
569 t = path_join("/sys/fs/cgroup", dn, path, suffix);
570 if (!t)
571 return -ENOMEM;
572
573 *ret = t;
574 return 0;
575 }
576
577 static int join_path_unified(const char *path, const char *suffix, char **ret) {
578 char *t;
579
580 assert(ret);
581
582 if (isempty(path) && isempty(suffix))
583 t = strdup("/sys/fs/cgroup");
584 else if (isempty(path))
585 t = path_join("/sys/fs/cgroup", suffix);
586 else if (isempty(suffix))
587 t = path_join("/sys/fs/cgroup", path);
588 else
589 t = path_join("/sys/fs/cgroup", path, suffix);
590 if (!t)
591 return -ENOMEM;
592
593 *ret = t;
594 return 0;
595 }
596
597 int cg_get_path(const char *controller, const char *path, const char *suffix, char **ret) {
598 int r;
599
600 assert(ret);
601
602 if (!controller) {
603 char *t;
604
605 /* If no controller is specified, we return the path *below* the controllers, without any
606 * prefix. */
607
608 if (isempty(path) && isempty(suffix))
609 return -EINVAL;
610
611 if (isempty(suffix))
612 t = strdup(path);
613 else if (isempty(path))
614 t = strdup(suffix);
615 else
616 t = path_join(path, suffix);
617 if (!t)
618 return -ENOMEM;
619
620 *ret = path_simplify(t);
621 return 0;
622 }
623
624 if (!cg_controller_is_valid(controller))
625 return -EINVAL;
626
627 r = cg_all_unified();
628 if (r < 0)
629 return r;
630 if (r > 0)
631 r = join_path_unified(path, suffix, ret);
632 else
633 r = join_path_legacy(controller, path, suffix, ret);
634 if (r < 0)
635 return r;
636
637 path_simplify(*ret);
638 return 0;
639 }
640
641 static int controller_is_v1_accessible(const char *root, const char *controller) {
642 const char *cpath, *dn;
643
644 assert(controller);
645
646 dn = controller_to_dirname(controller);
647
648 /* If root if specified, we check that:
649 * - possible subcgroup is created at root,
650 * - we can modify the hierarchy. */
651
652 cpath = strjoina("/sys/fs/cgroup/", dn, root, root ? "/cgroup.procs" : NULL);
653 return laccess(cpath, root ? W_OK : F_OK);
654 }
655
656 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **ret) {
657 int r;
658
659 assert(controller);
660 assert(ret);
661
662 if (!cg_controller_is_valid(controller))
663 return -EINVAL;
664
665 r = cg_all_unified();
666 if (r < 0)
667 return r;
668 if (r > 0) {
669 /* In the unified hierarchy all controllers are considered accessible,
670 * except for the named hierarchies */
671 if (startswith(controller, "name="))
672 return -EOPNOTSUPP;
673 } else {
674 /* Check if the specified controller is actually accessible */
675 r = controller_is_v1_accessible(NULL, controller);
676 if (r < 0)
677 return r;
678 }
679
680 return cg_get_path(controller, path, suffix, ret);
681 }
682
683 int cg_set_xattr(const char *path, const char *name, const void *value, size_t size, int flags) {
684 _cleanup_free_ char *fs = NULL;
685 int r;
686
687 assert(path);
688 assert(name);
689 assert(value || size <= 0);
690
691 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
692 if (r < 0)
693 return r;
694
695 return RET_NERRNO(setxattr(fs, name, value, size, flags));
696 }
697
698 int cg_get_xattr(const char *path, const char *name, void *value, size_t size) {
699 _cleanup_free_ char *fs = NULL;
700 ssize_t n;
701 int r;
702
703 assert(path);
704 assert(name);
705
706 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
707 if (r < 0)
708 return r;
709
710 n = getxattr(fs, name, value, size);
711 if (n < 0)
712 return -errno;
713
714 return (int) n;
715 }
716
717 int cg_get_xattr_malloc(const char *path, const char *name, char **ret) {
718 _cleanup_free_ char *fs = NULL;
719 int r;
720
721 assert(path);
722 assert(name);
723
724 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
725 if (r < 0)
726 return r;
727
728 return lgetxattr_malloc(fs, name, ret);
729 }
730
731 int cg_get_xattr_bool(const char *path, const char *name) {
732 _cleanup_free_ char *fs = NULL;
733 int r;
734
735 assert(path);
736 assert(name);
737
738 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
739 if (r < 0)
740 return r;
741
742 return getxattr_at_bool(AT_FDCWD, fs, name, /* flags= */ 0);
743 }
744
745 int cg_remove_xattr(const char *path, const char *name) {
746 _cleanup_free_ char *fs = NULL;
747 int r;
748
749 assert(path);
750 assert(name);
751
752 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
753 if (r < 0)
754 return r;
755
756 return RET_NERRNO(removexattr(fs, name));
757 }
758
759 int cg_pid_get_path(const char *controller, pid_t pid, char **ret_path) {
760 _cleanup_fclose_ FILE *f = NULL;
761 const char *fs, *controller_str = NULL; /* avoid false maybe-uninitialized warning */
762 int unified, r;
763
764 assert(pid >= 0);
765 assert(ret_path);
766
767 if (controller) {
768 if (!cg_controller_is_valid(controller))
769 return -EINVAL;
770 } else
771 controller = SYSTEMD_CGROUP_CONTROLLER;
772
773 unified = cg_unified_controller(controller);
774 if (unified < 0)
775 return unified;
776 if (unified == 0) {
777 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
778 controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
779 else
780 controller_str = controller;
781 }
782
783 fs = procfs_file_alloca(pid, "cgroup");
784 r = fopen_unlocked(fs, "re", &f);
785 if (r == -ENOENT)
786 return -ESRCH;
787 if (r < 0)
788 return r;
789
790 for (;;) {
791 _cleanup_free_ char *line = NULL;
792 char *e;
793
794 r = read_line(f, LONG_LINE_MAX, &line);
795 if (r < 0)
796 return r;
797 if (r == 0)
798 return -ENODATA;
799
800 if (unified) {
801 e = startswith(line, "0:");
802 if (!e)
803 continue;
804
805 e = strchr(e, ':');
806 if (!e)
807 continue;
808 } else {
809 char *l;
810
811 l = strchr(line, ':');
812 if (!l)
813 continue;
814
815 l++;
816 e = strchr(l, ':');
817 if (!e)
818 continue;
819 *e = 0;
820
821 assert(controller_str);
822 r = string_contains_word(l, ",", controller_str);
823 if (r < 0)
824 return r;
825 if (r == 0)
826 continue;
827 }
828
829 char *path = strdup(e + 1);
830 if (!path)
831 return -ENOMEM;
832
833 /* Truncate suffix indicating the process is a zombie */
834 e = endswith(path, " (deleted)");
835 if (e)
836 *e = 0;
837
838 *ret_path = path;
839 return 0;
840 }
841 }
842
843 int cg_pidref_get_path(const char *controller, const PidRef *pidref, char **ret_path) {
844 _cleanup_free_ char *path = NULL;
845 int r;
846
847 assert(ret_path);
848
849 if (!pidref_is_set(pidref))
850 return -ESRCH;
851
852 r = cg_pid_get_path(controller, pidref->pid, &path);
853 if (r < 0)
854 return r;
855
856 /* Before we return the path, make sure the procfs entry for this pid still matches the pidref */
857 r = pidref_verify(pidref);
858 if (r < 0)
859 return r;
860
861 *ret_path = TAKE_PTR(path);
862 return 0;
863 }
864
865 int cg_install_release_agent(const char *controller, const char *agent) {
866 _cleanup_free_ char *fs = NULL, *contents = NULL;
867 const char *sc;
868 int r;
869
870 assert(agent);
871
872 r = cg_unified_controller(controller);
873 if (r < 0)
874 return r;
875 if (r > 0) /* doesn't apply to unified hierarchy */
876 return -EOPNOTSUPP;
877
878 r = cg_get_path(controller, NULL, "release_agent", &fs);
879 if (r < 0)
880 return r;
881
882 r = read_one_line_file(fs, &contents);
883 if (r < 0)
884 return r;
885
886 sc = strstrip(contents);
887 if (isempty(sc)) {
888 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
889 if (r < 0)
890 return r;
891 } else if (!path_equal(sc, agent))
892 return -EEXIST;
893
894 fs = mfree(fs);
895 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
896 if (r < 0)
897 return r;
898
899 contents = mfree(contents);
900 r = read_one_line_file(fs, &contents);
901 if (r < 0)
902 return r;
903
904 sc = strstrip(contents);
905 if (streq(sc, "0")) {
906 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
907 if (r < 0)
908 return r;
909
910 return 1;
911 }
912
913 if (!streq(sc, "1"))
914 return -EIO;
915
916 return 0;
917 }
918
919 int cg_uninstall_release_agent(const char *controller) {
920 _cleanup_free_ char *fs = NULL;
921 int r;
922
923 r = cg_unified_controller(controller);
924 if (r < 0)
925 return r;
926 if (r > 0) /* Doesn't apply to unified hierarchy */
927 return -EOPNOTSUPP;
928
929 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
930 if (r < 0)
931 return r;
932
933 r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
934 if (r < 0)
935 return r;
936
937 fs = mfree(fs);
938
939 r = cg_get_path(controller, NULL, "release_agent", &fs);
940 if (r < 0)
941 return r;
942
943 r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
944 if (r < 0)
945 return r;
946
947 return 0;
948 }
949
950 int cg_is_empty(const char *controller, const char *path) {
951 _cleanup_fclose_ FILE *f = NULL;
952 pid_t pid;
953 int r;
954
955 assert(path);
956
957 r = cg_enumerate_processes(controller, path, &f);
958 if (r == -ENOENT)
959 return true;
960 if (r < 0)
961 return r;
962
963 r = cg_read_pid(f, &pid, CGROUP_DONT_SKIP_UNMAPPED);
964 if (r < 0)
965 return r;
966
967 return r == 0;
968 }
969
970 int cg_is_empty_recursive(const char *controller, const char *path) {
971 int r;
972
973 assert(path);
974
975 /* The root cgroup is always populated */
976 if (controller && empty_or_root(path))
977 return false;
978
979 r = cg_unified_controller(controller);
980 if (r < 0)
981 return r;
982 if (r > 0) {
983 _cleanup_free_ char *t = NULL;
984
985 /* On the unified hierarchy we can check empty state
986 * via the "populated" attribute of "cgroup.events". */
987
988 r = cg_read_event(controller, path, "populated", &t);
989 if (r == -ENOENT)
990 return true;
991 if (r < 0)
992 return r;
993
994 return streq(t, "0");
995 } else {
996 _cleanup_closedir_ DIR *d = NULL;
997 char *fn;
998
999 r = cg_is_empty(controller, path);
1000 if (r <= 0)
1001 return r;
1002
1003 r = cg_enumerate_subgroups(controller, path, &d);
1004 if (r == -ENOENT)
1005 return true;
1006 if (r < 0)
1007 return r;
1008
1009 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1010 _cleanup_free_ char *p = NULL;
1011
1012 p = path_join(path, fn);
1013 free(fn);
1014 if (!p)
1015 return -ENOMEM;
1016
1017 r = cg_is_empty_recursive(controller, p);
1018 if (r <= 0)
1019 return r;
1020 }
1021 if (r < 0)
1022 return r;
1023
1024 return true;
1025 }
1026 }
1027
1028 int cg_split_spec(const char *spec, char **ret_controller, char **ret_path) {
1029 _cleanup_free_ char *controller = NULL, *path = NULL;
1030 int r;
1031
1032 assert(spec);
1033
1034 if (*spec == '/') {
1035 if (!path_is_normalized(spec))
1036 return -EINVAL;
1037
1038 if (ret_path) {
1039 r = path_simplify_alloc(spec, &path);
1040 if (r < 0)
1041 return r;
1042 }
1043
1044 } else {
1045 const char *e;
1046
1047 e = strchr(spec, ':');
1048 if (e) {
1049 controller = strndup(spec, e-spec);
1050 if (!controller)
1051 return -ENOMEM;
1052 if (!cg_controller_is_valid(controller))
1053 return -EINVAL;
1054
1055 if (!isempty(e + 1)) {
1056 path = strdup(e+1);
1057 if (!path)
1058 return -ENOMEM;
1059
1060 if (!path_is_normalized(path) ||
1061 !path_is_absolute(path))
1062 return -EINVAL;
1063
1064 path_simplify(path);
1065 }
1066
1067 } else {
1068 if (!cg_controller_is_valid(spec))
1069 return -EINVAL;
1070
1071 if (ret_controller) {
1072 controller = strdup(spec);
1073 if (!controller)
1074 return -ENOMEM;
1075 }
1076 }
1077 }
1078
1079 if (ret_controller)
1080 *ret_controller = TAKE_PTR(controller);
1081 if (ret_path)
1082 *ret_path = TAKE_PTR(path);
1083 return 0;
1084 }
1085
1086 int cg_mangle_path(const char *path, char **ret) {
1087 _cleanup_free_ char *c = NULL, *p = NULL;
1088 int r;
1089
1090 assert(path);
1091 assert(ret);
1092
1093 /* First, check if it already is a filesystem path */
1094 if (path_startswith(path, "/sys/fs/cgroup"))
1095 return path_simplify_alloc(path, ret);
1096
1097 /* Otherwise, treat it as cg spec */
1098 r = cg_split_spec(path, &c, &p);
1099 if (r < 0)
1100 return r;
1101
1102 return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, ret);
1103 }
1104
1105 int cg_get_root_path(char **ret_path) {
1106 char *p, *e;
1107 int r;
1108
1109 assert(ret_path);
1110
1111 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1112 if (r < 0)
1113 return r;
1114
1115 e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1116 if (!e)
1117 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1118 if (!e)
1119 e = endswith(p, "/system"); /* even more legacy */
1120 if (e)
1121 *e = 0;
1122
1123 *ret_path = p;
1124 return 0;
1125 }
1126
1127 int cg_shift_path(const char *cgroup, const char *root, const char **ret_shifted) {
1128 _cleanup_free_ char *rt = NULL;
1129 char *p;
1130 int r;
1131
1132 assert(cgroup);
1133 assert(ret_shifted);
1134
1135 if (!root) {
1136 /* If the root was specified let's use that, otherwise
1137 * let's determine it from PID 1 */
1138
1139 r = cg_get_root_path(&rt);
1140 if (r < 0)
1141 return r;
1142
1143 root = rt;
1144 }
1145
1146 p = path_startswith(cgroup, root);
1147 if (p && p > cgroup)
1148 *ret_shifted = p - 1;
1149 else
1150 *ret_shifted = cgroup;
1151
1152 return 0;
1153 }
1154
1155 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **ret_cgroup) {
1156 _cleanup_free_ char *raw = NULL;
1157 const char *c;
1158 int r;
1159
1160 assert(pid >= 0);
1161 assert(ret_cgroup);
1162
1163 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1164 if (r < 0)
1165 return r;
1166
1167 r = cg_shift_path(raw, root, &c);
1168 if (r < 0)
1169 return r;
1170
1171 if (c == raw) {
1172 *ret_cgroup = TAKE_PTR(raw);
1173 return 0;
1174 }
1175
1176 return strdup_to(ret_cgroup, c);
1177 }
1178
1179 int cg_path_decode_unit(const char *cgroup, char **ret_unit) {
1180 assert(cgroup);
1181 assert(ret_unit);
1182
1183 size_t n = strcspn(cgroup, "/");
1184 if (n < 3)
1185 return -ENXIO;
1186
1187 char *c = strndupa_safe(cgroup, n);
1188 c = cg_unescape(c);
1189
1190 if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1191 return -ENXIO;
1192
1193 return strdup_to(ret_unit, c);
1194 }
1195
1196 static bool valid_slice_name(const char *p, size_t n) {
1197
1198 if (!p)
1199 return false;
1200
1201 if (n < STRLEN("x.slice"))
1202 return false;
1203
1204 if (memcmp(p + n - 6, ".slice", 6) == 0) {
1205 char buf[n+1], *c;
1206
1207 memcpy(buf, p, n);
1208 buf[n] = 0;
1209
1210 c = cg_unescape(buf);
1211
1212 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1213 }
1214
1215 return false;
1216 }
1217
1218 static const char *skip_slices(const char *p) {
1219 assert(p);
1220
1221 /* Skips over all slice assignments */
1222
1223 for (;;) {
1224 size_t n;
1225
1226 p += strspn(p, "/");
1227
1228 n = strcspn(p, "/");
1229 if (!valid_slice_name(p, n))
1230 return p;
1231
1232 p += n;
1233 }
1234 }
1235
1236 int cg_path_get_unit(const char *path, char **ret) {
1237 _cleanup_free_ char *unit = NULL;
1238 const char *e;
1239 int r;
1240
1241 assert(path);
1242 assert(ret);
1243
1244 e = skip_slices(path);
1245
1246 r = cg_path_decode_unit(e, &unit);
1247 if (r < 0)
1248 return r;
1249
1250 /* We skipped over the slices, don't accept any now */
1251 if (endswith(unit, ".slice"))
1252 return -ENXIO;
1253
1254 *ret = TAKE_PTR(unit);
1255 return 0;
1256 }
1257
1258 int cg_path_get_unit_path(const char *path, char **ret) {
1259 _cleanup_free_ char *path_copy = NULL;
1260 char *unit_name;
1261
1262 assert(path);
1263 assert(ret);
1264
1265 path_copy = strdup(path);
1266 if (!path_copy)
1267 return -ENOMEM;
1268
1269 unit_name = (char *)skip_slices(path_copy);
1270 unit_name[strcspn(unit_name, "/")] = 0;
1271
1272 if (!unit_name_is_valid(cg_unescape(unit_name), UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1273 return -ENXIO;
1274
1275 *ret = TAKE_PTR(path_copy);
1276
1277 return 0;
1278 }
1279
1280 int cg_pid_get_unit(pid_t pid, char **ret_unit) {
1281 _cleanup_free_ char *cgroup = NULL;
1282 int r;
1283
1284 assert(ret_unit);
1285
1286 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1287 if (r < 0)
1288 return r;
1289
1290 return cg_path_get_unit(cgroup, ret_unit);
1291 }
1292
1293 int cg_pidref_get_unit(const PidRef *pidref, char **ret) {
1294 _cleanup_free_ char *unit = NULL;
1295 int r;
1296
1297 assert(ret);
1298
1299 if (!pidref_is_set(pidref))
1300 return -ESRCH;
1301
1302 r = cg_pid_get_unit(pidref->pid, &unit);
1303 if (r < 0)
1304 return r;
1305
1306 r = pidref_verify(pidref);
1307 if (r < 0)
1308 return r;
1309
1310 *ret = TAKE_PTR(unit);
1311 return 0;
1312 }
1313
1314 /**
1315 * Skip session-*.scope, but require it to be there.
1316 */
1317 static const char *skip_session(const char *p) {
1318 size_t n;
1319
1320 if (isempty(p))
1321 return NULL;
1322
1323 p += strspn(p, "/");
1324
1325 n = strcspn(p, "/");
1326 if (n < STRLEN("session-x.scope"))
1327 return NULL;
1328
1329 if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1330 char buf[n - 8 - 6 + 1];
1331
1332 memcpy(buf, p + 8, n - 8 - 6);
1333 buf[n - 8 - 6] = 0;
1334
1335 /* Note that session scopes never need unescaping,
1336 * since they cannot conflict with the kernel's own
1337 * names, hence we don't need to call cg_unescape()
1338 * here. */
1339
1340 if (!session_id_valid(buf))
1341 return NULL;
1342
1343 p += n;
1344 p += strspn(p, "/");
1345 return p;
1346 }
1347
1348 return NULL;
1349 }
1350
1351 /**
1352 * Skip user@*.service, but require it to be there.
1353 */
1354 static const char *skip_user_manager(const char *p) {
1355 size_t n;
1356
1357 if (isempty(p))
1358 return NULL;
1359
1360 p += strspn(p, "/");
1361
1362 n = strcspn(p, "/");
1363 if (n < STRLEN("user@x.service"))
1364 return NULL;
1365
1366 if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1367 char buf[n - 5 - 8 + 1];
1368
1369 memcpy(buf, p + 5, n - 5 - 8);
1370 buf[n - 5 - 8] = 0;
1371
1372 /* Note that user manager services never need unescaping,
1373 * since they cannot conflict with the kernel's own
1374 * names, hence we don't need to call cg_unescape()
1375 * here. */
1376
1377 if (parse_uid(buf, NULL) < 0)
1378 return NULL;
1379
1380 p += n;
1381 p += strspn(p, "/");
1382
1383 return p;
1384 }
1385
1386 return NULL;
1387 }
1388
1389 static const char *skip_user_prefix(const char *path) {
1390 const char *e, *t;
1391
1392 assert(path);
1393
1394 /* Skip slices, if there are any */
1395 e = skip_slices(path);
1396
1397 /* Skip the user manager, if it's in the path now... */
1398 t = skip_user_manager(e);
1399 if (t)
1400 return t;
1401
1402 /* Alternatively skip the user session if it is in the path... */
1403 return skip_session(e);
1404 }
1405
1406 int cg_path_get_user_unit(const char *path, char **ret) {
1407 const char *t;
1408
1409 assert(path);
1410 assert(ret);
1411
1412 t = skip_user_prefix(path);
1413 if (!t)
1414 return -ENXIO;
1415
1416 /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1417 * parser. */
1418 return cg_path_get_unit(t, ret);
1419 }
1420
1421 int cg_pid_get_user_unit(pid_t pid, char **ret_unit) {
1422 _cleanup_free_ char *cgroup = NULL;
1423 int r;
1424
1425 assert(ret_unit);
1426
1427 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1428 if (r < 0)
1429 return r;
1430
1431 return cg_path_get_user_unit(cgroup, ret_unit);
1432 }
1433
1434 int cg_path_get_machine_name(const char *path, char **ret_machine) {
1435 _cleanup_free_ char *u = NULL;
1436 const char *sl;
1437 int r;
1438
1439 r = cg_path_get_unit(path, &u);
1440 if (r < 0)
1441 return r;
1442
1443 sl = strjoina("/run/systemd/machines/unit:", u);
1444 return readlink_malloc(sl, ret_machine);
1445 }
1446
1447 int cg_pid_get_machine_name(pid_t pid, char **ret_machine) {
1448 _cleanup_free_ char *cgroup = NULL;
1449 int r;
1450
1451 assert(ret_machine);
1452
1453 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1454 if (r < 0)
1455 return r;
1456
1457 return cg_path_get_machine_name(cgroup, ret_machine);
1458 }
1459
1460 int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
1461 cg_file_handle fh = CG_FILE_HANDLE_INIT;
1462 int mnt_id;
1463
1464 assert(path);
1465 assert(ret);
1466
1467 /* This is cgroupfs so we know the size of the handle, thus no need to loop around like
1468 * name_to_handle_at_loop() does in mountpoint-util.c */
1469 if (name_to_handle_at(AT_FDCWD, path, &fh.file_handle, &mnt_id, 0) < 0)
1470 return -errno;
1471
1472 *ret = CG_FILE_HANDLE_CGROUPID(fh);
1473 return 0;
1474 }
1475
1476 int cg_fd_get_cgroupid(int fd, uint64_t *ret) {
1477 cg_file_handle fh = CG_FILE_HANDLE_INIT;
1478 int mnt_id = -1;
1479
1480 assert(fd >= 0);
1481 assert(ret);
1482
1483 if (name_to_handle_at(fd, "", &fh.file_handle, &mnt_id, AT_EMPTY_PATH) < 0)
1484 return -errno;
1485
1486 *ret = CG_FILE_HANDLE_CGROUPID(fh);
1487 return 0;
1488 }
1489
1490 int cg_path_get_session(const char *path, char **ret_session) {
1491 _cleanup_free_ char *unit = NULL;
1492 char *start, *end;
1493 int r;
1494
1495 assert(path);
1496
1497 r = cg_path_get_unit(path, &unit);
1498 if (r < 0)
1499 return r;
1500
1501 start = startswith(unit, "session-");
1502 if (!start)
1503 return -ENXIO;
1504 end = endswith(start, ".scope");
1505 if (!end)
1506 return -ENXIO;
1507
1508 *end = 0;
1509 if (!session_id_valid(start))
1510 return -ENXIO;
1511
1512 if (!ret_session)
1513 return 0;
1514
1515 return strdup_to(ret_session, start);
1516 }
1517
1518 int cg_pid_get_session(pid_t pid, char **ret_session) {
1519 _cleanup_free_ char *cgroup = NULL;
1520 int r;
1521
1522 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1523 if (r < 0)
1524 return r;
1525
1526 return cg_path_get_session(cgroup, ret_session);
1527 }
1528
1529 int cg_path_get_owner_uid(const char *path, uid_t *ret_uid) {
1530 _cleanup_free_ char *slice = NULL;
1531 char *start, *end;
1532 int r;
1533
1534 assert(path);
1535
1536 r = cg_path_get_slice(path, &slice);
1537 if (r < 0)
1538 return r;
1539
1540 start = startswith(slice, "user-");
1541 if (!start)
1542 return -ENXIO;
1543
1544 end = endswith(start, ".slice");
1545 if (!end)
1546 return -ENXIO;
1547
1548 *end = 0;
1549 if (parse_uid(start, ret_uid) < 0)
1550 return -ENXIO;
1551
1552 return 0;
1553 }
1554
1555 int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid) {
1556 _cleanup_free_ char *cgroup = NULL;
1557 int r;
1558
1559 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1560 if (r < 0)
1561 return r;
1562
1563 return cg_path_get_owner_uid(cgroup, ret_uid);
1564 }
1565
1566 int cg_path_get_slice(const char *p, char **ret_slice) {
1567 const char *e = NULL;
1568
1569 assert(p);
1570 assert(ret_slice);
1571
1572 /* Finds the right-most slice unit from the beginning, but stops before we come to
1573 * the first non-slice unit. */
1574
1575 for (;;) {
1576 const char *s;
1577 int n;
1578
1579 n = path_find_first_component(&p, /* accept_dot_dot = */ false, &s);
1580 if (n < 0)
1581 return n;
1582 if (!valid_slice_name(s, n))
1583 break;
1584
1585 e = s;
1586 }
1587
1588 if (e)
1589 return cg_path_decode_unit(e, ret_slice);
1590
1591 return strdup_to(ret_slice, SPECIAL_ROOT_SLICE);
1592 }
1593
1594 int cg_pid_get_slice(pid_t pid, char **ret_slice) {
1595 _cleanup_free_ char *cgroup = NULL;
1596 int r;
1597
1598 assert(ret_slice);
1599
1600 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1601 if (r < 0)
1602 return r;
1603
1604 return cg_path_get_slice(cgroup, ret_slice);
1605 }
1606
1607 int cg_path_get_user_slice(const char *p, char **ret_slice) {
1608 const char *t;
1609 assert(p);
1610 assert(ret_slice);
1611
1612 t = skip_user_prefix(p);
1613 if (!t)
1614 return -ENXIO;
1615
1616 /* And now it looks pretty much the same as for a system slice, so let's just use the same parser
1617 * from here on. */
1618 return cg_path_get_slice(t, ret_slice);
1619 }
1620
1621 int cg_pid_get_user_slice(pid_t pid, char **ret_slice) {
1622 _cleanup_free_ char *cgroup = NULL;
1623 int r;
1624
1625 assert(ret_slice);
1626
1627 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1628 if (r < 0)
1629 return r;
1630
1631 return cg_path_get_user_slice(cgroup, ret_slice);
1632 }
1633
1634 bool cg_needs_escape(const char *p) {
1635
1636 /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note
1637 * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if
1638 * they shall be used. Also note that various names cannot be made valid by escaping even if we
1639 * return true here (because too long, or contain the forbidden character "/"). */
1640
1641 if (!filename_is_valid(p))
1642 return true;
1643
1644 if (IN_SET(p[0], '_', '.'))
1645 return true;
1646
1647 if (STR_IN_SET(p, "notify_on_release", "release_agent", "tasks"))
1648 return true;
1649
1650 if (startswith(p, "cgroup."))
1651 return true;
1652
1653 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1654 const char *q;
1655
1656 q = startswith(p, cgroup_controller_to_string(c));
1657 if (!q)
1658 continue;
1659
1660 if (q[0] == '.')
1661 return true;
1662 }
1663
1664 return false;
1665 }
1666
1667 int cg_escape(const char *p, char **ret) {
1668 _cleanup_free_ char *n = NULL;
1669
1670 /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any
1671 * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That
1672 * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there
1673 * is one. */
1674
1675 /* The return value of this function (unlike cg_unescape()) needs free()! */
1676
1677 if (cg_needs_escape(p)) {
1678 n = strjoin("_", p);
1679 if (!n)
1680 return -ENOMEM;
1681
1682 if (!filename_is_valid(n)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */
1683 return -EINVAL;
1684 } else {
1685 n = strdup(p);
1686 if (!n)
1687 return -ENOMEM;
1688 }
1689
1690 *ret = TAKE_PTR(n);
1691 return 0;
1692 }
1693
1694 char *cg_unescape(const char *p) {
1695 assert(p);
1696
1697 /* The return value of this function (unlike cg_escape())
1698 * doesn't need free()! */
1699
1700 if (p[0] == '_')
1701 return (char*) p+1;
1702
1703 return (char*) p;
1704 }
1705
1706 #define CONTROLLER_VALID \
1707 DIGITS LETTERS \
1708 "_"
1709
1710 bool cg_controller_is_valid(const char *p) {
1711 const char *t, *s;
1712
1713 if (!p)
1714 return false;
1715
1716 if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1717 return true;
1718
1719 s = startswith(p, "name=");
1720 if (s)
1721 p = s;
1722
1723 if (IN_SET(*p, 0, '_'))
1724 return false;
1725
1726 for (t = p; *t; t++)
1727 if (!strchr(CONTROLLER_VALID, *t))
1728 return false;
1729
1730 if (t - p > NAME_MAX)
1731 return false;
1732
1733 return true;
1734 }
1735
1736 int cg_slice_to_path(const char *unit, char **ret) {
1737 _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1738 const char *dash;
1739 int r;
1740
1741 assert(unit);
1742 assert(ret);
1743
1744 if (streq(unit, SPECIAL_ROOT_SLICE))
1745 return strdup_to(ret, "");
1746
1747 if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1748 return -EINVAL;
1749
1750 if (!endswith(unit, ".slice"))
1751 return -EINVAL;
1752
1753 r = unit_name_to_prefix(unit, &p);
1754 if (r < 0)
1755 return r;
1756
1757 dash = strchr(p, '-');
1758
1759 /* Don't allow initial dashes */
1760 if (dash == p)
1761 return -EINVAL;
1762
1763 while (dash) {
1764 _cleanup_free_ char *escaped = NULL;
1765 char n[dash - p + sizeof(".slice")];
1766
1767 #if HAS_FEATURE_MEMORY_SANITIZER
1768 /* msan doesn't instrument stpncpy, so it thinks
1769 * n is later used uninitialized:
1770 * https://github.com/google/sanitizers/issues/926
1771 */
1772 zero(n);
1773 #endif
1774
1775 /* Don't allow trailing or double dashes */
1776 if (IN_SET(dash[1], 0, '-'))
1777 return -EINVAL;
1778
1779 strcpy(stpncpy(n, p, dash - p), ".slice");
1780 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1781 return -EINVAL;
1782
1783 r = cg_escape(n, &escaped);
1784 if (r < 0)
1785 return r;
1786
1787 if (!strextend(&s, escaped, "/"))
1788 return -ENOMEM;
1789
1790 dash = strchr(dash+1, '-');
1791 }
1792
1793 r = cg_escape(unit, &e);
1794 if (r < 0)
1795 return r;
1796
1797 if (!strextend(&s, e))
1798 return -ENOMEM;
1799
1800 *ret = TAKE_PTR(s);
1801 return 0;
1802 }
1803
1804 int cg_is_threaded(const char *path) {
1805 _cleanup_free_ char *fs = NULL, *contents = NULL;
1806 _cleanup_strv_free_ char **v = NULL;
1807 int r;
1808
1809 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.type", &fs);
1810 if (r < 0)
1811 return r;
1812
1813 r = read_full_virtual_file(fs, &contents, NULL);
1814 if (r == -ENOENT)
1815 return false; /* Assume no. */
1816 if (r < 0)
1817 return r;
1818
1819 v = strv_split(contents, NULL);
1820 if (!v)
1821 return -ENOMEM;
1822
1823 /* If the cgroup is in the threaded mode, it contains "threaded".
1824 * If one of the parents or siblings is in the threaded mode, it may contain "invalid". */
1825 return strv_contains(v, "threaded") || strv_contains(v, "invalid");
1826 }
1827
1828 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
1829 _cleanup_free_ char *p = NULL;
1830 int r;
1831
1832 r = cg_get_path(controller, path, attribute, &p);
1833 if (r < 0)
1834 return r;
1835
1836 return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
1837 }
1838
1839 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
1840 _cleanup_free_ char *p = NULL;
1841 int r;
1842
1843 r = cg_get_path(controller, path, attribute, &p);
1844 if (r < 0)
1845 return r;
1846
1847 return read_one_line_file(p, ret);
1848 }
1849
1850 int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret) {
1851 _cleanup_free_ char *value = NULL;
1852 uint64_t v;
1853 int r;
1854
1855 assert(ret);
1856
1857 r = cg_get_attribute(controller, path, attribute, &value);
1858 if (r == -ENOENT)
1859 return -ENODATA;
1860 if (r < 0)
1861 return r;
1862
1863 if (streq(value, "max")) {
1864 *ret = CGROUP_LIMIT_MAX;
1865 return 0;
1866 }
1867
1868 r = safe_atou64(value, &v);
1869 if (r < 0)
1870 return r;
1871
1872 *ret = v;
1873 return 0;
1874 }
1875
1876 int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) {
1877 _cleanup_free_ char *value = NULL;
1878 int r;
1879
1880 assert(ret);
1881
1882 r = cg_get_attribute(controller, path, attribute, &value);
1883 if (r == -ENOENT)
1884 return -ENODATA;
1885 if (r < 0)
1886 return r;
1887
1888 r = parse_boolean(value);
1889 if (r < 0)
1890 return r;
1891
1892 *ret = r;
1893 return 0;
1894 }
1895
1896 int cg_get_owner(const char *path, uid_t *ret_uid) {
1897 _cleanup_free_ char *f = NULL;
1898 struct stat stats;
1899 int r;
1900
1901 assert(ret_uid);
1902
1903 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &f);
1904 if (r < 0)
1905 return r;
1906
1907 if (stat(f, &stats) < 0)
1908 return -errno;
1909
1910 r = stat_verify_directory(&stats);
1911 if (r < 0)
1912 return r;
1913
1914 *ret_uid = stats.st_uid;
1915 return 0;
1916 }
1917
1918 int cg_get_keyed_attribute_full(
1919 const char *controller,
1920 const char *path,
1921 const char *attribute,
1922 char **keys,
1923 char **ret_values,
1924 CGroupKeyMode mode) {
1925
1926 _cleanup_free_ char *filename = NULL, *contents = NULL;
1927 const char *p;
1928 size_t n, i, n_done = 0;
1929 char **v;
1930 int r;
1931
1932 /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
1933 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
1934 * entries as 'keys'. On success each entry will be set to the value of the matching key.
1935 *
1936 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode
1937 * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */
1938
1939 r = cg_get_path(controller, path, attribute, &filename);
1940 if (r < 0)
1941 return r;
1942
1943 r = read_full_file(filename, &contents, NULL);
1944 if (r < 0)
1945 return r;
1946
1947 n = strv_length(keys);
1948 if (n == 0) /* No keys to retrieve? That's easy, we are done then */
1949 return 0;
1950
1951 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
1952 v = newa0(char*, n);
1953
1954 for (p = contents; *p;) {
1955 const char *w = NULL;
1956
1957 for (i = 0; i < n; i++)
1958 if (!v[i]) {
1959 w = first_word(p, keys[i]);
1960 if (w)
1961 break;
1962 }
1963
1964 if (w) {
1965 size_t l;
1966
1967 l = strcspn(w, NEWLINE);
1968 v[i] = strndup(w, l);
1969 if (!v[i]) {
1970 r = -ENOMEM;
1971 goto fail;
1972 }
1973
1974 n_done++;
1975 if (n_done >= n)
1976 goto done;
1977
1978 p = w + l;
1979 } else
1980 p += strcspn(p, NEWLINE);
1981
1982 p += strspn(p, NEWLINE);
1983 }
1984
1985 if (mode & CG_KEY_MODE_GRACEFUL)
1986 goto done;
1987
1988 r = -ENXIO;
1989
1990 fail:
1991 free_many_charp(v, n);
1992 return r;
1993
1994 done:
1995 memcpy(ret_values, v, sizeof(char*) * n);
1996 if (mode & CG_KEY_MODE_GRACEFUL)
1997 return n_done;
1998
1999 return 0;
2000 }
2001
2002 int cg_mask_to_string(CGroupMask mask, char **ret) {
2003 _cleanup_free_ char *s = NULL;
2004 bool space = false;
2005 CGroupController c;
2006 size_t n = 0;
2007
2008 assert(ret);
2009
2010 if (mask == 0) {
2011 *ret = NULL;
2012 return 0;
2013 }
2014
2015 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2016 const char *k;
2017 size_t l;
2018
2019 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
2020 continue;
2021
2022 k = cgroup_controller_to_string(c);
2023 l = strlen(k);
2024
2025 if (!GREEDY_REALLOC(s, n + space + l + 1))
2026 return -ENOMEM;
2027
2028 if (space)
2029 s[n] = ' ';
2030 memcpy(s + n + space, k, l);
2031 n += space + l;
2032
2033 space = true;
2034 }
2035
2036 assert(s);
2037
2038 s[n] = 0;
2039 *ret = TAKE_PTR(s);
2040
2041 return 0;
2042 }
2043
2044 int cg_mask_from_string(const char *value, CGroupMask *ret) {
2045 CGroupMask m = 0;
2046
2047 assert(ret);
2048 assert(value);
2049
2050 for (;;) {
2051 _cleanup_free_ char *n = NULL;
2052 CGroupController v;
2053 int r;
2054
2055 r = extract_first_word(&value, &n, NULL, 0);
2056 if (r < 0)
2057 return r;
2058 if (r == 0)
2059 break;
2060
2061 v = cgroup_controller_from_string(n);
2062 if (v < 0)
2063 continue;
2064
2065 m |= CGROUP_CONTROLLER_TO_MASK(v);
2066 }
2067
2068 *ret = m;
2069 return 0;
2070 }
2071
2072 int cg_mask_supported_subtree(const char *root, CGroupMask *ret) {
2073 CGroupMask mask;
2074 int r;
2075
2076 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2077 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2078 * pseudo-controllers. */
2079
2080 r = cg_all_unified();
2081 if (r < 0)
2082 return r;
2083 if (r > 0) {
2084 _cleanup_free_ char *controllers = NULL, *path = NULL;
2085
2086 /* In the unified hierarchy we can read the supported and accessible controllers from
2087 * the top-level cgroup attribute */
2088
2089 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2090 if (r < 0)
2091 return r;
2092
2093 r = read_one_line_file(path, &controllers);
2094 if (r < 0)
2095 return r;
2096
2097 r = cg_mask_from_string(controllers, &mask);
2098 if (r < 0)
2099 return r;
2100
2101 /* Mask controllers that are not supported in unified hierarchy. */
2102 mask &= CGROUP_MASK_V2;
2103
2104 } else {
2105 CGroupController c;
2106
2107 /* In the legacy hierarchy, we check which hierarchies are accessible. */
2108
2109 mask = 0;
2110 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2111 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2112 const char *n;
2113
2114 if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2115 continue;
2116
2117 n = cgroup_controller_to_string(c);
2118 if (controller_is_v1_accessible(root, n) >= 0)
2119 mask |= bit;
2120 }
2121 }
2122
2123 *ret = mask;
2124 return 0;
2125 }
2126
2127 int cg_mask_supported(CGroupMask *ret) {
2128 _cleanup_free_ char *root = NULL;
2129 int r;
2130
2131 r = cg_get_root_path(&root);
2132 if (r < 0)
2133 return r;
2134
2135 return cg_mask_supported_subtree(root, ret);
2136 }
2137
2138 int cg_kernel_controllers(Set **ret) {
2139 _cleanup_set_free_ Set *controllers = NULL;
2140 _cleanup_fclose_ FILE *f = NULL;
2141 int r;
2142
2143 assert(ret);
2144
2145 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2146 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2147 * pseudo-controllers. */
2148
2149 r = fopen_unlocked("/proc/cgroups", "re", &f);
2150 if (r == -ENOENT) {
2151 *ret = NULL;
2152 return 0;
2153 }
2154 if (r < 0)
2155 return r;
2156
2157 /* Ignore the header line */
2158 (void) read_line(f, SIZE_MAX, NULL);
2159
2160 for (;;) {
2161 _cleanup_free_ char *controller = NULL;
2162 int enabled = 0;
2163
2164 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2165
2166 if (ferror(f))
2167 return -errno;
2168
2169 if (feof(f))
2170 break;
2171
2172 return -EBADMSG;
2173 }
2174
2175 if (!enabled)
2176 continue;
2177
2178 if (!cg_controller_is_valid(controller))
2179 return -EBADMSG;
2180
2181 r = set_ensure_consume(&controllers, &string_hash_ops_free, TAKE_PTR(controller));
2182 if (r < 0)
2183 return r;
2184 }
2185
2186 *ret = TAKE_PTR(controllers);
2187
2188 return 0;
2189 }
2190
2191 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on
2192 * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1
2193 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on
2194 * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility
2195 * with other tools.
2196 *
2197 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep
2198 * cgroup v2 process management but disable the compat dual layout, we return true on
2199 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified().
2200 */
2201 static thread_local bool unified_systemd_v232;
2202
2203 int cg_unified_cached(bool flush) {
2204 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2205
2206 struct statfs fs;
2207
2208 /* Checks if we support the unified hierarchy. Returns an
2209 * error when the cgroup hierarchies aren't mounted yet or we
2210 * have any other trouble determining if the unified hierarchy
2211 * is supported. */
2212
2213 if (flush)
2214 unified_cache = CGROUP_UNIFIED_UNKNOWN;
2215 else if (unified_cache >= CGROUP_UNIFIED_NONE)
2216 return unified_cache;
2217
2218 if (statfs("/sys/fs/cgroup/", &fs) < 0)
2219 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2220
2221 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2222 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2223 unified_cache = CGROUP_UNIFIED_ALL;
2224 } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2225 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2226 F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2227 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2228 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2229 unified_systemd_v232 = false;
2230 } else {
2231 if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0) {
2232 if (errno == ENOENT) {
2233 /* Some other software may have set up /sys/fs/cgroup in a configuration we do not recognize. */
2234 log_debug_errno(errno, "Unsupported cgroupsv1 setup detected: name=systemd hierarchy not found.");
2235 return -ENOMEDIUM;
2236 }
2237 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2238 }
2239
2240 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2241 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2242 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2243 unified_systemd_v232 = true;
2244 } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2245 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2246 unified_cache = CGROUP_UNIFIED_NONE;
2247 } else {
2248 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2249 (unsigned long long) fs.f_type);
2250 unified_cache = CGROUP_UNIFIED_NONE;
2251 }
2252 }
2253 } else if (F_TYPE_EQUAL(fs.f_type, SYSFS_MAGIC)) {
2254 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2255 "No filesystem is currently mounted on /sys/fs/cgroup.");
2256 } else
2257 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2258 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2259 (unsigned long long)fs.f_type);
2260
2261 return unified_cache;
2262 }
2263
2264 int cg_unified_controller(const char *controller) {
2265 int r;
2266
2267 r = cg_unified_cached(false);
2268 if (r < 0)
2269 return r;
2270
2271 if (r == CGROUP_UNIFIED_NONE)
2272 return false;
2273
2274 if (r >= CGROUP_UNIFIED_ALL)
2275 return true;
2276
2277 return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2278 }
2279
2280 int cg_all_unified(void) {
2281 int r;
2282
2283 r = cg_unified_cached(false);
2284 if (r < 0)
2285 return r;
2286
2287 return r >= CGROUP_UNIFIED_ALL;
2288 }
2289
2290 int cg_hybrid_unified(void) {
2291 int r;
2292
2293 r = cg_unified_cached(false);
2294 if (r < 0)
2295 return r;
2296
2297 return r == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2298 }
2299
2300 int cg_is_delegated(const char *path) {
2301 int r;
2302
2303 assert(path);
2304
2305 r = cg_get_xattr_bool(path, "trusted.delegate");
2306 if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2307 return r;
2308
2309 /* If the trusted xattr isn't set (preferred), then check the untrusted one. Under the assumption
2310 * that whoever is trusted enough to own the cgroup, is also trusted enough to decide if it is
2311 * delegated or not this should be safe. */
2312 r = cg_get_xattr_bool(path, "user.delegate");
2313 return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2314 }
2315
2316 int cg_is_delegated_fd(int fd) {
2317 int r;
2318
2319 assert(fd >= 0);
2320
2321 r = getxattr_at_bool(fd, /* path= */ NULL, "trusted.delegate", /* flags= */ 0);
2322 if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2323 return r;
2324
2325 r = getxattr_at_bool(fd, /* path= */ NULL, "user.delegate", /* flags= */ 0);
2326 return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2327 }
2328
2329 int cg_has_coredump_receive(const char *path) {
2330 int r;
2331
2332 assert(path);
2333
2334 r = cg_get_xattr_bool(path, "user.coredump_receive");
2335 if (ERRNO_IS_NEG_XATTR_ABSENT(r))
2336 return false;
2337
2338 return r;
2339 }
2340
2341 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2342 [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX,
2343 [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX,
2344 [CGROUP_IO_RIOPS_MAX] = CGROUP_LIMIT_MAX,
2345 [CGROUP_IO_WIOPS_MAX] = CGROUP_LIMIT_MAX,
2346 };
2347
2348 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2349 [CGROUP_IO_RBPS_MAX] = "IOReadBandwidthMax",
2350 [CGROUP_IO_WBPS_MAX] = "IOWriteBandwidthMax",
2351 [CGROUP_IO_RIOPS_MAX] = "IOReadIOPSMax",
2352 [CGROUP_IO_WIOPS_MAX] = "IOWriteIOPSMax",
2353 };
2354
2355 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2356
2357 bool is_cgroup_fs(const struct statfs *s) {
2358 return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2359 is_fs_type(s, CGROUP2_SUPER_MAGIC);
2360 }
2361
2362 bool fd_is_cgroup_fs(int fd) {
2363 struct statfs s;
2364
2365 if (fstatfs(fd, &s) < 0)
2366 return -errno;
2367
2368 return is_cgroup_fs(&s);
2369 }
2370
2371 static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2372 [CGROUP_CONTROLLER_CPU] = "cpu",
2373 [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2374 [CGROUP_CONTROLLER_CPUSET] = "cpuset",
2375 [CGROUP_CONTROLLER_IO] = "io",
2376 [CGROUP_CONTROLLER_BLKIO] = "blkio",
2377 [CGROUP_CONTROLLER_MEMORY] = "memory",
2378 [CGROUP_CONTROLLER_DEVICES] = "devices",
2379 [CGROUP_CONTROLLER_PIDS] = "pids",
2380 [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2381 [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2382 [CGROUP_CONTROLLER_BPF_FOREIGN] = "bpf-foreign",
2383 [CGROUP_CONTROLLER_BPF_SOCKET_BIND] = "bpf-socket-bind",
2384 [CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES] = "bpf-restrict-network-interfaces",
2385 };
2386
2387 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2388
2389 CGroupMask get_cpu_accounting_mask(void) {
2390 static CGroupMask needed_mask = (CGroupMask) -1;
2391
2392 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2393 * provided externally from the CPU controller, which means we don't
2394 * need to enable the CPU controller just to get metrics. This is good,
2395 * because enabling the CPU controller comes at a minor performance
2396 * hit, especially when it's propagated deep into large hierarchies.
2397 * There's also no separate CPU accounting controller available within
2398 * a unified hierarchy.
2399 *
2400 * This combination of factors results in the desired cgroup mask to
2401 * enable for CPU accounting varying as follows:
2402 *
2403 * ╔═════════════════════╤═════════════════════╗
2404 * ║ Linux ≥4.15 │ Linux <4.15 ║
2405 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2406 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2407 * ╟───────────────╫─────────────────────┼─────────────────────╢
2408 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2409 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2410 *
2411 * We check kernel version here instead of manually checking whether
2412 * cpu.stat is present for every cgroup, as that check in itself would
2413 * already be fairly expensive.
2414 *
2415 * Kernels where this patch has been backported will therefore have the
2416 * CPU controller enabled unnecessarily. This is more expensive than
2417 * necessary, but harmless. ☺️
2418 */
2419
2420 if (needed_mask == (CGroupMask) -1) {
2421 if (cg_all_unified()) {
2422 struct utsname u;
2423 assert_se(uname(&u) >= 0);
2424
2425 if (strverscmp_improved(u.release, "4.15") < 0)
2426 needed_mask = CGROUP_MASK_CPU;
2427 else
2428 needed_mask = 0;
2429 } else
2430 needed_mask = CGROUP_MASK_CPUACCT;
2431 }
2432
2433 return needed_mask;
2434 }
2435
2436 bool cpu_accounting_is_cheap(void) {
2437 return get_cpu_accounting_mask() == 0;
2438 }
2439
2440 static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = {
2441 [MANAGED_OOM_AUTO] = "auto",
2442 [MANAGED_OOM_KILL] = "kill",
2443 };
2444
2445 DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode);
2446
2447 static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = {
2448 [MANAGED_OOM_PREFERENCE_NONE] = "none",
2449 [MANAGED_OOM_PREFERENCE_AVOID] = "avoid",
2450 [MANAGED_OOM_PREFERENCE_OMIT] = "omit",
2451 };
2452
2453 DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference);