]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/basic/cgroup-util.c
c0d0fe6f1484a5924a21cd3ade9fc2e80e133e0c
[thirdparty/systemd.git] / src / basic / cgroup-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <limits.h>
5 #include <signal.h>
6 #include <stddef.h>
7 #include <stdlib.h>
8 #include <sys/types.h>
9 #include <sys/utsname.h>
10 #include <sys/xattr.h>
11 #include <unistd.h>
12
13 #include "alloc-util.h"
14 #include "cgroup-util.h"
15 #include "constants.h"
16 #include "dirent-util.h"
17 #include "extract-word.h"
18 #include "fd-util.h"
19 #include "fileio.h"
20 #include "format-util.h"
21 #include "fs-util.h"
22 #include "log.h"
23 #include "login-util.h"
24 #include "macro.h"
25 #include "missing_fs.h"
26 #include "missing_magic.h"
27 #include "missing_threads.h"
28 #include "mkdir.h"
29 #include "parse-util.h"
30 #include "path-util.h"
31 #include "process-util.h"
32 #include "set.h"
33 #include "special.h"
34 #include "stat-util.h"
35 #include "stdio-util.h"
36 #include "string-table.h"
37 #include "string-util.h"
38 #include "strv.h"
39 #include "unit-name.h"
40 #include "user-util.h"
41 #include "xattr-util.h"
42
43 int cg_path_open(const char *controller, const char *path) {
44 _cleanup_free_ char *fs = NULL;
45 int r;
46
47 r = cg_get_path(controller, path, /* item=*/ NULL, &fs);
48 if (r < 0)
49 return r;
50
51 return RET_NERRNO(open(fs, O_DIRECTORY|O_CLOEXEC));
52 }
53
54 int cg_cgroupid_open(int cgroupfs_fd, uint64_t id) {
55 _cleanup_close_ int fsfd = -EBADF;
56
57 if (cgroupfs_fd < 0) {
58 fsfd = open("/sys/fs/cgroup", O_CLOEXEC|O_DIRECTORY);
59 if (fsfd < 0)
60 return -errno;
61
62 cgroupfs_fd = fsfd;
63 }
64
65 cg_file_handle fh = CG_FILE_HANDLE_INIT;
66 CG_FILE_HANDLE_CGROUPID(fh) = id;
67
68 int fd = open_by_handle_at(cgroupfs_fd, &fh.file_handle, O_DIRECTORY|O_CLOEXEC);
69 if (fd < 0)
70 return -errno;
71
72 return fd;
73 }
74
75 static int cg_enumerate_items(const char *controller, const char *path, FILE **ret, const char *item) {
76 _cleanup_free_ char *fs = NULL;
77 FILE *f;
78 int r;
79
80 assert(ret);
81
82 r = cg_get_path(controller, path, item, &fs);
83 if (r < 0)
84 return r;
85
86 f = fopen(fs, "re");
87 if (!f)
88 return -errno;
89
90 *ret = f;
91 return 0;
92 }
93
94 int cg_enumerate_processes(const char *controller, const char *path, FILE **ret) {
95 return cg_enumerate_items(controller, path, ret, "cgroup.procs");
96 }
97
98 int cg_read_pid(FILE *f, pid_t *ret, CGroupFlags flags) {
99 unsigned long ul;
100
101 /* Note that the cgroup.procs might contain duplicates! See cgroups.txt for details. */
102
103 assert(f);
104 assert(ret);
105
106 for (;;) {
107 errno = 0;
108 if (fscanf(f, "%lu", &ul) != 1) {
109
110 if (feof(f)) {
111 *ret = 0;
112 return 0;
113 }
114
115 return errno_or_else(EIO);
116 }
117
118 if (ul > PID_T_MAX)
119 return -EIO;
120
121 /* In some circumstances (e.g. WSL), cgroups might contain unmappable PIDs from other
122 * contexts. These show up as zeros, and depending on the caller, can either be plain
123 * skipped over, or returned as-is. */
124 if (ul == 0 && !FLAGS_SET(flags, CGROUP_DONT_SKIP_UNMAPPED))
125 continue;
126
127 *ret = (pid_t) ul;
128 return 1;
129 }
130 }
131
132 int cg_read_pidref(FILE *f, PidRef *ret, CGroupFlags flags) {
133 int r;
134
135 assert(f);
136 assert(ret);
137
138 for (;;) {
139 pid_t pid;
140
141 r = cg_read_pid(f, &pid, flags);
142 if (r < 0)
143 return r;
144 if (r == 0) {
145 *ret = PIDREF_NULL;
146 return 0;
147 }
148
149 if (pid == 0)
150 return -EREMOTE;
151
152 r = pidref_set_pid(ret, pid);
153 if (r >= 0)
154 return 1;
155 if (r != -ESRCH)
156 return r;
157
158 /* ESRCH → gone by now? just skip over it, read the next */
159 }
160 }
161
162 int cg_read_event(
163 const char *controller,
164 const char *path,
165 const char *event,
166 char **ret) {
167
168 _cleanup_free_ char *events = NULL, *content = NULL;
169 int r;
170
171 r = cg_get_path(controller, path, "cgroup.events", &events);
172 if (r < 0)
173 return r;
174
175 r = read_full_virtual_file(events, &content, NULL);
176 if (r < 0)
177 return r;
178
179 for (const char *p = content;;) {
180 _cleanup_free_ char *line = NULL, *key = NULL;
181 const char *q;
182
183 r = extract_first_word(&p, &line, "\n", 0);
184 if (r < 0)
185 return r;
186 if (r == 0)
187 return -ENOENT;
188
189 q = line;
190 r = extract_first_word(&q, &key, " ", 0);
191 if (r < 0)
192 return r;
193 if (r == 0)
194 return -EINVAL;
195
196 if (!streq(key, event))
197 continue;
198
199 return strdup_to(ret, q);
200 }
201 }
202
203 bool cg_ns_supported(void) {
204 static thread_local int enabled = -1;
205
206 if (enabled >= 0)
207 return enabled;
208
209 if (access("/proc/self/ns/cgroup", F_OK) < 0) {
210 if (errno != ENOENT)
211 log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
212 enabled = false;
213 } else
214 enabled = true;
215
216 return enabled;
217 }
218
219 bool cg_freezer_supported(void) {
220 static thread_local int supported = -1;
221
222 if (supported >= 0)
223 return supported;
224
225 supported = cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK) == 0;
226
227 return supported;
228 }
229
230 bool cg_kill_supported(void) {
231 static thread_local int supported = -1;
232
233 if (supported >= 0)
234 return supported;
235
236 if (cg_all_unified() <= 0)
237 supported = false;
238 else if (access("/sys/fs/cgroup/init.scope/cgroup.kill", F_OK) < 0) {
239 if (errno != ENOENT)
240 log_debug_errno(errno, "Failed to check if cgroup.kill is available, assuming not: %m");
241 supported = false;
242 } else
243 supported = true;
244
245 return supported;
246 }
247
248 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **ret) {
249 _cleanup_free_ char *fs = NULL;
250 DIR *d;
251 int r;
252
253 assert(ret);
254
255 /* This is not recursive! */
256
257 r = cg_get_path(controller, path, NULL, &fs);
258 if (r < 0)
259 return r;
260
261 d = opendir(fs);
262 if (!d)
263 return -errno;
264
265 *ret = d;
266 return 0;
267 }
268
269 int cg_read_subgroup(DIR *d, char **ret) {
270 assert(d);
271 assert(ret);
272
273 FOREACH_DIRENT_ALL(de, d, return -errno) {
274 if (de->d_type != DT_DIR)
275 continue;
276
277 if (dot_or_dot_dot(de->d_name))
278 continue;
279
280 return strdup_to_full(ret, de->d_name);
281 }
282
283 *ret = NULL;
284 return 0;
285 }
286
287 int cg_rmdir(const char *controller, const char *path) {
288 _cleanup_free_ char *p = NULL;
289 int r;
290
291 r = cg_get_path(controller, path, NULL, &p);
292 if (r < 0)
293 return r;
294
295 r = rmdir(p);
296 if (r < 0 && errno != ENOENT)
297 return -errno;
298
299 r = cg_hybrid_unified();
300 if (r <= 0)
301 return r;
302
303 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
304 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
305 if (r < 0)
306 log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
307 }
308
309 return 0;
310 }
311
312 static int cg_kill_items(
313 const char *path,
314 int sig,
315 CGroupFlags flags,
316 Set *s,
317 cg_kill_log_func_t log_kill,
318 void *userdata,
319 const char *item) {
320
321 _cleanup_set_free_ Set *allocated_set = NULL;
322 bool done = false;
323 int r, ret = 0, ret_log_kill = 0;
324
325 assert(sig >= 0);
326
327 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
328 * SIGCONT on SIGKILL. */
329 if (IN_SET(sig, SIGCONT, SIGKILL))
330 flags &= ~CGROUP_SIGCONT;
331
332 /* This goes through the tasks list and kills them all. This
333 * is repeated until no further processes are added to the
334 * tasks list, to properly handle forking processes */
335
336 if (!s) {
337 s = allocated_set = set_new(NULL);
338 if (!s)
339 return -ENOMEM;
340 }
341
342 do {
343 _cleanup_fclose_ FILE *f = NULL;
344 done = true;
345
346 r = cg_enumerate_items(SYSTEMD_CGROUP_CONTROLLER, path, &f, item);
347 if (r == -ENOENT)
348 break;
349 if (r < 0)
350 return RET_GATHER(ret, r);
351
352 for (;;) {
353 _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
354
355 r = cg_read_pidref(f, &pidref, /* flags = */ 0);
356 if (r < 0)
357 return RET_GATHER(ret, r);
358 if (r == 0)
359 break;
360
361 if ((flags & CGROUP_IGNORE_SELF) && pidref_is_self(&pidref))
362 continue;
363
364 if (set_get(s, PID_TO_PTR(pidref.pid)) == PID_TO_PTR(pidref.pid))
365 continue;
366
367 if (log_kill)
368 ret_log_kill = log_kill(&pidref, sig, userdata);
369
370 /* If we haven't killed this process yet, kill it */
371 r = pidref_kill(&pidref, sig);
372 if (r < 0 && r != -ESRCH)
373 RET_GATHER(ret, r);
374 if (r >= 0) {
375 if (flags & CGROUP_SIGCONT)
376 (void) pidref_kill(&pidref, SIGCONT);
377
378 if (ret == 0) {
379 if (log_kill)
380 ret = ret_log_kill;
381 else
382 ret = 1;
383 }
384 }
385
386 done = false;
387
388 r = set_put(s, PID_TO_PTR(pidref.pid));
389 if (r < 0)
390 return RET_GATHER(ret, r);
391 }
392
393 /* To avoid racing against processes which fork quicker than we can kill them, we repeat this
394 * until no new pids need to be killed. */
395
396 } while (!done);
397
398 return ret;
399 }
400
401 int cg_kill(
402 const char *path,
403 int sig,
404 CGroupFlags flags,
405 Set *s,
406 cg_kill_log_func_t log_kill,
407 void *userdata) {
408
409 int r, ret;
410
411 r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.procs");
412 if (r < 0 || sig != SIGKILL)
413 return r;
414
415 ret = r;
416
417 /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
418 a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
419 (4340d175b898) and 4.14.138 (feb6b123b7dd). */
420 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
421 if (r < 0)
422 return r;
423 if (r == 0)
424 return ret;
425
426 r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.threads");
427 if (r < 0)
428 return r;
429
430 return r > 0 || ret > 0;
431 }
432
433 int cg_kill_kernel_sigkill(const char *path) {
434 /* Kills the cgroup at `path` directly by writing to its cgroup.kill file. This sends SIGKILL to all
435 * processes in the cgroup and has the advantage of being completely atomic, unlike cg_kill_items(). */
436
437 _cleanup_free_ char *killfile = NULL;
438 int r;
439
440 assert(path);
441
442 if (!cg_kill_supported())
443 return -EOPNOTSUPP;
444
445 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.kill", &killfile);
446 if (r < 0)
447 return r;
448
449 r = write_string_file(killfile, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
450 if (r < 0)
451 return r;
452
453 return 0;
454 }
455
456 int cg_kill_recursive(
457 const char *path,
458 int sig,
459 CGroupFlags flags,
460 Set *s,
461 cg_kill_log_func_t log_kill,
462 void *userdata) {
463
464 int r, ret;
465
466 assert(path);
467 assert(sig >= 0);
468
469 if (sig == SIGKILL && cg_kill_supported() &&
470 !FLAGS_SET(flags, CGROUP_IGNORE_SELF) && !s && !log_kill)
471 /* ignore CGROUP_SIGCONT, since this is a no-op alongside SIGKILL */
472 ret = cg_kill_kernel_sigkill(path);
473 else {
474 _cleanup_set_free_ Set *allocated_set = NULL;
475 _cleanup_closedir_ DIR *d = NULL;
476
477 if (!s) {
478 s = allocated_set = set_new(NULL);
479 if (!s)
480 return -ENOMEM;
481 }
482
483 ret = cg_kill(path, sig, flags, s, log_kill, userdata);
484
485 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
486 if (r < 0) {
487 if (r != -ENOENT)
488 RET_GATHER(ret, r);
489
490 return ret;
491 }
492
493 for (;;) {
494 _cleanup_free_ char *fn = NULL, *p = NULL;
495
496 r = cg_read_subgroup(d, &fn);
497 if (r < 0) {
498 RET_GATHER(ret, r);
499 break;
500 }
501 if (r == 0)
502 break;
503
504 p = path_join(empty_to_root(path), fn);
505 if (!p)
506 return -ENOMEM;
507
508 r = cg_kill_recursive(p, sig, flags, s, log_kill, userdata);
509 if (r != 0 && ret >= 0)
510 ret = r;
511 }
512 }
513
514 if (FLAGS_SET(flags, CGROUP_REMOVE)) {
515 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, path);
516 if (!IN_SET(r, -ENOENT, -EBUSY))
517 RET_GATHER(ret, r);
518 }
519
520 return ret;
521 }
522
523 static const char *controller_to_dirname(const char *controller) {
524 assert(controller);
525
526 /* Converts a controller name to the directory name below /sys/fs/cgroup/ we want to mount it
527 * to. Effectively, this just cuts off the name= prefixed used for named hierarchies, if it is
528 * specified. */
529
530 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
531 if (cg_hybrid_unified() > 0)
532 controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
533 else
534 controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
535 }
536
537 return startswith(controller, "name=") ?: controller;
538 }
539
540 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **ret) {
541 const char *dn;
542 char *t = NULL;
543
544 assert(ret);
545 assert(controller);
546
547 dn = controller_to_dirname(controller);
548
549 if (isempty(path) && isempty(suffix))
550 t = path_join("/sys/fs/cgroup", dn);
551 else if (isempty(path))
552 t = path_join("/sys/fs/cgroup", dn, suffix);
553 else if (isempty(suffix))
554 t = path_join("/sys/fs/cgroup", dn, path);
555 else
556 t = path_join("/sys/fs/cgroup", dn, path, suffix);
557 if (!t)
558 return -ENOMEM;
559
560 *ret = t;
561 return 0;
562 }
563
564 static int join_path_unified(const char *path, const char *suffix, char **ret) {
565 char *t;
566
567 assert(ret);
568
569 if (isempty(path) && isempty(suffix))
570 t = strdup("/sys/fs/cgroup");
571 else if (isempty(path))
572 t = path_join("/sys/fs/cgroup", suffix);
573 else if (isempty(suffix))
574 t = path_join("/sys/fs/cgroup", path);
575 else
576 t = path_join("/sys/fs/cgroup", path, suffix);
577 if (!t)
578 return -ENOMEM;
579
580 *ret = t;
581 return 0;
582 }
583
584 int cg_get_path(const char *controller, const char *path, const char *suffix, char **ret) {
585 int r;
586
587 assert(ret);
588
589 if (!controller) {
590 char *t;
591
592 /* If no controller is specified, we return the path *below* the controllers, without any
593 * prefix. */
594
595 if (isempty(path) && isempty(suffix))
596 return -EINVAL;
597
598 if (isempty(suffix))
599 t = strdup(path);
600 else if (isempty(path))
601 t = strdup(suffix);
602 else
603 t = path_join(path, suffix);
604 if (!t)
605 return -ENOMEM;
606
607 *ret = path_simplify(t);
608 return 0;
609 }
610
611 if (!cg_controller_is_valid(controller))
612 return -EINVAL;
613
614 r = cg_all_unified();
615 if (r < 0)
616 return r;
617 if (r > 0)
618 r = join_path_unified(path, suffix, ret);
619 else
620 r = join_path_legacy(controller, path, suffix, ret);
621 if (r < 0)
622 return r;
623
624 path_simplify(*ret);
625 return 0;
626 }
627
628 static int controller_is_v1_accessible(const char *root, const char *controller) {
629 const char *cpath, *dn;
630
631 assert(controller);
632
633 dn = controller_to_dirname(controller);
634
635 /* If root if specified, we check that:
636 * - possible subcgroup is created at root,
637 * - we can modify the hierarchy. */
638
639 cpath = strjoina("/sys/fs/cgroup/", dn, root, root ? "/cgroup.procs" : NULL);
640 return laccess(cpath, root ? W_OK : F_OK);
641 }
642
643 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **ret) {
644 int r;
645
646 assert(controller);
647 assert(ret);
648
649 if (!cg_controller_is_valid(controller))
650 return -EINVAL;
651
652 r = cg_all_unified();
653 if (r < 0)
654 return r;
655 if (r > 0) {
656 /* In the unified hierarchy all controllers are considered accessible,
657 * except for the named hierarchies */
658 if (startswith(controller, "name="))
659 return -EOPNOTSUPP;
660 } else {
661 /* Check if the specified controller is actually accessible */
662 r = controller_is_v1_accessible(NULL, controller);
663 if (r < 0)
664 return r;
665 }
666
667 return cg_get_path(controller, path, suffix, ret);
668 }
669
670 int cg_set_xattr(const char *path, const char *name, const void *value, size_t size, int flags) {
671 _cleanup_free_ char *fs = NULL;
672 int r;
673
674 assert(path);
675 assert(name);
676 assert(value || size <= 0);
677
678 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
679 if (r < 0)
680 return r;
681
682 return RET_NERRNO(setxattr(fs, name, value, size, flags));
683 }
684
685 int cg_get_xattr(const char *path, const char *name, void *value, size_t size) {
686 _cleanup_free_ char *fs = NULL;
687 ssize_t n;
688 int r;
689
690 assert(path);
691 assert(name);
692
693 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
694 if (r < 0)
695 return r;
696
697 n = getxattr(fs, name, value, size);
698 if (n < 0)
699 return -errno;
700
701 return (int) n;
702 }
703
704 int cg_get_xattr_malloc(const char *path, const char *name, char **ret) {
705 _cleanup_free_ char *fs = NULL;
706 int r;
707
708 assert(path);
709 assert(name);
710
711 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
712 if (r < 0)
713 return r;
714
715 return lgetxattr_malloc(fs, name, ret);
716 }
717
718 int cg_get_xattr_bool(const char *path, const char *name) {
719 _cleanup_free_ char *fs = NULL;
720 int r;
721
722 assert(path);
723 assert(name);
724
725 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
726 if (r < 0)
727 return r;
728
729 return getxattr_at_bool(AT_FDCWD, fs, name, /* flags= */ 0);
730 }
731
732 int cg_remove_xattr(const char *path, const char *name) {
733 _cleanup_free_ char *fs = NULL;
734 int r;
735
736 assert(path);
737 assert(name);
738
739 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
740 if (r < 0)
741 return r;
742
743 return RET_NERRNO(removexattr(fs, name));
744 }
745
746 int cg_pid_get_path(const char *controller, pid_t pid, char **ret_path) {
747 _cleanup_fclose_ FILE *f = NULL;
748 const char *fs, *controller_str = NULL; /* avoid false maybe-uninitialized warning */
749 int unified, r;
750
751 assert(pid >= 0);
752 assert(ret_path);
753
754 if (controller) {
755 if (!cg_controller_is_valid(controller))
756 return -EINVAL;
757 } else
758 controller = SYSTEMD_CGROUP_CONTROLLER;
759
760 unified = cg_unified_controller(controller);
761 if (unified < 0)
762 return unified;
763 if (unified == 0) {
764 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
765 controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
766 else
767 controller_str = controller;
768 }
769
770 fs = procfs_file_alloca(pid, "cgroup");
771 r = fopen_unlocked(fs, "re", &f);
772 if (r == -ENOENT)
773 return -ESRCH;
774 if (r < 0)
775 return r;
776
777 for (;;) {
778 _cleanup_free_ char *line = NULL;
779 char *e;
780
781 r = read_line(f, LONG_LINE_MAX, &line);
782 if (r < 0)
783 return r;
784 if (r == 0)
785 return -ENODATA;
786
787 if (unified) {
788 e = startswith(line, "0:");
789 if (!e)
790 continue;
791
792 e = strchr(e, ':');
793 if (!e)
794 continue;
795 } else {
796 char *l;
797
798 l = strchr(line, ':');
799 if (!l)
800 continue;
801
802 l++;
803 e = strchr(l, ':');
804 if (!e)
805 continue;
806 *e = 0;
807
808 assert(controller_str);
809 r = string_contains_word(l, ",", controller_str);
810 if (r < 0)
811 return r;
812 if (r == 0)
813 continue;
814 }
815
816 char *path = strdup(e + 1);
817 if (!path)
818 return -ENOMEM;
819
820 /* Truncate suffix indicating the process is a zombie */
821 e = endswith(path, " (deleted)");
822 if (e)
823 *e = 0;
824
825 *ret_path = path;
826 return 0;
827 }
828 }
829
830 int cg_pidref_get_path(const char *controller, const PidRef *pidref, char **ret_path) {
831 _cleanup_free_ char *path = NULL;
832 int r;
833
834 assert(ret_path);
835
836 if (!pidref_is_set(pidref))
837 return -ESRCH;
838
839 r = cg_pid_get_path(controller, pidref->pid, &path);
840 if (r < 0)
841 return r;
842
843 /* Before we return the path, make sure the procfs entry for this pid still matches the pidref */
844 r = pidref_verify(pidref);
845 if (r < 0)
846 return r;
847
848 *ret_path = TAKE_PTR(path);
849 return 0;
850 }
851
852 int cg_install_release_agent(const char *controller, const char *agent) {
853 _cleanup_free_ char *fs = NULL, *contents = NULL;
854 const char *sc;
855 int r;
856
857 assert(agent);
858
859 r = cg_unified_controller(controller);
860 if (r < 0)
861 return r;
862 if (r > 0) /* doesn't apply to unified hierarchy */
863 return -EOPNOTSUPP;
864
865 r = cg_get_path(controller, NULL, "release_agent", &fs);
866 if (r < 0)
867 return r;
868
869 r = read_one_line_file(fs, &contents);
870 if (r < 0)
871 return r;
872
873 sc = strstrip(contents);
874 if (isempty(sc)) {
875 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
876 if (r < 0)
877 return r;
878 } else if (!path_equal(sc, agent))
879 return -EEXIST;
880
881 fs = mfree(fs);
882 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
883 if (r < 0)
884 return r;
885
886 contents = mfree(contents);
887 r = read_one_line_file(fs, &contents);
888 if (r < 0)
889 return r;
890
891 sc = strstrip(contents);
892 if (streq(sc, "0")) {
893 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
894 if (r < 0)
895 return r;
896
897 return 1;
898 }
899
900 if (!streq(sc, "1"))
901 return -EIO;
902
903 return 0;
904 }
905
906 int cg_uninstall_release_agent(const char *controller) {
907 _cleanup_free_ char *fs = NULL;
908 int r;
909
910 r = cg_unified_controller(controller);
911 if (r < 0)
912 return r;
913 if (r > 0) /* Doesn't apply to unified hierarchy */
914 return -EOPNOTSUPP;
915
916 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
917 if (r < 0)
918 return r;
919
920 r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
921 if (r < 0)
922 return r;
923
924 fs = mfree(fs);
925
926 r = cg_get_path(controller, NULL, "release_agent", &fs);
927 if (r < 0)
928 return r;
929
930 r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
931 if (r < 0)
932 return r;
933
934 return 0;
935 }
936
937 int cg_is_empty(const char *controller, const char *path) {
938 _cleanup_fclose_ FILE *f = NULL;
939 pid_t pid;
940 int r;
941
942 assert(path);
943
944 r = cg_enumerate_processes(controller, path, &f);
945 if (r == -ENOENT)
946 return true;
947 if (r < 0)
948 return r;
949
950 r = cg_read_pid(f, &pid, CGROUP_DONT_SKIP_UNMAPPED);
951 if (r < 0)
952 return r;
953
954 return r == 0;
955 }
956
957 int cg_is_empty_recursive(const char *controller, const char *path) {
958 int r;
959
960 assert(path);
961
962 /* The root cgroup is always populated */
963 if (controller && empty_or_root(path))
964 return false;
965
966 r = cg_unified_controller(controller);
967 if (r < 0)
968 return r;
969 if (r > 0) {
970 _cleanup_free_ char *t = NULL;
971
972 /* On the unified hierarchy we can check empty state
973 * via the "populated" attribute of "cgroup.events". */
974
975 r = cg_read_event(controller, path, "populated", &t);
976 if (r == -ENOENT)
977 return true;
978 if (r < 0)
979 return r;
980
981 return streq(t, "0");
982 } else {
983 _cleanup_closedir_ DIR *d = NULL;
984 char *fn;
985
986 r = cg_is_empty(controller, path);
987 if (r <= 0)
988 return r;
989
990 r = cg_enumerate_subgroups(controller, path, &d);
991 if (r == -ENOENT)
992 return true;
993 if (r < 0)
994 return r;
995
996 while ((r = cg_read_subgroup(d, &fn)) > 0) {
997 _cleanup_free_ char *p = NULL;
998
999 p = path_join(path, fn);
1000 free(fn);
1001 if (!p)
1002 return -ENOMEM;
1003
1004 r = cg_is_empty_recursive(controller, p);
1005 if (r <= 0)
1006 return r;
1007 }
1008 if (r < 0)
1009 return r;
1010
1011 return true;
1012 }
1013 }
1014
1015 int cg_split_spec(const char *spec, char **ret_controller, char **ret_path) {
1016 _cleanup_free_ char *controller = NULL, *path = NULL;
1017 int r;
1018
1019 assert(spec);
1020
1021 if (*spec == '/') {
1022 if (!path_is_normalized(spec))
1023 return -EINVAL;
1024
1025 if (ret_path) {
1026 r = path_simplify_alloc(spec, &path);
1027 if (r < 0)
1028 return r;
1029 }
1030
1031 } else {
1032 const char *e;
1033
1034 e = strchr(spec, ':');
1035 if (e) {
1036 controller = strndup(spec, e-spec);
1037 if (!controller)
1038 return -ENOMEM;
1039 if (!cg_controller_is_valid(controller))
1040 return -EINVAL;
1041
1042 if (!isempty(e + 1)) {
1043 path = strdup(e+1);
1044 if (!path)
1045 return -ENOMEM;
1046
1047 if (!path_is_normalized(path) ||
1048 !path_is_absolute(path))
1049 return -EINVAL;
1050
1051 path_simplify(path);
1052 }
1053
1054 } else {
1055 if (!cg_controller_is_valid(spec))
1056 return -EINVAL;
1057
1058 if (ret_controller) {
1059 controller = strdup(spec);
1060 if (!controller)
1061 return -ENOMEM;
1062 }
1063 }
1064 }
1065
1066 if (ret_controller)
1067 *ret_controller = TAKE_PTR(controller);
1068 if (ret_path)
1069 *ret_path = TAKE_PTR(path);
1070 return 0;
1071 }
1072
1073 int cg_mangle_path(const char *path, char **ret) {
1074 _cleanup_free_ char *c = NULL, *p = NULL;
1075 int r;
1076
1077 assert(path);
1078 assert(ret);
1079
1080 /* First, check if it already is a filesystem path */
1081 if (path_startswith(path, "/sys/fs/cgroup"))
1082 return path_simplify_alloc(path, ret);
1083
1084 /* Otherwise, treat it as cg spec */
1085 r = cg_split_spec(path, &c, &p);
1086 if (r < 0)
1087 return r;
1088
1089 return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, ret);
1090 }
1091
1092 int cg_get_root_path(char **ret_path) {
1093 char *p, *e;
1094 int r;
1095
1096 assert(ret_path);
1097
1098 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1099 if (r < 0)
1100 return r;
1101
1102 e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1103 if (!e)
1104 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1105 if (!e)
1106 e = endswith(p, "/system"); /* even more legacy */
1107 if (e)
1108 *e = 0;
1109
1110 *ret_path = p;
1111 return 0;
1112 }
1113
1114 int cg_shift_path(const char *cgroup, const char *root, const char **ret_shifted) {
1115 _cleanup_free_ char *rt = NULL;
1116 char *p;
1117 int r;
1118
1119 assert(cgroup);
1120 assert(ret_shifted);
1121
1122 if (!root) {
1123 /* If the root was specified let's use that, otherwise
1124 * let's determine it from PID 1 */
1125
1126 r = cg_get_root_path(&rt);
1127 if (r < 0)
1128 return r;
1129
1130 root = rt;
1131 }
1132
1133 p = path_startswith(cgroup, root);
1134 if (p && p > cgroup)
1135 *ret_shifted = p - 1;
1136 else
1137 *ret_shifted = cgroup;
1138
1139 return 0;
1140 }
1141
1142 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **ret_cgroup) {
1143 _cleanup_free_ char *raw = NULL;
1144 const char *c;
1145 int r;
1146
1147 assert(pid >= 0);
1148 assert(ret_cgroup);
1149
1150 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1151 if (r < 0)
1152 return r;
1153
1154 r = cg_shift_path(raw, root, &c);
1155 if (r < 0)
1156 return r;
1157
1158 if (c == raw) {
1159 *ret_cgroup = TAKE_PTR(raw);
1160 return 0;
1161 }
1162
1163 return strdup_to(ret_cgroup, c);
1164 }
1165
1166 int cg_path_decode_unit(const char *cgroup, char **ret_unit) {
1167 assert(cgroup);
1168 assert(ret_unit);
1169
1170 size_t n = strcspn(cgroup, "/");
1171 if (n < 3)
1172 return -ENXIO;
1173
1174 char *c = strndupa_safe(cgroup, n);
1175 c = cg_unescape(c);
1176
1177 if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1178 return -ENXIO;
1179
1180 return strdup_to(ret_unit, c);
1181 }
1182
1183 static bool valid_slice_name(const char *p, size_t n) {
1184
1185 if (!p)
1186 return false;
1187
1188 if (n < STRLEN("x.slice"))
1189 return false;
1190
1191 if (memcmp(p + n - 6, ".slice", 6) == 0) {
1192 char buf[n+1], *c;
1193
1194 memcpy(buf, p, n);
1195 buf[n] = 0;
1196
1197 c = cg_unescape(buf);
1198
1199 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1200 }
1201
1202 return false;
1203 }
1204
1205 static const char *skip_slices(const char *p) {
1206 assert(p);
1207
1208 /* Skips over all slice assignments */
1209
1210 for (;;) {
1211 size_t n;
1212
1213 p += strspn(p, "/");
1214
1215 n = strcspn(p, "/");
1216 if (!valid_slice_name(p, n))
1217 return p;
1218
1219 p += n;
1220 }
1221 }
1222
1223 int cg_path_get_unit(const char *path, char **ret) {
1224 _cleanup_free_ char *unit = NULL;
1225 const char *e;
1226 int r;
1227
1228 assert(path);
1229 assert(ret);
1230
1231 e = skip_slices(path);
1232
1233 r = cg_path_decode_unit(e, &unit);
1234 if (r < 0)
1235 return r;
1236
1237 /* We skipped over the slices, don't accept any now */
1238 if (endswith(unit, ".slice"))
1239 return -ENXIO;
1240
1241 *ret = TAKE_PTR(unit);
1242 return 0;
1243 }
1244
1245 int cg_path_get_unit_path(const char *path, char **ret) {
1246 _cleanup_free_ char *path_copy = NULL;
1247 char *unit_name;
1248
1249 assert(path);
1250 assert(ret);
1251
1252 path_copy = strdup(path);
1253 if (!path_copy)
1254 return -ENOMEM;
1255
1256 unit_name = (char *)skip_slices(path_copy);
1257 unit_name[strcspn(unit_name, "/")] = 0;
1258
1259 if (!unit_name_is_valid(cg_unescape(unit_name), UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1260 return -ENXIO;
1261
1262 *ret = TAKE_PTR(path_copy);
1263
1264 return 0;
1265 }
1266
1267 int cg_pid_get_unit(pid_t pid, char **ret_unit) {
1268 _cleanup_free_ char *cgroup = NULL;
1269 int r;
1270
1271 assert(ret_unit);
1272
1273 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1274 if (r < 0)
1275 return r;
1276
1277 return cg_path_get_unit(cgroup, ret_unit);
1278 }
1279
1280 int cg_pidref_get_unit(const PidRef *pidref, char **ret) {
1281 _cleanup_free_ char *unit = NULL;
1282 int r;
1283
1284 assert(ret);
1285
1286 if (!pidref_is_set(pidref))
1287 return -ESRCH;
1288
1289 r = cg_pid_get_unit(pidref->pid, &unit);
1290 if (r < 0)
1291 return r;
1292
1293 r = pidref_verify(pidref);
1294 if (r < 0)
1295 return r;
1296
1297 *ret = TAKE_PTR(unit);
1298 return 0;
1299 }
1300
1301 /**
1302 * Skip session-*.scope, but require it to be there.
1303 */
1304 static const char *skip_session(const char *p) {
1305 size_t n;
1306
1307 if (isempty(p))
1308 return NULL;
1309
1310 p += strspn(p, "/");
1311
1312 n = strcspn(p, "/");
1313 if (n < STRLEN("session-x.scope"))
1314 return NULL;
1315
1316 if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1317 char buf[n - 8 - 6 + 1];
1318
1319 memcpy(buf, p + 8, n - 8 - 6);
1320 buf[n - 8 - 6] = 0;
1321
1322 /* Note that session scopes never need unescaping,
1323 * since they cannot conflict with the kernel's own
1324 * names, hence we don't need to call cg_unescape()
1325 * here. */
1326
1327 if (!session_id_valid(buf))
1328 return NULL;
1329
1330 p += n;
1331 p += strspn(p, "/");
1332 return p;
1333 }
1334
1335 return NULL;
1336 }
1337
1338 /**
1339 * Skip user@*.service, but require it to be there.
1340 */
1341 static const char *skip_user_manager(const char *p) {
1342 size_t n;
1343
1344 if (isempty(p))
1345 return NULL;
1346
1347 p += strspn(p, "/");
1348
1349 n = strcspn(p, "/");
1350 if (n < STRLEN("user@x.service"))
1351 return NULL;
1352
1353 if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1354 char buf[n - 5 - 8 + 1];
1355
1356 memcpy(buf, p + 5, n - 5 - 8);
1357 buf[n - 5 - 8] = 0;
1358
1359 /* Note that user manager services never need unescaping,
1360 * since they cannot conflict with the kernel's own
1361 * names, hence we don't need to call cg_unescape()
1362 * here. */
1363
1364 if (parse_uid(buf, NULL) < 0)
1365 return NULL;
1366
1367 p += n;
1368 p += strspn(p, "/");
1369
1370 return p;
1371 }
1372
1373 return NULL;
1374 }
1375
1376 static const char *skip_user_prefix(const char *path) {
1377 const char *e, *t;
1378
1379 assert(path);
1380
1381 /* Skip slices, if there are any */
1382 e = skip_slices(path);
1383
1384 /* Skip the user manager, if it's in the path now... */
1385 t = skip_user_manager(e);
1386 if (t)
1387 return t;
1388
1389 /* Alternatively skip the user session if it is in the path... */
1390 return skip_session(e);
1391 }
1392
1393 int cg_path_get_user_unit(const char *path, char **ret) {
1394 const char *t;
1395
1396 assert(path);
1397 assert(ret);
1398
1399 t = skip_user_prefix(path);
1400 if (!t)
1401 return -ENXIO;
1402
1403 /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1404 * parser. */
1405 return cg_path_get_unit(t, ret);
1406 }
1407
1408 int cg_pid_get_user_unit(pid_t pid, char **ret_unit) {
1409 _cleanup_free_ char *cgroup = NULL;
1410 int r;
1411
1412 assert(ret_unit);
1413
1414 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1415 if (r < 0)
1416 return r;
1417
1418 return cg_path_get_user_unit(cgroup, ret_unit);
1419 }
1420
1421 int cg_path_get_machine_name(const char *path, char **ret_machine) {
1422 _cleanup_free_ char *u = NULL;
1423 const char *sl;
1424 int r;
1425
1426 r = cg_path_get_unit(path, &u);
1427 if (r < 0)
1428 return r;
1429
1430 sl = strjoina("/run/systemd/machines/unit:", u);
1431 return readlink_malloc(sl, ret_machine);
1432 }
1433
1434 int cg_pid_get_machine_name(pid_t pid, char **ret_machine) {
1435 _cleanup_free_ char *cgroup = NULL;
1436 int r;
1437
1438 assert(ret_machine);
1439
1440 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1441 if (r < 0)
1442 return r;
1443
1444 return cg_path_get_machine_name(cgroup, ret_machine);
1445 }
1446
1447 int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
1448 cg_file_handle fh = CG_FILE_HANDLE_INIT;
1449 int mnt_id;
1450
1451 assert(path);
1452 assert(ret);
1453
1454 /* This is cgroupfs so we know the size of the handle, thus no need to loop around like
1455 * name_to_handle_at_loop() does in mountpoint-util.c */
1456 if (name_to_handle_at(AT_FDCWD, path, &fh.file_handle, &mnt_id, 0) < 0)
1457 return -errno;
1458
1459 *ret = CG_FILE_HANDLE_CGROUPID(fh);
1460 return 0;
1461 }
1462
1463 int cg_fd_get_cgroupid(int fd, uint64_t *ret) {
1464 cg_file_handle fh = CG_FILE_HANDLE_INIT;
1465 int mnt_id = -1;
1466
1467 assert(fd >= 0);
1468 assert(ret);
1469
1470 if (name_to_handle_at(fd, "", &fh.file_handle, &mnt_id, AT_EMPTY_PATH) < 0)
1471 return -errno;
1472
1473 *ret = CG_FILE_HANDLE_CGROUPID(fh);
1474 return 0;
1475 }
1476
1477 int cg_path_get_session(const char *path, char **ret_session) {
1478 _cleanup_free_ char *unit = NULL;
1479 char *start, *end;
1480 int r;
1481
1482 assert(path);
1483
1484 r = cg_path_get_unit(path, &unit);
1485 if (r < 0)
1486 return r;
1487
1488 start = startswith(unit, "session-");
1489 if (!start)
1490 return -ENXIO;
1491 end = endswith(start, ".scope");
1492 if (!end)
1493 return -ENXIO;
1494
1495 *end = 0;
1496 if (!session_id_valid(start))
1497 return -ENXIO;
1498
1499 if (!ret_session)
1500 return 0;
1501
1502 return strdup_to(ret_session, start);
1503 }
1504
1505 int cg_pid_get_session(pid_t pid, char **ret_session) {
1506 _cleanup_free_ char *cgroup = NULL;
1507 int r;
1508
1509 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1510 if (r < 0)
1511 return r;
1512
1513 return cg_path_get_session(cgroup, ret_session);
1514 }
1515
1516 int cg_path_get_owner_uid(const char *path, uid_t *ret_uid) {
1517 _cleanup_free_ char *slice = NULL;
1518 char *start, *end;
1519 int r;
1520
1521 assert(path);
1522
1523 r = cg_path_get_slice(path, &slice);
1524 if (r < 0)
1525 return r;
1526
1527 start = startswith(slice, "user-");
1528 if (!start)
1529 return -ENXIO;
1530
1531 end = endswith(start, ".slice");
1532 if (!end)
1533 return -ENXIO;
1534
1535 *end = 0;
1536 if (parse_uid(start, ret_uid) < 0)
1537 return -ENXIO;
1538
1539 return 0;
1540 }
1541
1542 int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid) {
1543 _cleanup_free_ char *cgroup = NULL;
1544 int r;
1545
1546 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1547 if (r < 0)
1548 return r;
1549
1550 return cg_path_get_owner_uid(cgroup, ret_uid);
1551 }
1552
1553 int cg_path_get_slice(const char *p, char **ret_slice) {
1554 const char *e = NULL;
1555
1556 assert(p);
1557 assert(ret_slice);
1558
1559 /* Finds the right-most slice unit from the beginning, but stops before we come to
1560 * the first non-slice unit. */
1561
1562 for (;;) {
1563 const char *s;
1564 int n;
1565
1566 n = path_find_first_component(&p, /* accept_dot_dot = */ false, &s);
1567 if (n < 0)
1568 return n;
1569 if (!valid_slice_name(s, n))
1570 break;
1571
1572 e = s;
1573 }
1574
1575 if (e)
1576 return cg_path_decode_unit(e, ret_slice);
1577
1578 return strdup_to(ret_slice, SPECIAL_ROOT_SLICE);
1579 }
1580
1581 int cg_pid_get_slice(pid_t pid, char **ret_slice) {
1582 _cleanup_free_ char *cgroup = NULL;
1583 int r;
1584
1585 assert(ret_slice);
1586
1587 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1588 if (r < 0)
1589 return r;
1590
1591 return cg_path_get_slice(cgroup, ret_slice);
1592 }
1593
1594 int cg_path_get_user_slice(const char *p, char **ret_slice) {
1595 const char *t;
1596 assert(p);
1597 assert(ret_slice);
1598
1599 t = skip_user_prefix(p);
1600 if (!t)
1601 return -ENXIO;
1602
1603 /* And now it looks pretty much the same as for a system slice, so let's just use the same parser
1604 * from here on. */
1605 return cg_path_get_slice(t, ret_slice);
1606 }
1607
1608 int cg_pid_get_user_slice(pid_t pid, char **ret_slice) {
1609 _cleanup_free_ char *cgroup = NULL;
1610 int r;
1611
1612 assert(ret_slice);
1613
1614 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1615 if (r < 0)
1616 return r;
1617
1618 return cg_path_get_user_slice(cgroup, ret_slice);
1619 }
1620
1621 bool cg_needs_escape(const char *p) {
1622
1623 /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note
1624 * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if
1625 * they shall be used. Also note that various names cannot be made valid by escaping even if we
1626 * return true here (because too long, or contain the forbidden character "/"). */
1627
1628 if (!filename_is_valid(p))
1629 return true;
1630
1631 if (IN_SET(p[0], '_', '.'))
1632 return true;
1633
1634 if (STR_IN_SET(p, "notify_on_release", "release_agent", "tasks"))
1635 return true;
1636
1637 if (startswith(p, "cgroup."))
1638 return true;
1639
1640 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1641 const char *q;
1642
1643 q = startswith(p, cgroup_controller_to_string(c));
1644 if (!q)
1645 continue;
1646
1647 if (q[0] == '.')
1648 return true;
1649 }
1650
1651 return false;
1652 }
1653
1654 int cg_escape(const char *p, char **ret) {
1655 _cleanup_free_ char *n = NULL;
1656
1657 /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any
1658 * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That
1659 * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there
1660 * is one. */
1661
1662 /* The return value of this function (unlike cg_unescape()) needs free()! */
1663
1664 if (cg_needs_escape(p)) {
1665 n = strjoin("_", p);
1666 if (!n)
1667 return -ENOMEM;
1668
1669 if (!filename_is_valid(n)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */
1670 return -EINVAL;
1671 } else {
1672 n = strdup(p);
1673 if (!n)
1674 return -ENOMEM;
1675 }
1676
1677 *ret = TAKE_PTR(n);
1678 return 0;
1679 }
1680
1681 char *cg_unescape(const char *p) {
1682 assert(p);
1683
1684 /* The return value of this function (unlike cg_escape())
1685 * doesn't need free()! */
1686
1687 if (p[0] == '_')
1688 return (char*) p+1;
1689
1690 return (char*) p;
1691 }
1692
1693 #define CONTROLLER_VALID \
1694 DIGITS LETTERS \
1695 "_"
1696
1697 bool cg_controller_is_valid(const char *p) {
1698 const char *t, *s;
1699
1700 if (!p)
1701 return false;
1702
1703 if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1704 return true;
1705
1706 s = startswith(p, "name=");
1707 if (s)
1708 p = s;
1709
1710 if (IN_SET(*p, 0, '_'))
1711 return false;
1712
1713 for (t = p; *t; t++)
1714 if (!strchr(CONTROLLER_VALID, *t))
1715 return false;
1716
1717 if (t - p > NAME_MAX)
1718 return false;
1719
1720 return true;
1721 }
1722
1723 int cg_slice_to_path(const char *unit, char **ret) {
1724 _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1725 const char *dash;
1726 int r;
1727
1728 assert(unit);
1729 assert(ret);
1730
1731 if (streq(unit, SPECIAL_ROOT_SLICE))
1732 return strdup_to(ret, "");
1733
1734 if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1735 return -EINVAL;
1736
1737 if (!endswith(unit, ".slice"))
1738 return -EINVAL;
1739
1740 r = unit_name_to_prefix(unit, &p);
1741 if (r < 0)
1742 return r;
1743
1744 dash = strchr(p, '-');
1745
1746 /* Don't allow initial dashes */
1747 if (dash == p)
1748 return -EINVAL;
1749
1750 while (dash) {
1751 _cleanup_free_ char *escaped = NULL;
1752 char n[dash - p + sizeof(".slice")];
1753
1754 #if HAS_FEATURE_MEMORY_SANITIZER
1755 /* msan doesn't instrument stpncpy, so it thinks
1756 * n is later used uninitialized:
1757 * https://github.com/google/sanitizers/issues/926
1758 */
1759 zero(n);
1760 #endif
1761
1762 /* Don't allow trailing or double dashes */
1763 if (IN_SET(dash[1], 0, '-'))
1764 return -EINVAL;
1765
1766 strcpy(stpncpy(n, p, dash - p), ".slice");
1767 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1768 return -EINVAL;
1769
1770 r = cg_escape(n, &escaped);
1771 if (r < 0)
1772 return r;
1773
1774 if (!strextend(&s, escaped, "/"))
1775 return -ENOMEM;
1776
1777 dash = strchr(dash+1, '-');
1778 }
1779
1780 r = cg_escape(unit, &e);
1781 if (r < 0)
1782 return r;
1783
1784 if (!strextend(&s, e))
1785 return -ENOMEM;
1786
1787 *ret = TAKE_PTR(s);
1788 return 0;
1789 }
1790
1791 int cg_is_threaded(const char *path) {
1792 _cleanup_free_ char *fs = NULL, *contents = NULL;
1793 _cleanup_strv_free_ char **v = NULL;
1794 int r;
1795
1796 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.type", &fs);
1797 if (r < 0)
1798 return r;
1799
1800 r = read_full_virtual_file(fs, &contents, NULL);
1801 if (r == -ENOENT)
1802 return false; /* Assume no. */
1803 if (r < 0)
1804 return r;
1805
1806 v = strv_split(contents, NULL);
1807 if (!v)
1808 return -ENOMEM;
1809
1810 /* If the cgroup is in the threaded mode, it contains "threaded".
1811 * If one of the parents or siblings is in the threaded mode, it may contain "invalid". */
1812 return strv_contains(v, "threaded") || strv_contains(v, "invalid");
1813 }
1814
1815 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
1816 _cleanup_free_ char *p = NULL;
1817 int r;
1818
1819 r = cg_get_path(controller, path, attribute, &p);
1820 if (r < 0)
1821 return r;
1822
1823 return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
1824 }
1825
1826 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
1827 _cleanup_free_ char *p = NULL;
1828 int r;
1829
1830 r = cg_get_path(controller, path, attribute, &p);
1831 if (r < 0)
1832 return r;
1833
1834 return read_one_line_file(p, ret);
1835 }
1836
1837 int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret) {
1838 _cleanup_free_ char *value = NULL;
1839 uint64_t v;
1840 int r;
1841
1842 assert(ret);
1843
1844 r = cg_get_attribute(controller, path, attribute, &value);
1845 if (r == -ENOENT)
1846 return -ENODATA;
1847 if (r < 0)
1848 return r;
1849
1850 if (streq(value, "max")) {
1851 *ret = CGROUP_LIMIT_MAX;
1852 return 0;
1853 }
1854
1855 r = safe_atou64(value, &v);
1856 if (r < 0)
1857 return r;
1858
1859 *ret = v;
1860 return 0;
1861 }
1862
1863 int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) {
1864 _cleanup_free_ char *value = NULL;
1865 int r;
1866
1867 assert(ret);
1868
1869 r = cg_get_attribute(controller, path, attribute, &value);
1870 if (r == -ENOENT)
1871 return -ENODATA;
1872 if (r < 0)
1873 return r;
1874
1875 r = parse_boolean(value);
1876 if (r < 0)
1877 return r;
1878
1879 *ret = r;
1880 return 0;
1881 }
1882
1883 int cg_get_owner(const char *path, uid_t *ret_uid) {
1884 _cleanup_free_ char *f = NULL;
1885 struct stat stats;
1886 int r;
1887
1888 assert(ret_uid);
1889
1890 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &f);
1891 if (r < 0)
1892 return r;
1893
1894 if (stat(f, &stats) < 0)
1895 return -errno;
1896
1897 r = stat_verify_directory(&stats);
1898 if (r < 0)
1899 return r;
1900
1901 *ret_uid = stats.st_uid;
1902 return 0;
1903 }
1904
1905 int cg_get_keyed_attribute_full(
1906 const char *controller,
1907 const char *path,
1908 const char *attribute,
1909 char **keys,
1910 char **ret_values,
1911 CGroupKeyMode mode) {
1912
1913 _cleanup_free_ char *filename = NULL, *contents = NULL;
1914 const char *p;
1915 size_t n, i, n_done = 0;
1916 char **v;
1917 int r;
1918
1919 /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
1920 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
1921 * entries as 'keys'. On success each entry will be set to the value of the matching key.
1922 *
1923 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode
1924 * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */
1925
1926 r = cg_get_path(controller, path, attribute, &filename);
1927 if (r < 0)
1928 return r;
1929
1930 r = read_full_file(filename, &contents, NULL);
1931 if (r < 0)
1932 return r;
1933
1934 n = strv_length(keys);
1935 if (n == 0) /* No keys to retrieve? That's easy, we are done then */
1936 return 0;
1937
1938 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
1939 v = newa0(char*, n);
1940
1941 for (p = contents; *p;) {
1942 const char *w = NULL;
1943
1944 for (i = 0; i < n; i++)
1945 if (!v[i]) {
1946 w = first_word(p, keys[i]);
1947 if (w)
1948 break;
1949 }
1950
1951 if (w) {
1952 size_t l;
1953
1954 l = strcspn(w, NEWLINE);
1955 v[i] = strndup(w, l);
1956 if (!v[i]) {
1957 r = -ENOMEM;
1958 goto fail;
1959 }
1960
1961 n_done++;
1962 if (n_done >= n)
1963 goto done;
1964
1965 p = w + l;
1966 } else
1967 p += strcspn(p, NEWLINE);
1968
1969 p += strspn(p, NEWLINE);
1970 }
1971
1972 if (mode & CG_KEY_MODE_GRACEFUL)
1973 goto done;
1974
1975 r = -ENXIO;
1976
1977 fail:
1978 free_many_charp(v, n);
1979 return r;
1980
1981 done:
1982 memcpy(ret_values, v, sizeof(char*) * n);
1983 if (mode & CG_KEY_MODE_GRACEFUL)
1984 return n_done;
1985
1986 return 0;
1987 }
1988
1989 int cg_mask_to_string(CGroupMask mask, char **ret) {
1990 _cleanup_free_ char *s = NULL;
1991 bool space = false;
1992 CGroupController c;
1993 size_t n = 0;
1994
1995 assert(ret);
1996
1997 if (mask == 0) {
1998 *ret = NULL;
1999 return 0;
2000 }
2001
2002 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2003 const char *k;
2004 size_t l;
2005
2006 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
2007 continue;
2008
2009 k = cgroup_controller_to_string(c);
2010 l = strlen(k);
2011
2012 if (!GREEDY_REALLOC(s, n + space + l + 1))
2013 return -ENOMEM;
2014
2015 if (space)
2016 s[n] = ' ';
2017 memcpy(s + n + space, k, l);
2018 n += space + l;
2019
2020 space = true;
2021 }
2022
2023 assert(s);
2024
2025 s[n] = 0;
2026 *ret = TAKE_PTR(s);
2027
2028 return 0;
2029 }
2030
2031 int cg_mask_from_string(const char *value, CGroupMask *ret) {
2032 CGroupMask m = 0;
2033
2034 assert(ret);
2035 assert(value);
2036
2037 for (;;) {
2038 _cleanup_free_ char *n = NULL;
2039 CGroupController v;
2040 int r;
2041
2042 r = extract_first_word(&value, &n, NULL, 0);
2043 if (r < 0)
2044 return r;
2045 if (r == 0)
2046 break;
2047
2048 v = cgroup_controller_from_string(n);
2049 if (v < 0)
2050 continue;
2051
2052 m |= CGROUP_CONTROLLER_TO_MASK(v);
2053 }
2054
2055 *ret = m;
2056 return 0;
2057 }
2058
2059 int cg_mask_supported_subtree(const char *root, CGroupMask *ret) {
2060 CGroupMask mask;
2061 int r;
2062
2063 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2064 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2065 * pseudo-controllers. */
2066
2067 r = cg_all_unified();
2068 if (r < 0)
2069 return r;
2070 if (r > 0) {
2071 _cleanup_free_ char *controllers = NULL, *path = NULL;
2072
2073 /* In the unified hierarchy we can read the supported and accessible controllers from
2074 * the top-level cgroup attribute */
2075
2076 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2077 if (r < 0)
2078 return r;
2079
2080 r = read_one_line_file(path, &controllers);
2081 if (r < 0)
2082 return r;
2083
2084 r = cg_mask_from_string(controllers, &mask);
2085 if (r < 0)
2086 return r;
2087
2088 /* Mask controllers that are not supported in unified hierarchy. */
2089 mask &= CGROUP_MASK_V2;
2090
2091 } else {
2092 CGroupController c;
2093
2094 /* In the legacy hierarchy, we check which hierarchies are accessible. */
2095
2096 mask = 0;
2097 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2098 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2099 const char *n;
2100
2101 if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2102 continue;
2103
2104 n = cgroup_controller_to_string(c);
2105 if (controller_is_v1_accessible(root, n) >= 0)
2106 mask |= bit;
2107 }
2108 }
2109
2110 *ret = mask;
2111 return 0;
2112 }
2113
2114 int cg_mask_supported(CGroupMask *ret) {
2115 _cleanup_free_ char *root = NULL;
2116 int r;
2117
2118 r = cg_get_root_path(&root);
2119 if (r < 0)
2120 return r;
2121
2122 return cg_mask_supported_subtree(root, ret);
2123 }
2124
2125 int cg_kernel_controllers(Set **ret) {
2126 _cleanup_set_free_ Set *controllers = NULL;
2127 _cleanup_fclose_ FILE *f = NULL;
2128 int r;
2129
2130 assert(ret);
2131
2132 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2133 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2134 * pseudo-controllers. */
2135
2136 r = fopen_unlocked("/proc/cgroups", "re", &f);
2137 if (r == -ENOENT) {
2138 *ret = NULL;
2139 return 0;
2140 }
2141 if (r < 0)
2142 return r;
2143
2144 /* Ignore the header line */
2145 (void) read_line(f, SIZE_MAX, NULL);
2146
2147 for (;;) {
2148 _cleanup_free_ char *controller = NULL;
2149 int enabled = 0;
2150
2151 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2152
2153 if (ferror(f))
2154 return -errno;
2155
2156 if (feof(f))
2157 break;
2158
2159 return -EBADMSG;
2160 }
2161
2162 if (!enabled)
2163 continue;
2164
2165 if (!cg_controller_is_valid(controller))
2166 return -EBADMSG;
2167
2168 r = set_ensure_consume(&controllers, &string_hash_ops_free, TAKE_PTR(controller));
2169 if (r < 0)
2170 return r;
2171 }
2172
2173 *ret = TAKE_PTR(controllers);
2174
2175 return 0;
2176 }
2177
2178 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on
2179 * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1
2180 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on
2181 * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility
2182 * with other tools.
2183 *
2184 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep
2185 * cgroup v2 process management but disable the compat dual layout, we return true on
2186 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified().
2187 */
2188 static thread_local bool unified_systemd_v232;
2189
2190 int cg_unified_cached(bool flush) {
2191 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2192
2193 struct statfs fs;
2194
2195 /* Checks if we support the unified hierarchy. Returns an
2196 * error when the cgroup hierarchies aren't mounted yet or we
2197 * have any other trouble determining if the unified hierarchy
2198 * is supported. */
2199
2200 if (flush)
2201 unified_cache = CGROUP_UNIFIED_UNKNOWN;
2202 else if (unified_cache >= CGROUP_UNIFIED_NONE)
2203 return unified_cache;
2204
2205 if (statfs("/sys/fs/cgroup/", &fs) < 0)
2206 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2207
2208 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2209 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2210 unified_cache = CGROUP_UNIFIED_ALL;
2211 } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2212 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2213 F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2214 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2215 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2216 unified_systemd_v232 = false;
2217 } else {
2218 if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0) {
2219 if (errno == ENOENT) {
2220 /* Some other software may have set up /sys/fs/cgroup in a configuration we do not recognize. */
2221 log_debug_errno(errno, "Unsupported cgroupsv1 setup detected: name=systemd hierarchy not found.");
2222 return -ENOMEDIUM;
2223 }
2224 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2225 }
2226
2227 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2228 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2229 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2230 unified_systemd_v232 = true;
2231 } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2232 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2233 unified_cache = CGROUP_UNIFIED_NONE;
2234 } else {
2235 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2236 (unsigned long long) fs.f_type);
2237 unified_cache = CGROUP_UNIFIED_NONE;
2238 }
2239 }
2240 } else if (F_TYPE_EQUAL(fs.f_type, SYSFS_MAGIC)) {
2241 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2242 "No filesystem is currently mounted on /sys/fs/cgroup.");
2243 } else
2244 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2245 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2246 (unsigned long long)fs.f_type);
2247
2248 return unified_cache;
2249 }
2250
2251 int cg_unified_controller(const char *controller) {
2252 int r;
2253
2254 r = cg_unified_cached(false);
2255 if (r < 0)
2256 return r;
2257
2258 if (r == CGROUP_UNIFIED_NONE)
2259 return false;
2260
2261 if (r >= CGROUP_UNIFIED_ALL)
2262 return true;
2263
2264 return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2265 }
2266
2267 int cg_all_unified(void) {
2268 int r;
2269
2270 r = cg_unified_cached(false);
2271 if (r < 0)
2272 return r;
2273
2274 return r >= CGROUP_UNIFIED_ALL;
2275 }
2276
2277 int cg_hybrid_unified(void) {
2278 int r;
2279
2280 r = cg_unified_cached(false);
2281 if (r < 0)
2282 return r;
2283
2284 return r == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2285 }
2286
2287 int cg_is_delegated(const char *path) {
2288 int r;
2289
2290 assert(path);
2291
2292 r = cg_get_xattr_bool(path, "trusted.delegate");
2293 if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2294 return r;
2295
2296 /* If the trusted xattr isn't set (preferred), then check the untrusted one. Under the assumption
2297 * that whoever is trusted enough to own the cgroup, is also trusted enough to decide if it is
2298 * delegated or not this should be safe. */
2299 r = cg_get_xattr_bool(path, "user.delegate");
2300 return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2301 }
2302
2303 int cg_is_delegated_fd(int fd) {
2304 int r;
2305
2306 assert(fd >= 0);
2307
2308 r = getxattr_at_bool(fd, /* path= */ NULL, "trusted.delegate", /* flags= */ 0);
2309 if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2310 return r;
2311
2312 r = getxattr_at_bool(fd, /* path= */ NULL, "user.delegate", /* flags= */ 0);
2313 return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2314 }
2315
2316 int cg_has_coredump_receive(const char *path) {
2317 int r;
2318
2319 assert(path);
2320
2321 r = cg_get_xattr_bool(path, "user.coredump_receive");
2322 if (ERRNO_IS_NEG_XATTR_ABSENT(r))
2323 return false;
2324
2325 return r;
2326 }
2327
2328 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2329 [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX,
2330 [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX,
2331 [CGROUP_IO_RIOPS_MAX] = CGROUP_LIMIT_MAX,
2332 [CGROUP_IO_WIOPS_MAX] = CGROUP_LIMIT_MAX,
2333 };
2334
2335 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2336 [CGROUP_IO_RBPS_MAX] = "IOReadBandwidthMax",
2337 [CGROUP_IO_WBPS_MAX] = "IOWriteBandwidthMax",
2338 [CGROUP_IO_RIOPS_MAX] = "IOReadIOPSMax",
2339 [CGROUP_IO_WIOPS_MAX] = "IOWriteIOPSMax",
2340 };
2341
2342 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2343
2344 bool is_cgroup_fs(const struct statfs *s) {
2345 return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2346 is_fs_type(s, CGROUP2_SUPER_MAGIC);
2347 }
2348
2349 bool fd_is_cgroup_fs(int fd) {
2350 struct statfs s;
2351
2352 if (fstatfs(fd, &s) < 0)
2353 return -errno;
2354
2355 return is_cgroup_fs(&s);
2356 }
2357
2358 static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2359 [CGROUP_CONTROLLER_CPU] = "cpu",
2360 [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2361 [CGROUP_CONTROLLER_CPUSET] = "cpuset",
2362 [CGROUP_CONTROLLER_IO] = "io",
2363 [CGROUP_CONTROLLER_BLKIO] = "blkio",
2364 [CGROUP_CONTROLLER_MEMORY] = "memory",
2365 [CGROUP_CONTROLLER_DEVICES] = "devices",
2366 [CGROUP_CONTROLLER_PIDS] = "pids",
2367 [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2368 [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2369 [CGROUP_CONTROLLER_BPF_FOREIGN] = "bpf-foreign",
2370 [CGROUP_CONTROLLER_BPF_SOCKET_BIND] = "bpf-socket-bind",
2371 [CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES] = "bpf-restrict-network-interfaces",
2372 };
2373
2374 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2375
2376 CGroupMask get_cpu_accounting_mask(void) {
2377 static CGroupMask needed_mask = (CGroupMask) -1;
2378
2379 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2380 * provided externally from the CPU controller, which means we don't
2381 * need to enable the CPU controller just to get metrics. This is good,
2382 * because enabling the CPU controller comes at a minor performance
2383 * hit, especially when it's propagated deep into large hierarchies.
2384 * There's also no separate CPU accounting controller available within
2385 * a unified hierarchy.
2386 *
2387 * This combination of factors results in the desired cgroup mask to
2388 * enable for CPU accounting varying as follows:
2389 *
2390 * ╔═════════════════════╤═════════════════════╗
2391 * ║ Linux ≥4.15 │ Linux <4.15 ║
2392 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2393 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2394 * ╟───────────────╫─────────────────────┼─────────────────────╢
2395 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2396 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2397 *
2398 * We check kernel version here instead of manually checking whether
2399 * cpu.stat is present for every cgroup, as that check in itself would
2400 * already be fairly expensive.
2401 *
2402 * Kernels where this patch has been backported will therefore have the
2403 * CPU controller enabled unnecessarily. This is more expensive than
2404 * necessary, but harmless. ☺️
2405 */
2406
2407 if (needed_mask == (CGroupMask) -1) {
2408 if (cg_all_unified()) {
2409 struct utsname u;
2410 assert_se(uname(&u) >= 0);
2411
2412 if (strverscmp_improved(u.release, "4.15") < 0)
2413 needed_mask = CGROUP_MASK_CPU;
2414 else
2415 needed_mask = 0;
2416 } else
2417 needed_mask = CGROUP_MASK_CPUACCT;
2418 }
2419
2420 return needed_mask;
2421 }
2422
2423 bool cpu_accounting_is_cheap(void) {
2424 return get_cpu_accounting_mask() == 0;
2425 }
2426
2427 static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = {
2428 [MANAGED_OOM_AUTO] = "auto",
2429 [MANAGED_OOM_KILL] = "kill",
2430 };
2431
2432 DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode);
2433
2434 static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = {
2435 [MANAGED_OOM_PREFERENCE_NONE] = "none",
2436 [MANAGED_OOM_PREFERENCE_AVOID] = "avoid",
2437 [MANAGED_OOM_PREFERENCE_OMIT] = "omit",
2438 };
2439
2440 DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference);