]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/basic/cgroup-util.c
830a63c1850a6bbc9d2414f0ac0fbe134c422a6f
[thirdparty/systemd.git] / src / basic / cgroup-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <dirent.h>
4 #include <errno.h>
5 #include <ftw.h>
6 #include <limits.h>
7 #include <signal.h>
8 #include <stddef.h>
9 #include <stdio_ext.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <sys/stat.h>
13 #include <sys/statfs.h>
14 #include <sys/types.h>
15 #include <sys/utsname.h>
16 #include <sys/xattr.h>
17 #include <unistd.h>
18
19 #include "alloc-util.h"
20 #include "cgroup-util.h"
21 #include "def.h"
22 #include "dirent-util.h"
23 #include "extract-word.h"
24 #include "fd-util.h"
25 #include "fileio.h"
26 #include "format-util.h"
27 #include "fs-util.h"
28 #include "log.h"
29 #include "login-util.h"
30 #include "macro.h"
31 #include "missing.h"
32 #include "mkdir.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "proc-cmdline.h"
36 #include "process-util.h"
37 #include "set.h"
38 #include "special.h"
39 #include "stat-util.h"
40 #include "stdio-util.h"
41 #include "string-table.h"
42 #include "string-util.h"
43 #include "strv.h"
44 #include "unit-name.h"
45 #include "user-util.h"
46
47 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
48 _cleanup_free_ char *fs = NULL;
49 FILE *f;
50 int r;
51
52 assert(_f);
53
54 r = cg_get_path(controller, path, "cgroup.procs", &fs);
55 if (r < 0)
56 return r;
57
58 f = fopen(fs, "re");
59 if (!f)
60 return -errno;
61
62 *_f = f;
63 return 0;
64 }
65
66 int cg_read_pid(FILE *f, pid_t *_pid) {
67 unsigned long ul;
68
69 /* Note that the cgroup.procs might contain duplicates! See
70 * cgroups.txt for details. */
71
72 assert(f);
73 assert(_pid);
74
75 errno = 0;
76 if (fscanf(f, "%lu", &ul) != 1) {
77
78 if (feof(f))
79 return 0;
80
81 return errno > 0 ? -errno : -EIO;
82 }
83
84 if (ul <= 0)
85 return -EIO;
86
87 *_pid = (pid_t) ul;
88 return 1;
89 }
90
91 int cg_read_event(
92 const char *controller,
93 const char *path,
94 const char *event,
95 char **val) {
96
97 _cleanup_free_ char *events = NULL, *content = NULL;
98 char *p, *line;
99 int r;
100
101 r = cg_get_path(controller, path, "cgroup.events", &events);
102 if (r < 0)
103 return r;
104
105 r = read_full_file(events, &content, NULL);
106 if (r < 0)
107 return r;
108
109 p = content;
110 while ((line = strsep(&p, "\n"))) {
111 char *key;
112
113 key = strsep(&line, " ");
114 if (!key || !line)
115 return -EINVAL;
116
117 if (strcmp(key, event))
118 continue;
119
120 *val = strdup(line);
121 return 0;
122 }
123
124 return -ENOENT;
125 }
126
127 bool cg_ns_supported(void) {
128 static thread_local int enabled = -1;
129
130 if (enabled >= 0)
131 return enabled;
132
133 if (access("/proc/self/ns/cgroup", F_OK) < 0) {
134 if (errno != ENOENT)
135 log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
136 enabled = false;
137 } else
138 enabled = true;
139
140 return enabled;
141 }
142
143 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
144 _cleanup_free_ char *fs = NULL;
145 int r;
146 DIR *d;
147
148 assert(_d);
149
150 /* This is not recursive! */
151
152 r = cg_get_path(controller, path, NULL, &fs);
153 if (r < 0)
154 return r;
155
156 d = opendir(fs);
157 if (!d)
158 return -errno;
159
160 *_d = d;
161 return 0;
162 }
163
164 int cg_read_subgroup(DIR *d, char **fn) {
165 struct dirent *de;
166
167 assert(d);
168 assert(fn);
169
170 FOREACH_DIRENT_ALL(de, d, return -errno) {
171 char *b;
172
173 if (de->d_type != DT_DIR)
174 continue;
175
176 if (dot_or_dot_dot(de->d_name))
177 continue;
178
179 b = strdup(de->d_name);
180 if (!b)
181 return -ENOMEM;
182
183 *fn = b;
184 return 1;
185 }
186
187 return 0;
188 }
189
190 int cg_rmdir(const char *controller, const char *path) {
191 _cleanup_free_ char *p = NULL;
192 int r;
193
194 r = cg_get_path(controller, path, NULL, &p);
195 if (r < 0)
196 return r;
197
198 r = rmdir(p);
199 if (r < 0 && errno != ENOENT)
200 return -errno;
201
202 r = cg_hybrid_unified();
203 if (r <= 0)
204 return r;
205
206 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
207 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
208 if (r < 0)
209 log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
210 }
211
212 return 0;
213 }
214
215 int cg_kill(
216 const char *controller,
217 const char *path,
218 int sig,
219 CGroupFlags flags,
220 Set *s,
221 cg_kill_log_func_t log_kill,
222 void *userdata) {
223
224 _cleanup_set_free_ Set *allocated_set = NULL;
225 bool done = false;
226 int r, ret = 0;
227 pid_t my_pid;
228
229 assert(sig >= 0);
230
231 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
232 * SIGCONT on SIGKILL. */
233 if (IN_SET(sig, SIGCONT, SIGKILL))
234 flags &= ~CGROUP_SIGCONT;
235
236 /* This goes through the tasks list and kills them all. This
237 * is repeated until no further processes are added to the
238 * tasks list, to properly handle forking processes */
239
240 if (!s) {
241 s = allocated_set = set_new(NULL);
242 if (!s)
243 return -ENOMEM;
244 }
245
246 my_pid = getpid_cached();
247
248 do {
249 _cleanup_fclose_ FILE *f = NULL;
250 pid_t pid = 0;
251 done = true;
252
253 r = cg_enumerate_processes(controller, path, &f);
254 if (r < 0) {
255 if (ret >= 0 && r != -ENOENT)
256 return r;
257
258 return ret;
259 }
260
261 while ((r = cg_read_pid(f, &pid)) > 0) {
262
263 if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
264 continue;
265
266 if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
267 continue;
268
269 if (log_kill)
270 log_kill(pid, sig, userdata);
271
272 /* If we haven't killed this process yet, kill
273 * it */
274 if (kill(pid, sig) < 0) {
275 if (ret >= 0 && errno != ESRCH)
276 ret = -errno;
277 } else {
278 if (flags & CGROUP_SIGCONT)
279 (void) kill(pid, SIGCONT);
280
281 if (ret == 0)
282 ret = 1;
283 }
284
285 done = false;
286
287 r = set_put(s, PID_TO_PTR(pid));
288 if (r < 0) {
289 if (ret >= 0)
290 return r;
291
292 return ret;
293 }
294 }
295
296 if (r < 0) {
297 if (ret >= 0)
298 return r;
299
300 return ret;
301 }
302
303 /* To avoid racing against processes which fork
304 * quicker than we can kill them we repeat this until
305 * no new pids need to be killed. */
306
307 } while (!done);
308
309 return ret;
310 }
311
312 int cg_kill_recursive(
313 const char *controller,
314 const char *path,
315 int sig,
316 CGroupFlags flags,
317 Set *s,
318 cg_kill_log_func_t log_kill,
319 void *userdata) {
320
321 _cleanup_set_free_ Set *allocated_set = NULL;
322 _cleanup_closedir_ DIR *d = NULL;
323 int r, ret;
324 char *fn;
325
326 assert(path);
327 assert(sig >= 0);
328
329 if (!s) {
330 s = allocated_set = set_new(NULL);
331 if (!s)
332 return -ENOMEM;
333 }
334
335 ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
336
337 r = cg_enumerate_subgroups(controller, path, &d);
338 if (r < 0) {
339 if (ret >= 0 && r != -ENOENT)
340 return r;
341
342 return ret;
343 }
344
345 while ((r = cg_read_subgroup(d, &fn)) > 0) {
346 _cleanup_free_ char *p = NULL;
347
348 p = strjoin(path, "/", fn);
349 free(fn);
350 if (!p)
351 return -ENOMEM;
352
353 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
354 if (r != 0 && ret >= 0)
355 ret = r;
356 }
357 if (ret >= 0 && r < 0)
358 ret = r;
359
360 if (flags & CGROUP_REMOVE) {
361 r = cg_rmdir(controller, path);
362 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
363 return r;
364 }
365
366 return ret;
367 }
368
369 int cg_migrate(
370 const char *cfrom,
371 const char *pfrom,
372 const char *cto,
373 const char *pto,
374 CGroupFlags flags) {
375
376 bool done = false;
377 _cleanup_set_free_ Set *s = NULL;
378 int r, ret = 0;
379 pid_t my_pid;
380
381 assert(cfrom);
382 assert(pfrom);
383 assert(cto);
384 assert(pto);
385
386 s = set_new(NULL);
387 if (!s)
388 return -ENOMEM;
389
390 my_pid = getpid_cached();
391
392 do {
393 _cleanup_fclose_ FILE *f = NULL;
394 pid_t pid = 0;
395 done = true;
396
397 r = cg_enumerate_processes(cfrom, pfrom, &f);
398 if (r < 0) {
399 if (ret >= 0 && r != -ENOENT)
400 return r;
401
402 return ret;
403 }
404
405 while ((r = cg_read_pid(f, &pid)) > 0) {
406
407 /* This might do weird stuff if we aren't a
408 * single-threaded program. However, we
409 * luckily know we are not */
410 if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
411 continue;
412
413 if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
414 continue;
415
416 /* Ignore kernel threads. Since they can only
417 * exist in the root cgroup, we only check for
418 * them there. */
419 if (cfrom &&
420 empty_or_root(pfrom) &&
421 is_kernel_thread(pid) > 0)
422 continue;
423
424 r = cg_attach(cto, pto, pid);
425 if (r < 0) {
426 if (ret >= 0 && r != -ESRCH)
427 ret = r;
428 } else if (ret == 0)
429 ret = 1;
430
431 done = false;
432
433 r = set_put(s, PID_TO_PTR(pid));
434 if (r < 0) {
435 if (ret >= 0)
436 return r;
437
438 return ret;
439 }
440 }
441
442 if (r < 0) {
443 if (ret >= 0)
444 return r;
445
446 return ret;
447 }
448 } while (!done);
449
450 return ret;
451 }
452
453 int cg_migrate_recursive(
454 const char *cfrom,
455 const char *pfrom,
456 const char *cto,
457 const char *pto,
458 CGroupFlags flags) {
459
460 _cleanup_closedir_ DIR *d = NULL;
461 int r, ret = 0;
462 char *fn;
463
464 assert(cfrom);
465 assert(pfrom);
466 assert(cto);
467 assert(pto);
468
469 ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
470
471 r = cg_enumerate_subgroups(cfrom, pfrom, &d);
472 if (r < 0) {
473 if (ret >= 0 && r != -ENOENT)
474 return r;
475
476 return ret;
477 }
478
479 while ((r = cg_read_subgroup(d, &fn)) > 0) {
480 _cleanup_free_ char *p = NULL;
481
482 p = strjoin(pfrom, "/", fn);
483 free(fn);
484 if (!p)
485 return -ENOMEM;
486
487 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
488 if (r != 0 && ret >= 0)
489 ret = r;
490 }
491
492 if (r < 0 && ret >= 0)
493 ret = r;
494
495 if (flags & CGROUP_REMOVE) {
496 r = cg_rmdir(cfrom, pfrom);
497 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
498 return r;
499 }
500
501 return ret;
502 }
503
504 int cg_migrate_recursive_fallback(
505 const char *cfrom,
506 const char *pfrom,
507 const char *cto,
508 const char *pto,
509 CGroupFlags flags) {
510
511 int r;
512
513 assert(cfrom);
514 assert(pfrom);
515 assert(cto);
516 assert(pto);
517
518 r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
519 if (r < 0) {
520 char prefix[strlen(pto) + 1];
521
522 /* This didn't work? Then let's try all prefixes of the destination */
523
524 PATH_FOREACH_PREFIX(prefix, pto) {
525 int q;
526
527 q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
528 if (q >= 0)
529 return q;
530 }
531 }
532
533 return r;
534 }
535
536 static const char *controller_to_dirname(const char *controller) {
537 const char *e;
538
539 assert(controller);
540
541 /* Converts a controller name to the directory name below
542 * /sys/fs/cgroup/ we want to mount it to. Effectively, this
543 * just cuts off the name= prefixed used for named
544 * hierarchies, if it is specified. */
545
546 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
547 if (cg_hybrid_unified() > 0)
548 controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
549 else
550 controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
551 }
552
553 e = startswith(controller, "name=");
554 if (e)
555 return e;
556
557 return controller;
558 }
559
560 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
561 const char *dn;
562 char *t = NULL;
563
564 assert(fs);
565 assert(controller);
566
567 dn = controller_to_dirname(controller);
568
569 if (isempty(path) && isempty(suffix))
570 t = strappend("/sys/fs/cgroup/", dn);
571 else if (isempty(path))
572 t = strjoin("/sys/fs/cgroup/", dn, "/", suffix);
573 else if (isempty(suffix))
574 t = strjoin("/sys/fs/cgroup/", dn, "/", path);
575 else
576 t = strjoin("/sys/fs/cgroup/", dn, "/", path, "/", suffix);
577 if (!t)
578 return -ENOMEM;
579
580 *fs = t;
581 return 0;
582 }
583
584 static int join_path_unified(const char *path, const char *suffix, char **fs) {
585 char *t;
586
587 assert(fs);
588
589 if (isempty(path) && isempty(suffix))
590 t = strdup("/sys/fs/cgroup");
591 else if (isempty(path))
592 t = strappend("/sys/fs/cgroup/", suffix);
593 else if (isempty(suffix))
594 t = strappend("/sys/fs/cgroup/", path);
595 else
596 t = strjoin("/sys/fs/cgroup/", path, "/", suffix);
597 if (!t)
598 return -ENOMEM;
599
600 *fs = t;
601 return 0;
602 }
603
604 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
605 int r;
606
607 assert(fs);
608
609 if (!controller) {
610 char *t;
611
612 /* If no controller is specified, we return the path
613 * *below* the controllers, without any prefix. */
614
615 if (!path && !suffix)
616 return -EINVAL;
617
618 if (!suffix)
619 t = strdup(path);
620 else if (!path)
621 t = strdup(suffix);
622 else
623 t = strjoin(path, "/", suffix);
624 if (!t)
625 return -ENOMEM;
626
627 *fs = path_simplify(t, false);
628 return 0;
629 }
630
631 if (!cg_controller_is_valid(controller))
632 return -EINVAL;
633
634 r = cg_all_unified();
635 if (r < 0)
636 return r;
637 if (r > 0)
638 r = join_path_unified(path, suffix, fs);
639 else
640 r = join_path_legacy(controller, path, suffix, fs);
641 if (r < 0)
642 return r;
643
644 path_simplify(*fs, false);
645 return 0;
646 }
647
648 static int controller_is_accessible(const char *controller) {
649 int r;
650
651 assert(controller);
652
653 /* Checks whether a specific controller is accessible,
654 * i.e. its hierarchy mounted. In the unified hierarchy all
655 * controllers are considered accessible, except for the named
656 * hierarchies */
657
658 if (!cg_controller_is_valid(controller))
659 return -EINVAL;
660
661 r = cg_all_unified();
662 if (r < 0)
663 return r;
664 if (r > 0) {
665 /* We don't support named hierarchies if we are using
666 * the unified hierarchy. */
667
668 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
669 return 0;
670
671 if (startswith(controller, "name="))
672 return -EOPNOTSUPP;
673
674 } else {
675 const char *cc, *dn;
676
677 dn = controller_to_dirname(controller);
678 cc = strjoina("/sys/fs/cgroup/", dn);
679
680 if (laccess(cc, F_OK) < 0)
681 return -errno;
682 }
683
684 return 0;
685 }
686
687 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
688 int r;
689
690 assert(controller);
691 assert(fs);
692
693 /* Check if the specified controller is actually accessible */
694 r = controller_is_accessible(controller);
695 if (r < 0)
696 return r;
697
698 return cg_get_path(controller, path, suffix, fs);
699 }
700
701 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
702 assert(path);
703 assert(sb);
704 assert(ftwbuf);
705
706 if (typeflag != FTW_DP)
707 return 0;
708
709 if (ftwbuf->level < 1)
710 return 0;
711
712 (void) rmdir(path);
713 return 0;
714 }
715
716 int cg_trim(const char *controller, const char *path, bool delete_root) {
717 _cleanup_free_ char *fs = NULL;
718 int r = 0, q;
719
720 assert(path);
721
722 r = cg_get_path(controller, path, NULL, &fs);
723 if (r < 0)
724 return r;
725
726 errno = 0;
727 if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
728 if (errno == ENOENT)
729 r = 0;
730 else if (errno > 0)
731 r = -errno;
732 else
733 r = -EIO;
734 }
735
736 if (delete_root) {
737 if (rmdir(fs) < 0 && errno != ENOENT)
738 return -errno;
739 }
740
741 q = cg_hybrid_unified();
742 if (q < 0)
743 return q;
744 if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
745 q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
746 if (q < 0)
747 log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
748 }
749
750 return r;
751 }
752
753 /* Create a cgroup in the hierarchy of controller.
754 * Returns 0 if the group already existed, 1 on success, negative otherwise.
755 */
756 int cg_create(const char *controller, const char *path) {
757 _cleanup_free_ char *fs = NULL;
758 int r;
759
760 r = cg_get_path_and_check(controller, path, NULL, &fs);
761 if (r < 0)
762 return r;
763
764 r = mkdir_parents(fs, 0755);
765 if (r < 0)
766 return r;
767
768 r = mkdir_errno_wrapper(fs, 0755);
769 if (r == -EEXIST)
770 return 0;
771 if (r < 0)
772 return r;
773
774 r = cg_hybrid_unified();
775 if (r < 0)
776 return r;
777
778 if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
779 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
780 if (r < 0)
781 log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
782 }
783
784 return 1;
785 }
786
787 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
788 int r, q;
789
790 assert(pid >= 0);
791
792 r = cg_create(controller, path);
793 if (r < 0)
794 return r;
795
796 q = cg_attach(controller, path, pid);
797 if (q < 0)
798 return q;
799
800 /* This does not remove the cgroup on failure */
801 return r;
802 }
803
804 int cg_attach(const char *controller, const char *path, pid_t pid) {
805 _cleanup_free_ char *fs = NULL;
806 char c[DECIMAL_STR_MAX(pid_t) + 2];
807 int r;
808
809 assert(path);
810 assert(pid >= 0);
811
812 r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
813 if (r < 0)
814 return r;
815
816 if (pid == 0)
817 pid = getpid_cached();
818
819 xsprintf(c, PID_FMT "\n", pid);
820
821 r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
822 if (r < 0)
823 return r;
824
825 r = cg_hybrid_unified();
826 if (r < 0)
827 return r;
828
829 if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
830 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
831 if (r < 0)
832 log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
833 }
834
835 return 0;
836 }
837
838 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
839 int r;
840
841 assert(controller);
842 assert(path);
843 assert(pid >= 0);
844
845 r = cg_attach(controller, path, pid);
846 if (r < 0) {
847 char prefix[strlen(path) + 1];
848
849 /* This didn't work? Then let's try all prefixes of
850 * the destination */
851
852 PATH_FOREACH_PREFIX(prefix, path) {
853 int q;
854
855 q = cg_attach(controller, prefix, pid);
856 if (q >= 0)
857 return q;
858 }
859 }
860
861 return r;
862 }
863
864 int cg_set_access(
865 const char *controller,
866 const char *path,
867 uid_t uid,
868 gid_t gid) {
869
870 struct Attribute {
871 const char *name;
872 bool fatal;
873 };
874
875 /* cgroupsv1, aka legacy/non-unified */
876 static const struct Attribute legacy_attributes[] = {
877 { "cgroup.procs", true },
878 { "tasks", false },
879 { "cgroup.clone_children", false },
880 {},
881 };
882
883 /* cgroupsv2, aka unified */
884 static const struct Attribute unified_attributes[] = {
885 { "cgroup.procs", true },
886 { "cgroup.subtree_control", true },
887 { "cgroup.threads", false },
888 {},
889 };
890
891 static const struct Attribute* const attributes[] = {
892 [false] = legacy_attributes,
893 [true] = unified_attributes,
894 };
895
896 _cleanup_free_ char *fs = NULL;
897 const struct Attribute *i;
898 int r, unified;
899
900 assert(path);
901
902 if (uid == UID_INVALID && gid == GID_INVALID)
903 return 0;
904
905 unified = cg_unified_controller(controller);
906 if (unified < 0)
907 return unified;
908
909 /* Configure access to the cgroup itself */
910 r = cg_get_path(controller, path, NULL, &fs);
911 if (r < 0)
912 return r;
913
914 r = chmod_and_chown(fs, 0755, uid, gid);
915 if (r < 0)
916 return r;
917
918 /* Configure access to the cgroup's attributes */
919 for (i = attributes[unified]; i->name; i++) {
920 fs = mfree(fs);
921
922 r = cg_get_path(controller, path, i->name, &fs);
923 if (r < 0)
924 return r;
925
926 r = chmod_and_chown(fs, 0644, uid, gid);
927 if (r < 0) {
928 if (i->fatal)
929 return r;
930
931 log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
932 }
933 }
934
935 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
936 r = cg_hybrid_unified();
937 if (r < 0)
938 return r;
939 if (r > 0) {
940 /* Always propagate access mode from unified to legacy controller */
941 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
942 if (r < 0)
943 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
944 }
945 }
946
947 return 0;
948 }
949
950 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
951 _cleanup_free_ char *fs = NULL;
952 int r;
953
954 assert(path);
955 assert(name);
956 assert(value || size <= 0);
957
958 r = cg_get_path(controller, path, NULL, &fs);
959 if (r < 0)
960 return r;
961
962 if (setxattr(fs, name, value, size, flags) < 0)
963 return -errno;
964
965 return 0;
966 }
967
968 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
969 _cleanup_free_ char *fs = NULL;
970 ssize_t n;
971 int r;
972
973 assert(path);
974 assert(name);
975
976 r = cg_get_path(controller, path, NULL, &fs);
977 if (r < 0)
978 return r;
979
980 n = getxattr(fs, name, value, size);
981 if (n < 0)
982 return -errno;
983
984 return (int) n;
985 }
986
987 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
988 _cleanup_fclose_ FILE *f = NULL;
989 const char *fs, *controller_str;
990 int unified, r;
991 size_t cs = 0;
992
993 assert(path);
994 assert(pid >= 0);
995
996 if (controller) {
997 if (!cg_controller_is_valid(controller))
998 return -EINVAL;
999 } else
1000 controller = SYSTEMD_CGROUP_CONTROLLER;
1001
1002 unified = cg_unified_controller(controller);
1003 if (unified < 0)
1004 return unified;
1005 if (unified == 0) {
1006 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1007 controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1008 else
1009 controller_str = controller;
1010
1011 cs = strlen(controller_str);
1012 }
1013
1014 fs = procfs_file_alloca(pid, "cgroup");
1015 f = fopen(fs, "re");
1016 if (!f)
1017 return errno == ENOENT ? -ESRCH : -errno;
1018
1019 (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
1020
1021 for (;;) {
1022 _cleanup_free_ char *line = NULL;
1023 char *e, *p;
1024
1025 r = read_line(f, LONG_LINE_MAX, &line);
1026 if (r < 0)
1027 return r;
1028 if (r == 0)
1029 break;
1030
1031 if (unified) {
1032 e = startswith(line, "0:");
1033 if (!e)
1034 continue;
1035
1036 e = strchr(e, ':');
1037 if (!e)
1038 continue;
1039 } else {
1040 char *l;
1041 size_t k;
1042 const char *word, *state;
1043 bool found = false;
1044
1045 l = strchr(line, ':');
1046 if (!l)
1047 continue;
1048
1049 l++;
1050 e = strchr(l, ':');
1051 if (!e)
1052 continue;
1053
1054 *e = 0;
1055 FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
1056 if (k == cs && memcmp(word, controller_str, cs) == 0) {
1057 found = true;
1058 break;
1059 }
1060 if (!found)
1061 continue;
1062 }
1063
1064 p = strdup(e + 1);
1065 if (!p)
1066 return -ENOMEM;
1067
1068 /* Truncate suffix indicating the process is a zombie */
1069 e = endswith(p, " (deleted)");
1070 if (e)
1071 *e = 0;
1072
1073 *path = p;
1074 return 0;
1075 }
1076
1077 return -ENODATA;
1078 }
1079
1080 int cg_install_release_agent(const char *controller, const char *agent) {
1081 _cleanup_free_ char *fs = NULL, *contents = NULL;
1082 const char *sc;
1083 int r;
1084
1085 assert(agent);
1086
1087 r = cg_unified_controller(controller);
1088 if (r < 0)
1089 return r;
1090 if (r > 0) /* doesn't apply to unified hierarchy */
1091 return -EOPNOTSUPP;
1092
1093 r = cg_get_path(controller, NULL, "release_agent", &fs);
1094 if (r < 0)
1095 return r;
1096
1097 r = read_one_line_file(fs, &contents);
1098 if (r < 0)
1099 return r;
1100
1101 sc = strstrip(contents);
1102 if (isempty(sc)) {
1103 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
1104 if (r < 0)
1105 return r;
1106 } else if (!path_equal(sc, agent))
1107 return -EEXIST;
1108
1109 fs = mfree(fs);
1110 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1111 if (r < 0)
1112 return r;
1113
1114 contents = mfree(contents);
1115 r = read_one_line_file(fs, &contents);
1116 if (r < 0)
1117 return r;
1118
1119 sc = strstrip(contents);
1120 if (streq(sc, "0")) {
1121 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
1122 if (r < 0)
1123 return r;
1124
1125 return 1;
1126 }
1127
1128 if (!streq(sc, "1"))
1129 return -EIO;
1130
1131 return 0;
1132 }
1133
1134 int cg_uninstall_release_agent(const char *controller) {
1135 _cleanup_free_ char *fs = NULL;
1136 int r;
1137
1138 r = cg_unified_controller(controller);
1139 if (r < 0)
1140 return r;
1141 if (r > 0) /* Doesn't apply to unified hierarchy */
1142 return -EOPNOTSUPP;
1143
1144 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1145 if (r < 0)
1146 return r;
1147
1148 r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
1149 if (r < 0)
1150 return r;
1151
1152 fs = mfree(fs);
1153
1154 r = cg_get_path(controller, NULL, "release_agent", &fs);
1155 if (r < 0)
1156 return r;
1157
1158 r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
1159 if (r < 0)
1160 return r;
1161
1162 return 0;
1163 }
1164
1165 int cg_is_empty(const char *controller, const char *path) {
1166 _cleanup_fclose_ FILE *f = NULL;
1167 pid_t pid;
1168 int r;
1169
1170 assert(path);
1171
1172 r = cg_enumerate_processes(controller, path, &f);
1173 if (r == -ENOENT)
1174 return true;
1175 if (r < 0)
1176 return r;
1177
1178 r = cg_read_pid(f, &pid);
1179 if (r < 0)
1180 return r;
1181
1182 return r == 0;
1183 }
1184
1185 int cg_is_empty_recursive(const char *controller, const char *path) {
1186 int r;
1187
1188 assert(path);
1189
1190 /* The root cgroup is always populated */
1191 if (controller && empty_or_root(path))
1192 return false;
1193
1194 r = cg_unified_controller(controller);
1195 if (r < 0)
1196 return r;
1197 if (r > 0) {
1198 _cleanup_free_ char *t = NULL;
1199
1200 /* On the unified hierarchy we can check empty state
1201 * via the "populated" attribute of "cgroup.events". */
1202
1203 r = cg_read_event(controller, path, "populated", &t);
1204 if (r == -ENOENT)
1205 return true;
1206 if (r < 0)
1207 return r;
1208
1209 return streq(t, "0");
1210 } else {
1211 _cleanup_closedir_ DIR *d = NULL;
1212 char *fn;
1213
1214 r = cg_is_empty(controller, path);
1215 if (r <= 0)
1216 return r;
1217
1218 r = cg_enumerate_subgroups(controller, path, &d);
1219 if (r == -ENOENT)
1220 return true;
1221 if (r < 0)
1222 return r;
1223
1224 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1225 _cleanup_free_ char *p = NULL;
1226
1227 p = strjoin(path, "/", fn);
1228 free(fn);
1229 if (!p)
1230 return -ENOMEM;
1231
1232 r = cg_is_empty_recursive(controller, p);
1233 if (r <= 0)
1234 return r;
1235 }
1236 if (r < 0)
1237 return r;
1238
1239 return true;
1240 }
1241 }
1242
1243 int cg_split_spec(const char *spec, char **controller, char **path) {
1244 char *t = NULL, *u = NULL;
1245 const char *e;
1246
1247 assert(spec);
1248
1249 if (*spec == '/') {
1250 if (!path_is_normalized(spec))
1251 return -EINVAL;
1252
1253 if (path) {
1254 t = strdup(spec);
1255 if (!t)
1256 return -ENOMEM;
1257
1258 *path = path_simplify(t, false);
1259 }
1260
1261 if (controller)
1262 *controller = NULL;
1263
1264 return 0;
1265 }
1266
1267 e = strchr(spec, ':');
1268 if (!e) {
1269 if (!cg_controller_is_valid(spec))
1270 return -EINVAL;
1271
1272 if (controller) {
1273 t = strdup(spec);
1274 if (!t)
1275 return -ENOMEM;
1276
1277 *controller = t;
1278 }
1279
1280 if (path)
1281 *path = NULL;
1282
1283 return 0;
1284 }
1285
1286 t = strndup(spec, e-spec);
1287 if (!t)
1288 return -ENOMEM;
1289 if (!cg_controller_is_valid(t)) {
1290 free(t);
1291 return -EINVAL;
1292 }
1293
1294 if (isempty(e+1))
1295 u = NULL;
1296 else {
1297 u = strdup(e+1);
1298 if (!u) {
1299 free(t);
1300 return -ENOMEM;
1301 }
1302
1303 if (!path_is_normalized(u) ||
1304 !path_is_absolute(u)) {
1305 free(t);
1306 free(u);
1307 return -EINVAL;
1308 }
1309
1310 path_simplify(u, false);
1311 }
1312
1313 if (controller)
1314 *controller = t;
1315 else
1316 free(t);
1317
1318 if (path)
1319 *path = u;
1320 else
1321 free(u);
1322
1323 return 0;
1324 }
1325
1326 int cg_mangle_path(const char *path, char **result) {
1327 _cleanup_free_ char *c = NULL, *p = NULL;
1328 char *t;
1329 int r;
1330
1331 assert(path);
1332 assert(result);
1333
1334 /* First, check if it already is a filesystem path */
1335 if (path_startswith(path, "/sys/fs/cgroup")) {
1336
1337 t = strdup(path);
1338 if (!t)
1339 return -ENOMEM;
1340
1341 *result = path_simplify(t, false);
1342 return 0;
1343 }
1344
1345 /* Otherwise, treat it as cg spec */
1346 r = cg_split_spec(path, &c, &p);
1347 if (r < 0)
1348 return r;
1349
1350 return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1351 }
1352
1353 int cg_get_root_path(char **path) {
1354 char *p, *e;
1355 int r;
1356
1357 assert(path);
1358
1359 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1360 if (r < 0)
1361 return r;
1362
1363 e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1364 if (!e)
1365 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1366 if (!e)
1367 e = endswith(p, "/system"); /* even more legacy */
1368 if (e)
1369 *e = 0;
1370
1371 *path = p;
1372 return 0;
1373 }
1374
1375 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1376 _cleanup_free_ char *rt = NULL;
1377 char *p;
1378 int r;
1379
1380 assert(cgroup);
1381 assert(shifted);
1382
1383 if (!root) {
1384 /* If the root was specified let's use that, otherwise
1385 * let's determine it from PID 1 */
1386
1387 r = cg_get_root_path(&rt);
1388 if (r < 0)
1389 return r;
1390
1391 root = rt;
1392 }
1393
1394 p = path_startswith(cgroup, root);
1395 if (p && p > cgroup)
1396 *shifted = p - 1;
1397 else
1398 *shifted = cgroup;
1399
1400 return 0;
1401 }
1402
1403 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1404 _cleanup_free_ char *raw = NULL;
1405 const char *c;
1406 int r;
1407
1408 assert(pid >= 0);
1409 assert(cgroup);
1410
1411 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1412 if (r < 0)
1413 return r;
1414
1415 r = cg_shift_path(raw, root, &c);
1416 if (r < 0)
1417 return r;
1418
1419 if (c == raw)
1420 *cgroup = TAKE_PTR(raw);
1421 else {
1422 char *n;
1423
1424 n = strdup(c);
1425 if (!n)
1426 return -ENOMEM;
1427
1428 *cgroup = n;
1429 }
1430
1431 return 0;
1432 }
1433
1434 int cg_path_decode_unit(const char *cgroup, char **unit) {
1435 char *c, *s;
1436 size_t n;
1437
1438 assert(cgroup);
1439 assert(unit);
1440
1441 n = strcspn(cgroup, "/");
1442 if (n < 3)
1443 return -ENXIO;
1444
1445 c = strndupa(cgroup, n);
1446 c = cg_unescape(c);
1447
1448 if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1449 return -ENXIO;
1450
1451 s = strdup(c);
1452 if (!s)
1453 return -ENOMEM;
1454
1455 *unit = s;
1456 return 0;
1457 }
1458
1459 static bool valid_slice_name(const char *p, size_t n) {
1460
1461 if (!p)
1462 return false;
1463
1464 if (n < STRLEN("x.slice"))
1465 return false;
1466
1467 if (memcmp(p + n - 6, ".slice", 6) == 0) {
1468 char buf[n+1], *c;
1469
1470 memcpy(buf, p, n);
1471 buf[n] = 0;
1472
1473 c = cg_unescape(buf);
1474
1475 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1476 }
1477
1478 return false;
1479 }
1480
1481 static const char *skip_slices(const char *p) {
1482 assert(p);
1483
1484 /* Skips over all slice assignments */
1485
1486 for (;;) {
1487 size_t n;
1488
1489 p += strspn(p, "/");
1490
1491 n = strcspn(p, "/");
1492 if (!valid_slice_name(p, n))
1493 return p;
1494
1495 p += n;
1496 }
1497 }
1498
1499 int cg_path_get_unit(const char *path, char **ret) {
1500 const char *e;
1501 char *unit;
1502 int r;
1503
1504 assert(path);
1505 assert(ret);
1506
1507 e = skip_slices(path);
1508
1509 r = cg_path_decode_unit(e, &unit);
1510 if (r < 0)
1511 return r;
1512
1513 /* We skipped over the slices, don't accept any now */
1514 if (endswith(unit, ".slice")) {
1515 free(unit);
1516 return -ENXIO;
1517 }
1518
1519 *ret = unit;
1520 return 0;
1521 }
1522
1523 int cg_pid_get_unit(pid_t pid, char **unit) {
1524 _cleanup_free_ char *cgroup = NULL;
1525 int r;
1526
1527 assert(unit);
1528
1529 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1530 if (r < 0)
1531 return r;
1532
1533 return cg_path_get_unit(cgroup, unit);
1534 }
1535
1536 /**
1537 * Skip session-*.scope, but require it to be there.
1538 */
1539 static const char *skip_session(const char *p) {
1540 size_t n;
1541
1542 if (isempty(p))
1543 return NULL;
1544
1545 p += strspn(p, "/");
1546
1547 n = strcspn(p, "/");
1548 if (n < STRLEN("session-x.scope"))
1549 return NULL;
1550
1551 if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1552 char buf[n - 8 - 6 + 1];
1553
1554 memcpy(buf, p + 8, n - 8 - 6);
1555 buf[n - 8 - 6] = 0;
1556
1557 /* Note that session scopes never need unescaping,
1558 * since they cannot conflict with the kernel's own
1559 * names, hence we don't need to call cg_unescape()
1560 * here. */
1561
1562 if (!session_id_valid(buf))
1563 return false;
1564
1565 p += n;
1566 p += strspn(p, "/");
1567 return p;
1568 }
1569
1570 return NULL;
1571 }
1572
1573 /**
1574 * Skip user@*.service, but require it to be there.
1575 */
1576 static const char *skip_user_manager(const char *p) {
1577 size_t n;
1578
1579 if (isempty(p))
1580 return NULL;
1581
1582 p += strspn(p, "/");
1583
1584 n = strcspn(p, "/");
1585 if (n < STRLEN("user@x.service"))
1586 return NULL;
1587
1588 if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1589 char buf[n - 5 - 8 + 1];
1590
1591 memcpy(buf, p + 5, n - 5 - 8);
1592 buf[n - 5 - 8] = 0;
1593
1594 /* Note that user manager services never need unescaping,
1595 * since they cannot conflict with the kernel's own
1596 * names, hence we don't need to call cg_unescape()
1597 * here. */
1598
1599 if (parse_uid(buf, NULL) < 0)
1600 return NULL;
1601
1602 p += n;
1603 p += strspn(p, "/");
1604
1605 return p;
1606 }
1607
1608 return NULL;
1609 }
1610
1611 static const char *skip_user_prefix(const char *path) {
1612 const char *e, *t;
1613
1614 assert(path);
1615
1616 /* Skip slices, if there are any */
1617 e = skip_slices(path);
1618
1619 /* Skip the user manager, if it's in the path now... */
1620 t = skip_user_manager(e);
1621 if (t)
1622 return t;
1623
1624 /* Alternatively skip the user session if it is in the path... */
1625 return skip_session(e);
1626 }
1627
1628 int cg_path_get_user_unit(const char *path, char **ret) {
1629 const char *t;
1630
1631 assert(path);
1632 assert(ret);
1633
1634 t = skip_user_prefix(path);
1635 if (!t)
1636 return -ENXIO;
1637
1638 /* And from here on it looks pretty much the same as for a
1639 * system unit, hence let's use the same parser from here
1640 * on. */
1641 return cg_path_get_unit(t, ret);
1642 }
1643
1644 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1645 _cleanup_free_ char *cgroup = NULL;
1646 int r;
1647
1648 assert(unit);
1649
1650 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1651 if (r < 0)
1652 return r;
1653
1654 return cg_path_get_user_unit(cgroup, unit);
1655 }
1656
1657 int cg_path_get_machine_name(const char *path, char **machine) {
1658 _cleanup_free_ char *u = NULL;
1659 const char *sl;
1660 int r;
1661
1662 r = cg_path_get_unit(path, &u);
1663 if (r < 0)
1664 return r;
1665
1666 sl = strjoina("/run/systemd/machines/unit:", u);
1667 return readlink_malloc(sl, machine);
1668 }
1669
1670 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1671 _cleanup_free_ char *cgroup = NULL;
1672 int r;
1673
1674 assert(machine);
1675
1676 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1677 if (r < 0)
1678 return r;
1679
1680 return cg_path_get_machine_name(cgroup, machine);
1681 }
1682
1683 int cg_path_get_session(const char *path, char **session) {
1684 _cleanup_free_ char *unit = NULL;
1685 char *start, *end;
1686 int r;
1687
1688 assert(path);
1689
1690 r = cg_path_get_unit(path, &unit);
1691 if (r < 0)
1692 return r;
1693
1694 start = startswith(unit, "session-");
1695 if (!start)
1696 return -ENXIO;
1697 end = endswith(start, ".scope");
1698 if (!end)
1699 return -ENXIO;
1700
1701 *end = 0;
1702 if (!session_id_valid(start))
1703 return -ENXIO;
1704
1705 if (session) {
1706 char *rr;
1707
1708 rr = strdup(start);
1709 if (!rr)
1710 return -ENOMEM;
1711
1712 *session = rr;
1713 }
1714
1715 return 0;
1716 }
1717
1718 int cg_pid_get_session(pid_t pid, char **session) {
1719 _cleanup_free_ char *cgroup = NULL;
1720 int r;
1721
1722 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1723 if (r < 0)
1724 return r;
1725
1726 return cg_path_get_session(cgroup, session);
1727 }
1728
1729 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1730 _cleanup_free_ char *slice = NULL;
1731 char *start, *end;
1732 int r;
1733
1734 assert(path);
1735
1736 r = cg_path_get_slice(path, &slice);
1737 if (r < 0)
1738 return r;
1739
1740 start = startswith(slice, "user-");
1741 if (!start)
1742 return -ENXIO;
1743 end = endswith(start, ".slice");
1744 if (!end)
1745 return -ENXIO;
1746
1747 *end = 0;
1748 if (parse_uid(start, uid) < 0)
1749 return -ENXIO;
1750
1751 return 0;
1752 }
1753
1754 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1755 _cleanup_free_ char *cgroup = NULL;
1756 int r;
1757
1758 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1759 if (r < 0)
1760 return r;
1761
1762 return cg_path_get_owner_uid(cgroup, uid);
1763 }
1764
1765 int cg_path_get_slice(const char *p, char **slice) {
1766 const char *e = NULL;
1767
1768 assert(p);
1769 assert(slice);
1770
1771 /* Finds the right-most slice unit from the beginning, but
1772 * stops before we come to the first non-slice unit. */
1773
1774 for (;;) {
1775 size_t n;
1776
1777 p += strspn(p, "/");
1778
1779 n = strcspn(p, "/");
1780 if (!valid_slice_name(p, n)) {
1781
1782 if (!e) {
1783 char *s;
1784
1785 s = strdup(SPECIAL_ROOT_SLICE);
1786 if (!s)
1787 return -ENOMEM;
1788
1789 *slice = s;
1790 return 0;
1791 }
1792
1793 return cg_path_decode_unit(e, slice);
1794 }
1795
1796 e = p;
1797 p += n;
1798 }
1799 }
1800
1801 int cg_pid_get_slice(pid_t pid, char **slice) {
1802 _cleanup_free_ char *cgroup = NULL;
1803 int r;
1804
1805 assert(slice);
1806
1807 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1808 if (r < 0)
1809 return r;
1810
1811 return cg_path_get_slice(cgroup, slice);
1812 }
1813
1814 int cg_path_get_user_slice(const char *p, char **slice) {
1815 const char *t;
1816 assert(p);
1817 assert(slice);
1818
1819 t = skip_user_prefix(p);
1820 if (!t)
1821 return -ENXIO;
1822
1823 /* And now it looks pretty much the same as for a system
1824 * slice, so let's just use the same parser from here on. */
1825 return cg_path_get_slice(t, slice);
1826 }
1827
1828 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1829 _cleanup_free_ char *cgroup = NULL;
1830 int r;
1831
1832 assert(slice);
1833
1834 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1835 if (r < 0)
1836 return r;
1837
1838 return cg_path_get_user_slice(cgroup, slice);
1839 }
1840
1841 char *cg_escape(const char *p) {
1842 bool need_prefix = false;
1843
1844 /* This implements very minimal escaping for names to be used
1845 * as file names in the cgroup tree: any name which might
1846 * conflict with a kernel name or is prefixed with '_' is
1847 * prefixed with a '_'. That way, when reading cgroup names it
1848 * is sufficient to remove a single prefixing underscore if
1849 * there is one. */
1850
1851 /* The return value of this function (unlike cg_unescape())
1852 * needs free()! */
1853
1854 if (IN_SET(p[0], 0, '_', '.') ||
1855 STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") ||
1856 startswith(p, "cgroup."))
1857 need_prefix = true;
1858 else {
1859 const char *dot;
1860
1861 dot = strrchr(p, '.');
1862 if (dot) {
1863 CGroupController c;
1864 size_t l = dot - p;
1865
1866 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1867 const char *n;
1868
1869 n = cgroup_controller_to_string(c);
1870
1871 if (l != strlen(n))
1872 continue;
1873
1874 if (memcmp(p, n, l) != 0)
1875 continue;
1876
1877 need_prefix = true;
1878 break;
1879 }
1880 }
1881 }
1882
1883 if (need_prefix)
1884 return strappend("_", p);
1885
1886 return strdup(p);
1887 }
1888
1889 char *cg_unescape(const char *p) {
1890 assert(p);
1891
1892 /* The return value of this function (unlike cg_escape())
1893 * doesn't need free()! */
1894
1895 if (p[0] == '_')
1896 return (char*) p+1;
1897
1898 return (char*) p;
1899 }
1900
1901 #define CONTROLLER_VALID \
1902 DIGITS LETTERS \
1903 "_"
1904
1905 bool cg_controller_is_valid(const char *p) {
1906 const char *t, *s;
1907
1908 if (!p)
1909 return false;
1910
1911 if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1912 return true;
1913
1914 s = startswith(p, "name=");
1915 if (s)
1916 p = s;
1917
1918 if (IN_SET(*p, 0, '_'))
1919 return false;
1920
1921 for (t = p; *t; t++)
1922 if (!strchr(CONTROLLER_VALID, *t))
1923 return false;
1924
1925 if (t - p > FILENAME_MAX)
1926 return false;
1927
1928 return true;
1929 }
1930
1931 int cg_slice_to_path(const char *unit, char **ret) {
1932 _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1933 const char *dash;
1934 int r;
1935
1936 assert(unit);
1937 assert(ret);
1938
1939 if (streq(unit, SPECIAL_ROOT_SLICE)) {
1940 char *x;
1941
1942 x = strdup("");
1943 if (!x)
1944 return -ENOMEM;
1945 *ret = x;
1946 return 0;
1947 }
1948
1949 if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1950 return -EINVAL;
1951
1952 if (!endswith(unit, ".slice"))
1953 return -EINVAL;
1954
1955 r = unit_name_to_prefix(unit, &p);
1956 if (r < 0)
1957 return r;
1958
1959 dash = strchr(p, '-');
1960
1961 /* Don't allow initial dashes */
1962 if (dash == p)
1963 return -EINVAL;
1964
1965 while (dash) {
1966 _cleanup_free_ char *escaped = NULL;
1967 char n[dash - p + sizeof(".slice")];
1968
1969 #if HAS_FEATURE_MEMORY_SANITIZER
1970 /* msan doesn't instrument stpncpy, so it thinks
1971 * n is later used unitialized:
1972 * https://github.com/google/sanitizers/issues/926
1973 */
1974 zero(n);
1975 #endif
1976
1977 /* Don't allow trailing or double dashes */
1978 if (IN_SET(dash[1], 0, '-'))
1979 return -EINVAL;
1980
1981 strcpy(stpncpy(n, p, dash - p), ".slice");
1982 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1983 return -EINVAL;
1984
1985 escaped = cg_escape(n);
1986 if (!escaped)
1987 return -ENOMEM;
1988
1989 if (!strextend(&s, escaped, "/", NULL))
1990 return -ENOMEM;
1991
1992 dash = strchr(dash+1, '-');
1993 }
1994
1995 e = cg_escape(unit);
1996 if (!e)
1997 return -ENOMEM;
1998
1999 if (!strextend(&s, e, NULL))
2000 return -ENOMEM;
2001
2002 *ret = TAKE_PTR(s);
2003
2004 return 0;
2005 }
2006
2007 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2008 _cleanup_free_ char *p = NULL;
2009 int r;
2010
2011 r = cg_get_path(controller, path, attribute, &p);
2012 if (r < 0)
2013 return r;
2014
2015 return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
2016 }
2017
2018 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2019 _cleanup_free_ char *p = NULL;
2020 int r;
2021
2022 r = cg_get_path(controller, path, attribute, &p);
2023 if (r < 0)
2024 return r;
2025
2026 return read_one_line_file(p, ret);
2027 }
2028
2029 int cg_get_keyed_attribute(
2030 const char *controller,
2031 const char *path,
2032 const char *attribute,
2033 char **keys,
2034 char **ret_values) {
2035
2036 _cleanup_free_ char *filename = NULL, *contents = NULL;
2037 const char *p;
2038 size_t n, i, n_done = 0;
2039 char **v;
2040 int r;
2041
2042 /* Reads one or more fields of a cgroupsv2 keyed attribute file. The 'keys' parameter should be an strv with
2043 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
2044 * entries as 'keys'. On success each entry will be set to the value of the matching key.
2045 *
2046 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. */
2047
2048 r = cg_get_path(controller, path, attribute, &filename);
2049 if (r < 0)
2050 return r;
2051
2052 r = read_full_file(filename, &contents, NULL);
2053 if (r < 0)
2054 return r;
2055
2056 n = strv_length(keys);
2057 if (n == 0) /* No keys to retrieve? That's easy, we are done then */
2058 return 0;
2059
2060 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
2061 v = newa0(char*, n);
2062
2063 for (p = contents; *p;) {
2064 const char *w = NULL;
2065
2066 for (i = 0; i < n; i++)
2067 if (!v[i]) {
2068 w = first_word(p, keys[i]);
2069 if (w)
2070 break;
2071 }
2072
2073 if (w) {
2074 size_t l;
2075
2076 l = strcspn(w, NEWLINE);
2077 v[i] = strndup(w, l);
2078 if (!v[i]) {
2079 r = -ENOMEM;
2080 goto fail;
2081 }
2082
2083 n_done++;
2084 if (n_done >= n)
2085 goto done;
2086
2087 p = w + l;
2088 } else
2089 p += strcspn(p, NEWLINE);
2090
2091 p += strspn(p, NEWLINE);
2092 }
2093
2094 r = -ENXIO;
2095
2096 fail:
2097 for (i = 0; i < n; i++)
2098 free(v[i]);
2099
2100 return r;
2101
2102 done:
2103 memcpy(ret_values, v, sizeof(char*) * n);
2104 return 0;
2105
2106 }
2107
2108 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2109 CGroupController c;
2110 CGroupMask done;
2111 bool created;
2112 int r;
2113
2114 /* This one will create a cgroup in our private tree, but also
2115 * duplicate it in the trees specified in mask, and remove it
2116 * in all others.
2117 *
2118 * Returns 0 if the group already existed in the systemd hierarchy,
2119 * 1 on success, negative otherwise.
2120 */
2121
2122 /* First create the cgroup in our own hierarchy. */
2123 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2124 if (r < 0)
2125 return r;
2126 created = r;
2127
2128 /* If we are in the unified hierarchy, we are done now */
2129 r = cg_all_unified();
2130 if (r < 0)
2131 return r;
2132 if (r > 0)
2133 return created;
2134
2135 supported &= CGROUP_MASK_V1;
2136 mask = CGROUP_MASK_EXTEND_JOINED(mask);
2137 done = 0;
2138
2139 /* Otherwise, do the same in the other hierarchies */
2140 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2141 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2142 const char *n;
2143
2144 if (!FLAGS_SET(supported, bit))
2145 continue;
2146
2147 if (FLAGS_SET(done, bit))
2148 continue;
2149
2150 n = cgroup_controller_to_string(c);
2151 if (FLAGS_SET(mask, bit))
2152 (void) cg_create(n, path);
2153 else
2154 (void) cg_trim(n, path, true);
2155
2156 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2157 }
2158
2159 return created;
2160 }
2161
2162 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2163 CGroupController c;
2164 CGroupMask done;
2165 int r;
2166
2167 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2168 if (r < 0)
2169 return r;
2170
2171 r = cg_all_unified();
2172 if (r < 0)
2173 return r;
2174 if (r > 0)
2175 return 0;
2176
2177 supported &= CGROUP_MASK_V1;
2178 done = 0;
2179
2180 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2181 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2182 const char *p = NULL;
2183
2184 if (!FLAGS_SET(supported, bit))
2185 continue;
2186
2187 if (FLAGS_SET(done, bit))
2188 continue;
2189
2190 if (path_callback)
2191 p = path_callback(bit, userdata);
2192 if (!p)
2193 p = path;
2194
2195 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2196 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2197 }
2198
2199 return 0;
2200 }
2201
2202 int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2203 Iterator i;
2204 void *pidp;
2205 int r = 0;
2206
2207 SET_FOREACH(pidp, pids, i) {
2208 pid_t pid = PTR_TO_PID(pidp);
2209 int q;
2210
2211 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2212 if (q < 0 && r >= 0)
2213 r = q;
2214 }
2215
2216 return r;
2217 }
2218
2219 int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2220 CGroupController c;
2221 CGroupMask done;
2222 int r = 0, q;
2223
2224 if (!path_equal(from, to)) {
2225 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2226 if (r < 0)
2227 return r;
2228 }
2229
2230 q = cg_all_unified();
2231 if (q < 0)
2232 return q;
2233 if (q > 0)
2234 return r;
2235
2236 supported &= CGROUP_MASK_V1;
2237 done = 0;
2238
2239 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2240 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2241 const char *p = NULL;
2242
2243 if (!FLAGS_SET(supported, bit))
2244 continue;
2245
2246 if (FLAGS_SET(done, bit))
2247 continue;
2248
2249 if (to_callback)
2250 p = to_callback(bit, userdata);
2251 if (!p)
2252 p = to;
2253
2254 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2255 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2256 }
2257
2258 return r;
2259 }
2260
2261 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2262 CGroupController c;
2263 CGroupMask done;
2264 int r, q;
2265
2266 r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2267 if (r < 0)
2268 return r;
2269
2270 q = cg_all_unified();
2271 if (q < 0)
2272 return q;
2273 if (q > 0)
2274 return r;
2275
2276 supported &= CGROUP_MASK_V1;
2277 done = 0;
2278
2279 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2280 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2281
2282 if (!FLAGS_SET(supported, bit))
2283 continue;
2284
2285 if (FLAGS_SET(done, bit))
2286 continue;
2287
2288 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2289 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2290 }
2291
2292 return r;
2293 }
2294
2295 int cg_mask_to_string(CGroupMask mask, char **ret) {
2296 _cleanup_free_ char *s = NULL;
2297 size_t n = 0, allocated = 0;
2298 bool space = false;
2299 CGroupController c;
2300
2301 assert(ret);
2302
2303 if (mask == 0) {
2304 *ret = NULL;
2305 return 0;
2306 }
2307
2308 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2309 const char *k;
2310 size_t l;
2311
2312 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
2313 continue;
2314
2315 k = cgroup_controller_to_string(c);
2316 l = strlen(k);
2317
2318 if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2319 return -ENOMEM;
2320
2321 if (space)
2322 s[n] = ' ';
2323 memcpy(s + n + space, k, l);
2324 n += space + l;
2325
2326 space = true;
2327 }
2328
2329 assert(s);
2330
2331 s[n] = 0;
2332 *ret = TAKE_PTR(s);
2333
2334 return 0;
2335 }
2336
2337 int cg_mask_from_string(const char *value, CGroupMask *ret) {
2338 CGroupMask m = 0;
2339
2340 assert(ret);
2341 assert(value);
2342
2343 for (;;) {
2344 _cleanup_free_ char *n = NULL;
2345 CGroupController v;
2346 int r;
2347
2348 r = extract_first_word(&value, &n, NULL, 0);
2349 if (r < 0)
2350 return r;
2351 if (r == 0)
2352 break;
2353
2354 v = cgroup_controller_from_string(n);
2355 if (v < 0)
2356 continue;
2357
2358 m |= CGROUP_CONTROLLER_TO_MASK(v);
2359 }
2360
2361 *ret = m;
2362 return 0;
2363 }
2364
2365 int cg_mask_supported(CGroupMask *ret) {
2366 CGroupMask mask;
2367 int r;
2368
2369 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2370 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2371 * pseudo-controllers. */
2372
2373 r = cg_all_unified();
2374 if (r < 0)
2375 return r;
2376 if (r > 0) {
2377 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2378
2379 /* In the unified hierarchy we can read the supported
2380 * and accessible controllers from a the top-level
2381 * cgroup attribute */
2382
2383 r = cg_get_root_path(&root);
2384 if (r < 0)
2385 return r;
2386
2387 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2388 if (r < 0)
2389 return r;
2390
2391 r = read_one_line_file(path, &controllers);
2392 if (r < 0)
2393 return r;
2394
2395 r = cg_mask_from_string(controllers, &mask);
2396 if (r < 0)
2397 return r;
2398
2399 /* Currently, we support the cpu, memory, io and pids controller in the unified hierarchy, mask
2400 * everything else off. */
2401 mask &= CGROUP_MASK_V2;
2402
2403 } else {
2404 CGroupController c;
2405
2406 /* In the legacy hierarchy, we check which hierarchies are mounted. */
2407
2408 mask = 0;
2409 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2410 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2411 const char *n;
2412
2413 if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2414 continue;
2415
2416 n = cgroup_controller_to_string(c);
2417 if (controller_is_accessible(n) >= 0)
2418 mask |= bit;
2419 }
2420 }
2421
2422 *ret = mask;
2423 return 0;
2424 }
2425
2426 int cg_kernel_controllers(Set **ret) {
2427 _cleanup_set_free_free_ Set *controllers = NULL;
2428 _cleanup_fclose_ FILE *f = NULL;
2429 int r;
2430
2431 assert(ret);
2432
2433 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2434 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2435 * pseudo-controllers. */
2436
2437 controllers = set_new(&string_hash_ops);
2438 if (!controllers)
2439 return -ENOMEM;
2440
2441 f = fopen("/proc/cgroups", "re");
2442 if (!f) {
2443 if (errno == ENOENT) {
2444 *ret = NULL;
2445 return 0;
2446 }
2447
2448 return -errno;
2449 }
2450
2451 (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
2452
2453 /* Ignore the header line */
2454 (void) read_line(f, (size_t) -1, NULL);
2455
2456 for (;;) {
2457 char *controller;
2458 int enabled = 0;
2459
2460 errno = 0;
2461 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2462
2463 if (feof(f))
2464 break;
2465
2466 if (ferror(f) && errno > 0)
2467 return -errno;
2468
2469 return -EBADMSG;
2470 }
2471
2472 if (!enabled) {
2473 free(controller);
2474 continue;
2475 }
2476
2477 if (!cg_controller_is_valid(controller)) {
2478 free(controller);
2479 return -EBADMSG;
2480 }
2481
2482 r = set_consume(controllers, controller);
2483 if (r < 0)
2484 return r;
2485 }
2486
2487 *ret = TAKE_PTR(controllers);
2488
2489 return 0;
2490 }
2491
2492 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2493
2494 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup v2 on /sys/fs/cgroup/systemd. This
2495 * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2496 * /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2497 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2498 *
2499 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep cgroup v2
2500 * process management but disable the compat dual layout, we return %true on
2501 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2502 */
2503 static thread_local bool unified_systemd_v232;
2504
2505 static int cg_unified_update(void) {
2506
2507 struct statfs fs;
2508
2509 /* Checks if we support the unified hierarchy. Returns an
2510 * error when the cgroup hierarchies aren't mounted yet or we
2511 * have any other trouble determining if the unified hierarchy
2512 * is supported. */
2513
2514 if (unified_cache >= CGROUP_UNIFIED_NONE)
2515 return 0;
2516
2517 if (statfs("/sys/fs/cgroup/", &fs) < 0)
2518 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2519
2520 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2521 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2522 unified_cache = CGROUP_UNIFIED_ALL;
2523 } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2524 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2525 F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2526 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2527 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2528 unified_systemd_v232 = false;
2529 } else {
2530 if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2531 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2532
2533 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2534 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2535 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2536 unified_systemd_v232 = true;
2537 } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2538 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2539 unified_cache = CGROUP_UNIFIED_NONE;
2540 } else {
2541 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2542 (unsigned long long) fs.f_type);
2543 unified_cache = CGROUP_UNIFIED_NONE;
2544 }
2545 }
2546 } else
2547 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2548 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2549 (unsigned long long)fs.f_type);
2550
2551 return 0;
2552 }
2553
2554 int cg_unified_controller(const char *controller) {
2555 int r;
2556
2557 r = cg_unified_update();
2558 if (r < 0)
2559 return r;
2560
2561 if (unified_cache == CGROUP_UNIFIED_NONE)
2562 return false;
2563
2564 if (unified_cache >= CGROUP_UNIFIED_ALL)
2565 return true;
2566
2567 return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2568 }
2569
2570 int cg_all_unified(void) {
2571 int r;
2572
2573 r = cg_unified_update();
2574 if (r < 0)
2575 return r;
2576
2577 return unified_cache >= CGROUP_UNIFIED_ALL;
2578 }
2579
2580 int cg_hybrid_unified(void) {
2581 int r;
2582
2583 r = cg_unified_update();
2584 if (r < 0)
2585 return r;
2586
2587 return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2588 }
2589
2590 int cg_unified_flush(void) {
2591 unified_cache = CGROUP_UNIFIED_UNKNOWN;
2592
2593 return cg_unified_update();
2594 }
2595
2596 int cg_enable_everywhere(
2597 CGroupMask supported,
2598 CGroupMask mask,
2599 const char *p,
2600 CGroupMask *ret_result_mask) {
2601
2602 _cleanup_fclose_ FILE *f = NULL;
2603 _cleanup_free_ char *fs = NULL;
2604 CGroupController c;
2605 CGroupMask ret = 0;
2606 int r;
2607
2608 assert(p);
2609
2610 if (supported == 0) {
2611 if (ret_result_mask)
2612 *ret_result_mask = 0;
2613 return 0;
2614 }
2615
2616 r = cg_all_unified();
2617 if (r < 0)
2618 return r;
2619 if (r == 0) {
2620 /* On the legacy hiearchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
2621 * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
2622 * caller tends to use the returned mask later on to compare if all controllers where properly joined,
2623 * and if not requeues realization. This use is the primary purpose of the return value, hence let's
2624 * minimize surprises here and reduce triggers for re-realization by always saying we fully
2625 * succeeded.) */
2626 if (ret_result_mask)
2627 *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
2628 * CGROUP_MASK_V2: The 'supported' mask
2629 * might contain pure-V1 or BPF
2630 * controllers, and we never want to
2631 * claim that we could enable those with
2632 * cgroup.subtree_control */
2633 return 0;
2634 }
2635
2636 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2637 if (r < 0)
2638 return r;
2639
2640 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2641 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2642 const char *n;
2643
2644 if (!FLAGS_SET(CGROUP_MASK_V2, bit))
2645 continue;
2646
2647 if (!FLAGS_SET(supported, bit))
2648 continue;
2649
2650 n = cgroup_controller_to_string(c);
2651 {
2652 char s[1 + strlen(n) + 1];
2653
2654 s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
2655 strcpy(s + 1, n);
2656
2657 if (!f) {
2658 f = fopen(fs, "we");
2659 if (!f)
2660 return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
2661 }
2662
2663 r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
2664 if (r < 0) {
2665 log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
2666 FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
2667 clearerr(f);
2668
2669 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
2670 * happens for example when we attempt to turn off a controller up in the tree that is
2671 * used down in the tree. */
2672 if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
2673 * only here, and not follow the same logic
2674 * for other errors such as EINVAL or
2675 * EOPNOTSUPP or anything else. That's
2676 * because EBUSY indicates that the
2677 * controllers is currently enabled and
2678 * cannot be disabled because something down
2679 * the hierarchy is still using it. Any other
2680 * error most likely means something like "I
2681 * never heard of this controller" or
2682 * similar. In the former case it's hence
2683 * safe to assume the controller is still on
2684 * after the failed operation, while in the
2685 * latter case it's safer to assume the
2686 * controller is unknown and hence certainly
2687 * not enabled. */
2688 ret |= bit;
2689 } else {
2690 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
2691 if (FLAGS_SET(mask, bit))
2692 ret |= bit;
2693 }
2694 }
2695 }
2696
2697 /* Let's return the precise set of controllers now enabled for the cgroup. */
2698 if (ret_result_mask)
2699 *ret_result_mask = ret;
2700
2701 return 0;
2702 }
2703
2704 bool cg_is_unified_wanted(void) {
2705 static thread_local int wanted = -1;
2706 int r;
2707 bool b;
2708 const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
2709 _cleanup_free_ char *c = NULL;
2710
2711 /* If we have a cached value, return that. */
2712 if (wanted >= 0)
2713 return wanted;
2714
2715 /* If the hierarchy is already mounted, then follow whatever
2716 * was chosen for it. */
2717 if (cg_unified_flush() >= 0)
2718 return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
2719
2720 /* If we were explicitly passed systemd.unified_cgroup_hierarchy,
2721 * respect that. */
2722 r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2723 if (r > 0)
2724 return (wanted = b);
2725
2726 /* If we passed cgroup_no_v1=all with no other instructions, it seems
2727 * highly unlikely that we want to use hybrid or legacy hierarchy. */
2728 r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
2729 if (r > 0 && streq_ptr(c, "all"))
2730 return (wanted = true);
2731
2732 return (wanted = is_default);
2733 }
2734
2735 bool cg_is_legacy_wanted(void) {
2736 static thread_local int wanted = -1;
2737
2738 /* If we have a cached value, return that. */
2739 if (wanted >= 0)
2740 return wanted;
2741
2742 /* Check if we have cgroups2 already mounted. */
2743 if (cg_unified_flush() >= 0 &&
2744 unified_cache == CGROUP_UNIFIED_ALL)
2745 return (wanted = false);
2746
2747 /* Otherwise, assume that at least partial legacy is wanted,
2748 * since cgroups2 should already be mounted at this point. */
2749 return (wanted = true);
2750 }
2751
2752 bool cg_is_hybrid_wanted(void) {
2753 static thread_local int wanted = -1;
2754 int r;
2755 bool b;
2756 const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2757 /* We default to true if the default is "hybrid", obviously,
2758 * but also when the default is "unified", because if we get
2759 * called, it means that unified hierarchy was not mounted. */
2760
2761 /* If we have a cached value, return that. */
2762 if (wanted >= 0)
2763 return wanted;
2764
2765 /* If the hierarchy is already mounted, then follow whatever
2766 * was chosen for it. */
2767 if (cg_unified_flush() >= 0 &&
2768 unified_cache == CGROUP_UNIFIED_ALL)
2769 return (wanted = false);
2770
2771 /* Otherwise, let's see what the kernel command line has to say.
2772 * Since checking is expensive, cache a non-error result. */
2773 r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2774
2775 /* The meaning of the kernel option is reversed wrt. to the return value
2776 * of this function, hence the negation. */
2777 return (wanted = r > 0 ? !b : is_default);
2778 }
2779
2780 int cg_weight_parse(const char *s, uint64_t *ret) {
2781 uint64_t u;
2782 int r;
2783
2784 if (isempty(s)) {
2785 *ret = CGROUP_WEIGHT_INVALID;
2786 return 0;
2787 }
2788
2789 r = safe_atou64(s, &u);
2790 if (r < 0)
2791 return r;
2792
2793 if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2794 return -ERANGE;
2795
2796 *ret = u;
2797 return 0;
2798 }
2799
2800 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2801 [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX,
2802 [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX,
2803 [CGROUP_IO_RIOPS_MAX] = CGROUP_LIMIT_MAX,
2804 [CGROUP_IO_WIOPS_MAX] = CGROUP_LIMIT_MAX,
2805 };
2806
2807 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2808 [CGROUP_IO_RBPS_MAX] = "IOReadBandwidthMax",
2809 [CGROUP_IO_WBPS_MAX] = "IOWriteBandwidthMax",
2810 [CGROUP_IO_RIOPS_MAX] = "IOReadIOPSMax",
2811 [CGROUP_IO_WIOPS_MAX] = "IOWriteIOPSMax",
2812 };
2813
2814 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2815
2816 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2817 uint64_t u;
2818 int r;
2819
2820 if (isempty(s)) {
2821 *ret = CGROUP_CPU_SHARES_INVALID;
2822 return 0;
2823 }
2824
2825 r = safe_atou64(s, &u);
2826 if (r < 0)
2827 return r;
2828
2829 if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2830 return -ERANGE;
2831
2832 *ret = u;
2833 return 0;
2834 }
2835
2836 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2837 uint64_t u;
2838 int r;
2839
2840 if (isempty(s)) {
2841 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2842 return 0;
2843 }
2844
2845 r = safe_atou64(s, &u);
2846 if (r < 0)
2847 return r;
2848
2849 if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2850 return -ERANGE;
2851
2852 *ret = u;
2853 return 0;
2854 }
2855
2856 bool is_cgroup_fs(const struct statfs *s) {
2857 return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2858 is_fs_type(s, CGROUP2_SUPER_MAGIC);
2859 }
2860
2861 bool fd_is_cgroup_fs(int fd) {
2862 struct statfs s;
2863
2864 if (fstatfs(fd, &s) < 0)
2865 return -errno;
2866
2867 return is_cgroup_fs(&s);
2868 }
2869
2870 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2871 [CGROUP_CONTROLLER_CPU] = "cpu",
2872 [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2873 [CGROUP_CONTROLLER_IO] = "io",
2874 [CGROUP_CONTROLLER_BLKIO] = "blkio",
2875 [CGROUP_CONTROLLER_MEMORY] = "memory",
2876 [CGROUP_CONTROLLER_DEVICES] = "devices",
2877 [CGROUP_CONTROLLER_PIDS] = "pids",
2878 [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2879 [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2880 };
2881
2882 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2883
2884 CGroupMask get_cpu_accounting_mask(void) {
2885 static CGroupMask needed_mask = (CGroupMask) -1;
2886
2887 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2888 * provided externally from the CPU controller, which means we don't
2889 * need to enable the CPU controller just to get metrics. This is good,
2890 * because enabling the CPU controller comes at a minor performance
2891 * hit, especially when it's propagated deep into large hierarchies.
2892 * There's also no separate CPU accounting controller available within
2893 * a unified hierarchy.
2894 *
2895 * This combination of factors results in the desired cgroup mask to
2896 * enable for CPU accounting varying as follows:
2897 *
2898 * ╔═════════════════════╤═════════════════════╗
2899 * ║ Linux ≥4.15 │ Linux <4.15 ║
2900 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2901 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2902 * ╟───────────────╫─────────────────────┼─────────────────────╢
2903 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2904 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2905 *
2906 * We check kernel version here instead of manually checking whether
2907 * cpu.stat is present for every cgroup, as that check in itself would
2908 * already be fairly expensive.
2909 *
2910 * Kernels where this patch has been backported will therefore have the
2911 * CPU controller enabled unnecessarily. This is more expensive than
2912 * necessary, but harmless. ☺️
2913 */
2914
2915 if (needed_mask == (CGroupMask) -1) {
2916 if (cg_all_unified()) {
2917 struct utsname u;
2918 assert_se(uname(&u) >= 0);
2919
2920 if (str_verscmp(u.release, "4.15") < 0)
2921 needed_mask = CGROUP_MASK_CPU;
2922 else
2923 needed_mask = 0;
2924 } else
2925 needed_mask = CGROUP_MASK_CPUACCT;
2926 }
2927
2928 return needed_mask;
2929 }
2930
2931 bool cpu_accounting_is_cheap(void) {
2932 return get_cpu_accounting_mask() == 0;
2933 }