]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/basic/cgroup-util.c
Merge pull request #13022 from keszybz/coverity-cleanups
[thirdparty/systemd.git] / src / basic / cgroup-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <dirent.h>
4 #include <errno.h>
5 #include <ftw.h>
6 #include <limits.h>
7 #include <signal.h>
8 #include <stddef.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <sys/stat.h>
12 #include <sys/statfs.h>
13 #include <sys/types.h>
14 #include <sys/utsname.h>
15 #include <sys/xattr.h>
16 #include <unistd.h>
17
18 #include "alloc-util.h"
19 #include "cgroup-util.h"
20 #include "def.h"
21 #include "dirent-util.h"
22 #include "extract-word.h"
23 #include "fd-util.h"
24 #include "fileio.h"
25 #include "format-util.h"
26 #include "fs-util.h"
27 #include "log.h"
28 #include "login-util.h"
29 #include "macro.h"
30 #include "missing.h"
31 #include "mkdir.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "proc-cmdline.h"
35 #include "process-util.h"
36 #include "set.h"
37 #include "special.h"
38 #include "stat-util.h"
39 #include "stdio-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
42 #include "strv.h"
43 #include "unit-name.h"
44 #include "user-util.h"
45
46 static int cg_enumerate_items(const char *controller, const char *path, FILE **_f, const char *item) {
47 _cleanup_free_ char *fs = NULL;
48 FILE *f;
49 int r;
50
51 assert(_f);
52
53 r = cg_get_path(controller, path, item, &fs);
54 if (r < 0)
55 return r;
56
57 f = fopen(fs, "re");
58 if (!f)
59 return -errno;
60
61 *_f = f;
62 return 0;
63 }
64
65 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
66 return cg_enumerate_items(controller, path, _f, "cgroup.procs");
67 }
68
69 int cg_read_pid(FILE *f, pid_t *_pid) {
70 unsigned long ul;
71
72 /* Note that the cgroup.procs might contain duplicates! See
73 * cgroups.txt for details. */
74
75 assert(f);
76 assert(_pid);
77
78 errno = 0;
79 if (fscanf(f, "%lu", &ul) != 1) {
80
81 if (feof(f))
82 return 0;
83
84 return errno_or_else(EIO);
85 }
86
87 if (ul <= 0)
88 return -EIO;
89
90 *_pid = (pid_t) ul;
91 return 1;
92 }
93
94 int cg_read_event(
95 const char *controller,
96 const char *path,
97 const char *event,
98 char **ret) {
99
100 _cleanup_free_ char *events = NULL, *content = NULL;
101 int r;
102
103 r = cg_get_path(controller, path, "cgroup.events", &events);
104 if (r < 0)
105 return r;
106
107 r = read_full_file(events, &content, NULL);
108 if (r < 0)
109 return r;
110
111 for (const char *p = content;;) {
112 _cleanup_free_ char *line = NULL, *key = NULL, *val = NULL;
113 const char *q;
114
115 r = extract_first_word(&p, &line, "\n", 0);
116 if (r < 0)
117 return r;
118 if (r == 0)
119 return -ENOENT;
120
121 q = line;
122 r = extract_first_word(&q, &key, " ", 0);
123 if (r < 0)
124 return r;
125 if (r == 0)
126 return -EINVAL;
127
128 if (!streq(key, event))
129 continue;
130
131 val = strdup(q);
132 if (!val)
133 return -ENOMEM;
134
135 *ret = TAKE_PTR(val);
136 return 0;
137 }
138 }
139
140 bool cg_ns_supported(void) {
141 static thread_local int enabled = -1;
142
143 if (enabled >= 0)
144 return enabled;
145
146 if (access("/proc/self/ns/cgroup", F_OK) < 0) {
147 if (errno != ENOENT)
148 log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
149 enabled = false;
150 } else
151 enabled = true;
152
153 return enabled;
154 }
155
156 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
157 _cleanup_free_ char *fs = NULL;
158 int r;
159 DIR *d;
160
161 assert(_d);
162
163 /* This is not recursive! */
164
165 r = cg_get_path(controller, path, NULL, &fs);
166 if (r < 0)
167 return r;
168
169 d = opendir(fs);
170 if (!d)
171 return -errno;
172
173 *_d = d;
174 return 0;
175 }
176
177 int cg_read_subgroup(DIR *d, char **fn) {
178 struct dirent *de;
179
180 assert(d);
181 assert(fn);
182
183 FOREACH_DIRENT_ALL(de, d, return -errno) {
184 char *b;
185
186 if (de->d_type != DT_DIR)
187 continue;
188
189 if (dot_or_dot_dot(de->d_name))
190 continue;
191
192 b = strdup(de->d_name);
193 if (!b)
194 return -ENOMEM;
195
196 *fn = b;
197 return 1;
198 }
199
200 return 0;
201 }
202
203 int cg_rmdir(const char *controller, const char *path) {
204 _cleanup_free_ char *p = NULL;
205 int r;
206
207 r = cg_get_path(controller, path, NULL, &p);
208 if (r < 0)
209 return r;
210
211 r = rmdir(p);
212 if (r < 0 && errno != ENOENT)
213 return -errno;
214
215 r = cg_hybrid_unified();
216 if (r <= 0)
217 return r;
218
219 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
220 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
221 if (r < 0)
222 log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
223 }
224
225 return 0;
226 }
227
228 static int cg_kill_items(
229 const char *controller,
230 const char *path,
231 int sig,
232 CGroupFlags flags,
233 Set *s,
234 cg_kill_log_func_t log_kill,
235 void *userdata,
236 const char *item) {
237
238 _cleanup_set_free_ Set *allocated_set = NULL;
239 bool done = false;
240 int r, ret = 0, ret_log_kill = 0;
241 pid_t my_pid;
242
243 assert(sig >= 0);
244
245 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
246 * SIGCONT on SIGKILL. */
247 if (IN_SET(sig, SIGCONT, SIGKILL))
248 flags &= ~CGROUP_SIGCONT;
249
250 /* This goes through the tasks list and kills them all. This
251 * is repeated until no further processes are added to the
252 * tasks list, to properly handle forking processes */
253
254 if (!s) {
255 s = allocated_set = set_new(NULL);
256 if (!s)
257 return -ENOMEM;
258 }
259
260 my_pid = getpid_cached();
261
262 do {
263 _cleanup_fclose_ FILE *f = NULL;
264 pid_t pid = 0;
265 done = true;
266
267 r = cg_enumerate_items(controller, path, &f, item);
268 if (r < 0) {
269 if (ret >= 0 && r != -ENOENT)
270 return r;
271
272 return ret;
273 }
274
275 while ((r = cg_read_pid(f, &pid)) > 0) {
276
277 if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
278 continue;
279
280 if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
281 continue;
282
283 if (log_kill)
284 ret_log_kill = log_kill(pid, sig, userdata);
285
286 /* If we haven't killed this process yet, kill
287 * it */
288 if (kill(pid, sig) < 0) {
289 if (ret >= 0 && errno != ESRCH)
290 ret = -errno;
291 } else {
292 if (flags & CGROUP_SIGCONT)
293 (void) kill(pid, SIGCONT);
294
295 if (ret == 0) {
296 if (log_kill)
297 ret = ret_log_kill;
298 else
299 ret = 1;
300 }
301 }
302
303 done = false;
304
305 r = set_put(s, PID_TO_PTR(pid));
306 if (r < 0) {
307 if (ret >= 0)
308 return r;
309
310 return ret;
311 }
312 }
313
314 if (r < 0) {
315 if (ret >= 0)
316 return r;
317
318 return ret;
319 }
320
321 /* To avoid racing against processes which fork
322 * quicker than we can kill them we repeat this until
323 * no new pids need to be killed. */
324
325 } while (!done);
326
327 return ret;
328 }
329
330 int cg_kill(
331 const char *controller,
332 const char *path,
333 int sig,
334 CGroupFlags flags,
335 Set *s,
336 cg_kill_log_func_t log_kill,
337 void *userdata) {
338 int r;
339
340 r = cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.procs");
341 if (r < 0 || sig != SIGKILL)
342 return r;
343
344 /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
345 a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83). */
346 r = cg_unified_controller(controller);
347 if (r < 0)
348 return r;
349 if (r == 0) /* doesn't apply to legacy hierarchy */
350 return 0;
351
352 return cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.threads");
353 }
354
355 int cg_kill_recursive(
356 const char *controller,
357 const char *path,
358 int sig,
359 CGroupFlags flags,
360 Set *s,
361 cg_kill_log_func_t log_kill,
362 void *userdata) {
363
364 _cleanup_set_free_ Set *allocated_set = NULL;
365 _cleanup_closedir_ DIR *d = NULL;
366 int r, ret;
367 char *fn;
368
369 assert(path);
370 assert(sig >= 0);
371
372 if (!s) {
373 s = allocated_set = set_new(NULL);
374 if (!s)
375 return -ENOMEM;
376 }
377
378 ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
379
380 r = cg_enumerate_subgroups(controller, path, &d);
381 if (r < 0) {
382 if (ret >= 0 && r != -ENOENT)
383 return r;
384
385 return ret;
386 }
387
388 while ((r = cg_read_subgroup(d, &fn)) > 0) {
389 _cleanup_free_ char *p = NULL;
390
391 p = path_join(path, fn);
392 free(fn);
393 if (!p)
394 return -ENOMEM;
395
396 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
397 if (r != 0 && ret >= 0)
398 ret = r;
399 }
400 if (ret >= 0 && r < 0)
401 ret = r;
402
403 if (flags & CGROUP_REMOVE) {
404 r = cg_rmdir(controller, path);
405 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
406 return r;
407 }
408
409 return ret;
410 }
411
412 int cg_migrate(
413 const char *cfrom,
414 const char *pfrom,
415 const char *cto,
416 const char *pto,
417 CGroupFlags flags) {
418
419 bool done = false;
420 _cleanup_set_free_ Set *s = NULL;
421 int r, ret = 0;
422 pid_t my_pid;
423
424 assert(cfrom);
425 assert(pfrom);
426 assert(cto);
427 assert(pto);
428
429 s = set_new(NULL);
430 if (!s)
431 return -ENOMEM;
432
433 my_pid = getpid_cached();
434
435 do {
436 _cleanup_fclose_ FILE *f = NULL;
437 pid_t pid = 0;
438 done = true;
439
440 r = cg_enumerate_processes(cfrom, pfrom, &f);
441 if (r < 0) {
442 if (ret >= 0 && r != -ENOENT)
443 return r;
444
445 return ret;
446 }
447
448 while ((r = cg_read_pid(f, &pid)) > 0) {
449
450 /* This might do weird stuff if we aren't a
451 * single-threaded program. However, we
452 * luckily know we are not */
453 if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
454 continue;
455
456 if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
457 continue;
458
459 /* Ignore kernel threads. Since they can only
460 * exist in the root cgroup, we only check for
461 * them there. */
462 if (cfrom &&
463 empty_or_root(pfrom) &&
464 is_kernel_thread(pid) > 0)
465 continue;
466
467 r = cg_attach(cto, pto, pid);
468 if (r < 0) {
469 if (ret >= 0 && r != -ESRCH)
470 ret = r;
471 } else if (ret == 0)
472 ret = 1;
473
474 done = false;
475
476 r = set_put(s, PID_TO_PTR(pid));
477 if (r < 0) {
478 if (ret >= 0)
479 return r;
480
481 return ret;
482 }
483 }
484
485 if (r < 0) {
486 if (ret >= 0)
487 return r;
488
489 return ret;
490 }
491 } while (!done);
492
493 return ret;
494 }
495
496 int cg_migrate_recursive(
497 const char *cfrom,
498 const char *pfrom,
499 const char *cto,
500 const char *pto,
501 CGroupFlags flags) {
502
503 _cleanup_closedir_ DIR *d = NULL;
504 int r, ret = 0;
505 char *fn;
506
507 assert(cfrom);
508 assert(pfrom);
509 assert(cto);
510 assert(pto);
511
512 ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
513
514 r = cg_enumerate_subgroups(cfrom, pfrom, &d);
515 if (r < 0) {
516 if (ret >= 0 && r != -ENOENT)
517 return r;
518
519 return ret;
520 }
521
522 while ((r = cg_read_subgroup(d, &fn)) > 0) {
523 _cleanup_free_ char *p = NULL;
524
525 p = path_join(pfrom, fn);
526 free(fn);
527 if (!p)
528 return -ENOMEM;
529
530 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
531 if (r != 0 && ret >= 0)
532 ret = r;
533 }
534
535 if (r < 0 && ret >= 0)
536 ret = r;
537
538 if (flags & CGROUP_REMOVE) {
539 r = cg_rmdir(cfrom, pfrom);
540 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
541 return r;
542 }
543
544 return ret;
545 }
546
547 int cg_migrate_recursive_fallback(
548 const char *cfrom,
549 const char *pfrom,
550 const char *cto,
551 const char *pto,
552 CGroupFlags flags) {
553
554 int r;
555
556 assert(cfrom);
557 assert(pfrom);
558 assert(cto);
559 assert(pto);
560
561 r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
562 if (r < 0) {
563 char prefix[strlen(pto) + 1];
564
565 /* This didn't work? Then let's try all prefixes of the destination */
566
567 PATH_FOREACH_PREFIX(prefix, pto) {
568 int q;
569
570 q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
571 if (q >= 0)
572 return q;
573 }
574 }
575
576 return r;
577 }
578
579 static const char *controller_to_dirname(const char *controller) {
580 const char *e;
581
582 assert(controller);
583
584 /* Converts a controller name to the directory name below
585 * /sys/fs/cgroup/ we want to mount it to. Effectively, this
586 * just cuts off the name= prefixed used for named
587 * hierarchies, if it is specified. */
588
589 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
590 if (cg_hybrid_unified() > 0)
591 controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
592 else
593 controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
594 }
595
596 e = startswith(controller, "name=");
597 if (e)
598 return e;
599
600 return controller;
601 }
602
603 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
604 const char *dn;
605 char *t = NULL;
606
607 assert(fs);
608 assert(controller);
609
610 dn = controller_to_dirname(controller);
611
612 if (isempty(path) && isempty(suffix))
613 t = path_join("/sys/fs/cgroup", dn);
614 else if (isempty(path))
615 t = path_join("/sys/fs/cgroup", dn, suffix);
616 else if (isempty(suffix))
617 t = path_join("/sys/fs/cgroup", dn, path);
618 else
619 t = path_join("/sys/fs/cgroup", dn, path, suffix);
620 if (!t)
621 return -ENOMEM;
622
623 *fs = t;
624 return 0;
625 }
626
627 static int join_path_unified(const char *path, const char *suffix, char **fs) {
628 char *t;
629
630 assert(fs);
631
632 if (isempty(path) && isempty(suffix))
633 t = strdup("/sys/fs/cgroup");
634 else if (isempty(path))
635 t = path_join("/sys/fs/cgroup", suffix);
636 else if (isempty(suffix))
637 t = path_join("/sys/fs/cgroup", path);
638 else
639 t = path_join("/sys/fs/cgroup", path, suffix);
640 if (!t)
641 return -ENOMEM;
642
643 *fs = t;
644 return 0;
645 }
646
647 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
648 int r;
649
650 assert(fs);
651
652 if (!controller) {
653 char *t;
654
655 /* If no controller is specified, we return the path
656 * *below* the controllers, without any prefix. */
657
658 if (!path && !suffix)
659 return -EINVAL;
660
661 if (!suffix)
662 t = strdup(path);
663 else if (!path)
664 t = strdup(suffix);
665 else
666 t = path_join(path, suffix);
667 if (!t)
668 return -ENOMEM;
669
670 *fs = path_simplify(t, false);
671 return 0;
672 }
673
674 if (!cg_controller_is_valid(controller))
675 return -EINVAL;
676
677 r = cg_all_unified();
678 if (r < 0)
679 return r;
680 if (r > 0)
681 r = join_path_unified(path, suffix, fs);
682 else
683 r = join_path_legacy(controller, path, suffix, fs);
684 if (r < 0)
685 return r;
686
687 path_simplify(*fs, false);
688 return 0;
689 }
690
691 static int controller_is_accessible(const char *controller) {
692 int r;
693
694 assert(controller);
695
696 /* Checks whether a specific controller is accessible,
697 * i.e. its hierarchy mounted. In the unified hierarchy all
698 * controllers are considered accessible, except for the named
699 * hierarchies */
700
701 if (!cg_controller_is_valid(controller))
702 return -EINVAL;
703
704 r = cg_all_unified();
705 if (r < 0)
706 return r;
707 if (r > 0) {
708 /* We don't support named hierarchies if we are using
709 * the unified hierarchy. */
710
711 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
712 return 0;
713
714 if (startswith(controller, "name="))
715 return -EOPNOTSUPP;
716
717 } else {
718 const char *cc, *dn;
719
720 dn = controller_to_dirname(controller);
721 cc = strjoina("/sys/fs/cgroup/", dn);
722
723 if (laccess(cc, F_OK) < 0)
724 return -errno;
725 }
726
727 return 0;
728 }
729
730 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
731 int r;
732
733 assert(controller);
734 assert(fs);
735
736 /* Check if the specified controller is actually accessible */
737 r = controller_is_accessible(controller);
738 if (r < 0)
739 return r;
740
741 return cg_get_path(controller, path, suffix, fs);
742 }
743
744 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
745 assert(path);
746 assert(sb);
747 assert(ftwbuf);
748
749 if (typeflag != FTW_DP)
750 return 0;
751
752 if (ftwbuf->level < 1)
753 return 0;
754
755 (void) rmdir(path);
756 return 0;
757 }
758
759 int cg_trim(const char *controller, const char *path, bool delete_root) {
760 _cleanup_free_ char *fs = NULL;
761 int r = 0, q;
762
763 assert(path);
764
765 r = cg_get_path(controller, path, NULL, &fs);
766 if (r < 0)
767 return r;
768
769 errno = 0;
770 if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
771 if (errno == ENOENT)
772 r = 0;
773 else
774 r = errno_or_else(EIO);
775 }
776
777 if (delete_root) {
778 if (rmdir(fs) < 0 && errno != ENOENT)
779 return -errno;
780 }
781
782 q = cg_hybrid_unified();
783 if (q < 0)
784 return q;
785 if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
786 q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
787 if (q < 0)
788 log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
789 }
790
791 return r;
792 }
793
794 /* Create a cgroup in the hierarchy of controller.
795 * Returns 0 if the group already existed, 1 on success, negative otherwise.
796 */
797 int cg_create(const char *controller, const char *path) {
798 _cleanup_free_ char *fs = NULL;
799 int r;
800
801 r = cg_get_path_and_check(controller, path, NULL, &fs);
802 if (r < 0)
803 return r;
804
805 r = mkdir_parents(fs, 0755);
806 if (r < 0)
807 return r;
808
809 r = mkdir_errno_wrapper(fs, 0755);
810 if (r == -EEXIST)
811 return 0;
812 if (r < 0)
813 return r;
814
815 r = cg_hybrid_unified();
816 if (r < 0)
817 return r;
818
819 if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
820 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
821 if (r < 0)
822 log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
823 }
824
825 return 1;
826 }
827
828 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
829 int r, q;
830
831 assert(pid >= 0);
832
833 r = cg_create(controller, path);
834 if (r < 0)
835 return r;
836
837 q = cg_attach(controller, path, pid);
838 if (q < 0)
839 return q;
840
841 /* This does not remove the cgroup on failure */
842 return r;
843 }
844
845 int cg_attach(const char *controller, const char *path, pid_t pid) {
846 _cleanup_free_ char *fs = NULL;
847 char c[DECIMAL_STR_MAX(pid_t) + 2];
848 int r;
849
850 assert(path);
851 assert(pid >= 0);
852
853 r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
854 if (r < 0)
855 return r;
856
857 if (pid == 0)
858 pid = getpid_cached();
859
860 xsprintf(c, PID_FMT "\n", pid);
861
862 r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
863 if (r < 0)
864 return r;
865
866 r = cg_hybrid_unified();
867 if (r < 0)
868 return r;
869
870 if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
871 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
872 if (r < 0)
873 log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
874 }
875
876 return 0;
877 }
878
879 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
880 int r;
881
882 assert(controller);
883 assert(path);
884 assert(pid >= 0);
885
886 r = cg_attach(controller, path, pid);
887 if (r < 0) {
888 char prefix[strlen(path) + 1];
889
890 /* This didn't work? Then let's try all prefixes of
891 * the destination */
892
893 PATH_FOREACH_PREFIX(prefix, path) {
894 int q;
895
896 q = cg_attach(controller, prefix, pid);
897 if (q >= 0)
898 return q;
899 }
900 }
901
902 return r;
903 }
904
905 int cg_set_access(
906 const char *controller,
907 const char *path,
908 uid_t uid,
909 gid_t gid) {
910
911 struct Attribute {
912 const char *name;
913 bool fatal;
914 };
915
916 /* cgroup v1, aka legacy/non-unified */
917 static const struct Attribute legacy_attributes[] = {
918 { "cgroup.procs", true },
919 { "tasks", false },
920 { "cgroup.clone_children", false },
921 {},
922 };
923
924 /* cgroup v2, aka unified */
925 static const struct Attribute unified_attributes[] = {
926 { "cgroup.procs", true },
927 { "cgroup.subtree_control", true },
928 { "cgroup.threads", false },
929 {},
930 };
931
932 static const struct Attribute* const attributes[] = {
933 [false] = legacy_attributes,
934 [true] = unified_attributes,
935 };
936
937 _cleanup_free_ char *fs = NULL;
938 const struct Attribute *i;
939 int r, unified;
940
941 assert(path);
942
943 if (uid == UID_INVALID && gid == GID_INVALID)
944 return 0;
945
946 unified = cg_unified_controller(controller);
947 if (unified < 0)
948 return unified;
949
950 /* Configure access to the cgroup itself */
951 r = cg_get_path(controller, path, NULL, &fs);
952 if (r < 0)
953 return r;
954
955 r = chmod_and_chown(fs, 0755, uid, gid);
956 if (r < 0)
957 return r;
958
959 /* Configure access to the cgroup's attributes */
960 for (i = attributes[unified]; i->name; i++) {
961 fs = mfree(fs);
962
963 r = cg_get_path(controller, path, i->name, &fs);
964 if (r < 0)
965 return r;
966
967 r = chmod_and_chown(fs, 0644, uid, gid);
968 if (r < 0) {
969 if (i->fatal)
970 return r;
971
972 log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
973 }
974 }
975
976 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
977 r = cg_hybrid_unified();
978 if (r < 0)
979 return r;
980 if (r > 0) {
981 /* Always propagate access mode from unified to legacy controller */
982 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
983 if (r < 0)
984 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
985 }
986 }
987
988 return 0;
989 }
990
991 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
992 _cleanup_free_ char *fs = NULL;
993 int r;
994
995 assert(path);
996 assert(name);
997 assert(value || size <= 0);
998
999 r = cg_get_path(controller, path, NULL, &fs);
1000 if (r < 0)
1001 return r;
1002
1003 if (setxattr(fs, name, value, size, flags) < 0)
1004 return -errno;
1005
1006 return 0;
1007 }
1008
1009 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
1010 _cleanup_free_ char *fs = NULL;
1011 ssize_t n;
1012 int r;
1013
1014 assert(path);
1015 assert(name);
1016
1017 r = cg_get_path(controller, path, NULL, &fs);
1018 if (r < 0)
1019 return r;
1020
1021 n = getxattr(fs, name, value, size);
1022 if (n < 0)
1023 return -errno;
1024
1025 return (int) n;
1026 }
1027
1028 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
1029 _cleanup_fclose_ FILE *f = NULL;
1030 const char *fs, *controller_str;
1031 int unified, r;
1032 size_t cs = 0;
1033
1034 assert(path);
1035 assert(pid >= 0);
1036
1037 if (controller) {
1038 if (!cg_controller_is_valid(controller))
1039 return -EINVAL;
1040 } else
1041 controller = SYSTEMD_CGROUP_CONTROLLER;
1042
1043 unified = cg_unified_controller(controller);
1044 if (unified < 0)
1045 return unified;
1046 if (unified == 0) {
1047 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1048 controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1049 else
1050 controller_str = controller;
1051
1052 cs = strlen(controller_str);
1053 }
1054
1055 fs = procfs_file_alloca(pid, "cgroup");
1056 r = fopen_unlocked(fs, "re", &f);
1057 if (r == -ENOENT)
1058 return -ESRCH;
1059 if (r < 0)
1060 return r;
1061
1062 for (;;) {
1063 _cleanup_free_ char *line = NULL;
1064 char *e, *p;
1065
1066 r = read_line(f, LONG_LINE_MAX, &line);
1067 if (r < 0)
1068 return r;
1069 if (r == 0)
1070 break;
1071
1072 if (unified) {
1073 e = startswith(line, "0:");
1074 if (!e)
1075 continue;
1076
1077 e = strchr(e, ':');
1078 if (!e)
1079 continue;
1080 } else {
1081 char *l;
1082 size_t k;
1083 const char *word, *state;
1084 bool found = false;
1085
1086 l = strchr(line, ':');
1087 if (!l)
1088 continue;
1089
1090 l++;
1091 e = strchr(l, ':');
1092 if (!e)
1093 continue;
1094
1095 *e = 0;
1096 FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
1097 if (k == cs && memcmp(word, controller_str, cs) == 0) {
1098 found = true;
1099 break;
1100 }
1101 if (!found)
1102 continue;
1103 }
1104
1105 p = strdup(e + 1);
1106 if (!p)
1107 return -ENOMEM;
1108
1109 /* Truncate suffix indicating the process is a zombie */
1110 e = endswith(p, " (deleted)");
1111 if (e)
1112 *e = 0;
1113
1114 *path = p;
1115 return 0;
1116 }
1117
1118 return -ENODATA;
1119 }
1120
1121 int cg_install_release_agent(const char *controller, const char *agent) {
1122 _cleanup_free_ char *fs = NULL, *contents = NULL;
1123 const char *sc;
1124 int r;
1125
1126 assert(agent);
1127
1128 r = cg_unified_controller(controller);
1129 if (r < 0)
1130 return r;
1131 if (r > 0) /* doesn't apply to unified hierarchy */
1132 return -EOPNOTSUPP;
1133
1134 r = cg_get_path(controller, NULL, "release_agent", &fs);
1135 if (r < 0)
1136 return r;
1137
1138 r = read_one_line_file(fs, &contents);
1139 if (r < 0)
1140 return r;
1141
1142 sc = strstrip(contents);
1143 if (isempty(sc)) {
1144 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
1145 if (r < 0)
1146 return r;
1147 } else if (!path_equal(sc, agent))
1148 return -EEXIST;
1149
1150 fs = mfree(fs);
1151 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1152 if (r < 0)
1153 return r;
1154
1155 contents = mfree(contents);
1156 r = read_one_line_file(fs, &contents);
1157 if (r < 0)
1158 return r;
1159
1160 sc = strstrip(contents);
1161 if (streq(sc, "0")) {
1162 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
1163 if (r < 0)
1164 return r;
1165
1166 return 1;
1167 }
1168
1169 if (!streq(sc, "1"))
1170 return -EIO;
1171
1172 return 0;
1173 }
1174
1175 int cg_uninstall_release_agent(const char *controller) {
1176 _cleanup_free_ char *fs = NULL;
1177 int r;
1178
1179 r = cg_unified_controller(controller);
1180 if (r < 0)
1181 return r;
1182 if (r > 0) /* Doesn't apply to unified hierarchy */
1183 return -EOPNOTSUPP;
1184
1185 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1186 if (r < 0)
1187 return r;
1188
1189 r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
1190 if (r < 0)
1191 return r;
1192
1193 fs = mfree(fs);
1194
1195 r = cg_get_path(controller, NULL, "release_agent", &fs);
1196 if (r < 0)
1197 return r;
1198
1199 r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
1200 if (r < 0)
1201 return r;
1202
1203 return 0;
1204 }
1205
1206 int cg_is_empty(const char *controller, const char *path) {
1207 _cleanup_fclose_ FILE *f = NULL;
1208 pid_t pid;
1209 int r;
1210
1211 assert(path);
1212
1213 r = cg_enumerate_processes(controller, path, &f);
1214 if (r == -ENOENT)
1215 return true;
1216 if (r < 0)
1217 return r;
1218
1219 r = cg_read_pid(f, &pid);
1220 if (r < 0)
1221 return r;
1222
1223 return r == 0;
1224 }
1225
1226 int cg_is_empty_recursive(const char *controller, const char *path) {
1227 int r;
1228
1229 assert(path);
1230
1231 /* The root cgroup is always populated */
1232 if (controller && empty_or_root(path))
1233 return false;
1234
1235 r = cg_unified_controller(controller);
1236 if (r < 0)
1237 return r;
1238 if (r > 0) {
1239 _cleanup_free_ char *t = NULL;
1240
1241 /* On the unified hierarchy we can check empty state
1242 * via the "populated" attribute of "cgroup.events". */
1243
1244 r = cg_read_event(controller, path, "populated", &t);
1245 if (r == -ENOENT)
1246 return true;
1247 if (r < 0)
1248 return r;
1249
1250 return streq(t, "0");
1251 } else {
1252 _cleanup_closedir_ DIR *d = NULL;
1253 char *fn;
1254
1255 r = cg_is_empty(controller, path);
1256 if (r <= 0)
1257 return r;
1258
1259 r = cg_enumerate_subgroups(controller, path, &d);
1260 if (r == -ENOENT)
1261 return true;
1262 if (r < 0)
1263 return r;
1264
1265 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1266 _cleanup_free_ char *p = NULL;
1267
1268 p = path_join(path, fn);
1269 free(fn);
1270 if (!p)
1271 return -ENOMEM;
1272
1273 r = cg_is_empty_recursive(controller, p);
1274 if (r <= 0)
1275 return r;
1276 }
1277 if (r < 0)
1278 return r;
1279
1280 return true;
1281 }
1282 }
1283
1284 int cg_split_spec(const char *spec, char **controller, char **path) {
1285 char *t = NULL, *u = NULL;
1286 const char *e;
1287
1288 assert(spec);
1289
1290 if (*spec == '/') {
1291 if (!path_is_normalized(spec))
1292 return -EINVAL;
1293
1294 if (path) {
1295 t = strdup(spec);
1296 if (!t)
1297 return -ENOMEM;
1298
1299 *path = path_simplify(t, false);
1300 }
1301
1302 if (controller)
1303 *controller = NULL;
1304
1305 return 0;
1306 }
1307
1308 e = strchr(spec, ':');
1309 if (!e) {
1310 if (!cg_controller_is_valid(spec))
1311 return -EINVAL;
1312
1313 if (controller) {
1314 t = strdup(spec);
1315 if (!t)
1316 return -ENOMEM;
1317
1318 *controller = t;
1319 }
1320
1321 if (path)
1322 *path = NULL;
1323
1324 return 0;
1325 }
1326
1327 t = strndup(spec, e-spec);
1328 if (!t)
1329 return -ENOMEM;
1330 if (!cg_controller_is_valid(t)) {
1331 free(t);
1332 return -EINVAL;
1333 }
1334
1335 if (isempty(e+1))
1336 u = NULL;
1337 else {
1338 u = strdup(e+1);
1339 if (!u) {
1340 free(t);
1341 return -ENOMEM;
1342 }
1343
1344 if (!path_is_normalized(u) ||
1345 !path_is_absolute(u)) {
1346 free(t);
1347 free(u);
1348 return -EINVAL;
1349 }
1350
1351 path_simplify(u, false);
1352 }
1353
1354 if (controller)
1355 *controller = t;
1356 else
1357 free(t);
1358
1359 if (path)
1360 *path = u;
1361 else
1362 free(u);
1363
1364 return 0;
1365 }
1366
1367 int cg_mangle_path(const char *path, char **result) {
1368 _cleanup_free_ char *c = NULL, *p = NULL;
1369 char *t;
1370 int r;
1371
1372 assert(path);
1373 assert(result);
1374
1375 /* First, check if it already is a filesystem path */
1376 if (path_startswith(path, "/sys/fs/cgroup")) {
1377
1378 t = strdup(path);
1379 if (!t)
1380 return -ENOMEM;
1381
1382 *result = path_simplify(t, false);
1383 return 0;
1384 }
1385
1386 /* Otherwise, treat it as cg spec */
1387 r = cg_split_spec(path, &c, &p);
1388 if (r < 0)
1389 return r;
1390
1391 return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1392 }
1393
1394 int cg_get_root_path(char **path) {
1395 char *p, *e;
1396 int r;
1397
1398 assert(path);
1399
1400 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1401 if (r < 0)
1402 return r;
1403
1404 e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1405 if (!e)
1406 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1407 if (!e)
1408 e = endswith(p, "/system"); /* even more legacy */
1409 if (e)
1410 *e = 0;
1411
1412 *path = p;
1413 return 0;
1414 }
1415
1416 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1417 _cleanup_free_ char *rt = NULL;
1418 char *p;
1419 int r;
1420
1421 assert(cgroup);
1422 assert(shifted);
1423
1424 if (!root) {
1425 /* If the root was specified let's use that, otherwise
1426 * let's determine it from PID 1 */
1427
1428 r = cg_get_root_path(&rt);
1429 if (r < 0)
1430 return r;
1431
1432 root = rt;
1433 }
1434
1435 p = path_startswith(cgroup, root);
1436 if (p && p > cgroup)
1437 *shifted = p - 1;
1438 else
1439 *shifted = cgroup;
1440
1441 return 0;
1442 }
1443
1444 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1445 _cleanup_free_ char *raw = NULL;
1446 const char *c;
1447 int r;
1448
1449 assert(pid >= 0);
1450 assert(cgroup);
1451
1452 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1453 if (r < 0)
1454 return r;
1455
1456 r = cg_shift_path(raw, root, &c);
1457 if (r < 0)
1458 return r;
1459
1460 if (c == raw)
1461 *cgroup = TAKE_PTR(raw);
1462 else {
1463 char *n;
1464
1465 n = strdup(c);
1466 if (!n)
1467 return -ENOMEM;
1468
1469 *cgroup = n;
1470 }
1471
1472 return 0;
1473 }
1474
1475 int cg_path_decode_unit(const char *cgroup, char **unit) {
1476 char *c, *s;
1477 size_t n;
1478
1479 assert(cgroup);
1480 assert(unit);
1481
1482 n = strcspn(cgroup, "/");
1483 if (n < 3)
1484 return -ENXIO;
1485
1486 c = strndupa(cgroup, n);
1487 c = cg_unescape(c);
1488
1489 if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1490 return -ENXIO;
1491
1492 s = strdup(c);
1493 if (!s)
1494 return -ENOMEM;
1495
1496 *unit = s;
1497 return 0;
1498 }
1499
1500 static bool valid_slice_name(const char *p, size_t n) {
1501
1502 if (!p)
1503 return false;
1504
1505 if (n < STRLEN("x.slice"))
1506 return false;
1507
1508 if (memcmp(p + n - 6, ".slice", 6) == 0) {
1509 char buf[n+1], *c;
1510
1511 memcpy(buf, p, n);
1512 buf[n] = 0;
1513
1514 c = cg_unescape(buf);
1515
1516 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1517 }
1518
1519 return false;
1520 }
1521
1522 static const char *skip_slices(const char *p) {
1523 assert(p);
1524
1525 /* Skips over all slice assignments */
1526
1527 for (;;) {
1528 size_t n;
1529
1530 p += strspn(p, "/");
1531
1532 n = strcspn(p, "/");
1533 if (!valid_slice_name(p, n))
1534 return p;
1535
1536 p += n;
1537 }
1538 }
1539
1540 int cg_path_get_unit(const char *path, char **ret) {
1541 const char *e;
1542 char *unit;
1543 int r;
1544
1545 assert(path);
1546 assert(ret);
1547
1548 e = skip_slices(path);
1549
1550 r = cg_path_decode_unit(e, &unit);
1551 if (r < 0)
1552 return r;
1553
1554 /* We skipped over the slices, don't accept any now */
1555 if (endswith(unit, ".slice")) {
1556 free(unit);
1557 return -ENXIO;
1558 }
1559
1560 *ret = unit;
1561 return 0;
1562 }
1563
1564 int cg_pid_get_unit(pid_t pid, char **unit) {
1565 _cleanup_free_ char *cgroup = NULL;
1566 int r;
1567
1568 assert(unit);
1569
1570 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1571 if (r < 0)
1572 return r;
1573
1574 return cg_path_get_unit(cgroup, unit);
1575 }
1576
1577 /**
1578 * Skip session-*.scope, but require it to be there.
1579 */
1580 static const char *skip_session(const char *p) {
1581 size_t n;
1582
1583 if (isempty(p))
1584 return NULL;
1585
1586 p += strspn(p, "/");
1587
1588 n = strcspn(p, "/");
1589 if (n < STRLEN("session-x.scope"))
1590 return NULL;
1591
1592 if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1593 char buf[n - 8 - 6 + 1];
1594
1595 memcpy(buf, p + 8, n - 8 - 6);
1596 buf[n - 8 - 6] = 0;
1597
1598 /* Note that session scopes never need unescaping,
1599 * since they cannot conflict with the kernel's own
1600 * names, hence we don't need to call cg_unescape()
1601 * here. */
1602
1603 if (!session_id_valid(buf))
1604 return false;
1605
1606 p += n;
1607 p += strspn(p, "/");
1608 return p;
1609 }
1610
1611 return NULL;
1612 }
1613
1614 /**
1615 * Skip user@*.service, but require it to be there.
1616 */
1617 static const char *skip_user_manager(const char *p) {
1618 size_t n;
1619
1620 if (isempty(p))
1621 return NULL;
1622
1623 p += strspn(p, "/");
1624
1625 n = strcspn(p, "/");
1626 if (n < STRLEN("user@x.service"))
1627 return NULL;
1628
1629 if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1630 char buf[n - 5 - 8 + 1];
1631
1632 memcpy(buf, p + 5, n - 5 - 8);
1633 buf[n - 5 - 8] = 0;
1634
1635 /* Note that user manager services never need unescaping,
1636 * since they cannot conflict with the kernel's own
1637 * names, hence we don't need to call cg_unescape()
1638 * here. */
1639
1640 if (parse_uid(buf, NULL) < 0)
1641 return NULL;
1642
1643 p += n;
1644 p += strspn(p, "/");
1645
1646 return p;
1647 }
1648
1649 return NULL;
1650 }
1651
1652 static const char *skip_user_prefix(const char *path) {
1653 const char *e, *t;
1654
1655 assert(path);
1656
1657 /* Skip slices, if there are any */
1658 e = skip_slices(path);
1659
1660 /* Skip the user manager, if it's in the path now... */
1661 t = skip_user_manager(e);
1662 if (t)
1663 return t;
1664
1665 /* Alternatively skip the user session if it is in the path... */
1666 return skip_session(e);
1667 }
1668
1669 int cg_path_get_user_unit(const char *path, char **ret) {
1670 const char *t;
1671
1672 assert(path);
1673 assert(ret);
1674
1675 t = skip_user_prefix(path);
1676 if (!t)
1677 return -ENXIO;
1678
1679 /* And from here on it looks pretty much the same as for a
1680 * system unit, hence let's use the same parser from here
1681 * on. */
1682 return cg_path_get_unit(t, ret);
1683 }
1684
1685 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1686 _cleanup_free_ char *cgroup = NULL;
1687 int r;
1688
1689 assert(unit);
1690
1691 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1692 if (r < 0)
1693 return r;
1694
1695 return cg_path_get_user_unit(cgroup, unit);
1696 }
1697
1698 int cg_path_get_machine_name(const char *path, char **machine) {
1699 _cleanup_free_ char *u = NULL;
1700 const char *sl;
1701 int r;
1702
1703 r = cg_path_get_unit(path, &u);
1704 if (r < 0)
1705 return r;
1706
1707 sl = strjoina("/run/systemd/machines/unit:", u);
1708 return readlink_malloc(sl, machine);
1709 }
1710
1711 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1712 _cleanup_free_ char *cgroup = NULL;
1713 int r;
1714
1715 assert(machine);
1716
1717 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1718 if (r < 0)
1719 return r;
1720
1721 return cg_path_get_machine_name(cgroup, machine);
1722 }
1723
1724 int cg_path_get_session(const char *path, char **session) {
1725 _cleanup_free_ char *unit = NULL;
1726 char *start, *end;
1727 int r;
1728
1729 assert(path);
1730
1731 r = cg_path_get_unit(path, &unit);
1732 if (r < 0)
1733 return r;
1734
1735 start = startswith(unit, "session-");
1736 if (!start)
1737 return -ENXIO;
1738 end = endswith(start, ".scope");
1739 if (!end)
1740 return -ENXIO;
1741
1742 *end = 0;
1743 if (!session_id_valid(start))
1744 return -ENXIO;
1745
1746 if (session) {
1747 char *rr;
1748
1749 rr = strdup(start);
1750 if (!rr)
1751 return -ENOMEM;
1752
1753 *session = rr;
1754 }
1755
1756 return 0;
1757 }
1758
1759 int cg_pid_get_session(pid_t pid, char **session) {
1760 _cleanup_free_ char *cgroup = NULL;
1761 int r;
1762
1763 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1764 if (r < 0)
1765 return r;
1766
1767 return cg_path_get_session(cgroup, session);
1768 }
1769
1770 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1771 _cleanup_free_ char *slice = NULL;
1772 char *start, *end;
1773 int r;
1774
1775 assert(path);
1776
1777 r = cg_path_get_slice(path, &slice);
1778 if (r < 0)
1779 return r;
1780
1781 start = startswith(slice, "user-");
1782 if (!start)
1783 return -ENXIO;
1784 end = endswith(start, ".slice");
1785 if (!end)
1786 return -ENXIO;
1787
1788 *end = 0;
1789 if (parse_uid(start, uid) < 0)
1790 return -ENXIO;
1791
1792 return 0;
1793 }
1794
1795 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1796 _cleanup_free_ char *cgroup = NULL;
1797 int r;
1798
1799 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1800 if (r < 0)
1801 return r;
1802
1803 return cg_path_get_owner_uid(cgroup, uid);
1804 }
1805
1806 int cg_path_get_slice(const char *p, char **slice) {
1807 const char *e = NULL;
1808
1809 assert(p);
1810 assert(slice);
1811
1812 /* Finds the right-most slice unit from the beginning, but
1813 * stops before we come to the first non-slice unit. */
1814
1815 for (;;) {
1816 size_t n;
1817
1818 p += strspn(p, "/");
1819
1820 n = strcspn(p, "/");
1821 if (!valid_slice_name(p, n)) {
1822
1823 if (!e) {
1824 char *s;
1825
1826 s = strdup(SPECIAL_ROOT_SLICE);
1827 if (!s)
1828 return -ENOMEM;
1829
1830 *slice = s;
1831 return 0;
1832 }
1833
1834 return cg_path_decode_unit(e, slice);
1835 }
1836
1837 e = p;
1838 p += n;
1839 }
1840 }
1841
1842 int cg_pid_get_slice(pid_t pid, char **slice) {
1843 _cleanup_free_ char *cgroup = NULL;
1844 int r;
1845
1846 assert(slice);
1847
1848 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1849 if (r < 0)
1850 return r;
1851
1852 return cg_path_get_slice(cgroup, slice);
1853 }
1854
1855 int cg_path_get_user_slice(const char *p, char **slice) {
1856 const char *t;
1857 assert(p);
1858 assert(slice);
1859
1860 t = skip_user_prefix(p);
1861 if (!t)
1862 return -ENXIO;
1863
1864 /* And now it looks pretty much the same as for a system
1865 * slice, so let's just use the same parser from here on. */
1866 return cg_path_get_slice(t, slice);
1867 }
1868
1869 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1870 _cleanup_free_ char *cgroup = NULL;
1871 int r;
1872
1873 assert(slice);
1874
1875 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1876 if (r < 0)
1877 return r;
1878
1879 return cg_path_get_user_slice(cgroup, slice);
1880 }
1881
1882 char *cg_escape(const char *p) {
1883 bool need_prefix = false;
1884
1885 /* This implements very minimal escaping for names to be used
1886 * as file names in the cgroup tree: any name which might
1887 * conflict with a kernel name or is prefixed with '_' is
1888 * prefixed with a '_'. That way, when reading cgroup names it
1889 * is sufficient to remove a single prefixing underscore if
1890 * there is one. */
1891
1892 /* The return value of this function (unlike cg_unescape())
1893 * needs free()! */
1894
1895 if (IN_SET(p[0], 0, '_', '.') ||
1896 STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") ||
1897 startswith(p, "cgroup."))
1898 need_prefix = true;
1899 else {
1900 const char *dot;
1901
1902 dot = strrchr(p, '.');
1903 if (dot) {
1904 CGroupController c;
1905 size_t l = dot - p;
1906
1907 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1908 const char *n;
1909
1910 n = cgroup_controller_to_string(c);
1911
1912 if (l != strlen(n))
1913 continue;
1914
1915 if (memcmp(p, n, l) != 0)
1916 continue;
1917
1918 need_prefix = true;
1919 break;
1920 }
1921 }
1922 }
1923
1924 if (need_prefix)
1925 return strjoin("_", p);
1926
1927 return strdup(p);
1928 }
1929
1930 char *cg_unescape(const char *p) {
1931 assert(p);
1932
1933 /* The return value of this function (unlike cg_escape())
1934 * doesn't need free()! */
1935
1936 if (p[0] == '_')
1937 return (char*) p+1;
1938
1939 return (char*) p;
1940 }
1941
1942 #define CONTROLLER_VALID \
1943 DIGITS LETTERS \
1944 "_"
1945
1946 bool cg_controller_is_valid(const char *p) {
1947 const char *t, *s;
1948
1949 if (!p)
1950 return false;
1951
1952 if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1953 return true;
1954
1955 s = startswith(p, "name=");
1956 if (s)
1957 p = s;
1958
1959 if (IN_SET(*p, 0, '_'))
1960 return false;
1961
1962 for (t = p; *t; t++)
1963 if (!strchr(CONTROLLER_VALID, *t))
1964 return false;
1965
1966 if (t - p > FILENAME_MAX)
1967 return false;
1968
1969 return true;
1970 }
1971
1972 int cg_slice_to_path(const char *unit, char **ret) {
1973 _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1974 const char *dash;
1975 int r;
1976
1977 assert(unit);
1978 assert(ret);
1979
1980 if (streq(unit, SPECIAL_ROOT_SLICE)) {
1981 char *x;
1982
1983 x = strdup("");
1984 if (!x)
1985 return -ENOMEM;
1986 *ret = x;
1987 return 0;
1988 }
1989
1990 if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1991 return -EINVAL;
1992
1993 if (!endswith(unit, ".slice"))
1994 return -EINVAL;
1995
1996 r = unit_name_to_prefix(unit, &p);
1997 if (r < 0)
1998 return r;
1999
2000 dash = strchr(p, '-');
2001
2002 /* Don't allow initial dashes */
2003 if (dash == p)
2004 return -EINVAL;
2005
2006 while (dash) {
2007 _cleanup_free_ char *escaped = NULL;
2008 char n[dash - p + sizeof(".slice")];
2009
2010 #if HAS_FEATURE_MEMORY_SANITIZER
2011 /* msan doesn't instrument stpncpy, so it thinks
2012 * n is later used uninitialized:
2013 * https://github.com/google/sanitizers/issues/926
2014 */
2015 zero(n);
2016 #endif
2017
2018 /* Don't allow trailing or double dashes */
2019 if (IN_SET(dash[1], 0, '-'))
2020 return -EINVAL;
2021
2022 strcpy(stpncpy(n, p, dash - p), ".slice");
2023 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
2024 return -EINVAL;
2025
2026 escaped = cg_escape(n);
2027 if (!escaped)
2028 return -ENOMEM;
2029
2030 if (!strextend(&s, escaped, "/", NULL))
2031 return -ENOMEM;
2032
2033 dash = strchr(dash+1, '-');
2034 }
2035
2036 e = cg_escape(unit);
2037 if (!e)
2038 return -ENOMEM;
2039
2040 if (!strextend(&s, e, NULL))
2041 return -ENOMEM;
2042
2043 *ret = TAKE_PTR(s);
2044
2045 return 0;
2046 }
2047
2048 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2049 _cleanup_free_ char *p = NULL;
2050 int r;
2051
2052 r = cg_get_path(controller, path, attribute, &p);
2053 if (r < 0)
2054 return r;
2055
2056 return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
2057 }
2058
2059 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2060 _cleanup_free_ char *p = NULL;
2061 int r;
2062
2063 r = cg_get_path(controller, path, attribute, &p);
2064 if (r < 0)
2065 return r;
2066
2067 return read_one_line_file(p, ret);
2068 }
2069
2070 int cg_get_keyed_attribute(
2071 const char *controller,
2072 const char *path,
2073 const char *attribute,
2074 char **keys,
2075 char **ret_values) {
2076
2077 _cleanup_free_ char *filename = NULL, *contents = NULL;
2078 const char *p;
2079 size_t n, i, n_done = 0;
2080 char **v;
2081 int r;
2082
2083 /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
2084 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
2085 * entries as 'keys'. On success each entry will be set to the value of the matching key.
2086 *
2087 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. */
2088
2089 r = cg_get_path(controller, path, attribute, &filename);
2090 if (r < 0)
2091 return r;
2092
2093 r = read_full_file(filename, &contents, NULL);
2094 if (r < 0)
2095 return r;
2096
2097 n = strv_length(keys);
2098 if (n == 0) /* No keys to retrieve? That's easy, we are done then */
2099 return 0;
2100
2101 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
2102 v = newa0(char*, n);
2103
2104 for (p = contents; *p;) {
2105 const char *w = NULL;
2106
2107 for (i = 0; i < n; i++)
2108 if (!v[i]) {
2109 w = first_word(p, keys[i]);
2110 if (w)
2111 break;
2112 }
2113
2114 if (w) {
2115 size_t l;
2116
2117 l = strcspn(w, NEWLINE);
2118 v[i] = strndup(w, l);
2119 if (!v[i]) {
2120 r = -ENOMEM;
2121 goto fail;
2122 }
2123
2124 n_done++;
2125 if (n_done >= n)
2126 goto done;
2127
2128 p = w + l;
2129 } else
2130 p += strcspn(p, NEWLINE);
2131
2132 p += strspn(p, NEWLINE);
2133 }
2134
2135 r = -ENXIO;
2136
2137 fail:
2138 for (i = 0; i < n; i++)
2139 free(v[i]);
2140
2141 return r;
2142
2143 done:
2144 memcpy(ret_values, v, sizeof(char*) * n);
2145 return 0;
2146
2147 }
2148
2149 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2150 CGroupController c;
2151 CGroupMask done;
2152 bool created;
2153 int r;
2154
2155 /* This one will create a cgroup in our private tree, but also
2156 * duplicate it in the trees specified in mask, and remove it
2157 * in all others.
2158 *
2159 * Returns 0 if the group already existed in the systemd hierarchy,
2160 * 1 on success, negative otherwise.
2161 */
2162
2163 /* First create the cgroup in our own hierarchy. */
2164 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2165 if (r < 0)
2166 return r;
2167 created = r;
2168
2169 /* If we are in the unified hierarchy, we are done now */
2170 r = cg_all_unified();
2171 if (r < 0)
2172 return r;
2173 if (r > 0)
2174 return created;
2175
2176 supported &= CGROUP_MASK_V1;
2177 mask = CGROUP_MASK_EXTEND_JOINED(mask);
2178 done = 0;
2179
2180 /* Otherwise, do the same in the other hierarchies */
2181 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2182 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2183 const char *n;
2184
2185 if (!FLAGS_SET(supported, bit))
2186 continue;
2187
2188 if (FLAGS_SET(done, bit))
2189 continue;
2190
2191 n = cgroup_controller_to_string(c);
2192 if (FLAGS_SET(mask, bit))
2193 (void) cg_create(n, path);
2194 else
2195 (void) cg_trim(n, path, true);
2196
2197 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2198 }
2199
2200 return created;
2201 }
2202
2203 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2204 CGroupController c;
2205 CGroupMask done;
2206 int r;
2207
2208 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2209 if (r < 0)
2210 return r;
2211
2212 r = cg_all_unified();
2213 if (r < 0)
2214 return r;
2215 if (r > 0)
2216 return 0;
2217
2218 supported &= CGROUP_MASK_V1;
2219 done = 0;
2220
2221 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2222 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2223 const char *p = NULL;
2224
2225 if (!FLAGS_SET(supported, bit))
2226 continue;
2227
2228 if (FLAGS_SET(done, bit))
2229 continue;
2230
2231 if (path_callback)
2232 p = path_callback(bit, userdata);
2233 if (!p)
2234 p = path;
2235
2236 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2237 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2238 }
2239
2240 return 0;
2241 }
2242
2243 int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2244 Iterator i;
2245 void *pidp;
2246 int r = 0;
2247
2248 SET_FOREACH(pidp, pids, i) {
2249 pid_t pid = PTR_TO_PID(pidp);
2250 int q;
2251
2252 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2253 if (q < 0 && r >= 0)
2254 r = q;
2255 }
2256
2257 return r;
2258 }
2259
2260 int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2261 CGroupController c;
2262 CGroupMask done;
2263 int r = 0, q;
2264
2265 if (!path_equal(from, to)) {
2266 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2267 if (r < 0)
2268 return r;
2269 }
2270
2271 q = cg_all_unified();
2272 if (q < 0)
2273 return q;
2274 if (q > 0)
2275 return r;
2276
2277 supported &= CGROUP_MASK_V1;
2278 done = 0;
2279
2280 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2281 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2282 const char *p = NULL;
2283
2284 if (!FLAGS_SET(supported, bit))
2285 continue;
2286
2287 if (FLAGS_SET(done, bit))
2288 continue;
2289
2290 if (to_callback)
2291 p = to_callback(bit, userdata);
2292 if (!p)
2293 p = to;
2294
2295 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2296 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2297 }
2298
2299 return r;
2300 }
2301
2302 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2303 CGroupController c;
2304 CGroupMask done;
2305 int r, q;
2306
2307 r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2308 if (r < 0)
2309 return r;
2310
2311 q = cg_all_unified();
2312 if (q < 0)
2313 return q;
2314 if (q > 0)
2315 return r;
2316
2317 supported &= CGROUP_MASK_V1;
2318 done = 0;
2319
2320 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2321 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2322
2323 if (!FLAGS_SET(supported, bit))
2324 continue;
2325
2326 if (FLAGS_SET(done, bit))
2327 continue;
2328
2329 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2330 done |= CGROUP_MASK_EXTEND_JOINED(bit);
2331 }
2332
2333 return r;
2334 }
2335
2336 int cg_mask_to_string(CGroupMask mask, char **ret) {
2337 _cleanup_free_ char *s = NULL;
2338 size_t n = 0, allocated = 0;
2339 bool space = false;
2340 CGroupController c;
2341
2342 assert(ret);
2343
2344 if (mask == 0) {
2345 *ret = NULL;
2346 return 0;
2347 }
2348
2349 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2350 const char *k;
2351 size_t l;
2352
2353 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
2354 continue;
2355
2356 k = cgroup_controller_to_string(c);
2357 l = strlen(k);
2358
2359 if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2360 return -ENOMEM;
2361
2362 if (space)
2363 s[n] = ' ';
2364 memcpy(s + n + space, k, l);
2365 n += space + l;
2366
2367 space = true;
2368 }
2369
2370 assert(s);
2371
2372 s[n] = 0;
2373 *ret = TAKE_PTR(s);
2374
2375 return 0;
2376 }
2377
2378 int cg_mask_from_string(const char *value, CGroupMask *ret) {
2379 CGroupMask m = 0;
2380
2381 assert(ret);
2382 assert(value);
2383
2384 for (;;) {
2385 _cleanup_free_ char *n = NULL;
2386 CGroupController v;
2387 int r;
2388
2389 r = extract_first_word(&value, &n, NULL, 0);
2390 if (r < 0)
2391 return r;
2392 if (r == 0)
2393 break;
2394
2395 v = cgroup_controller_from_string(n);
2396 if (v < 0)
2397 continue;
2398
2399 m |= CGROUP_CONTROLLER_TO_MASK(v);
2400 }
2401
2402 *ret = m;
2403 return 0;
2404 }
2405
2406 int cg_mask_supported(CGroupMask *ret) {
2407 CGroupMask mask;
2408 int r;
2409
2410 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2411 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2412 * pseudo-controllers. */
2413
2414 r = cg_all_unified();
2415 if (r < 0)
2416 return r;
2417 if (r > 0) {
2418 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2419
2420 /* In the unified hierarchy we can read the supported
2421 * and accessible controllers from a the top-level
2422 * cgroup attribute */
2423
2424 r = cg_get_root_path(&root);
2425 if (r < 0)
2426 return r;
2427
2428 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2429 if (r < 0)
2430 return r;
2431
2432 r = read_one_line_file(path, &controllers);
2433 if (r < 0)
2434 return r;
2435
2436 r = cg_mask_from_string(controllers, &mask);
2437 if (r < 0)
2438 return r;
2439
2440 /* Currently, we support the cpu, memory, io and pids controller in the unified hierarchy, mask
2441 * everything else off. */
2442 mask &= CGROUP_MASK_V2;
2443
2444 } else {
2445 CGroupController c;
2446
2447 /* In the legacy hierarchy, we check which hierarchies are mounted. */
2448
2449 mask = 0;
2450 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2451 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2452 const char *n;
2453
2454 if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2455 continue;
2456
2457 n = cgroup_controller_to_string(c);
2458 if (controller_is_accessible(n) >= 0)
2459 mask |= bit;
2460 }
2461 }
2462
2463 *ret = mask;
2464 return 0;
2465 }
2466
2467 int cg_kernel_controllers(Set **ret) {
2468 _cleanup_set_free_free_ Set *controllers = NULL;
2469 _cleanup_fclose_ FILE *f = NULL;
2470 int r;
2471
2472 assert(ret);
2473
2474 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2475 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2476 * pseudo-controllers. */
2477
2478 controllers = set_new(&string_hash_ops);
2479 if (!controllers)
2480 return -ENOMEM;
2481
2482 r = fopen_unlocked("/proc/cgroups", "re", &f);
2483 if (r == -ENOENT) {
2484 *ret = NULL;
2485 return 0;
2486 }
2487 if (r < 0)
2488 return r;
2489
2490 /* Ignore the header line */
2491 (void) read_line(f, (size_t) -1, NULL);
2492
2493 for (;;) {
2494 char *controller;
2495 int enabled = 0;
2496
2497 errno = 0;
2498 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2499
2500 if (feof(f))
2501 break;
2502
2503 if (ferror(f))
2504 return errno_or_else(EIO);
2505
2506 return -EBADMSG;
2507 }
2508
2509 if (!enabled) {
2510 free(controller);
2511 continue;
2512 }
2513
2514 if (!cg_controller_is_valid(controller)) {
2515 free(controller);
2516 return -EBADMSG;
2517 }
2518
2519 r = set_consume(controllers, controller);
2520 if (r < 0)
2521 return r;
2522 }
2523
2524 *ret = TAKE_PTR(controllers);
2525
2526 return 0;
2527 }
2528
2529 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2530
2531 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on /sys/fs/cgroup/systemd. This
2532 * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2533 * /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2534 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2535 *
2536 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep cgroup v2
2537 * process management but disable the compat dual layout, we return %true on
2538 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2539 */
2540 static thread_local bool unified_systemd_v232;
2541
2542 static int cg_unified_update(void) {
2543
2544 struct statfs fs;
2545
2546 /* Checks if we support the unified hierarchy. Returns an
2547 * error when the cgroup hierarchies aren't mounted yet or we
2548 * have any other trouble determining if the unified hierarchy
2549 * is supported. */
2550
2551 if (unified_cache >= CGROUP_UNIFIED_NONE)
2552 return 0;
2553
2554 if (statfs("/sys/fs/cgroup/", &fs) < 0)
2555 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2556
2557 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2558 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2559 unified_cache = CGROUP_UNIFIED_ALL;
2560 } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2561 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2562 F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2563 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2564 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2565 unified_systemd_v232 = false;
2566 } else {
2567 if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2568 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2569
2570 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2571 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2572 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2573 unified_systemd_v232 = true;
2574 } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2575 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2576 unified_cache = CGROUP_UNIFIED_NONE;
2577 } else {
2578 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2579 (unsigned long long) fs.f_type);
2580 unified_cache = CGROUP_UNIFIED_NONE;
2581 }
2582 }
2583 } else
2584 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2585 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2586 (unsigned long long)fs.f_type);
2587
2588 return 0;
2589 }
2590
2591 int cg_unified_controller(const char *controller) {
2592 int r;
2593
2594 r = cg_unified_update();
2595 if (r < 0)
2596 return r;
2597
2598 if (unified_cache == CGROUP_UNIFIED_NONE)
2599 return false;
2600
2601 if (unified_cache >= CGROUP_UNIFIED_ALL)
2602 return true;
2603
2604 return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2605 }
2606
2607 int cg_all_unified(void) {
2608 int r;
2609
2610 r = cg_unified_update();
2611 if (r < 0)
2612 return r;
2613
2614 return unified_cache >= CGROUP_UNIFIED_ALL;
2615 }
2616
2617 int cg_hybrid_unified(void) {
2618 int r;
2619
2620 r = cg_unified_update();
2621 if (r < 0)
2622 return r;
2623
2624 return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2625 }
2626
2627 int cg_unified_flush(void) {
2628 unified_cache = CGROUP_UNIFIED_UNKNOWN;
2629
2630 return cg_unified_update();
2631 }
2632
2633 int cg_enable_everywhere(
2634 CGroupMask supported,
2635 CGroupMask mask,
2636 const char *p,
2637 CGroupMask *ret_result_mask) {
2638
2639 _cleanup_fclose_ FILE *f = NULL;
2640 _cleanup_free_ char *fs = NULL;
2641 CGroupController c;
2642 CGroupMask ret = 0;
2643 int r;
2644
2645 assert(p);
2646
2647 if (supported == 0) {
2648 if (ret_result_mask)
2649 *ret_result_mask = 0;
2650 return 0;
2651 }
2652
2653 r = cg_all_unified();
2654 if (r < 0)
2655 return r;
2656 if (r == 0) {
2657 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
2658 * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
2659 * caller tends to use the returned mask later on to compare if all controllers where properly joined,
2660 * and if not requeues realization. This use is the primary purpose of the return value, hence let's
2661 * minimize surprises here and reduce triggers for re-realization by always saying we fully
2662 * succeeded.) */
2663 if (ret_result_mask)
2664 *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
2665 * CGROUP_MASK_V2: The 'supported' mask
2666 * might contain pure-V1 or BPF
2667 * controllers, and we never want to
2668 * claim that we could enable those with
2669 * cgroup.subtree_control */
2670 return 0;
2671 }
2672
2673 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2674 if (r < 0)
2675 return r;
2676
2677 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2678 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2679 const char *n;
2680
2681 if (!FLAGS_SET(CGROUP_MASK_V2, bit))
2682 continue;
2683
2684 if (!FLAGS_SET(supported, bit))
2685 continue;
2686
2687 n = cgroup_controller_to_string(c);
2688 {
2689 char s[1 + strlen(n) + 1];
2690
2691 s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
2692 strcpy(s + 1, n);
2693
2694 if (!f) {
2695 f = fopen(fs, "we");
2696 if (!f)
2697 return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
2698 }
2699
2700 r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
2701 if (r < 0) {
2702 log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
2703 FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
2704 clearerr(f);
2705
2706 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
2707 * happens for example when we attempt to turn off a controller up in the tree that is
2708 * used down in the tree. */
2709 if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
2710 * only here, and not follow the same logic
2711 * for other errors such as EINVAL or
2712 * EOPNOTSUPP or anything else. That's
2713 * because EBUSY indicates that the
2714 * controllers is currently enabled and
2715 * cannot be disabled because something down
2716 * the hierarchy is still using it. Any other
2717 * error most likely means something like "I
2718 * never heard of this controller" or
2719 * similar. In the former case it's hence
2720 * safe to assume the controller is still on
2721 * after the failed operation, while in the
2722 * latter case it's safer to assume the
2723 * controller is unknown and hence certainly
2724 * not enabled. */
2725 ret |= bit;
2726 } else {
2727 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
2728 if (FLAGS_SET(mask, bit))
2729 ret |= bit;
2730 }
2731 }
2732 }
2733
2734 /* Let's return the precise set of controllers now enabled for the cgroup. */
2735 if (ret_result_mask)
2736 *ret_result_mask = ret;
2737
2738 return 0;
2739 }
2740
2741 bool cg_is_unified_wanted(void) {
2742 static thread_local int wanted = -1;
2743 int r;
2744 bool b;
2745 const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
2746 _cleanup_free_ char *c = NULL;
2747
2748 /* If we have a cached value, return that. */
2749 if (wanted >= 0)
2750 return wanted;
2751
2752 /* If the hierarchy is already mounted, then follow whatever
2753 * was chosen for it. */
2754 if (cg_unified_flush() >= 0)
2755 return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
2756
2757 /* If we were explicitly passed systemd.unified_cgroup_hierarchy,
2758 * respect that. */
2759 r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2760 if (r > 0)
2761 return (wanted = b);
2762
2763 /* If we passed cgroup_no_v1=all with no other instructions, it seems
2764 * highly unlikely that we want to use hybrid or legacy hierarchy. */
2765 r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
2766 if (r > 0 && streq_ptr(c, "all"))
2767 return (wanted = true);
2768
2769 return (wanted = is_default);
2770 }
2771
2772 bool cg_is_legacy_wanted(void) {
2773 static thread_local int wanted = -1;
2774
2775 /* If we have a cached value, return that. */
2776 if (wanted >= 0)
2777 return wanted;
2778
2779 /* Check if we have cgroup v2 already mounted. */
2780 if (cg_unified_flush() >= 0 &&
2781 unified_cache == CGROUP_UNIFIED_ALL)
2782 return (wanted = false);
2783
2784 /* Otherwise, assume that at least partial legacy is wanted,
2785 * since cgroup v2 should already be mounted at this point. */
2786 return (wanted = true);
2787 }
2788
2789 bool cg_is_hybrid_wanted(void) {
2790 static thread_local int wanted = -1;
2791 int r;
2792 bool b;
2793 const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2794 /* We default to true if the default is "hybrid", obviously,
2795 * but also when the default is "unified", because if we get
2796 * called, it means that unified hierarchy was not mounted. */
2797
2798 /* If we have a cached value, return that. */
2799 if (wanted >= 0)
2800 return wanted;
2801
2802 /* If the hierarchy is already mounted, then follow whatever
2803 * was chosen for it. */
2804 if (cg_unified_flush() >= 0 &&
2805 unified_cache == CGROUP_UNIFIED_ALL)
2806 return (wanted = false);
2807
2808 /* Otherwise, let's see what the kernel command line has to say.
2809 * Since checking is expensive, cache a non-error result. */
2810 r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2811
2812 /* The meaning of the kernel option is reversed wrt. to the return value
2813 * of this function, hence the negation. */
2814 return (wanted = r > 0 ? !b : is_default);
2815 }
2816
2817 int cg_weight_parse(const char *s, uint64_t *ret) {
2818 uint64_t u;
2819 int r;
2820
2821 if (isempty(s)) {
2822 *ret = CGROUP_WEIGHT_INVALID;
2823 return 0;
2824 }
2825
2826 r = safe_atou64(s, &u);
2827 if (r < 0)
2828 return r;
2829
2830 if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2831 return -ERANGE;
2832
2833 *ret = u;
2834 return 0;
2835 }
2836
2837 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2838 [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX,
2839 [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX,
2840 [CGROUP_IO_RIOPS_MAX] = CGROUP_LIMIT_MAX,
2841 [CGROUP_IO_WIOPS_MAX] = CGROUP_LIMIT_MAX,
2842 };
2843
2844 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2845 [CGROUP_IO_RBPS_MAX] = "IOReadBandwidthMax",
2846 [CGROUP_IO_WBPS_MAX] = "IOWriteBandwidthMax",
2847 [CGROUP_IO_RIOPS_MAX] = "IOReadIOPSMax",
2848 [CGROUP_IO_WIOPS_MAX] = "IOWriteIOPSMax",
2849 };
2850
2851 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2852
2853 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2854 uint64_t u;
2855 int r;
2856
2857 if (isempty(s)) {
2858 *ret = CGROUP_CPU_SHARES_INVALID;
2859 return 0;
2860 }
2861
2862 r = safe_atou64(s, &u);
2863 if (r < 0)
2864 return r;
2865
2866 if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2867 return -ERANGE;
2868
2869 *ret = u;
2870 return 0;
2871 }
2872
2873 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2874 uint64_t u;
2875 int r;
2876
2877 if (isempty(s)) {
2878 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2879 return 0;
2880 }
2881
2882 r = safe_atou64(s, &u);
2883 if (r < 0)
2884 return r;
2885
2886 if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2887 return -ERANGE;
2888
2889 *ret = u;
2890 return 0;
2891 }
2892
2893 bool is_cgroup_fs(const struct statfs *s) {
2894 return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2895 is_fs_type(s, CGROUP2_SUPER_MAGIC);
2896 }
2897
2898 bool fd_is_cgroup_fs(int fd) {
2899 struct statfs s;
2900
2901 if (fstatfs(fd, &s) < 0)
2902 return -errno;
2903
2904 return is_cgroup_fs(&s);
2905 }
2906
2907 static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2908 [CGROUP_CONTROLLER_CPU] = "cpu",
2909 [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2910 [CGROUP_CONTROLLER_IO] = "io",
2911 [CGROUP_CONTROLLER_BLKIO] = "blkio",
2912 [CGROUP_CONTROLLER_MEMORY] = "memory",
2913 [CGROUP_CONTROLLER_DEVICES] = "devices",
2914 [CGROUP_CONTROLLER_PIDS] = "pids",
2915 [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2916 [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2917 };
2918
2919 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2920
2921 CGroupMask get_cpu_accounting_mask(void) {
2922 static CGroupMask needed_mask = (CGroupMask) -1;
2923
2924 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2925 * provided externally from the CPU controller, which means we don't
2926 * need to enable the CPU controller just to get metrics. This is good,
2927 * because enabling the CPU controller comes at a minor performance
2928 * hit, especially when it's propagated deep into large hierarchies.
2929 * There's also no separate CPU accounting controller available within
2930 * a unified hierarchy.
2931 *
2932 * This combination of factors results in the desired cgroup mask to
2933 * enable for CPU accounting varying as follows:
2934 *
2935 * ╔═════════════════════╤═════════════════════╗
2936 * ║ Linux ≥4.15 │ Linux <4.15 ║
2937 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2938 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2939 * ╟───────────────╫─────────────────────┼─────────────────────╢
2940 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2941 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2942 *
2943 * We check kernel version here instead of manually checking whether
2944 * cpu.stat is present for every cgroup, as that check in itself would
2945 * already be fairly expensive.
2946 *
2947 * Kernels where this patch has been backported will therefore have the
2948 * CPU controller enabled unnecessarily. This is more expensive than
2949 * necessary, but harmless. ☺️
2950 */
2951
2952 if (needed_mask == (CGroupMask) -1) {
2953 if (cg_all_unified()) {
2954 struct utsname u;
2955 assert_se(uname(&u) >= 0);
2956
2957 if (str_verscmp(u.release, "4.15") < 0)
2958 needed_mask = CGROUP_MASK_CPU;
2959 else
2960 needed_mask = 0;
2961 } else
2962 needed_mask = CGROUP_MASK_CPUACCT;
2963 }
2964
2965 return needed_mask;
2966 }
2967
2968 bool cpu_accounting_is_cheap(void) {
2969 return get_cpu_accounting_mask() == 0;
2970 }