1 /* SPDX-License-Identifier: LGPL-2.1+ */
12 #include <sys/statfs.h>
13 #include <sys/types.h>
14 #include <sys/utsname.h>
15 #include <sys/xattr.h>
18 #include "alloc-util.h"
19 #include "cgroup-util.h"
21 #include "dirent-util.h"
22 #include "extract-word.h"
25 #include "format-util.h"
28 #include "login-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "proc-cmdline.h"
35 #include "process-util.h"
38 #include "stat-util.h"
39 #include "stdio-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
43 #include "unit-name.h"
44 #include "user-util.h"
46 static int cg_enumerate_items(const char *controller
, const char *path
, FILE **_f
, const char *item
) {
47 _cleanup_free_
char *fs
= NULL
;
53 r
= cg_get_path(controller
, path
, item
, &fs
);
65 int cg_enumerate_processes(const char *controller
, const char *path
, FILE **_f
) {
66 return cg_enumerate_items(controller
, path
, _f
, "cgroup.procs");
69 int cg_read_pid(FILE *f
, pid_t
*_pid
) {
72 /* Note that the cgroup.procs might contain duplicates! See
73 * cgroups.txt for details. */
79 if (fscanf(f
, "%lu", &ul
) != 1) {
84 return errno_or_else(EIO
);
95 const char *controller
,
100 _cleanup_free_
char *events
= NULL
, *content
= NULL
;
103 r
= cg_get_path(controller
, path
, "cgroup.events", &events
);
107 r
= read_full_file(events
, &content
, NULL
);
111 for (const char *p
= content
;;) {
112 _cleanup_free_
char *line
= NULL
, *key
= NULL
, *val
= NULL
;
115 r
= extract_first_word(&p
, &line
, "\n", 0);
122 r
= extract_first_word(&q
, &key
, " ", 0);
128 if (!streq(key
, event
))
135 *ret
= TAKE_PTR(val
);
140 bool cg_ns_supported(void) {
141 static thread_local
int enabled
= -1;
146 if (access("/proc/self/ns/cgroup", F_OK
) < 0) {
148 log_debug_errno(errno
, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
156 int cg_enumerate_subgroups(const char *controller
, const char *path
, DIR **_d
) {
157 _cleanup_free_
char *fs
= NULL
;
163 /* This is not recursive! */
165 r
= cg_get_path(controller
, path
, NULL
, &fs
);
177 int cg_read_subgroup(DIR *d
, char **fn
) {
183 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
186 if (de
->d_type
!= DT_DIR
)
189 if (dot_or_dot_dot(de
->d_name
))
192 b
= strdup(de
->d_name
);
203 int cg_rmdir(const char *controller
, const char *path
) {
204 _cleanup_free_
char *p
= NULL
;
207 r
= cg_get_path(controller
, path
, NULL
, &p
);
212 if (r
< 0 && errno
!= ENOENT
)
215 r
= cg_hybrid_unified();
219 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
220 r
= cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
222 log_warning_errno(r
, "Failed to remove compat systemd cgroup %s: %m", path
);
228 static int cg_kill_items(
229 const char *controller
,
234 cg_kill_log_func_t log_kill
,
238 _cleanup_set_free_ Set
*allocated_set
= NULL
;
240 int r
, ret
= 0, ret_log_kill
= 0;
245 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
246 * SIGCONT on SIGKILL. */
247 if (IN_SET(sig
, SIGCONT
, SIGKILL
))
248 flags
&= ~CGROUP_SIGCONT
;
250 /* This goes through the tasks list and kills them all. This
251 * is repeated until no further processes are added to the
252 * tasks list, to properly handle forking processes */
255 s
= allocated_set
= set_new(NULL
);
260 my_pid
= getpid_cached();
263 _cleanup_fclose_
FILE *f
= NULL
;
267 r
= cg_enumerate_items(controller
, path
, &f
, item
);
269 if (ret
>= 0 && r
!= -ENOENT
)
275 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
277 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
280 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
284 ret_log_kill
= log_kill(pid
, sig
, userdata
);
286 /* If we haven't killed this process yet, kill
288 if (kill(pid
, sig
) < 0) {
289 if (ret
>= 0 && errno
!= ESRCH
)
292 if (flags
& CGROUP_SIGCONT
)
293 (void) kill(pid
, SIGCONT
);
305 r
= set_put(s
, PID_TO_PTR(pid
));
321 /* To avoid racing against processes which fork
322 * quicker than we can kill them we repeat this until
323 * no new pids need to be killed. */
331 const char *controller
,
336 cg_kill_log_func_t log_kill
,
340 r
= cg_kill_items(controller
, path
, sig
, flags
, s
, log_kill
, userdata
, "cgroup.procs");
341 if (r
< 0 || sig
!= SIGKILL
)
344 /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
345 a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83). */
346 r
= cg_unified_controller(controller
);
349 if (r
== 0) /* doesn't apply to legacy hierarchy */
352 return cg_kill_items(controller
, path
, sig
, flags
, s
, log_kill
, userdata
, "cgroup.threads");
355 int cg_kill_recursive(
356 const char *controller
,
361 cg_kill_log_func_t log_kill
,
364 _cleanup_set_free_ Set
*allocated_set
= NULL
;
365 _cleanup_closedir_
DIR *d
= NULL
;
373 s
= allocated_set
= set_new(NULL
);
378 ret
= cg_kill(controller
, path
, sig
, flags
, s
, log_kill
, userdata
);
380 r
= cg_enumerate_subgroups(controller
, path
, &d
);
382 if (ret
>= 0 && r
!= -ENOENT
)
388 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
389 _cleanup_free_
char *p
= NULL
;
391 p
= path_join(path
, fn
);
396 r
= cg_kill_recursive(controller
, p
, sig
, flags
, s
, log_kill
, userdata
);
397 if (r
!= 0 && ret
>= 0)
400 if (ret
>= 0 && r
< 0)
403 if (flags
& CGROUP_REMOVE
) {
404 r
= cg_rmdir(controller
, path
);
405 if (r
< 0 && ret
>= 0 && !IN_SET(r
, -ENOENT
, -EBUSY
))
420 _cleanup_set_free_ Set
*s
= NULL
;
433 my_pid
= getpid_cached();
436 _cleanup_fclose_
FILE *f
= NULL
;
440 r
= cg_enumerate_processes(cfrom
, pfrom
, &f
);
442 if (ret
>= 0 && r
!= -ENOENT
)
448 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
450 /* This might do weird stuff if we aren't a
451 * single-threaded program. However, we
452 * luckily know we are not */
453 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
456 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
459 /* Ignore kernel threads. Since they can only
460 * exist in the root cgroup, we only check for
463 empty_or_root(pfrom
) &&
464 is_kernel_thread(pid
) > 0)
467 r
= cg_attach(cto
, pto
, pid
);
469 if (ret
>= 0 && r
!= -ESRCH
)
476 r
= set_put(s
, PID_TO_PTR(pid
));
496 int cg_migrate_recursive(
503 _cleanup_closedir_
DIR *d
= NULL
;
512 ret
= cg_migrate(cfrom
, pfrom
, cto
, pto
, flags
);
514 r
= cg_enumerate_subgroups(cfrom
, pfrom
, &d
);
516 if (ret
>= 0 && r
!= -ENOENT
)
522 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
523 _cleanup_free_
char *p
= NULL
;
525 p
= path_join(pfrom
, fn
);
530 r
= cg_migrate_recursive(cfrom
, p
, cto
, pto
, flags
);
531 if (r
!= 0 && ret
>= 0)
535 if (r
< 0 && ret
>= 0)
538 if (flags
& CGROUP_REMOVE
) {
539 r
= cg_rmdir(cfrom
, pfrom
);
540 if (r
< 0 && ret
>= 0 && !IN_SET(r
, -ENOENT
, -EBUSY
))
547 int cg_migrate_recursive_fallback(
561 r
= cg_migrate_recursive(cfrom
, pfrom
, cto
, pto
, flags
);
563 char prefix
[strlen(pto
) + 1];
565 /* This didn't work? Then let's try all prefixes of the destination */
567 PATH_FOREACH_PREFIX(prefix
, pto
) {
570 q
= cg_migrate_recursive(cfrom
, pfrom
, cto
, prefix
, flags
);
579 static const char *controller_to_dirname(const char *controller
) {
584 /* Converts a controller name to the directory name below
585 * /sys/fs/cgroup/ we want to mount it to. Effectively, this
586 * just cuts off the name= prefixed used for named
587 * hierarchies, if it is specified. */
589 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
590 if (cg_hybrid_unified() > 0)
591 controller
= SYSTEMD_CGROUP_CONTROLLER_HYBRID
;
593 controller
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
596 e
= startswith(controller
, "name=");
603 static int join_path_legacy(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
610 dn
= controller_to_dirname(controller
);
612 if (isempty(path
) && isempty(suffix
))
613 t
= path_join("/sys/fs/cgroup", dn
);
614 else if (isempty(path
))
615 t
= path_join("/sys/fs/cgroup", dn
, suffix
);
616 else if (isempty(suffix
))
617 t
= path_join("/sys/fs/cgroup", dn
, path
);
619 t
= path_join("/sys/fs/cgroup", dn
, path
, suffix
);
627 static int join_path_unified(const char *path
, const char *suffix
, char **fs
) {
632 if (isempty(path
) && isempty(suffix
))
633 t
= strdup("/sys/fs/cgroup");
634 else if (isempty(path
))
635 t
= path_join("/sys/fs/cgroup", suffix
);
636 else if (isempty(suffix
))
637 t
= path_join("/sys/fs/cgroup", path
);
639 t
= path_join("/sys/fs/cgroup", path
, suffix
);
647 int cg_get_path(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
655 /* If no controller is specified, we return the path
656 * *below* the controllers, without any prefix. */
658 if (!path
&& !suffix
)
666 t
= path_join(path
, suffix
);
670 *fs
= path_simplify(t
, false);
674 if (!cg_controller_is_valid(controller
))
677 r
= cg_all_unified();
681 r
= join_path_unified(path
, suffix
, fs
);
683 r
= join_path_legacy(controller
, path
, suffix
, fs
);
687 path_simplify(*fs
, false);
691 static int controller_is_accessible(const char *controller
) {
696 /* Checks whether a specific controller is accessible,
697 * i.e. its hierarchy mounted. In the unified hierarchy all
698 * controllers are considered accessible, except for the named
701 if (!cg_controller_is_valid(controller
))
704 r
= cg_all_unified();
708 /* We don't support named hierarchies if we are using
709 * the unified hierarchy. */
711 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
714 if (startswith(controller
, "name="))
720 dn
= controller_to_dirname(controller
);
721 cc
= strjoina("/sys/fs/cgroup/", dn
);
723 if (laccess(cc
, F_OK
) < 0)
730 int cg_get_path_and_check(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
736 /* Check if the specified controller is actually accessible */
737 r
= controller_is_accessible(controller
);
741 return cg_get_path(controller
, path
, suffix
, fs
);
744 static int trim_cb(const char *path
, const struct stat
*sb
, int typeflag
, struct FTW
*ftwbuf
) {
749 if (typeflag
!= FTW_DP
)
752 if (ftwbuf
->level
< 1)
759 int cg_trim(const char *controller
, const char *path
, bool delete_root
) {
760 _cleanup_free_
char *fs
= NULL
;
765 r
= cg_get_path(controller
, path
, NULL
, &fs
);
770 if (nftw(fs
, trim_cb
, 64, FTW_DEPTH
|FTW_MOUNT
|FTW_PHYS
) != 0) {
774 r
= errno_or_else(EIO
);
778 if (rmdir(fs
) < 0 && errno
!= ENOENT
)
782 q
= cg_hybrid_unified();
785 if (q
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
786 q
= cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, delete_root
);
788 log_warning_errno(q
, "Failed to trim compat systemd cgroup %s: %m", path
);
794 /* Create a cgroup in the hierarchy of controller.
795 * Returns 0 if the group already existed, 1 on success, negative otherwise.
797 int cg_create(const char *controller
, const char *path
) {
798 _cleanup_free_
char *fs
= NULL
;
801 r
= cg_get_path_and_check(controller
, path
, NULL
, &fs
);
805 r
= mkdir_parents(fs
, 0755);
809 r
= mkdir_errno_wrapper(fs
, 0755);
815 r
= cg_hybrid_unified();
819 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
820 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
822 log_warning_errno(r
, "Failed to create compat systemd cgroup %s: %m", path
);
828 int cg_create_and_attach(const char *controller
, const char *path
, pid_t pid
) {
833 r
= cg_create(controller
, path
);
837 q
= cg_attach(controller
, path
, pid
);
841 /* This does not remove the cgroup on failure */
845 int cg_attach(const char *controller
, const char *path
, pid_t pid
) {
846 _cleanup_free_
char *fs
= NULL
;
847 char c
[DECIMAL_STR_MAX(pid_t
) + 2];
853 r
= cg_get_path_and_check(controller
, path
, "cgroup.procs", &fs
);
858 pid
= getpid_cached();
860 xsprintf(c
, PID_FMT
"\n", pid
);
862 r
= write_string_file(fs
, c
, WRITE_STRING_FILE_DISABLE_BUFFER
);
866 r
= cg_hybrid_unified();
870 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
871 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, pid
);
873 log_warning_errno(r
, "Failed to attach "PID_FMT
" to compat systemd cgroup %s: %m", pid
, path
);
879 int cg_attach_fallback(const char *controller
, const char *path
, pid_t pid
) {
886 r
= cg_attach(controller
, path
, pid
);
888 char prefix
[strlen(path
) + 1];
890 /* This didn't work? Then let's try all prefixes of
893 PATH_FOREACH_PREFIX(prefix
, path
) {
896 q
= cg_attach(controller
, prefix
, pid
);
906 const char *controller
,
916 /* cgroup v1, aka legacy/non-unified */
917 static const struct Attribute legacy_attributes
[] = {
918 { "cgroup.procs", true },
920 { "cgroup.clone_children", false },
924 /* cgroup v2, aka unified */
925 static const struct Attribute unified_attributes
[] = {
926 { "cgroup.procs", true },
927 { "cgroup.subtree_control", true },
928 { "cgroup.threads", false },
932 static const struct Attribute
* const attributes
[] = {
933 [false] = legacy_attributes
,
934 [true] = unified_attributes
,
937 _cleanup_free_
char *fs
= NULL
;
938 const struct Attribute
*i
;
943 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
946 unified
= cg_unified_controller(controller
);
950 /* Configure access to the cgroup itself */
951 r
= cg_get_path(controller
, path
, NULL
, &fs
);
955 r
= chmod_and_chown(fs
, 0755, uid
, gid
);
959 /* Configure access to the cgroup's attributes */
960 for (i
= attributes
[unified
]; i
->name
; i
++) {
963 r
= cg_get_path(controller
, path
, i
->name
, &fs
);
967 r
= chmod_and_chown(fs
, 0644, uid
, gid
);
972 log_debug_errno(r
, "Failed to set access on cgroup %s, ignoring: %m", fs
);
976 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
977 r
= cg_hybrid_unified();
981 /* Always propagate access mode from unified to legacy controller */
982 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, uid
, gid
);
984 log_debug_errno(r
, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path
);
991 int cg_set_xattr(const char *controller
, const char *path
, const char *name
, const void *value
, size_t size
, int flags
) {
992 _cleanup_free_
char *fs
= NULL
;
997 assert(value
|| size
<= 0);
999 r
= cg_get_path(controller
, path
, NULL
, &fs
);
1003 if (setxattr(fs
, name
, value
, size
, flags
) < 0)
1009 int cg_get_xattr(const char *controller
, const char *path
, const char *name
, void *value
, size_t size
) {
1010 _cleanup_free_
char *fs
= NULL
;
1017 r
= cg_get_path(controller
, path
, NULL
, &fs
);
1021 n
= getxattr(fs
, name
, value
, size
);
1028 int cg_pid_get_path(const char *controller
, pid_t pid
, char **path
) {
1029 _cleanup_fclose_
FILE *f
= NULL
;
1030 const char *fs
, *controller_str
;
1038 if (!cg_controller_is_valid(controller
))
1041 controller
= SYSTEMD_CGROUP_CONTROLLER
;
1043 unified
= cg_unified_controller(controller
);
1047 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
1048 controller_str
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
1050 controller_str
= controller
;
1052 cs
= strlen(controller_str
);
1055 fs
= procfs_file_alloca(pid
, "cgroup");
1056 r
= fopen_unlocked(fs
, "re", &f
);
1063 _cleanup_free_
char *line
= NULL
;
1066 r
= read_line(f
, LONG_LINE_MAX
, &line
);
1073 e
= startswith(line
, "0:");
1083 const char *word
, *state
;
1086 l
= strchr(line
, ':');
1096 FOREACH_WORD_SEPARATOR(word
, k
, l
, ",", state
)
1097 if (k
== cs
&& memcmp(word
, controller_str
, cs
) == 0) {
1109 /* Truncate suffix indicating the process is a zombie */
1110 e
= endswith(p
, " (deleted)");
1121 int cg_install_release_agent(const char *controller
, const char *agent
) {
1122 _cleanup_free_
char *fs
= NULL
, *contents
= NULL
;
1128 r
= cg_unified_controller(controller
);
1131 if (r
> 0) /* doesn't apply to unified hierarchy */
1134 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
1138 r
= read_one_line_file(fs
, &contents
);
1142 sc
= strstrip(contents
);
1144 r
= write_string_file(fs
, agent
, WRITE_STRING_FILE_DISABLE_BUFFER
);
1147 } else if (!path_equal(sc
, agent
))
1151 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
1155 contents
= mfree(contents
);
1156 r
= read_one_line_file(fs
, &contents
);
1160 sc
= strstrip(contents
);
1161 if (streq(sc
, "0")) {
1162 r
= write_string_file(fs
, "1", WRITE_STRING_FILE_DISABLE_BUFFER
);
1169 if (!streq(sc
, "1"))
1175 int cg_uninstall_release_agent(const char *controller
) {
1176 _cleanup_free_
char *fs
= NULL
;
1179 r
= cg_unified_controller(controller
);
1182 if (r
> 0) /* Doesn't apply to unified hierarchy */
1185 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
1189 r
= write_string_file(fs
, "0", WRITE_STRING_FILE_DISABLE_BUFFER
);
1195 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
1199 r
= write_string_file(fs
, "", WRITE_STRING_FILE_DISABLE_BUFFER
);
1206 int cg_is_empty(const char *controller
, const char *path
) {
1207 _cleanup_fclose_
FILE *f
= NULL
;
1213 r
= cg_enumerate_processes(controller
, path
, &f
);
1219 r
= cg_read_pid(f
, &pid
);
1226 int cg_is_empty_recursive(const char *controller
, const char *path
) {
1231 /* The root cgroup is always populated */
1232 if (controller
&& empty_or_root(path
))
1235 r
= cg_unified_controller(controller
);
1239 _cleanup_free_
char *t
= NULL
;
1241 /* On the unified hierarchy we can check empty state
1242 * via the "populated" attribute of "cgroup.events". */
1244 r
= cg_read_event(controller
, path
, "populated", &t
);
1250 return streq(t
, "0");
1252 _cleanup_closedir_
DIR *d
= NULL
;
1255 r
= cg_is_empty(controller
, path
);
1259 r
= cg_enumerate_subgroups(controller
, path
, &d
);
1265 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
1266 _cleanup_free_
char *p
= NULL
;
1268 p
= path_join(path
, fn
);
1273 r
= cg_is_empty_recursive(controller
, p
);
1284 int cg_split_spec(const char *spec
, char **controller
, char **path
) {
1285 char *t
= NULL
, *u
= NULL
;
1291 if (!path_is_normalized(spec
))
1299 *path
= path_simplify(t
, false);
1308 e
= strchr(spec
, ':');
1310 if (!cg_controller_is_valid(spec
))
1327 t
= strndup(spec
, e
-spec
);
1330 if (!cg_controller_is_valid(t
)) {
1344 if (!path_is_normalized(u
) ||
1345 !path_is_absolute(u
)) {
1351 path_simplify(u
, false);
1367 int cg_mangle_path(const char *path
, char **result
) {
1368 _cleanup_free_
char *c
= NULL
, *p
= NULL
;
1375 /* First, check if it already is a filesystem path */
1376 if (path_startswith(path
, "/sys/fs/cgroup")) {
1382 *result
= path_simplify(t
, false);
1386 /* Otherwise, treat it as cg spec */
1387 r
= cg_split_spec(path
, &c
, &p
);
1391 return cg_get_path(c
?: SYSTEMD_CGROUP_CONTROLLER
, p
?: "/", NULL
, result
);
1394 int cg_get_root_path(char **path
) {
1400 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 1, &p
);
1404 e
= endswith(p
, "/" SPECIAL_INIT_SCOPE
);
1406 e
= endswith(p
, "/" SPECIAL_SYSTEM_SLICE
); /* legacy */
1408 e
= endswith(p
, "/system"); /* even more legacy */
1416 int cg_shift_path(const char *cgroup
, const char *root
, const char **shifted
) {
1417 _cleanup_free_
char *rt
= NULL
;
1425 /* If the root was specified let's use that, otherwise
1426 * let's determine it from PID 1 */
1428 r
= cg_get_root_path(&rt
);
1435 p
= path_startswith(cgroup
, root
);
1436 if (p
&& p
> cgroup
)
1444 int cg_pid_get_path_shifted(pid_t pid
, const char *root
, char **cgroup
) {
1445 _cleanup_free_
char *raw
= NULL
;
1452 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &raw
);
1456 r
= cg_shift_path(raw
, root
, &c
);
1461 *cgroup
= TAKE_PTR(raw
);
1475 int cg_path_decode_unit(const char *cgroup
, char **unit
) {
1482 n
= strcspn(cgroup
, "/");
1486 c
= strndupa(cgroup
, n
);
1489 if (!unit_name_is_valid(c
, UNIT_NAME_PLAIN
|UNIT_NAME_INSTANCE
))
1500 static bool valid_slice_name(const char *p
, size_t n
) {
1505 if (n
< STRLEN("x.slice"))
1508 if (memcmp(p
+ n
- 6, ".slice", 6) == 0) {
1514 c
= cg_unescape(buf
);
1516 return unit_name_is_valid(c
, UNIT_NAME_PLAIN
);
1522 static const char *skip_slices(const char *p
) {
1525 /* Skips over all slice assignments */
1530 p
+= strspn(p
, "/");
1532 n
= strcspn(p
, "/");
1533 if (!valid_slice_name(p
, n
))
1540 int cg_path_get_unit(const char *path
, char **ret
) {
1548 e
= skip_slices(path
);
1550 r
= cg_path_decode_unit(e
, &unit
);
1554 /* We skipped over the slices, don't accept any now */
1555 if (endswith(unit
, ".slice")) {
1564 int cg_pid_get_unit(pid_t pid
, char **unit
) {
1565 _cleanup_free_
char *cgroup
= NULL
;
1570 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1574 return cg_path_get_unit(cgroup
, unit
);
1578 * Skip session-*.scope, but require it to be there.
1580 static const char *skip_session(const char *p
) {
1586 p
+= strspn(p
, "/");
1588 n
= strcspn(p
, "/");
1589 if (n
< STRLEN("session-x.scope"))
1592 if (memcmp(p
, "session-", 8) == 0 && memcmp(p
+ n
- 6, ".scope", 6) == 0) {
1593 char buf
[n
- 8 - 6 + 1];
1595 memcpy(buf
, p
+ 8, n
- 8 - 6);
1598 /* Note that session scopes never need unescaping,
1599 * since they cannot conflict with the kernel's own
1600 * names, hence we don't need to call cg_unescape()
1603 if (!session_id_valid(buf
))
1607 p
+= strspn(p
, "/");
1615 * Skip user@*.service, but require it to be there.
1617 static const char *skip_user_manager(const char *p
) {
1623 p
+= strspn(p
, "/");
1625 n
= strcspn(p
, "/");
1626 if (n
< STRLEN("user@x.service"))
1629 if (memcmp(p
, "user@", 5) == 0 && memcmp(p
+ n
- 8, ".service", 8) == 0) {
1630 char buf
[n
- 5 - 8 + 1];
1632 memcpy(buf
, p
+ 5, n
- 5 - 8);
1635 /* Note that user manager services never need unescaping,
1636 * since they cannot conflict with the kernel's own
1637 * names, hence we don't need to call cg_unescape()
1640 if (parse_uid(buf
, NULL
) < 0)
1644 p
+= strspn(p
, "/");
1652 static const char *skip_user_prefix(const char *path
) {
1657 /* Skip slices, if there are any */
1658 e
= skip_slices(path
);
1660 /* Skip the user manager, if it's in the path now... */
1661 t
= skip_user_manager(e
);
1665 /* Alternatively skip the user session if it is in the path... */
1666 return skip_session(e
);
1669 int cg_path_get_user_unit(const char *path
, char **ret
) {
1675 t
= skip_user_prefix(path
);
1679 /* And from here on it looks pretty much the same as for a
1680 * system unit, hence let's use the same parser from here
1682 return cg_path_get_unit(t
, ret
);
1685 int cg_pid_get_user_unit(pid_t pid
, char **unit
) {
1686 _cleanup_free_
char *cgroup
= NULL
;
1691 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1695 return cg_path_get_user_unit(cgroup
, unit
);
1698 int cg_path_get_machine_name(const char *path
, char **machine
) {
1699 _cleanup_free_
char *u
= NULL
;
1703 r
= cg_path_get_unit(path
, &u
);
1707 sl
= strjoina("/run/systemd/machines/unit:", u
);
1708 return readlink_malloc(sl
, machine
);
1711 int cg_pid_get_machine_name(pid_t pid
, char **machine
) {
1712 _cleanup_free_
char *cgroup
= NULL
;
1717 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1721 return cg_path_get_machine_name(cgroup
, machine
);
1724 int cg_path_get_session(const char *path
, char **session
) {
1725 _cleanup_free_
char *unit
= NULL
;
1731 r
= cg_path_get_unit(path
, &unit
);
1735 start
= startswith(unit
, "session-");
1738 end
= endswith(start
, ".scope");
1743 if (!session_id_valid(start
))
1759 int cg_pid_get_session(pid_t pid
, char **session
) {
1760 _cleanup_free_
char *cgroup
= NULL
;
1763 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1767 return cg_path_get_session(cgroup
, session
);
1770 int cg_path_get_owner_uid(const char *path
, uid_t
*uid
) {
1771 _cleanup_free_
char *slice
= NULL
;
1777 r
= cg_path_get_slice(path
, &slice
);
1781 start
= startswith(slice
, "user-");
1784 end
= endswith(start
, ".slice");
1789 if (parse_uid(start
, uid
) < 0)
1795 int cg_pid_get_owner_uid(pid_t pid
, uid_t
*uid
) {
1796 _cleanup_free_
char *cgroup
= NULL
;
1799 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1803 return cg_path_get_owner_uid(cgroup
, uid
);
1806 int cg_path_get_slice(const char *p
, char **slice
) {
1807 const char *e
= NULL
;
1812 /* Finds the right-most slice unit from the beginning, but
1813 * stops before we come to the first non-slice unit. */
1818 p
+= strspn(p
, "/");
1820 n
= strcspn(p
, "/");
1821 if (!valid_slice_name(p
, n
)) {
1826 s
= strdup(SPECIAL_ROOT_SLICE
);
1834 return cg_path_decode_unit(e
, slice
);
1842 int cg_pid_get_slice(pid_t pid
, char **slice
) {
1843 _cleanup_free_
char *cgroup
= NULL
;
1848 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1852 return cg_path_get_slice(cgroup
, slice
);
1855 int cg_path_get_user_slice(const char *p
, char **slice
) {
1860 t
= skip_user_prefix(p
);
1864 /* And now it looks pretty much the same as for a system
1865 * slice, so let's just use the same parser from here on. */
1866 return cg_path_get_slice(t
, slice
);
1869 int cg_pid_get_user_slice(pid_t pid
, char **slice
) {
1870 _cleanup_free_
char *cgroup
= NULL
;
1875 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1879 return cg_path_get_user_slice(cgroup
, slice
);
1882 char *cg_escape(const char *p
) {
1883 bool need_prefix
= false;
1885 /* This implements very minimal escaping for names to be used
1886 * as file names in the cgroup tree: any name which might
1887 * conflict with a kernel name or is prefixed with '_' is
1888 * prefixed with a '_'. That way, when reading cgroup names it
1889 * is sufficient to remove a single prefixing underscore if
1892 /* The return value of this function (unlike cg_unescape())
1895 if (IN_SET(p
[0], 0, '_', '.') ||
1896 STR_IN_SET(p
, "notify_on_release", "release_agent", "tasks") ||
1897 startswith(p
, "cgroup."))
1902 dot
= strrchr(p
, '.');
1907 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
1910 n
= cgroup_controller_to_string(c
);
1915 if (memcmp(p
, n
, l
) != 0)
1925 return strjoin("_", p
);
1930 char *cg_unescape(const char *p
) {
1933 /* The return value of this function (unlike cg_escape())
1934 * doesn't need free()! */
1942 #define CONTROLLER_VALID \
1946 bool cg_controller_is_valid(const char *p
) {
1952 if (streq(p
, SYSTEMD_CGROUP_CONTROLLER
))
1955 s
= startswith(p
, "name=");
1959 if (IN_SET(*p
, 0, '_'))
1962 for (t
= p
; *t
; t
++)
1963 if (!strchr(CONTROLLER_VALID
, *t
))
1966 if (t
- p
> FILENAME_MAX
)
1972 int cg_slice_to_path(const char *unit
, char **ret
) {
1973 _cleanup_free_
char *p
= NULL
, *s
= NULL
, *e
= NULL
;
1980 if (streq(unit
, SPECIAL_ROOT_SLICE
)) {
1990 if (!unit_name_is_valid(unit
, UNIT_NAME_PLAIN
))
1993 if (!endswith(unit
, ".slice"))
1996 r
= unit_name_to_prefix(unit
, &p
);
2000 dash
= strchr(p
, '-');
2002 /* Don't allow initial dashes */
2007 _cleanup_free_
char *escaped
= NULL
;
2008 char n
[dash
- p
+ sizeof(".slice")];
2010 #if HAS_FEATURE_MEMORY_SANITIZER
2011 /* msan doesn't instrument stpncpy, so it thinks
2012 * n is later used uninitialized:
2013 * https://github.com/google/sanitizers/issues/926
2018 /* Don't allow trailing or double dashes */
2019 if (IN_SET(dash
[1], 0, '-'))
2022 strcpy(stpncpy(n
, p
, dash
- p
), ".slice");
2023 if (!unit_name_is_valid(n
, UNIT_NAME_PLAIN
))
2026 escaped
= cg_escape(n
);
2030 if (!strextend(&s
, escaped
, "/", NULL
))
2033 dash
= strchr(dash
+1, '-');
2036 e
= cg_escape(unit
);
2040 if (!strextend(&s
, e
, NULL
))
2048 int cg_set_attribute(const char *controller
, const char *path
, const char *attribute
, const char *value
) {
2049 _cleanup_free_
char *p
= NULL
;
2052 r
= cg_get_path(controller
, path
, attribute
, &p
);
2056 return write_string_file(p
, value
, WRITE_STRING_FILE_DISABLE_BUFFER
);
2059 int cg_get_attribute(const char *controller
, const char *path
, const char *attribute
, char **ret
) {
2060 _cleanup_free_
char *p
= NULL
;
2063 r
= cg_get_path(controller
, path
, attribute
, &p
);
2067 return read_one_line_file(p
, ret
);
2070 int cg_get_keyed_attribute(
2071 const char *controller
,
2073 const char *attribute
,
2075 char **ret_values
) {
2077 _cleanup_free_
char *filename
= NULL
, *contents
= NULL
;
2079 size_t n
, i
, n_done
= 0;
2083 /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
2084 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
2085 * entries as 'keys'. On success each entry will be set to the value of the matching key.
2087 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. */
2089 r
= cg_get_path(controller
, path
, attribute
, &filename
);
2093 r
= read_full_file(filename
, &contents
, NULL
);
2097 n
= strv_length(keys
);
2098 if (n
== 0) /* No keys to retrieve? That's easy, we are done then */
2101 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
2102 v
= newa0(char*, n
);
2104 for (p
= contents
; *p
;) {
2105 const char *w
= NULL
;
2107 for (i
= 0; i
< n
; i
++)
2109 w
= first_word(p
, keys
[i
]);
2117 l
= strcspn(w
, NEWLINE
);
2118 v
[i
] = strndup(w
, l
);
2130 p
+= strcspn(p
, NEWLINE
);
2132 p
+= strspn(p
, NEWLINE
);
2138 for (i
= 0; i
< n
; i
++)
2144 memcpy(ret_values
, v
, sizeof(char*) * n
);
2149 int cg_create_everywhere(CGroupMask supported
, CGroupMask mask
, const char *path
) {
2155 /* This one will create a cgroup in our private tree, but also
2156 * duplicate it in the trees specified in mask, and remove it
2159 * Returns 0 if the group already existed in the systemd hierarchy,
2160 * 1 on success, negative otherwise.
2163 /* First create the cgroup in our own hierarchy. */
2164 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, path
);
2169 /* If we are in the unified hierarchy, we are done now */
2170 r
= cg_all_unified();
2176 supported
&= CGROUP_MASK_V1
;
2177 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
2180 /* Otherwise, do the same in the other hierarchies */
2181 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2182 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2185 if (!FLAGS_SET(supported
, bit
))
2188 if (FLAGS_SET(done
, bit
))
2191 n
= cgroup_controller_to_string(c
);
2192 if (FLAGS_SET(mask
, bit
))
2193 (void) cg_create(n
, path
);
2195 (void) cg_trim(n
, path
, true);
2197 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
2203 int cg_attach_everywhere(CGroupMask supported
, const char *path
, pid_t pid
, cg_migrate_callback_t path_callback
, void *userdata
) {
2208 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER
, path
, pid
);
2212 r
= cg_all_unified();
2218 supported
&= CGROUP_MASK_V1
;
2221 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2222 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2223 const char *p
= NULL
;
2225 if (!FLAGS_SET(supported
, bit
))
2228 if (FLAGS_SET(done
, bit
))
2232 p
= path_callback(bit
, userdata
);
2236 (void) cg_attach_fallback(cgroup_controller_to_string(c
), p
, pid
);
2237 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
2243 int cg_attach_many_everywhere(CGroupMask supported
, const char *path
, Set
* pids
, cg_migrate_callback_t path_callback
, void *userdata
) {
2248 SET_FOREACH(pidp
, pids
, i
) {
2249 pid_t pid
= PTR_TO_PID(pidp
);
2252 q
= cg_attach_everywhere(supported
, path
, pid
, path_callback
, userdata
);
2253 if (q
< 0 && r
>= 0)
2260 int cg_migrate_everywhere(CGroupMask supported
, const char *from
, const char *to
, cg_migrate_callback_t to_callback
, void *userdata
) {
2265 if (!path_equal(from
, to
)) {
2266 r
= cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER
, from
, SYSTEMD_CGROUP_CONTROLLER
, to
, CGROUP_REMOVE
);
2271 q
= cg_all_unified();
2277 supported
&= CGROUP_MASK_V1
;
2280 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2281 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2282 const char *p
= NULL
;
2284 if (!FLAGS_SET(supported
, bit
))
2287 if (FLAGS_SET(done
, bit
))
2291 p
= to_callback(bit
, userdata
);
2295 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER
, to
, cgroup_controller_to_string(c
), p
, 0);
2296 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
2302 int cg_trim_everywhere(CGroupMask supported
, const char *path
, bool delete_root
) {
2307 r
= cg_trim(SYSTEMD_CGROUP_CONTROLLER
, path
, delete_root
);
2311 q
= cg_all_unified();
2317 supported
&= CGROUP_MASK_V1
;
2320 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2321 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2323 if (!FLAGS_SET(supported
, bit
))
2326 if (FLAGS_SET(done
, bit
))
2329 (void) cg_trim(cgroup_controller_to_string(c
), path
, delete_root
);
2330 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
2336 int cg_mask_to_string(CGroupMask mask
, char **ret
) {
2337 _cleanup_free_
char *s
= NULL
;
2338 size_t n
= 0, allocated
= 0;
2349 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2353 if (!FLAGS_SET(mask
, CGROUP_CONTROLLER_TO_MASK(c
)))
2356 k
= cgroup_controller_to_string(c
);
2359 if (!GREEDY_REALLOC(s
, allocated
, n
+ space
+ l
+ 1))
2364 memcpy(s
+ n
+ space
, k
, l
);
2378 int cg_mask_from_string(const char *value
, CGroupMask
*ret
) {
2385 _cleanup_free_
char *n
= NULL
;
2389 r
= extract_first_word(&value
, &n
, NULL
, 0);
2395 v
= cgroup_controller_from_string(n
);
2399 m
|= CGROUP_CONTROLLER_TO_MASK(v
);
2406 int cg_mask_supported(CGroupMask
*ret
) {
2410 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2411 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2412 * pseudo-controllers. */
2414 r
= cg_all_unified();
2418 _cleanup_free_
char *root
= NULL
, *controllers
= NULL
, *path
= NULL
;
2420 /* In the unified hierarchy we can read the supported
2421 * and accessible controllers from a the top-level
2422 * cgroup attribute */
2424 r
= cg_get_root_path(&root
);
2428 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, root
, "cgroup.controllers", &path
);
2432 r
= read_one_line_file(path
, &controllers
);
2436 r
= cg_mask_from_string(controllers
, &mask
);
2440 /* Currently, we support the cpu, memory, io and pids controller in the unified hierarchy, mask
2441 * everything else off. */
2442 mask
&= CGROUP_MASK_V2
;
2447 /* In the legacy hierarchy, we check which hierarchies are mounted. */
2450 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2451 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2454 if (!FLAGS_SET(CGROUP_MASK_V1
, bit
))
2457 n
= cgroup_controller_to_string(c
);
2458 if (controller_is_accessible(n
) >= 0)
2467 int cg_kernel_controllers(Set
**ret
) {
2468 _cleanup_set_free_free_ Set
*controllers
= NULL
;
2469 _cleanup_fclose_
FILE *f
= NULL
;
2474 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2475 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2476 * pseudo-controllers. */
2478 controllers
= set_new(&string_hash_ops
);
2482 r
= fopen_unlocked("/proc/cgroups", "re", &f
);
2490 /* Ignore the header line */
2491 (void) read_line(f
, (size_t) -1, NULL
);
2498 if (fscanf(f
, "%ms %*i %*i %i", &controller
, &enabled
) != 2) {
2504 return errno_or_else(EIO
);
2514 if (!cg_controller_is_valid(controller
)) {
2519 r
= set_consume(controllers
, controller
);
2524 *ret
= TAKE_PTR(controllers
);
2529 static thread_local CGroupUnified unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2531 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on /sys/fs/cgroup/systemd. This
2532 * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2533 * /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2534 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2536 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep cgroup v2
2537 * process management but disable the compat dual layout, we return %true on
2538 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2540 static thread_local
bool unified_systemd_v232
;
2542 static int cg_unified_update(void) {
2546 /* Checks if we support the unified hierarchy. Returns an
2547 * error when the cgroup hierarchies aren't mounted yet or we
2548 * have any other trouble determining if the unified hierarchy
2551 if (unified_cache
>= CGROUP_UNIFIED_NONE
)
2554 if (statfs("/sys/fs/cgroup/", &fs
) < 0)
2555 return log_debug_errno(errno
, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2557 if (F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2558 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2559 unified_cache
= CGROUP_UNIFIED_ALL
;
2560 } else if (F_TYPE_EQUAL(fs
.f_type
, TMPFS_MAGIC
)) {
2561 if (statfs("/sys/fs/cgroup/unified/", &fs
) == 0 &&
2562 F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2563 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2564 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2565 unified_systemd_v232
= false;
2567 if (statfs("/sys/fs/cgroup/systemd/", &fs
) < 0)
2568 return log_debug_errno(errno
, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2570 if (F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2571 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2572 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2573 unified_systemd_v232
= true;
2574 } else if (F_TYPE_EQUAL(fs
.f_type
, CGROUP_SUPER_MAGIC
)) {
2575 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2576 unified_cache
= CGROUP_UNIFIED_NONE
;
2578 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2579 (unsigned long long) fs
.f_type
);
2580 unified_cache
= CGROUP_UNIFIED_NONE
;
2584 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM
),
2585 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2586 (unsigned long long)fs
.f_type
);
2591 int cg_unified_controller(const char *controller
) {
2594 r
= cg_unified_update();
2598 if (unified_cache
== CGROUP_UNIFIED_NONE
)
2601 if (unified_cache
>= CGROUP_UNIFIED_ALL
)
2604 return streq_ptr(controller
, SYSTEMD_CGROUP_CONTROLLER
);
2607 int cg_all_unified(void) {
2610 r
= cg_unified_update();
2614 return unified_cache
>= CGROUP_UNIFIED_ALL
;
2617 int cg_hybrid_unified(void) {
2620 r
= cg_unified_update();
2624 return unified_cache
== CGROUP_UNIFIED_SYSTEMD
&& !unified_systemd_v232
;
2627 int cg_unified_flush(void) {
2628 unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2630 return cg_unified_update();
2633 int cg_enable_everywhere(
2634 CGroupMask supported
,
2637 CGroupMask
*ret_result_mask
) {
2639 _cleanup_fclose_
FILE *f
= NULL
;
2640 _cleanup_free_
char *fs
= NULL
;
2647 if (supported
== 0) {
2648 if (ret_result_mask
)
2649 *ret_result_mask
= 0;
2653 r
= cg_all_unified();
2657 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
2658 * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
2659 * caller tends to use the returned mask later on to compare if all controllers where properly joined,
2660 * and if not requeues realization. This use is the primary purpose of the return value, hence let's
2661 * minimize surprises here and reduce triggers for re-realization by always saying we fully
2663 if (ret_result_mask
)
2664 *ret_result_mask
= mask
& supported
& CGROUP_MASK_V2
; /* If you wonder why we mask this with
2665 * CGROUP_MASK_V2: The 'supported' mask
2666 * might contain pure-V1 or BPF
2667 * controllers, and we never want to
2668 * claim that we could enable those with
2669 * cgroup.subtree_control */
2673 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, p
, "cgroup.subtree_control", &fs
);
2677 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2678 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2681 if (!FLAGS_SET(CGROUP_MASK_V2
, bit
))
2684 if (!FLAGS_SET(supported
, bit
))
2687 n
= cgroup_controller_to_string(c
);
2689 char s
[1 + strlen(n
) + 1];
2691 s
[0] = FLAGS_SET(mask
, bit
) ? '+' : '-';
2695 f
= fopen(fs
, "we");
2697 return log_debug_errno(errno
, "Failed to open cgroup.subtree_control file of %s: %m", p
);
2700 r
= write_string_stream(f
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
2702 log_debug_errno(r
, "Failed to %s controller %s for %s (%s): %m",
2703 FLAGS_SET(mask
, bit
) ? "enable" : "disable", n
, p
, fs
);
2706 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
2707 * happens for example when we attempt to turn off a controller up in the tree that is
2708 * used down in the tree. */
2709 if (!FLAGS_SET(mask
, bit
) && r
== -EBUSY
) /* You might wonder why we check for EBUSY
2710 * only here, and not follow the same logic
2711 * for other errors such as EINVAL or
2712 * EOPNOTSUPP or anything else. That's
2713 * because EBUSY indicates that the
2714 * controllers is currently enabled and
2715 * cannot be disabled because something down
2716 * the hierarchy is still using it. Any other
2717 * error most likely means something like "I
2718 * never heard of this controller" or
2719 * similar. In the former case it's hence
2720 * safe to assume the controller is still on
2721 * after the failed operation, while in the
2722 * latter case it's safer to assume the
2723 * controller is unknown and hence certainly
2727 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
2728 if (FLAGS_SET(mask
, bit
))
2734 /* Let's return the precise set of controllers now enabled for the cgroup. */
2735 if (ret_result_mask
)
2736 *ret_result_mask
= ret
;
2741 bool cg_is_unified_wanted(void) {
2742 static thread_local
int wanted
= -1;
2745 const bool is_default
= DEFAULT_HIERARCHY
== CGROUP_UNIFIED_ALL
;
2746 _cleanup_free_
char *c
= NULL
;
2748 /* If we have a cached value, return that. */
2752 /* If the hierarchy is already mounted, then follow whatever
2753 * was chosen for it. */
2754 if (cg_unified_flush() >= 0)
2755 return (wanted
= unified_cache
>= CGROUP_UNIFIED_ALL
);
2757 /* If we were explicitly passed systemd.unified_cgroup_hierarchy,
2759 r
= proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b
);
2761 return (wanted
= b
);
2763 /* If we passed cgroup_no_v1=all with no other instructions, it seems
2764 * highly unlikely that we want to use hybrid or legacy hierarchy. */
2765 r
= proc_cmdline_get_key("cgroup_no_v1", 0, &c
);
2766 if (r
> 0 && streq_ptr(c
, "all"))
2767 return (wanted
= true);
2769 return (wanted
= is_default
);
2772 bool cg_is_legacy_wanted(void) {
2773 static thread_local
int wanted
= -1;
2775 /* If we have a cached value, return that. */
2779 /* Check if we have cgroup v2 already mounted. */
2780 if (cg_unified_flush() >= 0 &&
2781 unified_cache
== CGROUP_UNIFIED_ALL
)
2782 return (wanted
= false);
2784 /* Otherwise, assume that at least partial legacy is wanted,
2785 * since cgroup v2 should already be mounted at this point. */
2786 return (wanted
= true);
2789 bool cg_is_hybrid_wanted(void) {
2790 static thread_local
int wanted
= -1;
2793 const bool is_default
= DEFAULT_HIERARCHY
>= CGROUP_UNIFIED_SYSTEMD
;
2794 /* We default to true if the default is "hybrid", obviously,
2795 * but also when the default is "unified", because if we get
2796 * called, it means that unified hierarchy was not mounted. */
2798 /* If we have a cached value, return that. */
2802 /* If the hierarchy is already mounted, then follow whatever
2803 * was chosen for it. */
2804 if (cg_unified_flush() >= 0 &&
2805 unified_cache
== CGROUP_UNIFIED_ALL
)
2806 return (wanted
= false);
2808 /* Otherwise, let's see what the kernel command line has to say.
2809 * Since checking is expensive, cache a non-error result. */
2810 r
= proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b
);
2812 /* The meaning of the kernel option is reversed wrt. to the return value
2813 * of this function, hence the negation. */
2814 return (wanted
= r
> 0 ? !b
: is_default
);
2817 int cg_weight_parse(const char *s
, uint64_t *ret
) {
2822 *ret
= CGROUP_WEIGHT_INVALID
;
2826 r
= safe_atou64(s
, &u
);
2830 if (u
< CGROUP_WEIGHT_MIN
|| u
> CGROUP_WEIGHT_MAX
)
2837 const uint64_t cgroup_io_limit_defaults
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2838 [CGROUP_IO_RBPS_MAX
] = CGROUP_LIMIT_MAX
,
2839 [CGROUP_IO_WBPS_MAX
] = CGROUP_LIMIT_MAX
,
2840 [CGROUP_IO_RIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2841 [CGROUP_IO_WIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2844 static const char* const cgroup_io_limit_type_table
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2845 [CGROUP_IO_RBPS_MAX
] = "IOReadBandwidthMax",
2846 [CGROUP_IO_WBPS_MAX
] = "IOWriteBandwidthMax",
2847 [CGROUP_IO_RIOPS_MAX
] = "IOReadIOPSMax",
2848 [CGROUP_IO_WIOPS_MAX
] = "IOWriteIOPSMax",
2851 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type
, CGroupIOLimitType
);
2853 int cg_cpu_shares_parse(const char *s
, uint64_t *ret
) {
2858 *ret
= CGROUP_CPU_SHARES_INVALID
;
2862 r
= safe_atou64(s
, &u
);
2866 if (u
< CGROUP_CPU_SHARES_MIN
|| u
> CGROUP_CPU_SHARES_MAX
)
2873 int cg_blkio_weight_parse(const char *s
, uint64_t *ret
) {
2878 *ret
= CGROUP_BLKIO_WEIGHT_INVALID
;
2882 r
= safe_atou64(s
, &u
);
2886 if (u
< CGROUP_BLKIO_WEIGHT_MIN
|| u
> CGROUP_BLKIO_WEIGHT_MAX
)
2893 bool is_cgroup_fs(const struct statfs
*s
) {
2894 return is_fs_type(s
, CGROUP_SUPER_MAGIC
) ||
2895 is_fs_type(s
, CGROUP2_SUPER_MAGIC
);
2898 bool fd_is_cgroup_fs(int fd
) {
2901 if (fstatfs(fd
, &s
) < 0)
2904 return is_cgroup_fs(&s
);
2907 static const char *const cgroup_controller_table
[_CGROUP_CONTROLLER_MAX
] = {
2908 [CGROUP_CONTROLLER_CPU
] = "cpu",
2909 [CGROUP_CONTROLLER_CPUACCT
] = "cpuacct",
2910 [CGROUP_CONTROLLER_IO
] = "io",
2911 [CGROUP_CONTROLLER_BLKIO
] = "blkio",
2912 [CGROUP_CONTROLLER_MEMORY
] = "memory",
2913 [CGROUP_CONTROLLER_DEVICES
] = "devices",
2914 [CGROUP_CONTROLLER_PIDS
] = "pids",
2915 [CGROUP_CONTROLLER_BPF_FIREWALL
] = "bpf-firewall",
2916 [CGROUP_CONTROLLER_BPF_DEVICES
] = "bpf-devices",
2919 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller
, CGroupController
);
2921 CGroupMask
get_cpu_accounting_mask(void) {
2922 static CGroupMask needed_mask
= (CGroupMask
) -1;
2924 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2925 * provided externally from the CPU controller, which means we don't
2926 * need to enable the CPU controller just to get metrics. This is good,
2927 * because enabling the CPU controller comes at a minor performance
2928 * hit, especially when it's propagated deep into large hierarchies.
2929 * There's also no separate CPU accounting controller available within
2930 * a unified hierarchy.
2932 * This combination of factors results in the desired cgroup mask to
2933 * enable for CPU accounting varying as follows:
2935 * ╔═════════════════════╤═════════════════════╗
2936 * ║ Linux ≥4.15 │ Linux <4.15 ║
2937 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2938 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2939 * ╟───────────────╫─────────────────────┼─────────────────────╢
2940 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2941 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2943 * We check kernel version here instead of manually checking whether
2944 * cpu.stat is present for every cgroup, as that check in itself would
2945 * already be fairly expensive.
2947 * Kernels where this patch has been backported will therefore have the
2948 * CPU controller enabled unnecessarily. This is more expensive than
2949 * necessary, but harmless. ☺️
2952 if (needed_mask
== (CGroupMask
) -1) {
2953 if (cg_all_unified()) {
2955 assert_se(uname(&u
) >= 0);
2957 if (str_verscmp(u
.release
, "4.15") < 0)
2958 needed_mask
= CGROUP_MASK_CPU
;
2962 needed_mask
= CGROUP_MASK_CPUACCT
;
2968 bool cpu_accounting_is_cheap(void) {
2969 return get_cpu_accounting_mask() == 0;