1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
10 #include <sys/utsname.h>
11 #include <sys/xattr.h>
14 #include "alloc-util.h"
15 #include "cgroup-util.h"
17 #include "dirent-util.h"
18 #include "extract-word.h"
21 #include "format-util.h"
24 #include "login-util.h"
26 #include "missing_magic.h"
28 #include "parse-util.h"
29 #include "path-util.h"
30 #include "process-util.h"
33 #include "stat-util.h"
34 #include "stdio-util.h"
35 #include "string-table.h"
36 #include "string-util.h"
38 #include "unit-name.h"
39 #include "user-util.h"
40 #include "xattr-util.h"
42 static int cg_enumerate_items(const char *controller
, const char *path
, FILE **_f
, const char *item
) {
43 _cleanup_free_
char *fs
= NULL
;
49 r
= cg_get_path(controller
, path
, item
, &fs
);
61 int cg_enumerate_processes(const char *controller
, const char *path
, FILE **_f
) {
62 return cg_enumerate_items(controller
, path
, _f
, "cgroup.procs");
65 int cg_read_pid(FILE *f
, pid_t
*_pid
) {
68 /* Note that the cgroup.procs might contain duplicates! See
69 * cgroups.txt for details. */
75 if (fscanf(f
, "%lu", &ul
) != 1) {
80 return errno_or_else(EIO
);
91 const char *controller
,
96 _cleanup_free_
char *events
= NULL
, *content
= NULL
;
99 r
= cg_get_path(controller
, path
, "cgroup.events", &events
);
103 r
= read_full_file(events
, &content
, NULL
);
107 for (const char *p
= content
;;) {
108 _cleanup_free_
char *line
= NULL
, *key
= NULL
, *val
= NULL
;
111 r
= extract_first_word(&p
, &line
, "\n", 0);
118 r
= extract_first_word(&q
, &key
, " ", 0);
124 if (!streq(key
, event
))
131 *ret
= TAKE_PTR(val
);
136 bool cg_ns_supported(void) {
137 static thread_local
int enabled
= -1;
142 if (access("/proc/self/ns/cgroup", F_OK
) < 0) {
144 log_debug_errno(errno
, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
152 bool cg_freezer_supported(void) {
153 static thread_local
int supported
= -1;
158 supported
= cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK
) == 0;
163 int cg_enumerate_subgroups(const char *controller
, const char *path
, DIR **_d
) {
164 _cleanup_free_
char *fs
= NULL
;
170 /* This is not recursive! */
172 r
= cg_get_path(controller
, path
, NULL
, &fs
);
184 int cg_read_subgroup(DIR *d
, char **fn
) {
190 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
193 if (de
->d_type
!= DT_DIR
)
196 if (dot_or_dot_dot(de
->d_name
))
199 b
= strdup(de
->d_name
);
210 int cg_rmdir(const char *controller
, const char *path
) {
211 _cleanup_free_
char *p
= NULL
;
214 r
= cg_get_path(controller
, path
, NULL
, &p
);
219 if (r
< 0 && errno
!= ENOENT
)
222 r
= cg_hybrid_unified();
226 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
227 r
= cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
229 log_warning_errno(r
, "Failed to remove compat systemd cgroup %s: %m", path
);
235 static int cg_kill_items(
236 const char *controller
,
241 cg_kill_log_func_t log_kill
,
245 _cleanup_set_free_ Set
*allocated_set
= NULL
;
247 int r
, ret
= 0, ret_log_kill
= 0;
252 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
253 * SIGCONT on SIGKILL. */
254 if (IN_SET(sig
, SIGCONT
, SIGKILL
))
255 flags
&= ~CGROUP_SIGCONT
;
257 /* This goes through the tasks list and kills them all. This
258 * is repeated until no further processes are added to the
259 * tasks list, to properly handle forking processes */
262 s
= allocated_set
= set_new(NULL
);
267 my_pid
= getpid_cached();
270 _cleanup_fclose_
FILE *f
= NULL
;
274 r
= cg_enumerate_items(controller
, path
, &f
, item
);
276 if (ret
>= 0 && r
!= -ENOENT
)
282 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
284 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
287 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
291 ret_log_kill
= log_kill(pid
, sig
, userdata
);
293 /* If we haven't killed this process yet, kill
295 if (kill(pid
, sig
) < 0) {
296 if (ret
>= 0 && errno
!= ESRCH
)
299 if (flags
& CGROUP_SIGCONT
)
300 (void) kill(pid
, SIGCONT
);
312 r
= set_put(s
, PID_TO_PTR(pid
));
328 /* To avoid racing against processes which fork
329 * quicker than we can kill them we repeat this until
330 * no new pids need to be killed. */
338 const char *controller
,
343 cg_kill_log_func_t log_kill
,
347 r
= cg_kill_items(controller
, path
, sig
, flags
, s
, log_kill
, userdata
, "cgroup.procs");
348 if (r
< 0 || sig
!= SIGKILL
)
351 /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
352 a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
353 (4340d175b898) and 4.14.138 (feb6b123b7dd). */
354 r
= cg_unified_controller(controller
);
358 return cg_kill_items(controller
, path
, sig
, flags
, s
, log_kill
, userdata
, "cgroup.threads");
361 int cg_kill_recursive(
362 const char *controller
,
367 cg_kill_log_func_t log_kill
,
370 _cleanup_set_free_ Set
*allocated_set
= NULL
;
371 _cleanup_closedir_
DIR *d
= NULL
;
379 s
= allocated_set
= set_new(NULL
);
384 ret
= cg_kill(controller
, path
, sig
, flags
, s
, log_kill
, userdata
);
386 r
= cg_enumerate_subgroups(controller
, path
, &d
);
388 if (ret
>= 0 && r
!= -ENOENT
)
394 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
395 _cleanup_free_
char *p
= NULL
;
397 p
= path_join(empty_to_root(path
), fn
);
402 r
= cg_kill_recursive(controller
, p
, sig
, flags
, s
, log_kill
, userdata
);
403 if (r
!= 0 && ret
>= 0)
406 if (ret
>= 0 && r
< 0)
409 if (flags
& CGROUP_REMOVE
) {
410 r
= cg_rmdir(controller
, path
);
411 if (r
< 0 && ret
>= 0 && !IN_SET(r
, -ENOENT
, -EBUSY
))
418 static const char *controller_to_dirname(const char *controller
) {
423 /* Converts a controller name to the directory name below
424 * /sys/fs/cgroup/ we want to mount it to. Effectively, this
425 * just cuts off the name= prefixed used for named
426 * hierarchies, if it is specified. */
428 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
429 if (cg_hybrid_unified() > 0)
430 controller
= SYSTEMD_CGROUP_CONTROLLER_HYBRID
;
432 controller
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
435 e
= startswith(controller
, "name=");
442 static int join_path_legacy(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
449 dn
= controller_to_dirname(controller
);
451 if (isempty(path
) && isempty(suffix
))
452 t
= path_join("/sys/fs/cgroup", dn
);
453 else if (isempty(path
))
454 t
= path_join("/sys/fs/cgroup", dn
, suffix
);
455 else if (isempty(suffix
))
456 t
= path_join("/sys/fs/cgroup", dn
, path
);
458 t
= path_join("/sys/fs/cgroup", dn
, path
, suffix
);
466 static int join_path_unified(const char *path
, const char *suffix
, char **fs
) {
471 if (isempty(path
) && isempty(suffix
))
472 t
= strdup("/sys/fs/cgroup");
473 else if (isempty(path
))
474 t
= path_join("/sys/fs/cgroup", suffix
);
475 else if (isempty(suffix
))
476 t
= path_join("/sys/fs/cgroup", path
);
478 t
= path_join("/sys/fs/cgroup", path
, suffix
);
486 int cg_get_path(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
494 /* If no controller is specified, we return the path
495 * *below* the controllers, without any prefix. */
497 if (!path
&& !suffix
)
505 t
= path_join(path
, suffix
);
509 *fs
= path_simplify(t
, false);
513 if (!cg_controller_is_valid(controller
))
516 r
= cg_all_unified();
520 r
= join_path_unified(path
, suffix
, fs
);
522 r
= join_path_legacy(controller
, path
, suffix
, fs
);
526 path_simplify(*fs
, false);
530 static int controller_is_v1_accessible(const char *root
, const char *controller
) {
531 const char *cpath
, *dn
;
535 dn
= controller_to_dirname(controller
);
536 cpath
= strjoina("/sys/fs/cgroup/", dn
);
539 * - possible subcgroup is created at root,
540 * - we can modify the hierarchy.
541 * "Leak" cpath on stack */
542 cpath
= strjoina(cpath
, root
, "/cgroup.procs");
544 if (laccess(cpath
, root
? W_OK
: F_OK
) < 0)
550 int cg_get_path_and_check(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
556 if (!cg_controller_is_valid(controller
))
559 r
= cg_all_unified();
563 /* In the unified hierarchy all controllers are considered accessible,
564 * except for the named hierarchies */
565 if (startswith(controller
, "name="))
568 /* Check if the specified controller is actually accessible */
569 r
= controller_is_v1_accessible(NULL
, controller
);
574 return cg_get_path(controller
, path
, suffix
, fs
);
577 int cg_set_xattr(const char *controller
, const char *path
, const char *name
, const void *value
, size_t size
, int flags
) {
578 _cleanup_free_
char *fs
= NULL
;
583 assert(value
|| size
<= 0);
585 r
= cg_get_path(controller
, path
, NULL
, &fs
);
589 if (setxattr(fs
, name
, value
, size
, flags
) < 0)
595 int cg_get_xattr(const char *controller
, const char *path
, const char *name
, void *value
, size_t size
) {
596 _cleanup_free_
char *fs
= NULL
;
603 r
= cg_get_path(controller
, path
, NULL
, &fs
);
607 n
= getxattr(fs
, name
, value
, size
);
614 int cg_get_xattr_malloc(const char *controller
, const char *path
, const char *name
, char **ret
) {
615 _cleanup_free_
char *fs
= NULL
;
621 r
= cg_get_path(controller
, path
, NULL
, &fs
);
625 r
= getxattr_malloc(fs
, name
, ret
, false);
632 int cg_get_xattr_bool(const char *controller
, const char *path
, const char *name
) {
633 _cleanup_free_
char *val
= NULL
;
639 r
= cg_get_xattr_malloc(controller
, path
, name
, &val
);
643 return parse_boolean(val
);
646 int cg_remove_xattr(const char *controller
, const char *path
, const char *name
) {
647 _cleanup_free_
char *fs
= NULL
;
653 r
= cg_get_path(controller
, path
, NULL
, &fs
);
657 if (removexattr(fs
, name
) < 0)
663 int cg_pid_get_path(const char *controller
, pid_t pid
, char **ret_path
) {
664 _cleanup_fclose_
FILE *f
= NULL
;
665 const char *fs
, *controller_str
;
672 if (!cg_controller_is_valid(controller
))
675 controller
= SYSTEMD_CGROUP_CONTROLLER
;
677 unified
= cg_unified_controller(controller
);
681 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
682 controller_str
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
684 controller_str
= controller
;
687 fs
= procfs_file_alloca(pid
, "cgroup");
688 r
= fopen_unlocked(fs
, "re", &f
);
695 _cleanup_free_
char *line
= NULL
;
698 r
= read_line(f
, LONG_LINE_MAX
, &line
);
705 e
= startswith(line
, "0:");
715 l
= strchr(line
, ':');
725 r
= string_contains_word(l
, ",", controller_str
);
732 char *path
= strdup(e
+ 1);
736 /* Truncate suffix indicating the process is a zombie */
737 e
= endswith(path
, " (deleted)");
746 int cg_install_release_agent(const char *controller
, const char *agent
) {
747 _cleanup_free_
char *fs
= NULL
, *contents
= NULL
;
753 r
= cg_unified_controller(controller
);
756 if (r
> 0) /* doesn't apply to unified hierarchy */
759 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
763 r
= read_one_line_file(fs
, &contents
);
767 sc
= strstrip(contents
);
769 r
= write_string_file(fs
, agent
, WRITE_STRING_FILE_DISABLE_BUFFER
);
772 } else if (!path_equal(sc
, agent
))
776 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
780 contents
= mfree(contents
);
781 r
= read_one_line_file(fs
, &contents
);
785 sc
= strstrip(contents
);
786 if (streq(sc
, "0")) {
787 r
= write_string_file(fs
, "1", WRITE_STRING_FILE_DISABLE_BUFFER
);
800 int cg_uninstall_release_agent(const char *controller
) {
801 _cleanup_free_
char *fs
= NULL
;
804 r
= cg_unified_controller(controller
);
807 if (r
> 0) /* Doesn't apply to unified hierarchy */
810 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
814 r
= write_string_file(fs
, "0", WRITE_STRING_FILE_DISABLE_BUFFER
);
820 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
824 r
= write_string_file(fs
, "", WRITE_STRING_FILE_DISABLE_BUFFER
);
831 int cg_is_empty(const char *controller
, const char *path
) {
832 _cleanup_fclose_
FILE *f
= NULL
;
838 r
= cg_enumerate_processes(controller
, path
, &f
);
844 r
= cg_read_pid(f
, &pid
);
851 int cg_is_empty_recursive(const char *controller
, const char *path
) {
856 /* The root cgroup is always populated */
857 if (controller
&& empty_or_root(path
))
860 r
= cg_unified_controller(controller
);
864 _cleanup_free_
char *t
= NULL
;
866 /* On the unified hierarchy we can check empty state
867 * via the "populated" attribute of "cgroup.events". */
869 r
= cg_read_event(controller
, path
, "populated", &t
);
875 return streq(t
, "0");
877 _cleanup_closedir_
DIR *d
= NULL
;
880 r
= cg_is_empty(controller
, path
);
884 r
= cg_enumerate_subgroups(controller
, path
, &d
);
890 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
891 _cleanup_free_
char *p
= NULL
;
893 p
= path_join(path
, fn
);
898 r
= cg_is_empty_recursive(controller
, p
);
909 int cg_split_spec(const char *spec
, char **ret_controller
, char **ret_path
) {
910 _cleanup_free_
char *controller
= NULL
, *path
= NULL
;
915 if (!path_is_normalized(spec
))
923 path_simplify(path
, false);
929 e
= strchr(spec
, ':');
931 controller
= strndup(spec
, e
-spec
);
934 if (!cg_controller_is_valid(controller
))
937 if (!isempty(e
+ 1)) {
942 if (!path_is_normalized(path
) ||
943 !path_is_absolute(path
))
946 path_simplify(path
, false);
950 if (!cg_controller_is_valid(spec
))
953 if (ret_controller
) {
954 controller
= strdup(spec
);
962 *ret_controller
= TAKE_PTR(controller
);
964 *ret_path
= TAKE_PTR(path
);
968 int cg_mangle_path(const char *path
, char **result
) {
969 _cleanup_free_
char *c
= NULL
, *p
= NULL
;
976 /* First, check if it already is a filesystem path */
977 if (path_startswith(path
, "/sys/fs/cgroup")) {
983 *result
= path_simplify(t
, false);
987 /* Otherwise, treat it as cg spec */
988 r
= cg_split_spec(path
, &c
, &p
);
992 return cg_get_path(c
?: SYSTEMD_CGROUP_CONTROLLER
, p
?: "/", NULL
, result
);
995 int cg_get_root_path(char **path
) {
1001 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 1, &p
);
1005 e
= endswith(p
, "/" SPECIAL_INIT_SCOPE
);
1007 e
= endswith(p
, "/" SPECIAL_SYSTEM_SLICE
); /* legacy */
1009 e
= endswith(p
, "/system"); /* even more legacy */
1017 int cg_shift_path(const char *cgroup
, const char *root
, const char **shifted
) {
1018 _cleanup_free_
char *rt
= NULL
;
1026 /* If the root was specified let's use that, otherwise
1027 * let's determine it from PID 1 */
1029 r
= cg_get_root_path(&rt
);
1036 p
= path_startswith(cgroup
, root
);
1037 if (p
&& p
> cgroup
)
1045 int cg_pid_get_path_shifted(pid_t pid
, const char *root
, char **cgroup
) {
1046 _cleanup_free_
char *raw
= NULL
;
1053 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &raw
);
1057 r
= cg_shift_path(raw
, root
, &c
);
1062 *cgroup
= TAKE_PTR(raw
);
1076 int cg_path_decode_unit(const char *cgroup
, char **unit
) {
1083 n
= strcspn(cgroup
, "/");
1087 c
= strndupa(cgroup
, n
);
1090 if (!unit_name_is_valid(c
, UNIT_NAME_PLAIN
|UNIT_NAME_INSTANCE
))
1101 static bool valid_slice_name(const char *p
, size_t n
) {
1106 if (n
< STRLEN("x.slice"))
1109 if (memcmp(p
+ n
- 6, ".slice", 6) == 0) {
1115 c
= cg_unescape(buf
);
1117 return unit_name_is_valid(c
, UNIT_NAME_PLAIN
);
1123 static const char *skip_slices(const char *p
) {
1126 /* Skips over all slice assignments */
1131 p
+= strspn(p
, "/");
1133 n
= strcspn(p
, "/");
1134 if (!valid_slice_name(p
, n
))
1141 int cg_path_get_unit(const char *path
, char **ret
) {
1142 _cleanup_free_
char *unit
= NULL
;
1149 e
= skip_slices(path
);
1151 r
= cg_path_decode_unit(e
, &unit
);
1155 /* We skipped over the slices, don't accept any now */
1156 if (endswith(unit
, ".slice"))
1159 *ret
= TAKE_PTR(unit
);
1163 int cg_pid_get_unit(pid_t pid
, char **unit
) {
1164 _cleanup_free_
char *cgroup
= NULL
;
1169 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1173 return cg_path_get_unit(cgroup
, unit
);
1177 * Skip session-*.scope, but require it to be there.
1179 static const char *skip_session(const char *p
) {
1185 p
+= strspn(p
, "/");
1187 n
= strcspn(p
, "/");
1188 if (n
< STRLEN("session-x.scope"))
1191 if (memcmp(p
, "session-", 8) == 0 && memcmp(p
+ n
- 6, ".scope", 6) == 0) {
1192 char buf
[n
- 8 - 6 + 1];
1194 memcpy(buf
, p
+ 8, n
- 8 - 6);
1197 /* Note that session scopes never need unescaping,
1198 * since they cannot conflict with the kernel's own
1199 * names, hence we don't need to call cg_unescape()
1202 if (!session_id_valid(buf
))
1206 p
+= strspn(p
, "/");
1214 * Skip user@*.service, but require it to be there.
1216 static const char *skip_user_manager(const char *p
) {
1222 p
+= strspn(p
, "/");
1224 n
= strcspn(p
, "/");
1225 if (n
< STRLEN("user@x.service"))
1228 if (memcmp(p
, "user@", 5) == 0 && memcmp(p
+ n
- 8, ".service", 8) == 0) {
1229 char buf
[n
- 5 - 8 + 1];
1231 memcpy(buf
, p
+ 5, n
- 5 - 8);
1234 /* Note that user manager services never need unescaping,
1235 * since they cannot conflict with the kernel's own
1236 * names, hence we don't need to call cg_unescape()
1239 if (parse_uid(buf
, NULL
) < 0)
1243 p
+= strspn(p
, "/");
1251 static const char *skip_user_prefix(const char *path
) {
1256 /* Skip slices, if there are any */
1257 e
= skip_slices(path
);
1259 /* Skip the user manager, if it's in the path now... */
1260 t
= skip_user_manager(e
);
1264 /* Alternatively skip the user session if it is in the path... */
1265 return skip_session(e
);
1268 int cg_path_get_user_unit(const char *path
, char **ret
) {
1274 t
= skip_user_prefix(path
);
1278 /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1280 return cg_path_get_unit(t
, ret
);
1283 int cg_pid_get_user_unit(pid_t pid
, char **unit
) {
1284 _cleanup_free_
char *cgroup
= NULL
;
1289 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1293 return cg_path_get_user_unit(cgroup
, unit
);
1296 int cg_path_get_machine_name(const char *path
, char **machine
) {
1297 _cleanup_free_
char *u
= NULL
;
1301 r
= cg_path_get_unit(path
, &u
);
1305 sl
= strjoina("/run/systemd/machines/unit:", u
);
1306 return readlink_malloc(sl
, machine
);
1309 int cg_pid_get_machine_name(pid_t pid
, char **machine
) {
1310 _cleanup_free_
char *cgroup
= NULL
;
1315 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1319 return cg_path_get_machine_name(cgroup
, machine
);
1322 int cg_path_get_session(const char *path
, char **session
) {
1323 _cleanup_free_
char *unit
= NULL
;
1329 r
= cg_path_get_unit(path
, &unit
);
1333 start
= startswith(unit
, "session-");
1336 end
= endswith(start
, ".scope");
1341 if (!session_id_valid(start
))
1357 int cg_pid_get_session(pid_t pid
, char **session
) {
1358 _cleanup_free_
char *cgroup
= NULL
;
1361 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1365 return cg_path_get_session(cgroup
, session
);
1368 int cg_path_get_owner_uid(const char *path
, uid_t
*uid
) {
1369 _cleanup_free_
char *slice
= NULL
;
1375 r
= cg_path_get_slice(path
, &slice
);
1379 start
= startswith(slice
, "user-");
1382 end
= endswith(start
, ".slice");
1387 if (parse_uid(start
, uid
) < 0)
1393 int cg_pid_get_owner_uid(pid_t pid
, uid_t
*uid
) {
1394 _cleanup_free_
char *cgroup
= NULL
;
1397 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1401 return cg_path_get_owner_uid(cgroup
, uid
);
1404 int cg_path_get_slice(const char *p
, char **slice
) {
1405 const char *e
= NULL
;
1410 /* Finds the right-most slice unit from the beginning, but
1411 * stops before we come to the first non-slice unit. */
1416 p
+= strspn(p
, "/");
1418 n
= strcspn(p
, "/");
1419 if (!valid_slice_name(p
, n
)) {
1424 s
= strdup(SPECIAL_ROOT_SLICE
);
1432 return cg_path_decode_unit(e
, slice
);
1440 int cg_pid_get_slice(pid_t pid
, char **slice
) {
1441 _cleanup_free_
char *cgroup
= NULL
;
1446 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1450 return cg_path_get_slice(cgroup
, slice
);
1453 int cg_path_get_user_slice(const char *p
, char **slice
) {
1458 t
= skip_user_prefix(p
);
1462 /* And now it looks pretty much the same as for a system
1463 * slice, so let's just use the same parser from here on. */
1464 return cg_path_get_slice(t
, slice
);
1467 int cg_pid_get_user_slice(pid_t pid
, char **slice
) {
1468 _cleanup_free_
char *cgroup
= NULL
;
1473 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1477 return cg_path_get_user_slice(cgroup
, slice
);
1480 char *cg_escape(const char *p
) {
1481 bool need_prefix
= false;
1483 /* This implements very minimal escaping for names to be used
1484 * as file names in the cgroup tree: any name which might
1485 * conflict with a kernel name or is prefixed with '_' is
1486 * prefixed with a '_'. That way, when reading cgroup names it
1487 * is sufficient to remove a single prefixing underscore if
1490 /* The return value of this function (unlike cg_unescape())
1493 if (IN_SET(p
[0], 0, '_', '.') ||
1494 STR_IN_SET(p
, "notify_on_release", "release_agent", "tasks") ||
1495 startswith(p
, "cgroup."))
1500 dot
= strrchr(p
, '.');
1505 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
1508 n
= cgroup_controller_to_string(c
);
1513 if (memcmp(p
, n
, l
) != 0)
1523 return strjoin("_", p
);
1528 char *cg_unescape(const char *p
) {
1531 /* The return value of this function (unlike cg_escape())
1532 * doesn't need free()! */
1540 #define CONTROLLER_VALID \
1544 bool cg_controller_is_valid(const char *p
) {
1550 if (streq(p
, SYSTEMD_CGROUP_CONTROLLER
))
1553 s
= startswith(p
, "name=");
1557 if (IN_SET(*p
, 0, '_'))
1560 for (t
= p
; *t
; t
++)
1561 if (!strchr(CONTROLLER_VALID
, *t
))
1564 if (t
- p
> NAME_MAX
)
1570 int cg_slice_to_path(const char *unit
, char **ret
) {
1571 _cleanup_free_
char *p
= NULL
, *s
= NULL
, *e
= NULL
;
1578 if (streq(unit
, SPECIAL_ROOT_SLICE
)) {
1588 if (!unit_name_is_valid(unit
, UNIT_NAME_PLAIN
))
1591 if (!endswith(unit
, ".slice"))
1594 r
= unit_name_to_prefix(unit
, &p
);
1598 dash
= strchr(p
, '-');
1600 /* Don't allow initial dashes */
1605 _cleanup_free_
char *escaped
= NULL
;
1606 char n
[dash
- p
+ sizeof(".slice")];
1608 #if HAS_FEATURE_MEMORY_SANITIZER
1609 /* msan doesn't instrument stpncpy, so it thinks
1610 * n is later used uninitialized:
1611 * https://github.com/google/sanitizers/issues/926
1616 /* Don't allow trailing or double dashes */
1617 if (IN_SET(dash
[1], 0, '-'))
1620 strcpy(stpncpy(n
, p
, dash
- p
), ".slice");
1621 if (!unit_name_is_valid(n
, UNIT_NAME_PLAIN
))
1624 escaped
= cg_escape(n
);
1628 if (!strextend(&s
, escaped
, "/"))
1631 dash
= strchr(dash
+1, '-');
1634 e
= cg_escape(unit
);
1638 if (!strextend(&s
, e
))
1646 int cg_set_attribute(const char *controller
, const char *path
, const char *attribute
, const char *value
) {
1647 _cleanup_free_
char *p
= NULL
;
1650 r
= cg_get_path(controller
, path
, attribute
, &p
);
1654 return write_string_file(p
, value
, WRITE_STRING_FILE_DISABLE_BUFFER
);
1657 int cg_get_attribute(const char *controller
, const char *path
, const char *attribute
, char **ret
) {
1658 _cleanup_free_
char *p
= NULL
;
1661 r
= cg_get_path(controller
, path
, attribute
, &p
);
1665 return read_one_line_file(p
, ret
);
1668 int cg_get_attribute_as_uint64(const char *controller
, const char *path
, const char *attribute
, uint64_t *ret
) {
1669 _cleanup_free_
char *value
= NULL
;
1675 r
= cg_get_attribute(controller
, path
, attribute
, &value
);
1681 if (streq(value
, "max")) {
1682 *ret
= CGROUP_LIMIT_MAX
;
1686 r
= safe_atou64(value
, &v
);
1694 int cg_get_attribute_as_bool(const char *controller
, const char *path
, const char *attribute
, bool *ret
) {
1695 _cleanup_free_
char *value
= NULL
;
1700 r
= cg_get_attribute(controller
, path
, attribute
, &value
);
1706 r
= parse_boolean(value
);
1714 int cg_get_owner(const char *controller
, const char *path
, uid_t
*ret_uid
) {
1715 _cleanup_free_
char *f
= NULL
;
1721 r
= cg_get_path(controller
, path
, NULL
, &f
);
1725 r
= stat(f
, &stats
);
1729 *ret_uid
= stats
.st_uid
;
1733 int cg_get_keyed_attribute_full(
1734 const char *controller
,
1736 const char *attribute
,
1739 CGroupKeyMode mode
) {
1741 _cleanup_free_
char *filename
= NULL
, *contents
= NULL
;
1743 size_t n
, i
, n_done
= 0;
1747 /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
1748 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
1749 * entries as 'keys'. On success each entry will be set to the value of the matching key.
1751 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode
1752 * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */
1754 r
= cg_get_path(controller
, path
, attribute
, &filename
);
1758 r
= read_full_file(filename
, &contents
, NULL
);
1762 n
= strv_length(keys
);
1763 if (n
== 0) /* No keys to retrieve? That's easy, we are done then */
1766 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
1767 v
= newa0(char*, n
);
1769 for (p
= contents
; *p
;) {
1770 const char *w
= NULL
;
1772 for (i
= 0; i
< n
; i
++)
1774 w
= first_word(p
, keys
[i
]);
1782 l
= strcspn(w
, NEWLINE
);
1783 v
[i
] = strndup(w
, l
);
1795 p
+= strcspn(p
, NEWLINE
);
1797 p
+= strspn(p
, NEWLINE
);
1800 if (mode
& CG_KEY_MODE_GRACEFUL
)
1806 for (i
= 0; i
< n
; i
++)
1812 memcpy(ret_values
, v
, sizeof(char*) * n
);
1813 if (mode
& CG_KEY_MODE_GRACEFUL
)
1819 int cg_mask_to_string(CGroupMask mask
, char **ret
) {
1820 _cleanup_free_
char *s
= NULL
;
1821 size_t n
= 0, allocated
= 0;
1832 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
1836 if (!FLAGS_SET(mask
, CGROUP_CONTROLLER_TO_MASK(c
)))
1839 k
= cgroup_controller_to_string(c
);
1842 if (!GREEDY_REALLOC(s
, allocated
, n
+ space
+ l
+ 1))
1847 memcpy(s
+ n
+ space
, k
, l
);
1861 int cg_mask_from_string(const char *value
, CGroupMask
*ret
) {
1868 _cleanup_free_
char *n
= NULL
;
1872 r
= extract_first_word(&value
, &n
, NULL
, 0);
1878 v
= cgroup_controller_from_string(n
);
1882 m
|= CGROUP_CONTROLLER_TO_MASK(v
);
1889 int cg_mask_supported_subtree(const char *root
, CGroupMask
*ret
) {
1893 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
1894 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
1895 * pseudo-controllers. */
1897 r
= cg_all_unified();
1901 _cleanup_free_
char *controllers
= NULL
, *path
= NULL
;
1903 /* In the unified hierarchy we can read the supported and accessible controllers from
1904 * the top-level cgroup attribute */
1906 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, root
, "cgroup.controllers", &path
);
1910 r
= read_one_line_file(path
, &controllers
);
1914 r
= cg_mask_from_string(controllers
, &mask
);
1918 /* Mask controllers that are not supported in unified hierarchy. */
1919 mask
&= CGROUP_MASK_V2
;
1924 /* In the legacy hierarchy, we check which hierarchies are accessible. */
1927 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
1928 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
1931 if (!FLAGS_SET(CGROUP_MASK_V1
, bit
))
1934 n
= cgroup_controller_to_string(c
);
1935 if (controller_is_v1_accessible(root
, n
) >= 0)
1944 int cg_mask_supported(CGroupMask
*ret
) {
1945 _cleanup_free_
char *root
= NULL
;
1948 r
= cg_get_root_path(&root
);
1952 return cg_mask_supported_subtree(root
, ret
);
1955 int cg_kernel_controllers(Set
**ret
) {
1956 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1957 _cleanup_fclose_
FILE *f
= NULL
;
1962 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
1963 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
1964 * pseudo-controllers. */
1966 controllers
= set_new(&string_hash_ops
);
1970 r
= fopen_unlocked("/proc/cgroups", "re", &f
);
1978 /* Ignore the header line */
1979 (void) read_line(f
, SIZE_MAX
, NULL
);
1986 if (fscanf(f
, "%ms %*i %*i %i", &controller
, &enabled
) != 2) {
1992 return errno_or_else(EIO
);
2002 if (!cg_controller_is_valid(controller
)) {
2007 r
= set_consume(controllers
, controller
);
2012 *ret
= TAKE_PTR(controllers
);
2017 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on
2018 * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1
2019 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on
2020 * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility
2023 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep
2024 * cgroup v2 process management but disable the compat dual layout, we return true on
2025 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified().
2027 static thread_local
bool unified_systemd_v232
;
2029 int cg_unified_cached(bool flush
) {
2030 static thread_local CGroupUnified unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2034 /* Checks if we support the unified hierarchy. Returns an
2035 * error when the cgroup hierarchies aren't mounted yet or we
2036 * have any other trouble determining if the unified hierarchy
2040 unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2041 else if (unified_cache
>= CGROUP_UNIFIED_NONE
)
2042 return unified_cache
;
2044 if (statfs("/sys/fs/cgroup/", &fs
) < 0)
2045 return log_debug_errno(errno
, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2047 if (F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2048 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2049 unified_cache
= CGROUP_UNIFIED_ALL
;
2050 } else if (F_TYPE_EQUAL(fs
.f_type
, TMPFS_MAGIC
)) {
2051 if (statfs("/sys/fs/cgroup/unified/", &fs
) == 0 &&
2052 F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2053 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2054 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2055 unified_systemd_v232
= false;
2057 if (statfs("/sys/fs/cgroup/systemd/", &fs
) < 0)
2058 return log_debug_errno(errno
, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2060 if (F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2061 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2062 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2063 unified_systemd_v232
= true;
2064 } else if (F_TYPE_EQUAL(fs
.f_type
, CGROUP_SUPER_MAGIC
)) {
2065 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2066 unified_cache
= CGROUP_UNIFIED_NONE
;
2068 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2069 (unsigned long long) fs
.f_type
);
2070 unified_cache
= CGROUP_UNIFIED_NONE
;
2073 } else if (F_TYPE_EQUAL(fs
.f_type
, SYSFS_MAGIC
)) {
2074 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM
),
2075 "No filesystem is currently mounted on /sys/fs/cgroup.");
2077 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM
),
2078 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2079 (unsigned long long)fs
.f_type
);
2081 return unified_cache
;
2084 int cg_unified_controller(const char *controller
) {
2087 r
= cg_unified_cached(false);
2091 if (r
== CGROUP_UNIFIED_NONE
)
2094 if (r
>= CGROUP_UNIFIED_ALL
)
2097 return streq_ptr(controller
, SYSTEMD_CGROUP_CONTROLLER
);
2100 int cg_all_unified(void) {
2103 r
= cg_unified_cached(false);
2107 return r
>= CGROUP_UNIFIED_ALL
;
2110 int cg_hybrid_unified(void) {
2113 r
= cg_unified_cached(false);
2117 return r
== CGROUP_UNIFIED_SYSTEMD
&& !unified_systemd_v232
;
2120 const uint64_t cgroup_io_limit_defaults
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2121 [CGROUP_IO_RBPS_MAX
] = CGROUP_LIMIT_MAX
,
2122 [CGROUP_IO_WBPS_MAX
] = CGROUP_LIMIT_MAX
,
2123 [CGROUP_IO_RIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2124 [CGROUP_IO_WIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2127 static const char* const cgroup_io_limit_type_table
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2128 [CGROUP_IO_RBPS_MAX
] = "IOReadBandwidthMax",
2129 [CGROUP_IO_WBPS_MAX
] = "IOWriteBandwidthMax",
2130 [CGROUP_IO_RIOPS_MAX
] = "IOReadIOPSMax",
2131 [CGROUP_IO_WIOPS_MAX
] = "IOWriteIOPSMax",
2134 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type
, CGroupIOLimitType
);
2136 bool is_cgroup_fs(const struct statfs
*s
) {
2137 return is_fs_type(s
, CGROUP_SUPER_MAGIC
) ||
2138 is_fs_type(s
, CGROUP2_SUPER_MAGIC
);
2141 bool fd_is_cgroup_fs(int fd
) {
2144 if (fstatfs(fd
, &s
) < 0)
2147 return is_cgroup_fs(&s
);
2150 static const char *const cgroup_controller_table
[_CGROUP_CONTROLLER_MAX
] = {
2151 [CGROUP_CONTROLLER_CPU
] = "cpu",
2152 [CGROUP_CONTROLLER_CPUACCT
] = "cpuacct",
2153 [CGROUP_CONTROLLER_CPUSET
] = "cpuset",
2154 [CGROUP_CONTROLLER_IO
] = "io",
2155 [CGROUP_CONTROLLER_BLKIO
] = "blkio",
2156 [CGROUP_CONTROLLER_MEMORY
] = "memory",
2157 [CGROUP_CONTROLLER_DEVICES
] = "devices",
2158 [CGROUP_CONTROLLER_PIDS
] = "pids",
2159 [CGROUP_CONTROLLER_BPF_FIREWALL
] = "bpf-firewall",
2160 [CGROUP_CONTROLLER_BPF_DEVICES
] = "bpf-devices",
2163 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller
, CGroupController
);
2165 CGroupMask
get_cpu_accounting_mask(void) {
2166 static CGroupMask needed_mask
= (CGroupMask
) -1;
2168 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2169 * provided externally from the CPU controller, which means we don't
2170 * need to enable the CPU controller just to get metrics. This is good,
2171 * because enabling the CPU controller comes at a minor performance
2172 * hit, especially when it's propagated deep into large hierarchies.
2173 * There's also no separate CPU accounting controller available within
2174 * a unified hierarchy.
2176 * This combination of factors results in the desired cgroup mask to
2177 * enable for CPU accounting varying as follows:
2179 * ╔═════════════════════╤═════════════════════╗
2180 * ║ Linux ≥4.15 │ Linux <4.15 ║
2181 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2182 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2183 * ╟───────────────╫─────────────────────┼─────────────────────╢
2184 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2185 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2187 * We check kernel version here instead of manually checking whether
2188 * cpu.stat is present for every cgroup, as that check in itself would
2189 * already be fairly expensive.
2191 * Kernels where this patch has been backported will therefore have the
2192 * CPU controller enabled unnecessarily. This is more expensive than
2193 * necessary, but harmless. ☺️
2196 if (needed_mask
== (CGroupMask
) -1) {
2197 if (cg_all_unified()) {
2199 assert_se(uname(&u
) >= 0);
2201 if (strverscmp_improved(u
.release
, "4.15") < 0)
2202 needed_mask
= CGROUP_MASK_CPU
;
2206 needed_mask
= CGROUP_MASK_CPUACCT
;
2212 bool cpu_accounting_is_cheap(void) {
2213 return get_cpu_accounting_mask() == 0;
2216 static const char* const managed_oom_mode_table
[_MANAGED_OOM_MODE_MAX
] = {
2217 [MANAGED_OOM_AUTO
] = "auto",
2218 [MANAGED_OOM_KILL
] = "kill",
2221 DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode
, ManagedOOMMode
);
2223 static const char* const managed_oom_preference_table
[_MANAGED_OOM_PREFERENCE_MAX
] = {
2224 [MANAGED_OOM_PREFERENCE_NONE
] = "none",
2225 [MANAGED_OOM_PREFERENCE_AVOID
] = "avoid",
2226 [MANAGED_OOM_PREFERENCE_OMIT
] = "omit",
2229 DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference
, ManagedOOMPreference
);