1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
9 #include <sys/utsname.h>
10 #include <sys/xattr.h>
13 #include "alloc-util.h"
14 #include "cgroup-util.h"
15 #include "constants.h"
16 #include "dirent-util.h"
17 #include "extract-word.h"
20 #include "format-util.h"
23 #include "login-util.h"
25 #include "missing_fs.h"
26 #include "missing_magic.h"
27 #include "missing_threads.h"
29 #include "parse-util.h"
30 #include "path-util.h"
31 #include "process-util.h"
34 #include "stat-util.h"
35 #include "stdio-util.h"
36 #include "string-table.h"
37 #include "string-util.h"
39 #include "unit-name.h"
40 #include "user-util.h"
41 #include "xattr-util.h"
43 int cg_path_open(const char *controller
, const char *path
) {
44 _cleanup_free_
char *fs
= NULL
;
47 r
= cg_get_path(controller
, path
, /* item=*/ NULL
, &fs
);
51 return RET_NERRNO(open(fs
, O_DIRECTORY
|O_CLOEXEC
));
54 int cg_cgroupid_open(int cgroupfs_fd
, uint64_t id
) {
55 _cleanup_close_
int fsfd
= -EBADF
;
57 if (cgroupfs_fd
< 0) {
58 fsfd
= open("/sys/fs/cgroup", O_CLOEXEC
|O_DIRECTORY
);
65 cg_file_handle fh
= CG_FILE_HANDLE_INIT
;
66 CG_FILE_HANDLE_CGROUPID(fh
) = id
;
68 int fd
= open_by_handle_at(cgroupfs_fd
, &fh
.file_handle
, O_DIRECTORY
|O_CLOEXEC
);
75 static int cg_enumerate_items(const char *controller
, const char *path
, FILE **ret
, const char *item
) {
76 _cleanup_free_
char *fs
= NULL
;
82 r
= cg_get_path(controller
, path
, item
, &fs
);
94 int cg_enumerate_processes(const char *controller
, const char *path
, FILE **ret
) {
95 return cg_enumerate_items(controller
, path
, ret
, "cgroup.procs");
98 int cg_read_pid(FILE *f
, pid_t
*ret
, CGroupFlags flags
) {
101 /* Note that the cgroup.procs might contain duplicates! See cgroups.txt for details. */
108 if (fscanf(f
, "%lu", &ul
) != 1) {
115 return errno_or_else(EIO
);
121 /* In some circumstances (e.g. WSL), cgroups might contain unmappable PIDs from other
122 * contexts. These show up as zeros, and depending on the caller, can either be plain
123 * skipped over, or returned as-is. */
124 if (ul
== 0 && !FLAGS_SET(flags
, CGROUP_DONT_SKIP_UNMAPPED
))
132 int cg_read_pidref(FILE *f
, PidRef
*ret
, CGroupFlags flags
) {
141 r
= cg_read_pid(f
, &pid
, flags
);
143 return log_debug_errno(r
, "Failed to read pid from cgroup item: %m");
152 if (FLAGS_SET(flags
, CGROUP_NO_PIDFD
)) {
153 *ret
= PIDREF_MAKE_FROM_PID(pid
);
157 r
= pidref_set_pid(ret
, pid
);
163 /* ESRCH → gone by now? just skip over it, read the next */
168 const char *controller
,
173 _cleanup_free_
char *events
= NULL
, *content
= NULL
;
176 r
= cg_get_path(controller
, path
, "cgroup.events", &events
);
180 r
= read_full_virtual_file(events
, &content
, NULL
);
184 for (const char *p
= content
;;) {
185 _cleanup_free_
char *line
= NULL
, *key
= NULL
;
188 r
= extract_first_word(&p
, &line
, "\n", 0);
195 r
= extract_first_word(&q
, &key
, " ", 0);
201 if (!streq(key
, event
))
204 return strdup_to(ret
, q
);
208 bool cg_ns_supported(void) {
209 static thread_local
int enabled
= -1;
214 if (access("/proc/self/ns/cgroup", F_OK
) < 0) {
216 log_debug_errno(errno
, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
224 bool cg_freezer_supported(void) {
225 static thread_local
int supported
= -1;
230 supported
= cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK
) == 0;
235 bool cg_kill_supported(void) {
236 static thread_local
int supported
= -1;
241 if (cg_all_unified() <= 0)
243 else if (access("/sys/fs/cgroup/init.scope/cgroup.kill", F_OK
) < 0) {
245 log_debug_errno(errno
, "Failed to check if cgroup.kill is available, assuming not: %m");
253 int cg_enumerate_subgroups(const char *controller
, const char *path
, DIR **ret
) {
254 _cleanup_free_
char *fs
= NULL
;
260 /* This is not recursive! */
262 r
= cg_get_path(controller
, path
, NULL
, &fs
);
274 int cg_read_subgroup(DIR *d
, char **ret
) {
278 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
279 if (de
->d_type
!= DT_DIR
)
282 if (dot_or_dot_dot(de
->d_name
))
285 return strdup_to_full(ret
, de
->d_name
);
292 int cg_rmdir(const char *controller
, const char *path
) {
293 _cleanup_free_
char *p
= NULL
;
296 r
= cg_get_path(controller
, path
, NULL
, &p
);
301 if (r
< 0 && errno
!= ENOENT
)
304 r
= cg_hybrid_unified();
308 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
309 r
= cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
311 log_warning_errno(r
, "Failed to remove compat systemd cgroup %s: %m", path
);
317 static int cg_kill_items(
322 cg_kill_log_func_t log_kill
,
326 _cleanup_set_free_ Set
*allocated_set
= NULL
;
328 int r
, ret
= 0, ret_log_kill
= 0;
332 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
333 * SIGCONT on SIGKILL. */
334 if (IN_SET(sig
, SIGCONT
, SIGKILL
))
335 flags
&= ~CGROUP_SIGCONT
;
337 /* This goes through the tasks list and kills them all. This
338 * is repeated until no further processes are added to the
339 * tasks list, to properly handle forking processes */
342 s
= allocated_set
= set_new(NULL
);
348 _cleanup_fclose_
FILE *f
= NULL
;
351 r
= cg_enumerate_items(SYSTEMD_CGROUP_CONTROLLER
, path
, &f
, item
);
355 return RET_GATHER(ret
, log_debug_errno(r
, "Failed to enumerate cgroup items: %m"));
358 _cleanup_(pidref_done
) PidRef pidref
= PIDREF_NULL
;
360 r
= cg_read_pidref(f
, &pidref
, flags
);
362 return RET_GATHER(ret
, log_debug_errno(r
, "Failed to read pidref from cgroup '%s': %m", path
));
366 if ((flags
& CGROUP_IGNORE_SELF
) && pidref_is_self(&pidref
))
369 if (set_get(s
, PID_TO_PTR(pidref
.pid
)) == PID_TO_PTR(pidref
.pid
))
373 ret_log_kill
= log_kill(&pidref
, sig
, userdata
);
375 /* If we haven't killed this process yet, kill it */
376 r
= pidref_kill(&pidref
, sig
);
377 if (r
< 0 && r
!= -ESRCH
)
378 RET_GATHER(ret
, log_debug_errno(r
, "Failed to kill process with pid " PID_FMT
" from cgroup '%s': %m", pidref
.pid
, path
));
380 if (flags
& CGROUP_SIGCONT
)
381 (void) pidref_kill(&pidref
, SIGCONT
);
393 r
= set_put(s
, PID_TO_PTR(pidref
.pid
));
395 return RET_GATHER(ret
, r
);
398 /* To avoid racing against processes which fork quicker than we can kill them, we repeat this
399 * until no new pids need to be killed. */
411 cg_kill_log_func_t log_kill
,
416 r
= cg_kill_items(path
, sig
, flags
, s
, log_kill
, userdata
, "cgroup.procs");
418 log_debug_errno(r
, "Failed to kill processes in cgroup '%s' item cgroup.procs: %m", path
);
419 if (r
< 0 || sig
!= SIGKILL
)
424 /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
425 a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
426 (4340d175b898) and 4.14.138 (feb6b123b7dd). */
427 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
433 /* Opening pidfds for non thread group leaders only works from 6.9 onwards with PIDFD_THREAD. On
434 * older kernels or without PIDFD_THREAD pidfd_open() fails with EINVAL. Since we might read non
435 * thread group leader IDs from cgroup.threads, we set CGROUP_NO_PIDFD to avoid trying open pidfd's
436 * for them and instead use the regular pid. */
437 r
= cg_kill_items(path
, sig
, flags
|CGROUP_NO_PIDFD
, s
, log_kill
, userdata
, "cgroup.threads");
439 return log_debug_errno(r
, "Failed to kill processes in cgroup '%s' item cgroup.threads: %m", path
);
441 return r
> 0 || ret
> 0;
444 int cg_kill_kernel_sigkill(const char *path
) {
445 /* Kills the cgroup at `path` directly by writing to its cgroup.kill file. This sends SIGKILL to all
446 * processes in the cgroup and has the advantage of being completely atomic, unlike cg_kill_items(). */
448 _cleanup_free_
char *killfile
= NULL
;
453 if (!cg_kill_supported())
456 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, "cgroup.kill", &killfile
);
460 r
= write_string_file(killfile
, "1", WRITE_STRING_FILE_DISABLE_BUFFER
);
462 return log_debug_errno(r
, "Failed to write to cgroup.kill for cgroup '%s': %m", path
);
467 int cg_kill_recursive(
472 cg_kill_log_func_t log_kill
,
480 if (sig
== SIGKILL
&& cg_kill_supported() &&
481 !FLAGS_SET(flags
, CGROUP_IGNORE_SELF
) && !s
&& !log_kill
)
482 /* ignore CGROUP_SIGCONT, since this is a no-op alongside SIGKILL */
483 ret
= cg_kill_kernel_sigkill(path
);
485 _cleanup_set_free_ Set
*allocated_set
= NULL
;
486 _cleanup_closedir_
DIR *d
= NULL
;
489 s
= allocated_set
= set_new(NULL
);
494 ret
= cg_kill(path
, sig
, flags
, s
, log_kill
, userdata
);
496 r
= cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER
, path
, &d
);
499 RET_GATHER(ret
, log_debug_errno(r
, "Failed to enumerate cgroup '%s' subgroups: %m", path
));
505 _cleanup_free_
char *fn
= NULL
, *p
= NULL
;
507 r
= cg_read_subgroup(d
, &fn
);
509 RET_GATHER(ret
, log_debug_errno(r
, "Failed to read subgroup from cgroup '%s': %m", path
));
515 p
= path_join(empty_to_root(path
), fn
);
519 r
= cg_kill_recursive(p
, sig
, flags
, s
, log_kill
, userdata
);
521 log_debug_errno(r
, "Failed to recursively kill processes in cgroup '%s': %m", p
);
522 if (r
!= 0 && ret
>= 0)
527 if (FLAGS_SET(flags
, CGROUP_REMOVE
)) {
528 r
= cg_rmdir(SYSTEMD_CGROUP_CONTROLLER
, path
);
529 if (!IN_SET(r
, -ENOENT
, -EBUSY
))
530 RET_GATHER(ret
, log_debug_errno(r
, "Failed to remove cgroup '%s': %m", path
));
536 static const char *controller_to_dirname(const char *controller
) {
539 /* Converts a controller name to the directory name below /sys/fs/cgroup/ we want to mount it
540 * to. Effectively, this just cuts off the name= prefixed used for named hierarchies, if it is
543 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
544 if (cg_hybrid_unified() > 0)
545 controller
= SYSTEMD_CGROUP_CONTROLLER_HYBRID
;
547 controller
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
550 return startswith(controller
, "name=") ?: controller
;
553 static int join_path_legacy(const char *controller
, const char *path
, const char *suffix
, char **ret
) {
560 dn
= controller_to_dirname(controller
);
562 if (isempty(path
) && isempty(suffix
))
563 t
= path_join("/sys/fs/cgroup", dn
);
564 else if (isempty(path
))
565 t
= path_join("/sys/fs/cgroup", dn
, suffix
);
566 else if (isempty(suffix
))
567 t
= path_join("/sys/fs/cgroup", dn
, path
);
569 t
= path_join("/sys/fs/cgroup", dn
, path
, suffix
);
577 static int join_path_unified(const char *path
, const char *suffix
, char **ret
) {
582 if (isempty(path
) && isempty(suffix
))
583 t
= strdup("/sys/fs/cgroup");
584 else if (isempty(path
))
585 t
= path_join("/sys/fs/cgroup", suffix
);
586 else if (isempty(suffix
))
587 t
= path_join("/sys/fs/cgroup", path
);
589 t
= path_join("/sys/fs/cgroup", path
, suffix
);
597 int cg_get_path(const char *controller
, const char *path
, const char *suffix
, char **ret
) {
605 /* If no controller is specified, we return the path *below* the controllers, without any
608 if (isempty(path
) && isempty(suffix
))
613 else if (isempty(path
))
616 t
= path_join(path
, suffix
);
620 *ret
= path_simplify(t
);
624 if (!cg_controller_is_valid(controller
))
627 r
= cg_all_unified();
631 r
= join_path_unified(path
, suffix
, ret
);
633 r
= join_path_legacy(controller
, path
, suffix
, ret
);
641 static int controller_is_v1_accessible(const char *root
, const char *controller
) {
642 const char *cpath
, *dn
;
646 dn
= controller_to_dirname(controller
);
648 /* If root if specified, we check that:
649 * - possible subcgroup is created at root,
650 * - we can modify the hierarchy. */
652 cpath
= strjoina("/sys/fs/cgroup/", dn
, root
, root
? "/cgroup.procs" : NULL
);
653 return laccess(cpath
, root
? W_OK
: F_OK
);
656 int cg_get_path_and_check(const char *controller
, const char *path
, const char *suffix
, char **ret
) {
662 if (!cg_controller_is_valid(controller
))
665 r
= cg_all_unified();
669 /* In the unified hierarchy all controllers are considered accessible,
670 * except for the named hierarchies */
671 if (startswith(controller
, "name="))
674 /* Check if the specified controller is actually accessible */
675 r
= controller_is_v1_accessible(NULL
, controller
);
680 return cg_get_path(controller
, path
, suffix
, ret
);
683 int cg_set_xattr(const char *path
, const char *name
, const void *value
, size_t size
, int flags
) {
684 _cleanup_free_
char *fs
= NULL
;
689 assert(value
|| size
<= 0);
691 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
695 return RET_NERRNO(setxattr(fs
, name
, value
, size
, flags
));
698 int cg_get_xattr(const char *path
, const char *name
, void *value
, size_t size
) {
699 _cleanup_free_
char *fs
= NULL
;
706 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
710 n
= getxattr(fs
, name
, value
, size
);
717 int cg_get_xattr_malloc(const char *path
, const char *name
, char **ret
) {
718 _cleanup_free_
char *fs
= NULL
;
724 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
728 return lgetxattr_malloc(fs
, name
, ret
);
731 int cg_get_xattr_bool(const char *path
, const char *name
) {
732 _cleanup_free_
char *fs
= NULL
;
738 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
742 return getxattr_at_bool(AT_FDCWD
, fs
, name
, /* flags= */ 0);
745 int cg_remove_xattr(const char *path
, const char *name
) {
746 _cleanup_free_
char *fs
= NULL
;
752 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
756 return RET_NERRNO(removexattr(fs
, name
));
759 int cg_pid_get_path(const char *controller
, pid_t pid
, char **ret_path
) {
760 _cleanup_fclose_
FILE *f
= NULL
;
761 const char *fs
, *controller_str
= NULL
; /* avoid false maybe-uninitialized warning */
768 if (!cg_controller_is_valid(controller
))
771 controller
= SYSTEMD_CGROUP_CONTROLLER
;
773 unified
= cg_unified_controller(controller
);
777 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
778 controller_str
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
780 controller_str
= controller
;
783 fs
= procfs_file_alloca(pid
, "cgroup");
784 r
= fopen_unlocked(fs
, "re", &f
);
791 _cleanup_free_
char *line
= NULL
;
794 r
= read_line(f
, LONG_LINE_MAX
, &line
);
801 e
= startswith(line
, "0:");
811 l
= strchr(line
, ':');
821 assert(controller_str
);
822 r
= string_contains_word(l
, ",", controller_str
);
829 char *path
= strdup(e
+ 1);
833 /* Truncate suffix indicating the process is a zombie */
834 e
= endswith(path
, " (deleted)");
843 int cg_pidref_get_path(const char *controller
, const PidRef
*pidref
, char **ret_path
) {
844 _cleanup_free_
char *path
= NULL
;
849 if (!pidref_is_set(pidref
))
852 r
= cg_pid_get_path(controller
, pidref
->pid
, &path
);
856 /* Before we return the path, make sure the procfs entry for this pid still matches the pidref */
857 r
= pidref_verify(pidref
);
861 *ret_path
= TAKE_PTR(path
);
865 int cg_install_release_agent(const char *controller
, const char *agent
) {
866 _cleanup_free_
char *fs
= NULL
, *contents
= NULL
;
872 r
= cg_unified_controller(controller
);
875 if (r
> 0) /* doesn't apply to unified hierarchy */
878 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
882 r
= read_one_line_file(fs
, &contents
);
886 sc
= strstrip(contents
);
888 r
= write_string_file(fs
, agent
, WRITE_STRING_FILE_DISABLE_BUFFER
);
891 } else if (!path_equal(sc
, agent
))
895 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
899 contents
= mfree(contents
);
900 r
= read_one_line_file(fs
, &contents
);
904 sc
= strstrip(contents
);
905 if (streq(sc
, "0")) {
906 r
= write_string_file(fs
, "1", WRITE_STRING_FILE_DISABLE_BUFFER
);
919 int cg_uninstall_release_agent(const char *controller
) {
920 _cleanup_free_
char *fs
= NULL
;
923 r
= cg_unified_controller(controller
);
926 if (r
> 0) /* Doesn't apply to unified hierarchy */
929 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
933 r
= write_string_file(fs
, "0", WRITE_STRING_FILE_DISABLE_BUFFER
);
939 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
943 r
= write_string_file(fs
, "", WRITE_STRING_FILE_DISABLE_BUFFER
);
950 int cg_is_empty(const char *controller
, const char *path
) {
951 _cleanup_fclose_
FILE *f
= NULL
;
957 r
= cg_enumerate_processes(controller
, path
, &f
);
963 r
= cg_read_pid(f
, &pid
, CGROUP_DONT_SKIP_UNMAPPED
);
970 int cg_is_empty_recursive(const char *controller
, const char *path
) {
975 /* The root cgroup is always populated */
976 if (controller
&& empty_or_root(path
))
979 r
= cg_unified_controller(controller
);
983 _cleanup_free_
char *t
= NULL
;
985 /* On the unified hierarchy we can check empty state
986 * via the "populated" attribute of "cgroup.events". */
988 r
= cg_read_event(controller
, path
, "populated", &t
);
994 return streq(t
, "0");
996 _cleanup_closedir_
DIR *d
= NULL
;
999 r
= cg_is_empty(controller
, path
);
1003 r
= cg_enumerate_subgroups(controller
, path
, &d
);
1009 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
1010 _cleanup_free_
char *p
= NULL
;
1012 p
= path_join(path
, fn
);
1017 r
= cg_is_empty_recursive(controller
, p
);
1028 int cg_split_spec(const char *spec
, char **ret_controller
, char **ret_path
) {
1029 _cleanup_free_
char *controller
= NULL
, *path
= NULL
;
1035 if (!path_is_normalized(spec
))
1039 r
= path_simplify_alloc(spec
, &path
);
1047 e
= strchr(spec
, ':');
1049 controller
= strndup(spec
, e
-spec
);
1052 if (!cg_controller_is_valid(controller
))
1055 if (!isempty(e
+ 1)) {
1060 if (!path_is_normalized(path
) ||
1061 !path_is_absolute(path
))
1064 path_simplify(path
);
1068 if (!cg_controller_is_valid(spec
))
1071 if (ret_controller
) {
1072 controller
= strdup(spec
);
1080 *ret_controller
= TAKE_PTR(controller
);
1082 *ret_path
= TAKE_PTR(path
);
1086 int cg_mangle_path(const char *path
, char **ret
) {
1087 _cleanup_free_
char *c
= NULL
, *p
= NULL
;
1093 /* First, check if it already is a filesystem path */
1094 if (path_startswith(path
, "/sys/fs/cgroup"))
1095 return path_simplify_alloc(path
, ret
);
1097 /* Otherwise, treat it as cg spec */
1098 r
= cg_split_spec(path
, &c
, &p
);
1102 return cg_get_path(c
?: SYSTEMD_CGROUP_CONTROLLER
, p
?: "/", NULL
, ret
);
1105 int cg_get_root_path(char **ret_path
) {
1111 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 1, &p
);
1115 e
= endswith(p
, "/" SPECIAL_INIT_SCOPE
);
1117 e
= endswith(p
, "/" SPECIAL_SYSTEM_SLICE
); /* legacy */
1119 e
= endswith(p
, "/system"); /* even more legacy */
1127 int cg_shift_path(const char *cgroup
, const char *root
, const char **ret_shifted
) {
1128 _cleanup_free_
char *rt
= NULL
;
1133 assert(ret_shifted
);
1136 /* If the root was specified let's use that, otherwise
1137 * let's determine it from PID 1 */
1139 r
= cg_get_root_path(&rt
);
1146 p
= path_startswith(cgroup
, root
);
1147 if (p
&& p
> cgroup
)
1148 *ret_shifted
= p
- 1;
1150 *ret_shifted
= cgroup
;
1155 int cg_pid_get_path_shifted(pid_t pid
, const char *root
, char **ret_cgroup
) {
1156 _cleanup_free_
char *raw
= NULL
;
1163 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &raw
);
1167 r
= cg_shift_path(raw
, root
, &c
);
1172 *ret_cgroup
= TAKE_PTR(raw
);
1176 return strdup_to(ret_cgroup
, c
);
1179 int cg_path_decode_unit(const char *cgroup
, char **ret_unit
) {
1183 size_t n
= strcspn(cgroup
, "/");
1187 char *c
= strndupa_safe(cgroup
, n
);
1190 if (!unit_name_is_valid(c
, UNIT_NAME_PLAIN
|UNIT_NAME_INSTANCE
))
1193 return strdup_to(ret_unit
, c
);
1196 static bool valid_slice_name(const char *p
, size_t n
) {
1201 if (n
< STRLEN("x.slice"))
1204 if (memcmp(p
+ n
- 6, ".slice", 6) == 0) {
1210 c
= cg_unescape(buf
);
1212 return unit_name_is_valid(c
, UNIT_NAME_PLAIN
);
1218 static const char *skip_slices(const char *p
) {
1221 /* Skips over all slice assignments */
1226 p
+= strspn(p
, "/");
1228 n
= strcspn(p
, "/");
1229 if (!valid_slice_name(p
, n
))
1236 int cg_path_get_unit(const char *path
, char **ret
) {
1237 _cleanup_free_
char *unit
= NULL
;
1244 e
= skip_slices(path
);
1246 r
= cg_path_decode_unit(e
, &unit
);
1250 /* We skipped over the slices, don't accept any now */
1251 if (endswith(unit
, ".slice"))
1254 *ret
= TAKE_PTR(unit
);
1258 int cg_path_get_unit_path(const char *path
, char **ret
) {
1259 _cleanup_free_
char *path_copy
= NULL
;
1265 path_copy
= strdup(path
);
1269 unit_name
= (char *)skip_slices(path_copy
);
1270 unit_name
[strcspn(unit_name
, "/")] = 0;
1272 if (!unit_name_is_valid(cg_unescape(unit_name
), UNIT_NAME_PLAIN
|UNIT_NAME_INSTANCE
))
1275 *ret
= TAKE_PTR(path_copy
);
1280 int cg_pid_get_unit(pid_t pid
, char **ret_unit
) {
1281 _cleanup_free_
char *cgroup
= NULL
;
1286 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1290 return cg_path_get_unit(cgroup
, ret_unit
);
1293 int cg_pidref_get_unit(const PidRef
*pidref
, char **ret
) {
1294 _cleanup_free_
char *unit
= NULL
;
1299 if (!pidref_is_set(pidref
))
1302 r
= cg_pid_get_unit(pidref
->pid
, &unit
);
1306 r
= pidref_verify(pidref
);
1310 *ret
= TAKE_PTR(unit
);
1315 * Skip session-*.scope, but require it to be there.
1317 static const char *skip_session(const char *p
) {
1323 p
+= strspn(p
, "/");
1325 n
= strcspn(p
, "/");
1326 if (n
< STRLEN("session-x.scope"))
1329 if (memcmp(p
, "session-", 8) == 0 && memcmp(p
+ n
- 6, ".scope", 6) == 0) {
1330 char buf
[n
- 8 - 6 + 1];
1332 memcpy(buf
, p
+ 8, n
- 8 - 6);
1335 /* Note that session scopes never need unescaping,
1336 * since they cannot conflict with the kernel's own
1337 * names, hence we don't need to call cg_unescape()
1340 if (!session_id_valid(buf
))
1344 p
+= strspn(p
, "/");
1352 * Skip user@*.service, but require it to be there.
1354 static const char *skip_user_manager(const char *p
) {
1360 p
+= strspn(p
, "/");
1362 n
= strcspn(p
, "/");
1363 if (n
< STRLEN("user@x.service"))
1366 if (memcmp(p
, "user@", 5) == 0 && memcmp(p
+ n
- 8, ".service", 8) == 0) {
1367 char buf
[n
- 5 - 8 + 1];
1369 memcpy(buf
, p
+ 5, n
- 5 - 8);
1372 /* Note that user manager services never need unescaping,
1373 * since they cannot conflict with the kernel's own
1374 * names, hence we don't need to call cg_unescape()
1377 if (parse_uid(buf
, NULL
) < 0)
1381 p
+= strspn(p
, "/");
1389 static const char *skip_user_prefix(const char *path
) {
1394 /* Skip slices, if there are any */
1395 e
= skip_slices(path
);
1397 /* Skip the user manager, if it's in the path now... */
1398 t
= skip_user_manager(e
);
1402 /* Alternatively skip the user session if it is in the path... */
1403 return skip_session(e
);
1406 int cg_path_get_user_unit(const char *path
, char **ret
) {
1412 t
= skip_user_prefix(path
);
1416 /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1418 return cg_path_get_unit(t
, ret
);
1421 int cg_pid_get_user_unit(pid_t pid
, char **ret_unit
) {
1422 _cleanup_free_
char *cgroup
= NULL
;
1427 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1431 return cg_path_get_user_unit(cgroup
, ret_unit
);
1434 int cg_path_get_machine_name(const char *path
, char **ret_machine
) {
1435 _cleanup_free_
char *u
= NULL
;
1439 r
= cg_path_get_unit(path
, &u
);
1443 sl
= strjoina("/run/systemd/machines/unit:", u
);
1444 return readlink_malloc(sl
, ret_machine
);
1447 int cg_pid_get_machine_name(pid_t pid
, char **ret_machine
) {
1448 _cleanup_free_
char *cgroup
= NULL
;
1451 assert(ret_machine
);
1453 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1457 return cg_path_get_machine_name(cgroup
, ret_machine
);
1460 int cg_path_get_cgroupid(const char *path
, uint64_t *ret
) {
1461 cg_file_handle fh
= CG_FILE_HANDLE_INIT
;
1467 /* This is cgroupfs so we know the size of the handle, thus no need to loop around like
1468 * name_to_handle_at_loop() does in mountpoint-util.c */
1469 if (name_to_handle_at(AT_FDCWD
, path
, &fh
.file_handle
, &mnt_id
, 0) < 0)
1472 *ret
= CG_FILE_HANDLE_CGROUPID(fh
);
1476 int cg_fd_get_cgroupid(int fd
, uint64_t *ret
) {
1477 cg_file_handle fh
= CG_FILE_HANDLE_INIT
;
1483 if (name_to_handle_at(fd
, "", &fh
.file_handle
, &mnt_id
, AT_EMPTY_PATH
) < 0)
1486 *ret
= CG_FILE_HANDLE_CGROUPID(fh
);
1490 int cg_path_get_session(const char *path
, char **ret_session
) {
1491 _cleanup_free_
char *unit
= NULL
;
1497 r
= cg_path_get_unit(path
, &unit
);
1501 start
= startswith(unit
, "session-");
1504 end
= endswith(start
, ".scope");
1509 if (!session_id_valid(start
))
1515 return strdup_to(ret_session
, start
);
1518 int cg_pid_get_session(pid_t pid
, char **ret_session
) {
1519 _cleanup_free_
char *cgroup
= NULL
;
1522 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1526 return cg_path_get_session(cgroup
, ret_session
);
1529 int cg_path_get_owner_uid(const char *path
, uid_t
*ret_uid
) {
1530 _cleanup_free_
char *slice
= NULL
;
1536 r
= cg_path_get_slice(path
, &slice
);
1540 start
= startswith(slice
, "user-");
1544 end
= endswith(start
, ".slice");
1549 if (parse_uid(start
, ret_uid
) < 0)
1555 int cg_pid_get_owner_uid(pid_t pid
, uid_t
*ret_uid
) {
1556 _cleanup_free_
char *cgroup
= NULL
;
1559 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1563 return cg_path_get_owner_uid(cgroup
, ret_uid
);
1566 int cg_path_get_slice(const char *p
, char **ret_slice
) {
1567 const char *e
= NULL
;
1572 /* Finds the right-most slice unit from the beginning, but stops before we come to
1573 * the first non-slice unit. */
1579 n
= path_find_first_component(&p
, /* accept_dot_dot = */ false, &s
);
1582 if (!valid_slice_name(s
, n
))
1589 return cg_path_decode_unit(e
, ret_slice
);
1591 return strdup_to(ret_slice
, SPECIAL_ROOT_SLICE
);
1594 int cg_pid_get_slice(pid_t pid
, char **ret_slice
) {
1595 _cleanup_free_
char *cgroup
= NULL
;
1600 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1604 return cg_path_get_slice(cgroup
, ret_slice
);
1607 int cg_path_get_user_slice(const char *p
, char **ret_slice
) {
1612 t
= skip_user_prefix(p
);
1616 /* And now it looks pretty much the same as for a system slice, so let's just use the same parser
1618 return cg_path_get_slice(t
, ret_slice
);
1621 int cg_pid_get_user_slice(pid_t pid
, char **ret_slice
) {
1622 _cleanup_free_
char *cgroup
= NULL
;
1627 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1631 return cg_path_get_user_slice(cgroup
, ret_slice
);
1634 bool cg_needs_escape(const char *p
) {
1636 /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note
1637 * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if
1638 * they shall be used. Also note that various names cannot be made valid by escaping even if we
1639 * return true here (because too long, or contain the forbidden character "/"). */
1641 if (!filename_is_valid(p
))
1644 if (IN_SET(p
[0], '_', '.'))
1647 if (STR_IN_SET(p
, "notify_on_release", "release_agent", "tasks"))
1650 if (startswith(p
, "cgroup."))
1653 for (CGroupController c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
1656 q
= startswith(p
, cgroup_controller_to_string(c
));
1667 int cg_escape(const char *p
, char **ret
) {
1668 _cleanup_free_
char *n
= NULL
;
1670 /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any
1671 * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That
1672 * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there
1675 /* The return value of this function (unlike cg_unescape()) needs free()! */
1677 if (cg_needs_escape(p
)) {
1678 n
= strjoin("_", p
);
1682 if (!filename_is_valid(n
)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */
1694 char *cg_unescape(const char *p
) {
1697 /* The return value of this function (unlike cg_escape())
1698 * doesn't need free()! */
1706 #define CONTROLLER_VALID \
1710 bool cg_controller_is_valid(const char *p
) {
1716 if (streq(p
, SYSTEMD_CGROUP_CONTROLLER
))
1719 s
= startswith(p
, "name=");
1723 if (IN_SET(*p
, 0, '_'))
1726 for (t
= p
; *t
; t
++)
1727 if (!strchr(CONTROLLER_VALID
, *t
))
1730 if (t
- p
> NAME_MAX
)
1736 int cg_slice_to_path(const char *unit
, char **ret
) {
1737 _cleanup_free_
char *p
= NULL
, *s
= NULL
, *e
= NULL
;
1744 if (streq(unit
, SPECIAL_ROOT_SLICE
))
1745 return strdup_to(ret
, "");
1747 if (!unit_name_is_valid(unit
, UNIT_NAME_PLAIN
))
1750 if (!endswith(unit
, ".slice"))
1753 r
= unit_name_to_prefix(unit
, &p
);
1757 dash
= strchr(p
, '-');
1759 /* Don't allow initial dashes */
1764 _cleanup_free_
char *escaped
= NULL
;
1765 char n
[dash
- p
+ sizeof(".slice")];
1767 #if HAS_FEATURE_MEMORY_SANITIZER
1768 /* msan doesn't instrument stpncpy, so it thinks
1769 * n is later used uninitialized:
1770 * https://github.com/google/sanitizers/issues/926
1775 /* Don't allow trailing or double dashes */
1776 if (IN_SET(dash
[1], 0, '-'))
1779 strcpy(stpncpy(n
, p
, dash
- p
), ".slice");
1780 if (!unit_name_is_valid(n
, UNIT_NAME_PLAIN
))
1783 r
= cg_escape(n
, &escaped
);
1787 if (!strextend(&s
, escaped
, "/"))
1790 dash
= strchr(dash
+1, '-');
1793 r
= cg_escape(unit
, &e
);
1797 if (!strextend(&s
, e
))
1804 int cg_is_threaded(const char *path
) {
1805 _cleanup_free_
char *fs
= NULL
, *contents
= NULL
;
1806 _cleanup_strv_free_
char **v
= NULL
;
1809 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, "cgroup.type", &fs
);
1813 r
= read_full_virtual_file(fs
, &contents
, NULL
);
1815 return false; /* Assume no. */
1819 v
= strv_split(contents
, NULL
);
1823 /* If the cgroup is in the threaded mode, it contains "threaded".
1824 * If one of the parents or siblings is in the threaded mode, it may contain "invalid". */
1825 return strv_contains(v
, "threaded") || strv_contains(v
, "invalid");
1828 int cg_set_attribute(const char *controller
, const char *path
, const char *attribute
, const char *value
) {
1829 _cleanup_free_
char *p
= NULL
;
1832 r
= cg_get_path(controller
, path
, attribute
, &p
);
1836 return write_string_file(p
, value
, WRITE_STRING_FILE_DISABLE_BUFFER
);
1839 int cg_get_attribute(const char *controller
, const char *path
, const char *attribute
, char **ret
) {
1840 _cleanup_free_
char *p
= NULL
;
1843 r
= cg_get_path(controller
, path
, attribute
, &p
);
1847 return read_one_line_file(p
, ret
);
1850 int cg_get_attribute_as_uint64(const char *controller
, const char *path
, const char *attribute
, uint64_t *ret
) {
1851 _cleanup_free_
char *value
= NULL
;
1857 r
= cg_get_attribute(controller
, path
, attribute
, &value
);
1863 if (streq(value
, "max")) {
1864 *ret
= CGROUP_LIMIT_MAX
;
1868 r
= safe_atou64(value
, &v
);
1876 int cg_get_attribute_as_bool(const char *controller
, const char *path
, const char *attribute
, bool *ret
) {
1877 _cleanup_free_
char *value
= NULL
;
1882 r
= cg_get_attribute(controller
, path
, attribute
, &value
);
1888 r
= parse_boolean(value
);
1896 int cg_get_owner(const char *path
, uid_t
*ret_uid
) {
1897 _cleanup_free_
char *f
= NULL
;
1903 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &f
);
1907 if (stat(f
, &stats
) < 0)
1910 r
= stat_verify_directory(&stats
);
1914 *ret_uid
= stats
.st_uid
;
1918 int cg_get_keyed_attribute_full(
1919 const char *controller
,
1921 const char *attribute
,
1924 CGroupKeyMode mode
) {
1926 _cleanup_free_
char *filename
= NULL
, *contents
= NULL
;
1928 size_t n
, i
, n_done
= 0;
1932 /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
1933 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
1934 * entries as 'keys'. On success each entry will be set to the value of the matching key.
1936 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode
1937 * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */
1939 r
= cg_get_path(controller
, path
, attribute
, &filename
);
1943 r
= read_full_file(filename
, &contents
, NULL
);
1947 n
= strv_length(keys
);
1948 if (n
== 0) /* No keys to retrieve? That's easy, we are done then */
1951 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
1952 v
= newa0(char*, n
);
1954 for (p
= contents
; *p
;) {
1955 const char *w
= NULL
;
1957 for (i
= 0; i
< n
; i
++)
1959 w
= first_word(p
, keys
[i
]);
1967 l
= strcspn(w
, NEWLINE
);
1968 v
[i
] = strndup(w
, l
);
1980 p
+= strcspn(p
, NEWLINE
);
1982 p
+= strspn(p
, NEWLINE
);
1985 if (mode
& CG_KEY_MODE_GRACEFUL
)
1991 free_many_charp(v
, n
);
1995 memcpy(ret_values
, v
, sizeof(char*) * n
);
1996 if (mode
& CG_KEY_MODE_GRACEFUL
)
2002 int cg_mask_to_string(CGroupMask mask
, char **ret
) {
2003 _cleanup_free_
char *s
= NULL
;
2015 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2019 if (!FLAGS_SET(mask
, CGROUP_CONTROLLER_TO_MASK(c
)))
2022 k
= cgroup_controller_to_string(c
);
2025 if (!GREEDY_REALLOC(s
, n
+ space
+ l
+ 1))
2030 memcpy(s
+ n
+ space
, k
, l
);
2044 int cg_mask_from_string(const char *value
, CGroupMask
*ret
) {
2051 _cleanup_free_
char *n
= NULL
;
2055 r
= extract_first_word(&value
, &n
, NULL
, 0);
2061 v
= cgroup_controller_from_string(n
);
2065 m
|= CGROUP_CONTROLLER_TO_MASK(v
);
2072 int cg_mask_supported_subtree(const char *root
, CGroupMask
*ret
) {
2076 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2077 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2078 * pseudo-controllers. */
2080 r
= cg_all_unified();
2084 _cleanup_free_
char *controllers
= NULL
, *path
= NULL
;
2086 /* In the unified hierarchy we can read the supported and accessible controllers from
2087 * the top-level cgroup attribute */
2089 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, root
, "cgroup.controllers", &path
);
2093 r
= read_one_line_file(path
, &controllers
);
2097 r
= cg_mask_from_string(controllers
, &mask
);
2101 /* Mask controllers that are not supported in unified hierarchy. */
2102 mask
&= CGROUP_MASK_V2
;
2107 /* In the legacy hierarchy, we check which hierarchies are accessible. */
2110 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2111 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2114 if (!FLAGS_SET(CGROUP_MASK_V1
, bit
))
2117 n
= cgroup_controller_to_string(c
);
2118 if (controller_is_v1_accessible(root
, n
) >= 0)
2127 int cg_mask_supported(CGroupMask
*ret
) {
2128 _cleanup_free_
char *root
= NULL
;
2131 r
= cg_get_root_path(&root
);
2135 return cg_mask_supported_subtree(root
, ret
);
2138 int cg_kernel_controllers(Set
**ret
) {
2139 _cleanup_set_free_ Set
*controllers
= NULL
;
2140 _cleanup_fclose_
FILE *f
= NULL
;
2145 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2146 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2147 * pseudo-controllers. */
2149 r
= fopen_unlocked("/proc/cgroups", "re", &f
);
2157 /* Ignore the header line */
2158 (void) read_line(f
, SIZE_MAX
, NULL
);
2161 _cleanup_free_
char *controller
= NULL
;
2164 if (fscanf(f
, "%ms %*i %*i %i", &controller
, &enabled
) != 2) {
2178 if (!cg_controller_is_valid(controller
))
2181 r
= set_ensure_consume(&controllers
, &string_hash_ops_free
, TAKE_PTR(controller
));
2186 *ret
= TAKE_PTR(controllers
);
2191 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on
2192 * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1
2193 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on
2194 * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility
2197 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep
2198 * cgroup v2 process management but disable the compat dual layout, we return true on
2199 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified().
2201 static thread_local
bool unified_systemd_v232
;
2203 int cg_unified_cached(bool flush
) {
2204 static thread_local CGroupUnified unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2208 /* Checks if we support the unified hierarchy. Returns an
2209 * error when the cgroup hierarchies aren't mounted yet or we
2210 * have any other trouble determining if the unified hierarchy
2214 unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2215 else if (unified_cache
>= CGROUP_UNIFIED_NONE
)
2216 return unified_cache
;
2218 if (statfs("/sys/fs/cgroup/", &fs
) < 0)
2219 return log_debug_errno(errno
, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2221 if (F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2222 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2223 unified_cache
= CGROUP_UNIFIED_ALL
;
2224 } else if (F_TYPE_EQUAL(fs
.f_type
, TMPFS_MAGIC
)) {
2225 if (statfs("/sys/fs/cgroup/unified/", &fs
) == 0 &&
2226 F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2227 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2228 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2229 unified_systemd_v232
= false;
2231 if (statfs("/sys/fs/cgroup/systemd/", &fs
) < 0) {
2232 if (errno
== ENOENT
) {
2233 /* Some other software may have set up /sys/fs/cgroup in a configuration we do not recognize. */
2234 log_debug_errno(errno
, "Unsupported cgroupsv1 setup detected: name=systemd hierarchy not found.");
2237 return log_debug_errno(errno
, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2240 if (F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2241 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2242 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2243 unified_systemd_v232
= true;
2244 } else if (F_TYPE_EQUAL(fs
.f_type
, CGROUP_SUPER_MAGIC
)) {
2245 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2246 unified_cache
= CGROUP_UNIFIED_NONE
;
2248 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2249 (unsigned long long) fs
.f_type
);
2250 unified_cache
= CGROUP_UNIFIED_NONE
;
2253 } else if (F_TYPE_EQUAL(fs
.f_type
, SYSFS_MAGIC
)) {
2254 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM
),
2255 "No filesystem is currently mounted on /sys/fs/cgroup.");
2257 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM
),
2258 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2259 (unsigned long long)fs
.f_type
);
2261 return unified_cache
;
2264 int cg_unified_controller(const char *controller
) {
2267 r
= cg_unified_cached(false);
2271 if (r
== CGROUP_UNIFIED_NONE
)
2274 if (r
>= CGROUP_UNIFIED_ALL
)
2277 return streq_ptr(controller
, SYSTEMD_CGROUP_CONTROLLER
);
2280 int cg_all_unified(void) {
2283 r
= cg_unified_cached(false);
2287 return r
>= CGROUP_UNIFIED_ALL
;
2290 int cg_hybrid_unified(void) {
2293 r
= cg_unified_cached(false);
2297 return r
== CGROUP_UNIFIED_SYSTEMD
&& !unified_systemd_v232
;
2300 int cg_is_delegated(const char *path
) {
2305 r
= cg_get_xattr_bool(path
, "trusted.delegate");
2306 if (!ERRNO_IS_NEG_XATTR_ABSENT(r
))
2309 /* If the trusted xattr isn't set (preferred), then check the untrusted one. Under the assumption
2310 * that whoever is trusted enough to own the cgroup, is also trusted enough to decide if it is
2311 * delegated or not this should be safe. */
2312 r
= cg_get_xattr_bool(path
, "user.delegate");
2313 return ERRNO_IS_NEG_XATTR_ABSENT(r
) ? false : r
;
2316 int cg_is_delegated_fd(int fd
) {
2321 r
= getxattr_at_bool(fd
, /* path= */ NULL
, "trusted.delegate", /* flags= */ 0);
2322 if (!ERRNO_IS_NEG_XATTR_ABSENT(r
))
2325 r
= getxattr_at_bool(fd
, /* path= */ NULL
, "user.delegate", /* flags= */ 0);
2326 return ERRNO_IS_NEG_XATTR_ABSENT(r
) ? false : r
;
2329 int cg_has_coredump_receive(const char *path
) {
2334 r
= cg_get_xattr_bool(path
, "user.coredump_receive");
2335 if (ERRNO_IS_NEG_XATTR_ABSENT(r
))
2341 const uint64_t cgroup_io_limit_defaults
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2342 [CGROUP_IO_RBPS_MAX
] = CGROUP_LIMIT_MAX
,
2343 [CGROUP_IO_WBPS_MAX
] = CGROUP_LIMIT_MAX
,
2344 [CGROUP_IO_RIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2345 [CGROUP_IO_WIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2348 static const char* const cgroup_io_limit_type_table
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2349 [CGROUP_IO_RBPS_MAX
] = "IOReadBandwidthMax",
2350 [CGROUP_IO_WBPS_MAX
] = "IOWriteBandwidthMax",
2351 [CGROUP_IO_RIOPS_MAX
] = "IOReadIOPSMax",
2352 [CGROUP_IO_WIOPS_MAX
] = "IOWriteIOPSMax",
2355 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type
, CGroupIOLimitType
);
2357 bool is_cgroup_fs(const struct statfs
*s
) {
2358 return is_fs_type(s
, CGROUP_SUPER_MAGIC
) ||
2359 is_fs_type(s
, CGROUP2_SUPER_MAGIC
);
2362 bool fd_is_cgroup_fs(int fd
) {
2365 if (fstatfs(fd
, &s
) < 0)
2368 return is_cgroup_fs(&s
);
2371 static const char *const cgroup_controller_table
[_CGROUP_CONTROLLER_MAX
] = {
2372 [CGROUP_CONTROLLER_CPU
] = "cpu",
2373 [CGROUP_CONTROLLER_CPUACCT
] = "cpuacct",
2374 [CGROUP_CONTROLLER_CPUSET
] = "cpuset",
2375 [CGROUP_CONTROLLER_IO
] = "io",
2376 [CGROUP_CONTROLLER_BLKIO
] = "blkio",
2377 [CGROUP_CONTROLLER_MEMORY
] = "memory",
2378 [CGROUP_CONTROLLER_DEVICES
] = "devices",
2379 [CGROUP_CONTROLLER_PIDS
] = "pids",
2380 [CGROUP_CONTROLLER_BPF_FIREWALL
] = "bpf-firewall",
2381 [CGROUP_CONTROLLER_BPF_DEVICES
] = "bpf-devices",
2382 [CGROUP_CONTROLLER_BPF_FOREIGN
] = "bpf-foreign",
2383 [CGROUP_CONTROLLER_BPF_SOCKET_BIND
] = "bpf-socket-bind",
2384 [CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES
] = "bpf-restrict-network-interfaces",
2387 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller
, CGroupController
);
2389 CGroupMask
get_cpu_accounting_mask(void) {
2390 static CGroupMask needed_mask
= (CGroupMask
) -1;
2392 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2393 * provided externally from the CPU controller, which means we don't
2394 * need to enable the CPU controller just to get metrics. This is good,
2395 * because enabling the CPU controller comes at a minor performance
2396 * hit, especially when it's propagated deep into large hierarchies.
2397 * There's also no separate CPU accounting controller available within
2398 * a unified hierarchy.
2400 * This combination of factors results in the desired cgroup mask to
2401 * enable for CPU accounting varying as follows:
2403 * ╔═════════════════════╤═════════════════════╗
2404 * ║ Linux ≥4.15 │ Linux <4.15 ║
2405 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2406 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2407 * ╟───────────────╫─────────────────────┼─────────────────────╢
2408 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2409 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2411 * We check kernel version here instead of manually checking whether
2412 * cpu.stat is present for every cgroup, as that check in itself would
2413 * already be fairly expensive.
2415 * Kernels where this patch has been backported will therefore have the
2416 * CPU controller enabled unnecessarily. This is more expensive than
2417 * necessary, but harmless. ☺️
2420 if (needed_mask
== (CGroupMask
) -1) {
2421 if (cg_all_unified()) {
2423 assert_se(uname(&u
) >= 0);
2425 if (strverscmp_improved(u
.release
, "4.15") < 0)
2426 needed_mask
= CGROUP_MASK_CPU
;
2430 needed_mask
= CGROUP_MASK_CPUACCT
;
2436 bool cpu_accounting_is_cheap(void) {
2437 return get_cpu_accounting_mask() == 0;
2440 static const char* const managed_oom_mode_table
[_MANAGED_OOM_MODE_MAX
] = {
2441 [MANAGED_OOM_AUTO
] = "auto",
2442 [MANAGED_OOM_KILL
] = "kill",
2445 DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode
, ManagedOOMMode
);
2447 static const char* const managed_oom_preference_table
[_MANAGED_OOM_PREFERENCE_MAX
] = {
2448 [MANAGED_OOM_PREFERENCE_NONE
] = "none",
2449 [MANAGED_OOM_PREFERENCE_AVOID
] = "avoid",
2450 [MANAGED_OOM_PREFERENCE_OMIT
] = "omit",
2453 DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference
, ManagedOOMPreference
);