2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
29 #include <sys/statfs.h>
30 #include <sys/types.h>
31 #include <sys/xattr.h>
34 #include "alloc-util.h"
35 #include "cgroup-util.h"
37 #include "dirent-util.h"
38 #include "extract-word.h"
41 #include "format-util.h"
44 #include "login-util.h"
48 #include "parse-util.h"
49 #include "path-util.h"
50 #include "proc-cmdline.h"
51 #include "process-util.h"
54 #include "stat-util.h"
55 #include "stdio-util.h"
56 #include "string-table.h"
57 #include "string-util.h"
58 #include "unit-name.h"
59 #include "user-util.h"
61 int cg_enumerate_processes(const char *controller
, const char *path
, FILE **_f
) {
62 _cleanup_free_
char *fs
= NULL
;
68 r
= cg_get_path(controller
, path
, "cgroup.procs", &fs
);
80 int cg_read_pid(FILE *f
, pid_t
*_pid
) {
83 /* Note that the cgroup.procs might contain duplicates! See
84 * cgroups.txt for details. */
90 if (fscanf(f
, "%lu", &ul
) != 1) {
95 return errno
> 0 ? -errno
: -EIO
;
105 int cg_read_event(const char *controller
, const char *path
, const char *event
,
108 _cleanup_free_
char *events
= NULL
, *content
= NULL
;
112 r
= cg_get_path(controller
, path
, "cgroup.events", &events
);
116 r
= read_full_file(events
, &content
, NULL
);
121 while ((line
= strsep(&p
, "\n"))) {
124 key
= strsep(&line
, " ");
128 if (strcmp(key
, event
))
138 bool cg_ns_supported(void) {
139 static thread_local
int enabled
= -1;
144 if (access("/proc/self/ns/cgroup", F_OK
) == 0)
152 int cg_enumerate_subgroups(const char *controller
, const char *path
, DIR **_d
) {
153 _cleanup_free_
char *fs
= NULL
;
159 /* This is not recursive! */
161 r
= cg_get_path(controller
, path
, NULL
, &fs
);
173 int cg_read_subgroup(DIR *d
, char **fn
) {
179 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
182 if (de
->d_type
!= DT_DIR
)
185 if (dot_or_dot_dot(de
->d_name
))
188 b
= strdup(de
->d_name
);
199 int cg_rmdir(const char *controller
, const char *path
) {
200 _cleanup_free_
char *p
= NULL
;
203 r
= cg_get_path(controller
, path
, NULL
, &p
);
208 if (r
< 0 && errno
!= ENOENT
)
211 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
) && cg_hybrid_unified()) {
212 r
= cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
214 log_warning_errno(r
, "Failed to remove compat systemd cgroup %s: %m", path
);
221 const char *controller
,
226 cg_kill_log_func_t log_kill
,
229 _cleanup_set_free_ Set
*allocated_set
= NULL
;
236 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
237 * SIGCONT on SIGKILL. */
238 if (IN_SET(sig
, SIGCONT
, SIGKILL
))
239 flags
&= ~CGROUP_SIGCONT
;
241 /* This goes through the tasks list and kills them all. This
242 * is repeated until no further processes are added to the
243 * tasks list, to properly handle forking processes */
246 s
= allocated_set
= set_new(NULL
);
254 _cleanup_fclose_
FILE *f
= NULL
;
258 r
= cg_enumerate_processes(controller
, path
, &f
);
260 if (ret
>= 0 && r
!= -ENOENT
)
266 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
268 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
271 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
275 log_kill(pid
, sig
, userdata
);
277 /* If we haven't killed this process yet, kill
279 if (kill(pid
, sig
) < 0) {
280 if (ret
>= 0 && errno
!= ESRCH
)
283 if (flags
& CGROUP_SIGCONT
)
284 (void) kill(pid
, SIGCONT
);
292 r
= set_put(s
, PID_TO_PTR(pid
));
308 /* To avoid racing against processes which fork
309 * quicker than we can kill them we repeat this until
310 * no new pids need to be killed. */
317 int cg_kill_recursive(
318 const char *controller
,
323 cg_kill_log_func_t log_kill
,
326 _cleanup_set_free_ Set
*allocated_set
= NULL
;
327 _cleanup_closedir_
DIR *d
= NULL
;
335 s
= allocated_set
= set_new(NULL
);
340 ret
= cg_kill(controller
, path
, sig
, flags
, s
, log_kill
, userdata
);
342 r
= cg_enumerate_subgroups(controller
, path
, &d
);
344 if (ret
>= 0 && r
!= -ENOENT
)
350 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
351 _cleanup_free_
char *p
= NULL
;
353 p
= strjoin(path
, "/", fn
);
358 r
= cg_kill_recursive(controller
, p
, sig
, flags
, s
, log_kill
, userdata
);
359 if (r
!= 0 && ret
>= 0)
362 if (ret
>= 0 && r
< 0)
365 if (flags
& CGROUP_REMOVE
) {
366 r
= cg_rmdir(controller
, path
);
367 if (r
< 0 && ret
>= 0 && r
!= -ENOENT
&& r
!= -EBUSY
)
382 _cleanup_set_free_ Set
*s
= NULL
;
398 _cleanup_fclose_
FILE *f
= NULL
;
402 r
= cg_enumerate_processes(cfrom
, pfrom
, &f
);
404 if (ret
>= 0 && r
!= -ENOENT
)
410 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
412 /* This might do weird stuff if we aren't a
413 * single-threaded program. However, we
414 * luckily know we are not */
415 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
418 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
421 /* Ignore kernel threads. Since they can only
422 * exist in the root cgroup, we only check for
425 (isempty(pfrom
) || path_equal(pfrom
, "/")) &&
426 is_kernel_thread(pid
) > 0)
429 r
= cg_attach(cto
, pto
, pid
);
431 if (ret
>= 0 && r
!= -ESRCH
)
438 r
= set_put(s
, PID_TO_PTR(pid
));
458 int cg_migrate_recursive(
465 _cleanup_closedir_
DIR *d
= NULL
;
474 ret
= cg_migrate(cfrom
, pfrom
, cto
, pto
, flags
);
476 r
= cg_enumerate_subgroups(cfrom
, pfrom
, &d
);
478 if (ret
>= 0 && r
!= -ENOENT
)
484 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
485 _cleanup_free_
char *p
= NULL
;
487 p
= strjoin(pfrom
, "/", fn
);
492 r
= cg_migrate_recursive(cfrom
, p
, cto
, pto
, flags
);
493 if (r
!= 0 && ret
>= 0)
497 if (r
< 0 && ret
>= 0)
500 if (flags
& CGROUP_REMOVE
) {
501 r
= cg_rmdir(cfrom
, pfrom
);
502 if (r
< 0 && ret
>= 0 && r
!= -ENOENT
&& r
!= -EBUSY
)
509 int cg_migrate_recursive_fallback(
523 r
= cg_migrate_recursive(cfrom
, pfrom
, cto
, pto
, flags
);
525 char prefix
[strlen(pto
) + 1];
527 /* This didn't work? Then let's try all prefixes of the destination */
529 PATH_FOREACH_PREFIX(prefix
, pto
) {
532 q
= cg_migrate_recursive(cfrom
, pfrom
, cto
, prefix
, flags
);
541 static const char *controller_to_dirname(const char *controller
) {
546 /* Converts a controller name to the directory name below
547 * /sys/fs/cgroup/ we want to mount it to. Effectively, this
548 * just cuts off the name= prefixed used for named
549 * hierarchies, if it is specified. */
551 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
552 if (cg_hybrid_unified())
553 controller
= SYSTEMD_CGROUP_CONTROLLER_HYBRID
;
555 controller
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
558 e
= startswith(controller
, "name=");
565 static int join_path_legacy(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
572 dn
= controller_to_dirname(controller
);
574 if (isempty(path
) && isempty(suffix
))
575 t
= strappend("/sys/fs/cgroup/", dn
);
576 else if (isempty(path
))
577 t
= strjoin("/sys/fs/cgroup/", dn
, "/", suffix
);
578 else if (isempty(suffix
))
579 t
= strjoin("/sys/fs/cgroup/", dn
, "/", path
);
581 t
= strjoin("/sys/fs/cgroup/", dn
, "/", path
, "/", suffix
);
589 static int join_path_unified(const char *path
, const char *suffix
, char **fs
) {
594 if (isempty(path
) && isempty(suffix
))
595 t
= strdup("/sys/fs/cgroup");
596 else if (isempty(path
))
597 t
= strappend("/sys/fs/cgroup/", suffix
);
598 else if (isempty(suffix
))
599 t
= strappend("/sys/fs/cgroup/", path
);
601 t
= strjoin("/sys/fs/cgroup/", path
, "/", suffix
);
609 int cg_get_path(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
617 /* If no controller is specified, we return the path
618 * *below* the controllers, without any prefix. */
620 if (!path
&& !suffix
)
628 t
= strjoin(path
, "/", suffix
);
632 *fs
= path_kill_slashes(t
);
636 if (!cg_controller_is_valid(controller
))
639 if (cg_all_unified())
640 r
= join_path_unified(path
, suffix
, fs
);
642 r
= join_path_legacy(controller
, path
, suffix
, fs
);
646 path_kill_slashes(*fs
);
650 static int controller_is_accessible(const char *controller
) {
654 /* Checks whether a specific controller is accessible,
655 * i.e. its hierarchy mounted. In the unified hierarchy all
656 * controllers are considered accessible, except for the named
659 if (!cg_controller_is_valid(controller
))
662 if (cg_all_unified()) {
663 /* We don't support named hierarchies if we are using
664 * the unified hierarchy. */
666 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
669 if (startswith(controller
, "name="))
675 dn
= controller_to_dirname(controller
);
676 cc
= strjoina("/sys/fs/cgroup/", dn
);
678 if (laccess(cc
, F_OK
) < 0)
685 int cg_get_path_and_check(const char *controller
, const char *path
, const char *suffix
, char **fs
) {
691 /* Check if the specified controller is actually accessible */
692 r
= controller_is_accessible(controller
);
696 return cg_get_path(controller
, path
, suffix
, fs
);
699 static int trim_cb(const char *path
, const struct stat
*sb
, int typeflag
, struct FTW
*ftwbuf
) {
704 if (typeflag
!= FTW_DP
)
707 if (ftwbuf
->level
< 1)
714 int cg_trim(const char *controller
, const char *path
, bool delete_root
) {
715 _cleanup_free_
char *fs
= NULL
;
720 r
= cg_get_path(controller
, path
, NULL
, &fs
);
725 if (nftw(fs
, trim_cb
, 64, FTW_DEPTH
|FTW_MOUNT
|FTW_PHYS
) != 0) {
735 if (rmdir(fs
) < 0 && errno
!= ENOENT
)
739 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
) && cg_hybrid_unified()) {
740 q
= cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, delete_root
);
742 log_warning_errno(q
, "Failed to trim compat systemd cgroup %s: %m", path
);
748 int cg_create(const char *controller
, const char *path
) {
749 _cleanup_free_
char *fs
= NULL
;
752 r
= cg_get_path_and_check(controller
, path
, NULL
, &fs
);
756 r
= mkdir_parents(fs
, 0755);
760 if (mkdir(fs
, 0755) < 0) {
768 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
) && cg_hybrid_unified()) {
769 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
771 log_warning_errno(r
, "Failed to create compat systemd cgroup %s: %m", path
);
777 int cg_create_and_attach(const char *controller
, const char *path
, pid_t pid
) {
782 r
= cg_create(controller
, path
);
786 q
= cg_attach(controller
, path
, pid
);
790 /* This does not remove the cgroup on failure */
794 int cg_attach(const char *controller
, const char *path
, pid_t pid
) {
795 _cleanup_free_
char *fs
= NULL
;
796 char c
[DECIMAL_STR_MAX(pid_t
) + 2];
802 r
= cg_get_path_and_check(controller
, path
, "cgroup.procs", &fs
);
809 xsprintf(c
, PID_FMT
"\n", pid
);
811 r
= write_string_file(fs
, c
, 0);
815 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
) && cg_hybrid_unified()) {
816 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, pid
);
818 log_warning_errno(r
, "Failed to attach %d to compat systemd cgroup %s: %m", pid
, path
);
824 int cg_attach_fallback(const char *controller
, const char *path
, pid_t pid
) {
831 r
= cg_attach(controller
, path
, pid
);
833 char prefix
[strlen(path
) + 1];
835 /* This didn't work? Then let's try all prefixes of
838 PATH_FOREACH_PREFIX(prefix
, path
) {
841 q
= cg_attach(controller
, prefix
, pid
);
850 int cg_set_group_access(
851 const char *controller
,
857 _cleanup_free_
char *fs
= NULL
;
860 if (mode
== MODE_INVALID
&& uid
== UID_INVALID
&& gid
== GID_INVALID
)
863 if (mode
!= MODE_INVALID
)
866 r
= cg_get_path(controller
, path
, NULL
, &fs
);
870 r
= chmod_and_chown(fs
, mode
, uid
, gid
);
874 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
) && cg_hybrid_unified()) {
875 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, mode
, uid
, gid
);
877 log_warning_errno(r
, "Failed to set group access on compat systemd cgroup %s: %m", path
);
883 int cg_set_task_access(
884 const char *controller
,
890 _cleanup_free_
char *fs
= NULL
, *procs
= NULL
;
895 if (mode
== MODE_INVALID
&& uid
== UID_INVALID
&& gid
== GID_INVALID
)
898 if (mode
!= MODE_INVALID
)
901 r
= cg_get_path(controller
, path
, "cgroup.procs", &fs
);
905 r
= chmod_and_chown(fs
, mode
, uid
, gid
);
909 if (!cg_unified(controller
)) {
910 /* Compatibility, Always keep values for "tasks" in sync with
912 if (cg_get_path(controller
, path
, "tasks", &procs
) >= 0)
913 (void) chmod_and_chown(procs
, mode
, uid
, gid
);
916 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
) && cg_hybrid_unified()) {
917 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, mode
, uid
, gid
);
919 log_warning_errno(r
, "Failed to set task access on compat systemd cgroup %s: %m", path
);
925 int cg_set_xattr(const char *controller
, const char *path
, const char *name
, const void *value
, size_t size
, int flags
) {
926 _cleanup_free_
char *fs
= NULL
;
931 assert(value
|| size
<= 0);
933 r
= cg_get_path(controller
, path
, NULL
, &fs
);
937 if (setxattr(fs
, name
, value
, size
, flags
) < 0)
943 int cg_get_xattr(const char *controller
, const char *path
, const char *name
, void *value
, size_t size
) {
944 _cleanup_free_
char *fs
= NULL
;
951 r
= cg_get_path(controller
, path
, NULL
, &fs
);
955 n
= getxattr(fs
, name
, value
, size
);
962 int cg_pid_get_path(const char *controller
, pid_t pid
, char **path
) {
963 _cleanup_fclose_
FILE *f
= NULL
;
965 const char *fs
, *controller_str
;
973 if (!cg_controller_is_valid(controller
))
976 controller
= SYSTEMD_CGROUP_CONTROLLER
;
978 unified
= cg_unified(controller
);
980 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
981 controller_str
= SYSTEMD_CGROUP_CONTROLLER_LEGACY
;
983 controller_str
= controller
;
985 cs
= strlen(controller_str
);
988 fs
= procfs_file_alloca(pid
, "cgroup");
991 return errno
== ENOENT
? -ESRCH
: -errno
;
993 FOREACH_LINE(line
, f
, return -errno
) {
999 e
= startswith(line
, "0:");
1009 const char *word
, *state
;
1012 l
= strchr(line
, ':');
1022 FOREACH_WORD_SEPARATOR(word
, k
, l
, ",", state
) {
1023 if (k
== cs
&& memcmp(word
, controller_str
, cs
) == 0) {
1044 int cg_install_release_agent(const char *controller
, const char *agent
) {
1045 _cleanup_free_
char *fs
= NULL
, *contents
= NULL
;
1051 if (cg_unified(controller
)) /* doesn't apply to unified hierarchy */
1054 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
1058 r
= read_one_line_file(fs
, &contents
);
1062 sc
= strstrip(contents
);
1064 r
= write_string_file(fs
, agent
, 0);
1067 } else if (!path_equal(sc
, agent
))
1071 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
1075 contents
= mfree(contents
);
1076 r
= read_one_line_file(fs
, &contents
);
1080 sc
= strstrip(contents
);
1081 if (streq(sc
, "0")) {
1082 r
= write_string_file(fs
, "1", 0);
1089 if (!streq(sc
, "1"))
1095 int cg_uninstall_release_agent(const char *controller
) {
1096 _cleanup_free_
char *fs
= NULL
;
1099 if (cg_unified(controller
)) /* Doesn't apply to unified hierarchy */
1102 r
= cg_get_path(controller
, NULL
, "notify_on_release", &fs
);
1106 r
= write_string_file(fs
, "0", 0);
1112 r
= cg_get_path(controller
, NULL
, "release_agent", &fs
);
1116 r
= write_string_file(fs
, "", 0);
1123 int cg_is_empty(const char *controller
, const char *path
) {
1124 _cleanup_fclose_
FILE *f
= NULL
;
1130 r
= cg_enumerate_processes(controller
, path
, &f
);
1136 r
= cg_read_pid(f
, &pid
);
1143 int cg_is_empty_recursive(const char *controller
, const char *path
) {
1148 /* The root cgroup is always populated */
1149 if (controller
&& (isempty(path
) || path_equal(path
, "/")))
1152 if (cg_unified(controller
)) {
1153 _cleanup_free_
char *t
= NULL
;
1155 /* On the unified hierarchy we can check empty state
1156 * via the "populated" attribute of "cgroup.events". */
1158 r
= cg_read_event(controller
, path
, "populated", &t
);
1162 return streq(t
, "0");
1164 _cleanup_closedir_
DIR *d
= NULL
;
1167 r
= cg_is_empty(controller
, path
);
1171 r
= cg_enumerate_subgroups(controller
, path
, &d
);
1177 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
1178 _cleanup_free_
char *p
= NULL
;
1180 p
= strjoin(path
, "/", fn
);
1185 r
= cg_is_empty_recursive(controller
, p
);
1196 int cg_split_spec(const char *spec
, char **controller
, char **path
) {
1197 char *t
= NULL
, *u
= NULL
;
1203 if (!path_is_safe(spec
))
1211 *path
= path_kill_slashes(t
);
1220 e
= strchr(spec
, ':');
1222 if (!cg_controller_is_valid(spec
))
1239 t
= strndup(spec
, e
-spec
);
1242 if (!cg_controller_is_valid(t
)) {
1256 if (!path_is_safe(u
) ||
1257 !path_is_absolute(u
)) {
1263 path_kill_slashes(u
);
1279 int cg_mangle_path(const char *path
, char **result
) {
1280 _cleanup_free_
char *c
= NULL
, *p
= NULL
;
1287 /* First, check if it already is a filesystem path */
1288 if (path_startswith(path
, "/sys/fs/cgroup")) {
1294 *result
= path_kill_slashes(t
);
1298 /* Otherwise, treat it as cg spec */
1299 r
= cg_split_spec(path
, &c
, &p
);
1303 return cg_get_path(c
?: SYSTEMD_CGROUP_CONTROLLER
, p
?: "/", NULL
, result
);
1306 int cg_get_root_path(char **path
) {
1312 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 1, &p
);
1316 e
= endswith(p
, "/" SPECIAL_INIT_SCOPE
);
1318 e
= endswith(p
, "/" SPECIAL_SYSTEM_SLICE
); /* legacy */
1320 e
= endswith(p
, "/system"); /* even more legacy */
1328 int cg_shift_path(const char *cgroup
, const char *root
, const char **shifted
) {
1329 _cleanup_free_
char *rt
= NULL
;
1337 /* If the root was specified let's use that, otherwise
1338 * let's determine it from PID 1 */
1340 r
= cg_get_root_path(&rt
);
1347 p
= path_startswith(cgroup
, root
);
1348 if (p
&& p
> cgroup
)
1356 int cg_pid_get_path_shifted(pid_t pid
, const char *root
, char **cgroup
) {
1357 _cleanup_free_
char *raw
= NULL
;
1364 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &raw
);
1368 r
= cg_shift_path(raw
, root
, &c
);
1388 int cg_path_decode_unit(const char *cgroup
, char **unit
) {
1395 n
= strcspn(cgroup
, "/");
1399 c
= strndupa(cgroup
, n
);
1402 if (!unit_name_is_valid(c
, UNIT_NAME_PLAIN
|UNIT_NAME_INSTANCE
))
1413 static bool valid_slice_name(const char *p
, size_t n
) {
1418 if (n
< strlen("x.slice"))
1421 if (memcmp(p
+ n
- 6, ".slice", 6) == 0) {
1427 c
= cg_unescape(buf
);
1429 return unit_name_is_valid(c
, UNIT_NAME_PLAIN
);
1435 static const char *skip_slices(const char *p
) {
1438 /* Skips over all slice assignments */
1443 p
+= strspn(p
, "/");
1445 n
= strcspn(p
, "/");
1446 if (!valid_slice_name(p
, n
))
1453 int cg_path_get_unit(const char *path
, char **ret
) {
1461 e
= skip_slices(path
);
1463 r
= cg_path_decode_unit(e
, &unit
);
1467 /* We skipped over the slices, don't accept any now */
1468 if (endswith(unit
, ".slice")) {
1477 int cg_pid_get_unit(pid_t pid
, char **unit
) {
1478 _cleanup_free_
char *cgroup
= NULL
;
1483 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1487 return cg_path_get_unit(cgroup
, unit
);
1491 * Skip session-*.scope, but require it to be there.
1493 static const char *skip_session(const char *p
) {
1499 p
+= strspn(p
, "/");
1501 n
= strcspn(p
, "/");
1502 if (n
< strlen("session-x.scope"))
1505 if (memcmp(p
, "session-", 8) == 0 && memcmp(p
+ n
- 6, ".scope", 6) == 0) {
1506 char buf
[n
- 8 - 6 + 1];
1508 memcpy(buf
, p
+ 8, n
- 8 - 6);
1511 /* Note that session scopes never need unescaping,
1512 * since they cannot conflict with the kernel's own
1513 * names, hence we don't need to call cg_unescape()
1516 if (!session_id_valid(buf
))
1520 p
+= strspn(p
, "/");
1528 * Skip user@*.service, but require it to be there.
1530 static const char *skip_user_manager(const char *p
) {
1536 p
+= strspn(p
, "/");
1538 n
= strcspn(p
, "/");
1539 if (n
< strlen("user@x.service"))
1542 if (memcmp(p
, "user@", 5) == 0 && memcmp(p
+ n
- 8, ".service", 8) == 0) {
1543 char buf
[n
- 5 - 8 + 1];
1545 memcpy(buf
, p
+ 5, n
- 5 - 8);
1548 /* Note that user manager services never need unescaping,
1549 * since they cannot conflict with the kernel's own
1550 * names, hence we don't need to call cg_unescape()
1553 if (parse_uid(buf
, NULL
) < 0)
1557 p
+= strspn(p
, "/");
1565 static const char *skip_user_prefix(const char *path
) {
1570 /* Skip slices, if there are any */
1571 e
= skip_slices(path
);
1573 /* Skip the user manager, if it's in the path now... */
1574 t
= skip_user_manager(e
);
1578 /* Alternatively skip the user session if it is in the path... */
1579 return skip_session(e
);
1582 int cg_path_get_user_unit(const char *path
, char **ret
) {
1588 t
= skip_user_prefix(path
);
1592 /* And from here on it looks pretty much the same as for a
1593 * system unit, hence let's use the same parser from here
1595 return cg_path_get_unit(t
, ret
);
1598 int cg_pid_get_user_unit(pid_t pid
, char **unit
) {
1599 _cleanup_free_
char *cgroup
= NULL
;
1604 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1608 return cg_path_get_user_unit(cgroup
, unit
);
1611 int cg_path_get_machine_name(const char *path
, char **machine
) {
1612 _cleanup_free_
char *u
= NULL
;
1616 r
= cg_path_get_unit(path
, &u
);
1620 sl
= strjoina("/run/systemd/machines/unit:", u
);
1621 return readlink_malloc(sl
, machine
);
1624 int cg_pid_get_machine_name(pid_t pid
, char **machine
) {
1625 _cleanup_free_
char *cgroup
= NULL
;
1630 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1634 return cg_path_get_machine_name(cgroup
, machine
);
1637 int cg_path_get_session(const char *path
, char **session
) {
1638 _cleanup_free_
char *unit
= NULL
;
1644 r
= cg_path_get_unit(path
, &unit
);
1648 start
= startswith(unit
, "session-");
1651 end
= endswith(start
, ".scope");
1656 if (!session_id_valid(start
))
1672 int cg_pid_get_session(pid_t pid
, char **session
) {
1673 _cleanup_free_
char *cgroup
= NULL
;
1676 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1680 return cg_path_get_session(cgroup
, session
);
1683 int cg_path_get_owner_uid(const char *path
, uid_t
*uid
) {
1684 _cleanup_free_
char *slice
= NULL
;
1690 r
= cg_path_get_slice(path
, &slice
);
1694 start
= startswith(slice
, "user-");
1697 end
= endswith(start
, ".slice");
1702 if (parse_uid(start
, uid
) < 0)
1708 int cg_pid_get_owner_uid(pid_t pid
, uid_t
*uid
) {
1709 _cleanup_free_
char *cgroup
= NULL
;
1712 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1716 return cg_path_get_owner_uid(cgroup
, uid
);
1719 int cg_path_get_slice(const char *p
, char **slice
) {
1720 const char *e
= NULL
;
1725 /* Finds the right-most slice unit from the beginning, but
1726 * stops before we come to the first non-slice unit. */
1731 p
+= strspn(p
, "/");
1733 n
= strcspn(p
, "/");
1734 if (!valid_slice_name(p
, n
)) {
1739 s
= strdup(SPECIAL_ROOT_SLICE
);
1747 return cg_path_decode_unit(e
, slice
);
1755 int cg_pid_get_slice(pid_t pid
, char **slice
) {
1756 _cleanup_free_
char *cgroup
= NULL
;
1761 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1765 return cg_path_get_slice(cgroup
, slice
);
1768 int cg_path_get_user_slice(const char *p
, char **slice
) {
1773 t
= skip_user_prefix(p
);
1777 /* And now it looks pretty much the same as for a system
1778 * slice, so let's just use the same parser from here on. */
1779 return cg_path_get_slice(t
, slice
);
1782 int cg_pid_get_user_slice(pid_t pid
, char **slice
) {
1783 _cleanup_free_
char *cgroup
= NULL
;
1788 r
= cg_pid_get_path_shifted(pid
, NULL
, &cgroup
);
1792 return cg_path_get_user_slice(cgroup
, slice
);
1795 char *cg_escape(const char *p
) {
1796 bool need_prefix
= false;
1798 /* This implements very minimal escaping for names to be used
1799 * as file names in the cgroup tree: any name which might
1800 * conflict with a kernel name or is prefixed with '_' is
1801 * prefixed with a '_'. That way, when reading cgroup names it
1802 * is sufficient to remove a single prefixing underscore if
1805 /* The return value of this function (unlike cg_unescape())
1811 streq(p
, "notify_on_release") ||
1812 streq(p
, "release_agent") ||
1813 streq(p
, "tasks") ||
1814 startswith(p
, "cgroup."))
1819 dot
= strrchr(p
, '.');
1824 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
1827 n
= cgroup_controller_to_string(c
);
1832 if (memcmp(p
, n
, l
) != 0)
1842 return strappend("_", p
);
1847 char *cg_unescape(const char *p
) {
1850 /* The return value of this function (unlike cg_escape())
1851 * doesn't need free()! */
1859 #define CONTROLLER_VALID \
1863 bool cg_controller_is_valid(const char *p
) {
1869 if (streq(p
, SYSTEMD_CGROUP_CONTROLLER
))
1872 s
= startswith(p
, "name=");
1876 if (*p
== 0 || *p
== '_')
1879 for (t
= p
; *t
; t
++)
1880 if (!strchr(CONTROLLER_VALID
, *t
))
1883 if (t
- p
> FILENAME_MAX
)
1889 int cg_slice_to_path(const char *unit
, char **ret
) {
1890 _cleanup_free_
char *p
= NULL
, *s
= NULL
, *e
= NULL
;
1897 if (streq(unit
, SPECIAL_ROOT_SLICE
)) {
1907 if (!unit_name_is_valid(unit
, UNIT_NAME_PLAIN
))
1910 if (!endswith(unit
, ".slice"))
1913 r
= unit_name_to_prefix(unit
, &p
);
1917 dash
= strchr(p
, '-');
1919 /* Don't allow initial dashes */
1924 _cleanup_free_
char *escaped
= NULL
;
1925 char n
[dash
- p
+ sizeof(".slice")];
1927 /* Don't allow trailing or double dashes */
1928 if (dash
[1] == 0 || dash
[1] == '-')
1931 strcpy(stpncpy(n
, p
, dash
- p
), ".slice");
1932 if (!unit_name_is_valid(n
, UNIT_NAME_PLAIN
))
1935 escaped
= cg_escape(n
);
1939 if (!strextend(&s
, escaped
, "/", NULL
))
1942 dash
= strchr(dash
+1, '-');
1945 e
= cg_escape(unit
);
1949 if (!strextend(&s
, e
, NULL
))
1958 int cg_set_attribute(const char *controller
, const char *path
, const char *attribute
, const char *value
) {
1959 _cleanup_free_
char *p
= NULL
;
1962 r
= cg_get_path(controller
, path
, attribute
, &p
);
1966 return write_string_file(p
, value
, 0);
1969 int cg_get_attribute(const char *controller
, const char *path
, const char *attribute
, char **ret
) {
1970 _cleanup_free_
char *p
= NULL
;
1973 r
= cg_get_path(controller
, path
, attribute
, &p
);
1977 return read_one_line_file(p
, ret
);
1980 int cg_get_keyed_attribute(const char *controller
, const char *path
, const char *attribute
, const char **keys
, char **values
) {
1981 _cleanup_free_
char *filename
= NULL
, *content
= NULL
;
1985 for (i
= 0; keys
[i
]; i
++)
1988 r
= cg_get_path(controller
, path
, attribute
, &filename
);
1992 r
= read_full_file(filename
, &content
, NULL
);
1997 while ((line
= strsep(&p
, "\n"))) {
2000 key
= strsep(&line
, " ");
2002 for (i
= 0; keys
[i
]; i
++) {
2003 if (streq(key
, keys
[i
])) {
2004 values
[i
] = strdup(line
);
2010 for (i
= 0; keys
[i
]; i
++) {
2012 for (i
= 0; keys
[i
]; i
++) {
2023 int cg_create_everywhere(CGroupMask supported
, CGroupMask mask
, const char *path
) {
2027 /* This one will create a cgroup in our private tree, but also
2028 * duplicate it in the trees specified in mask, and remove it
2031 /* First create the cgroup in our own hierarchy. */
2032 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, path
);
2036 /* If we are in the unified hierarchy, we are done now */
2037 if (cg_all_unified())
2040 /* Otherwise, do the same in the other hierarchies */
2041 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2042 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2045 n
= cgroup_controller_to_string(c
);
2048 (void) cg_create(n
, path
);
2049 else if (supported
& bit
)
2050 (void) cg_trim(n
, path
, true);
2056 int cg_attach_everywhere(CGroupMask supported
, const char *path
, pid_t pid
, cg_migrate_callback_t path_callback
, void *userdata
) {
2060 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER
, path
, pid
);
2064 if (cg_all_unified())
2067 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2068 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2069 const char *p
= NULL
;
2071 if (!(supported
& bit
))
2075 p
= path_callback(bit
, userdata
);
2080 (void) cg_attach_fallback(cgroup_controller_to_string(c
), p
, pid
);
2086 int cg_attach_many_everywhere(CGroupMask supported
, const char *path
, Set
* pids
, cg_migrate_callback_t path_callback
, void *userdata
) {
2091 SET_FOREACH(pidp
, pids
, i
) {
2092 pid_t pid
= PTR_TO_PID(pidp
);
2095 q
= cg_attach_everywhere(supported
, path
, pid
, path_callback
, userdata
);
2096 if (q
< 0 && r
>= 0)
2103 int cg_migrate_everywhere(CGroupMask supported
, const char *from
, const char *to
, cg_migrate_callback_t to_callback
, void *userdata
) {
2107 if (!path_equal(from
, to
)) {
2108 r
= cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER
, from
, SYSTEMD_CGROUP_CONTROLLER
, to
, CGROUP_REMOVE
);
2113 if (cg_all_unified())
2116 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2117 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2118 const char *p
= NULL
;
2120 if (!(supported
& bit
))
2124 p
= to_callback(bit
, userdata
);
2129 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER
, to
, cgroup_controller_to_string(c
), p
, 0);
2135 int cg_trim_everywhere(CGroupMask supported
, const char *path
, bool delete_root
) {
2139 r
= cg_trim(SYSTEMD_CGROUP_CONTROLLER
, path
, delete_root
);
2143 if (cg_all_unified())
2146 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2147 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2149 if (!(supported
& bit
))
2152 (void) cg_trim(cgroup_controller_to_string(c
), path
, delete_root
);
2158 int cg_mask_supported(CGroupMask
*ret
) {
2159 CGroupMask mask
= 0;
2162 /* Determines the mask of supported cgroup controllers. Only
2163 * includes controllers we can make sense of and that are
2164 * actually accessible. */
2166 if (cg_all_unified()) {
2167 _cleanup_free_
char *root
= NULL
, *controllers
= NULL
, *path
= NULL
;
2170 /* In the unified hierarchy we can read the supported
2171 * and accessible controllers from a the top-level
2172 * cgroup attribute */
2174 r
= cg_get_root_path(&root
);
2178 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, root
, "cgroup.controllers", &path
);
2182 r
= read_one_line_file(path
, &controllers
);
2188 _cleanup_free_
char *n
= NULL
;
2191 r
= extract_first_word(&c
, &n
, NULL
, 0);
2197 v
= cgroup_controller_from_string(n
);
2201 mask
|= CGROUP_CONTROLLER_TO_MASK(v
);
2204 /* Currently, we support the cpu, memory, io and pids
2205 * controller in the unified hierarchy, mask
2206 * everything else off. */
2207 mask
&= CGROUP_MASK_CPU
| CGROUP_MASK_MEMORY
| CGROUP_MASK_IO
| CGROUP_MASK_PIDS
;
2212 /* In the legacy hierarchy, we check whether which
2213 * hierarchies are mounted. */
2215 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2218 n
= cgroup_controller_to_string(c
);
2219 if (controller_is_accessible(n
) >= 0)
2220 mask
|= CGROUP_CONTROLLER_TO_MASK(c
);
2228 int cg_kernel_controllers(Set
*controllers
) {
2229 _cleanup_fclose_
FILE *f
= NULL
;
2233 assert(controllers
);
2235 /* Determines the full list of kernel-known controllers. Might
2236 * include controllers we don't actually support, arbitrary
2237 * named hierarchies and controllers that aren't currently
2238 * accessible (because not mounted). */
2240 f
= fopen("/proc/cgroups", "re");
2242 if (errno
== ENOENT
)
2247 /* Ignore the header line */
2248 (void) fgets(buf
, sizeof(buf
), f
);
2255 if (fscanf(f
, "%ms %*i %*i %i", &controller
, &enabled
) != 2) {
2260 if (ferror(f
) && errno
> 0)
2271 if (!cg_controller_is_valid(controller
)) {
2276 r
= set_consume(controllers
, controller
);
2284 static thread_local CGroupUnified unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2286 /* The hybrid mode was initially implemented in v232 and simply mounted
2287 * cgroup v2 on /sys/fs/cgroup/systemd. This unfortunately broke other
2288 * tools (such as docker) which expected the v1 "name=systemd" hierarchy
2289 * on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mountnbs
2290 * v2 on /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy
2291 * on /sys/fs/cgroup/systemd for compatibility with other tools.
2293 * To keep live upgrade working, we detect and support v232 layout. When
2294 * v232 layout is detected, to keep cgroup v2 process management but
2295 * disable the compat dual layout, we return %true on
2296 * cg_unified(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2298 static thread_local
bool unified_systemd_v232
;
2300 static int cg_update_unified(void) {
2304 /* Checks if we support the unified hierarchy. Returns an
2305 * error when the cgroup hierarchies aren't mounted yet or we
2306 * have any other trouble determining if the unified hierarchy
2309 if (unified_cache
>= CGROUP_UNIFIED_NONE
)
2312 if (statfs("/sys/fs/cgroup/", &fs
) < 0)
2315 if (F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
))
2316 unified_cache
= CGROUP_UNIFIED_ALL
;
2317 else if (F_TYPE_EQUAL(fs
.f_type
, TMPFS_MAGIC
)) {
2318 if (statfs("/sys/fs/cgroup/unified/", &fs
) == 0 &&
2319 F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2320 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2321 unified_systemd_v232
= false;
2322 } else if (statfs("/sys/fs/cgroup/systemd/", &fs
) == 0 &&
2323 F_TYPE_EQUAL(fs
.f_type
, CGROUP2_SUPER_MAGIC
)) {
2324 unified_cache
= CGROUP_UNIFIED_SYSTEMD
;
2325 unified_systemd_v232
= true;
2327 if (statfs("/sys/fs/cgroup/systemd/", &fs
) < 0)
2329 if (!F_TYPE_EQUAL(fs
.f_type
, CGROUP_SUPER_MAGIC
))
2331 unified_cache
= CGROUP_UNIFIED_NONE
;
2339 bool cg_unified(const char *controller
) {
2341 assert(cg_update_unified() >= 0);
2343 if (streq_ptr(controller
, SYSTEMD_CGROUP_CONTROLLER
))
2344 return unified_cache
>= CGROUP_UNIFIED_SYSTEMD
;
2346 return unified_cache
>= CGROUP_UNIFIED_ALL
;
2349 bool cg_all_unified(void) {
2351 return cg_unified(NULL
);
2354 bool cg_hybrid_unified(void) {
2356 assert(cg_update_unified() >= 0);
2358 return unified_cache
== CGROUP_UNIFIED_SYSTEMD
&& !unified_systemd_v232
;
2361 int cg_unified_flush(void) {
2362 unified_cache
= CGROUP_UNIFIED_UNKNOWN
;
2364 return cg_update_unified();
2367 int cg_enable_everywhere(CGroupMask supported
, CGroupMask mask
, const char *p
) {
2368 _cleanup_free_
char *fs
= NULL
;
2377 if (!cg_all_unified()) /* on the legacy hiearchy there's no joining of controllers defined */
2380 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, p
, "cgroup.subtree_control", &fs
);
2384 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
2385 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
2388 if (!(supported
& bit
))
2391 n
= cgroup_controller_to_string(c
);
2393 char s
[1 + strlen(n
) + 1];
2395 s
[0] = mask
& bit
? '+' : '-';
2398 r
= write_string_file(fs
, s
, 0);
2400 log_debug_errno(r
, "Failed to enable controller %s for %s (%s): %m", n
, p
, fs
);
2407 bool cg_is_unified_wanted(void) {
2408 static thread_local
int wanted
= -1;
2412 /* If the hierarchy is already mounted, then follow whatever
2413 * was chosen for it. */
2414 if (cg_unified_flush() >= 0)
2415 return cg_all_unified();
2417 /* Otherwise, let's see what the kernel command line has to
2418 * say. Since checking that is expensive, let's cache the
2423 r
= proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b
);
2427 return (wanted
= r
> 0 ? b
: false);
2430 bool cg_is_legacy_wanted(void) {
2431 return !cg_is_unified_wanted();
2434 bool cg_is_unified_systemd_controller_wanted(void) {
2435 static thread_local
int wanted
= -1;
2439 /* If the unified hierarchy is requested in full, no need to
2440 * bother with this. */
2441 if (cg_is_unified_wanted())
2444 /* If the hierarchy is already mounted, then follow whatever
2445 * was chosen for it. */
2446 if (cg_unified_flush() >= 0)
2447 return cg_unified(SYSTEMD_CGROUP_CONTROLLER
);
2449 /* Otherwise, let's see what the kernel command line has to
2450 * say. Since checking that is expensive, let's cache the
2455 r
= proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b
);
2459 /* The meaning of the kernel option is reversed wrt. to the return value
2460 * of this function, hence the negation. */
2461 return (wanted
= r
> 0 ? !b
: false);
2464 int cg_weight_parse(const char *s
, uint64_t *ret
) {
2469 *ret
= CGROUP_WEIGHT_INVALID
;
2473 r
= safe_atou64(s
, &u
);
2477 if (u
< CGROUP_WEIGHT_MIN
|| u
> CGROUP_WEIGHT_MAX
)
2484 const uint64_t cgroup_io_limit_defaults
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2485 [CGROUP_IO_RBPS_MAX
] = CGROUP_LIMIT_MAX
,
2486 [CGROUP_IO_WBPS_MAX
] = CGROUP_LIMIT_MAX
,
2487 [CGROUP_IO_RIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2488 [CGROUP_IO_WIOPS_MAX
] = CGROUP_LIMIT_MAX
,
2491 static const char* const cgroup_io_limit_type_table
[_CGROUP_IO_LIMIT_TYPE_MAX
] = {
2492 [CGROUP_IO_RBPS_MAX
] = "IOReadBandwidthMax",
2493 [CGROUP_IO_WBPS_MAX
] = "IOWriteBandwidthMax",
2494 [CGROUP_IO_RIOPS_MAX
] = "IOReadIOPSMax",
2495 [CGROUP_IO_WIOPS_MAX
] = "IOWriteIOPSMax",
2498 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type
, CGroupIOLimitType
);
2500 int cg_cpu_shares_parse(const char *s
, uint64_t *ret
) {
2505 *ret
= CGROUP_CPU_SHARES_INVALID
;
2509 r
= safe_atou64(s
, &u
);
2513 if (u
< CGROUP_CPU_SHARES_MIN
|| u
> CGROUP_CPU_SHARES_MAX
)
2520 int cg_blkio_weight_parse(const char *s
, uint64_t *ret
) {
2525 *ret
= CGROUP_BLKIO_WEIGHT_INVALID
;
2529 r
= safe_atou64(s
, &u
);
2533 if (u
< CGROUP_BLKIO_WEIGHT_MIN
|| u
> CGROUP_BLKIO_WEIGHT_MAX
)
2540 bool is_cgroup_fs(const struct statfs
*s
) {
2541 return is_fs_type(s
, CGROUP_SUPER_MAGIC
) ||
2542 is_fs_type(s
, CGROUP2_SUPER_MAGIC
);
2545 bool fd_is_cgroup_fs(int fd
) {
2548 if (fstatfs(fd
, &s
) < 0)
2551 return is_cgroup_fs(&s
);
2554 static const char *cgroup_controller_table
[_CGROUP_CONTROLLER_MAX
] = {
2555 [CGROUP_CONTROLLER_CPU
] = "cpu",
2556 [CGROUP_CONTROLLER_CPUACCT
] = "cpuacct",
2557 [CGROUP_CONTROLLER_IO
] = "io",
2558 [CGROUP_CONTROLLER_BLKIO
] = "blkio",
2559 [CGROUP_CONTROLLER_MEMORY
] = "memory",
2560 [CGROUP_CONTROLLER_DEVICES
] = "devices",
2561 [CGROUP_CONTROLLER_PIDS
] = "pids",
2564 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller
, CGroupController
);