1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include "cgroup-setup.h"
6 #include "cgroup-util.h"
7 #include "errno-util.h"
11 #include "missing_threads.h"
13 #include "parse-util.h"
14 #include "path-util.h"
15 #include "proc-cmdline.h"
16 #include "process-util.h"
17 #include "recurse-dir.h"
18 #include "stdio-util.h"
19 #include "string-util.h"
20 #include "user-util.h"
23 static int cg_any_controller_used_for_v1(void) {
24 _cleanup_free_
char *buf
= NULL
;
25 _cleanup_strv_free_
char **lines
= NULL
;
28 r
= read_full_virtual_file("/proc/cgroups", &buf
, NULL
);
30 return log_debug_errno(r
, "Could not read /proc/cgroups, ignoring: %m");
32 r
= strv_split_newlines_full(&lines
, buf
, 0);
36 /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all
37 * enabled kernel cgroup controllers are currently not in use by cgroup1. For reference:
38 * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-
40 * Note that this is typically only useful to check inside a container where we don't know what
41 * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use
42 * unified since some or all controllers would be missing. This is not the best way to detect this,
43 * as whatever container manager created our container should have mounted /sys/fs/cgroup
44 * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use
46 STRV_FOREACH(line
, lines
) {
47 _cleanup_free_
char *name
= NULL
, *hierarchy_id
= NULL
, *num
= NULL
, *enabled
= NULL
;
49 /* Skip header line */
50 if (startswith(*line
, "#"))
53 const char *p
= *line
;
54 r
= extract_many_words(&p
, NULL
, 0, &name
, &hierarchy_id
, &num
, &enabled
, NULL
);
56 return log_debug_errno(r
, "Error parsing /proc/cgroups line, ignoring: %m");
58 log_debug("Invalid /proc/cgroups line, ignoring.");
62 /* Ignore disabled controllers. */
63 if (streq(enabled
, "0"))
66 /* Ignore controllers we don't care about. */
67 if (cgroup_controller_from_string(name
) < 0)
70 /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a
71 * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1
72 * hierarchy, and can't be used in a unified cgroup. */
73 if (!streq(hierarchy_id
, "0")) {
74 log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name
);
82 bool cg_is_unified_wanted(void) {
83 static thread_local
int wanted
= -1;
85 const bool is_default
= DEFAULT_HIERARCHY
== CGROUP_UNIFIED_ALL
;
86 _cleanup_free_
char *c
= NULL
;
89 /* If we have a cached value, return that. */
93 /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
94 r
= cg_unified_cached(true);
96 return (wanted
= r
>= CGROUP_UNIFIED_ALL
);
98 /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */
99 r
= proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", /* flags = */ 0, &b
);
103 /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to
104 * use hybrid or legacy hierarchy. */
105 r
= proc_cmdline_get_key("cgroup_no_v1", 0, &c
);
106 if (r
> 0 && streq_ptr(c
, "all"))
107 return (wanted
= true);
109 /* If any controller is in use as v1, don't use unified. */
110 if (cg_any_controller_used_for_v1() > 0)
111 return (wanted
= false);
113 return (wanted
= is_default
);
116 bool cg_is_legacy_wanted(void) {
117 static thread_local
int wanted
= -1;
119 /* If we have a cached value, return that. */
123 /* Check if we have cgroup v2 already mounted. */
124 if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL
)
125 return (wanted
= false);
127 /* Otherwise, assume that at least partial legacy is wanted,
128 * since cgroup v2 should already be mounted at this point. */
129 return (wanted
= true);
132 bool cg_is_hybrid_wanted(void) {
133 static thread_local
int wanted
= -1;
136 const bool is_default
= DEFAULT_HIERARCHY
>= CGROUP_UNIFIED_SYSTEMD
;
137 /* We default to true if the default is "hybrid", obviously, but also when the default is "unified",
138 * because if we get called, it means that unified hierarchy was not mounted. */
140 /* If we have a cached value, return that. */
144 /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
145 if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL
)
146 return (wanted
= false);
148 /* Otherwise, let's see what the kernel command line has to say. Since checking is expensive, cache
149 * a non-error result. */
150 r
= proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", /* flags = */ 0, &b
);
152 /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the
154 return (wanted
= r
> 0 ? !b
: is_default
);
157 int cg_weight_parse(const char *s
, uint64_t *ret
) {
162 *ret
= CGROUP_WEIGHT_INVALID
;
166 r
= safe_atou64(s
, &u
);
170 if (u
< CGROUP_WEIGHT_MIN
|| u
> CGROUP_WEIGHT_MAX
)
177 int cg_cpu_weight_parse(const char *s
, uint64_t *ret
) {
178 if (streq_ptr(s
, "idle"))
179 return *ret
= CGROUP_WEIGHT_IDLE
;
180 return cg_weight_parse(s
, ret
);
183 int cg_cpu_shares_parse(const char *s
, uint64_t *ret
) {
188 *ret
= CGROUP_CPU_SHARES_INVALID
;
192 r
= safe_atou64(s
, &u
);
196 if (u
< CGROUP_CPU_SHARES_MIN
|| u
> CGROUP_CPU_SHARES_MAX
)
203 int cg_blkio_weight_parse(const char *s
, uint64_t *ret
) {
208 *ret
= CGROUP_BLKIO_WEIGHT_INVALID
;
212 r
= safe_atou64(s
, &u
);
216 if (u
< CGROUP_BLKIO_WEIGHT_MIN
|| u
> CGROUP_BLKIO_WEIGHT_MAX
)
224 RecurseDirEvent event
,
228 const struct dirent
*de
,
229 const struct statx
*sx
,
232 /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */
233 if (event
== RECURSE_DIR_LEAVE
&&
234 de
->d_type
== DT_DIR
&&
235 unlinkat(dir_fd
, de
->d_name
, AT_REMOVEDIR
) < 0 &&
236 !IN_SET(errno
, ENOENT
, ENOTEMPTY
, EBUSY
))
237 log_debug_errno(errno
, "Failed to trim inner cgroup %s, ignoring: %m", path
);
239 return RECURSE_DIR_CONTINUE
;
242 int cg_trim(const char *controller
, const char *path
, bool delete_root
) {
243 _cleanup_free_
char *fs
= NULL
;
249 r
= cg_get_path(controller
, path
, NULL
, &fs
);
257 /* n_depth_max= */ UINT_MAX
,
258 RECURSE_DIR_ENSURE_TYPE
,
261 if (r
== -ENOENT
) /* non-existing is the ultimate trimming, hence no error */
264 log_debug_errno(r
, "Failed to iterate through cgroup %s: %m", path
);
266 /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is
267 * already gone anyway). Also, let's debug log about this failure, except if the error code is an
269 if (delete_root
&& !empty_or_root(path
) &&
270 rmdir(fs
) < 0 && errno
!= ENOENT
) {
271 if (!IN_SET(errno
, ENOTEMPTY
, EBUSY
))
272 log_debug_errno(errno
, "Failed to trim cgroup %s: %m", path
);
277 q
= cg_hybrid_unified();
280 if (q
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
281 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, delete_root
);
286 /* Create a cgroup in the hierarchy of controller.
287 * Returns 0 if the group already existed, 1 on success, negative otherwise.
289 int cg_create(const char *controller
, const char *path
) {
290 _cleanup_free_
char *fs
= NULL
;
293 r
= cg_get_path_and_check(controller
, path
, NULL
, &fs
);
297 r
= mkdir_parents(fs
, 0755);
301 r
= RET_NERRNO(mkdir(fs
, 0755));
307 r
= cg_hybrid_unified();
311 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
312 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
314 log_warning_errno(r
, "Failed to create compat systemd cgroup %s: %m", path
);
320 int cg_create_and_attach(const char *controller
, const char *path
, pid_t pid
) {
325 r
= cg_create(controller
, path
);
329 q
= cg_attach(controller
, path
, pid
);
333 /* This does not remove the cgroup on failure */
337 int cg_attach(const char *controller
, const char *path
, pid_t pid
) {
338 _cleanup_free_
char *fs
= NULL
;
339 char c
[DECIMAL_STR_MAX(pid_t
) + 2];
345 r
= cg_get_path_and_check(controller
, path
, "cgroup.procs", &fs
);
350 pid
= getpid_cached();
352 xsprintf(c
, PID_FMT
"\n", pid
);
354 r
= write_string_file(fs
, c
, WRITE_STRING_FILE_DISABLE_BUFFER
);
355 if (r
== -EOPNOTSUPP
&& cg_is_threaded(path
) > 0)
356 /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */
361 r
= cg_hybrid_unified();
365 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
366 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, pid
);
368 log_warning_errno(r
, "Failed to attach "PID_FMT
" to compat systemd cgroup %s: %m", pid
, path
);
374 int cg_attach_fallback(const char *controller
, const char *path
, pid_t pid
) {
381 r
= cg_attach(controller
, path
, pid
);
383 char prefix
[strlen(path
) + 1];
385 /* This didn't work? Then let's try all prefixes of
388 PATH_FOREACH_PREFIX(prefix
, path
) {
391 q
= cg_attach(controller
, prefix
, pid
);
401 const char *controller
,
411 /* cgroup v1, aka legacy/non-unified */
412 static const struct Attribute legacy_attributes
[] = {
413 { "cgroup.procs", true },
415 { "cgroup.clone_children", false },
419 /* cgroup v2, aka unified */
420 static const struct Attribute unified_attributes
[] = {
421 { "cgroup.procs", true },
422 { "cgroup.subtree_control", true },
423 { "cgroup.threads", false },
427 static const struct Attribute
* const attributes
[] = {
428 [false] = legacy_attributes
,
429 [true] = unified_attributes
,
432 _cleanup_free_
char *fs
= NULL
;
433 const struct Attribute
*i
;
438 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
441 unified
= cg_unified_controller(controller
);
445 /* Configure access to the cgroup itself */
446 r
= cg_get_path(controller
, path
, NULL
, &fs
);
450 r
= chmod_and_chown(fs
, 0755, uid
, gid
);
454 /* Configure access to the cgroup's attributes */
455 for (i
= attributes
[unified
]; i
->name
; i
++) {
458 r
= cg_get_path(controller
, path
, i
->name
, &fs
);
462 r
= chmod_and_chown(fs
, 0644, uid
, gid
);
467 log_debug_errno(r
, "Failed to set access on cgroup %s, ignoring: %m", fs
);
471 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
472 r
= cg_hybrid_unified();
476 /* Always propagate access mode from unified to legacy controller */
477 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, uid
, gid
);
479 log_debug_errno(r
, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path
);
486 struct access_callback_data
{
492 static int access_callback(
493 RecurseDirEvent event
,
497 const struct dirent
*de
,
498 const struct statx
*sx
,
501 struct access_callback_data
*d
= ASSERT_PTR(userdata
);
503 if (!IN_SET(event
, RECURSE_DIR_ENTER
, RECURSE_DIR_ENTRY
))
504 return RECURSE_DIR_CONTINUE
;
506 assert(inode_fd
>= 0);
508 /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */
509 if (chown(FORMAT_PROC_FD_PATH(inode_fd
), d
->uid
, d
->gid
) < 0) {
510 log_debug_errno(errno
, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path
));
512 if (d
->error
== 0) /* Return last error to caller */
516 return RECURSE_DIR_CONTINUE
;
519 int cg_set_access_recursive(
520 const char *controller
,
525 _cleanup_close_
int fd
= -EBADF
;
526 _cleanup_free_
char *fs
= NULL
;
529 /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files,
530 * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to
531 * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */
533 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
536 r
= cg_get_path(controller
, path
, NULL
, &fs
);
540 fd
= open(fs
, O_DIRECTORY
|O_CLOEXEC
|O_RDONLY
);
544 struct access_callback_data d
= {
552 /* n_depth_max= */ UINT_MAX
,
553 RECURSE_DIR_SAME_MOUNT
|RECURSE_DIR_INODE_FD
|RECURSE_DIR_TOPLEVEL
,
570 _cleanup_set_free_ Set
*s
= NULL
;
583 my_pid
= getpid_cached();
586 _cleanup_fclose_
FILE *f
= NULL
;
590 r
= cg_enumerate_processes(cfrom
, pfrom
, &f
);
592 if (ret
>= 0 && r
!= -ENOENT
)
598 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
600 /* This might do weird stuff if we aren't a
601 * single-threaded program. However, we
602 * luckily know we are not */
603 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
606 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
609 /* Ignore kernel threads. Since they can only
610 * exist in the root cgroup, we only check for
613 empty_or_root(pfrom
) &&
614 is_kernel_thread(pid
) > 0)
617 r
= cg_attach(cto
, pto
, pid
);
619 if (ret
>= 0 && r
!= -ESRCH
)
626 r
= set_put(s
, PID_TO_PTR(pid
));
646 int cg_migrate_recursive(
653 _cleanup_closedir_
DIR *d
= NULL
;
662 ret
= cg_migrate(cfrom
, pfrom
, cto
, pto
, flags
);
664 r
= cg_enumerate_subgroups(cfrom
, pfrom
, &d
);
666 if (ret
>= 0 && r
!= -ENOENT
)
672 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
673 _cleanup_free_
char *p
= NULL
;
675 p
= path_join(empty_to_root(pfrom
), fn
);
680 r
= cg_migrate_recursive(cfrom
, p
, cto
, pto
, flags
);
681 if (r
!= 0 && ret
>= 0)
685 if (r
< 0 && ret
>= 0)
688 if (flags
& CGROUP_REMOVE
) {
689 r
= cg_rmdir(cfrom
, pfrom
);
690 if (r
< 0 && ret
>= 0 && !IN_SET(r
, -ENOENT
, -EBUSY
))
697 int cg_migrate_recursive_fallback(
711 r
= cg_migrate_recursive(cfrom
, pfrom
, cto
, pto
, flags
);
713 char prefix
[strlen(pto
) + 1];
715 /* This didn't work? Then let's try all prefixes of the destination */
717 PATH_FOREACH_PREFIX(prefix
, pto
) {
720 q
= cg_migrate_recursive(cfrom
, pfrom
, cto
, prefix
, flags
);
729 int cg_create_everywhere(CGroupMask supported
, CGroupMask mask
, const char *path
) {
735 /* This one will create a cgroup in our private tree, but also
736 * duplicate it in the trees specified in mask, and remove it
739 * Returns 0 if the group already existed in the systemd hierarchy,
740 * 1 on success, negative otherwise.
743 /* First create the cgroup in our own hierarchy. */
744 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, path
);
749 /* If we are in the unified hierarchy, we are done now */
750 r
= cg_all_unified();
756 supported
&= CGROUP_MASK_V1
;
757 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
760 /* Otherwise, do the same in the other hierarchies */
761 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
762 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
765 if (!FLAGS_SET(supported
, bit
))
768 if (FLAGS_SET(done
, bit
))
771 n
= cgroup_controller_to_string(c
);
772 if (FLAGS_SET(mask
, bit
))
773 (void) cg_create(n
, path
);
775 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
781 int cg_attach_everywhere(CGroupMask supported
, const char *path
, pid_t pid
, cg_migrate_callback_t path_callback
, void *userdata
) {
784 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER
, path
, pid
);
788 r
= cg_all_unified();
794 supported
&= CGROUP_MASK_V1
;
797 for (CGroupController c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
798 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
799 const char *p
= NULL
;
801 if (!FLAGS_SET(supported
, bit
))
804 if (FLAGS_SET(done
, bit
))
808 p
= path_callback(bit
, userdata
);
812 (void) cg_attach_fallback(cgroup_controller_to_string(c
), p
, pid
);
813 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
819 int cg_migrate_v1_controllers(CGroupMask supported
, CGroupMask mask
, const char *from
, cg_migrate_callback_t to_callback
, void *userdata
) {
826 supported
&= CGROUP_MASK_V1
;
827 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
830 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
831 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
832 const char *to
= NULL
;
834 if (!FLAGS_SET(supported
, bit
))
837 if (FLAGS_SET(done
, bit
))
840 if (!FLAGS_SET(mask
, bit
))
843 to
= to_callback(bit
, userdata
);
845 /* Remember first error and try continuing */
846 q
= cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER
, from
, cgroup_controller_to_string(c
), to
, 0);
849 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
855 int cg_trim_everywhere(CGroupMask supported
, const char *path
, bool delete_root
) {
858 r
= cg_trim(SYSTEMD_CGROUP_CONTROLLER
, path
, delete_root
);
862 q
= cg_all_unified();
868 return cg_trim_v1_controllers(supported
, _CGROUP_MASK_ALL
, path
, delete_root
);
871 int cg_trim_v1_controllers(CGroupMask supported
, CGroupMask mask
, const char *path
, bool delete_root
) {
876 supported
&= CGROUP_MASK_V1
;
877 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
880 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
881 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
883 if (!FLAGS_SET(supported
, bit
))
886 if (FLAGS_SET(done
, bit
))
889 if (FLAGS_SET(mask
, bit
)) {
890 /* Remember first error and try continuing */
891 q
= cg_trim(cgroup_controller_to_string(c
), path
, delete_root
);
894 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
900 int cg_enable_everywhere(
901 CGroupMask supported
,
904 CGroupMask
*ret_result_mask
) {
906 _cleanup_fclose_
FILE *f
= NULL
;
907 _cleanup_free_
char *fs
= NULL
;
914 if (supported
== 0) {
916 *ret_result_mask
= 0;
920 r
= cg_all_unified();
924 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
925 * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
926 * caller tends to use the returned mask later on to compare if all controllers where properly joined,
927 * and if not requeues realization. This use is the primary purpose of the return value, hence let's
928 * minimize surprises here and reduce triggers for re-realization by always saying we fully
931 *ret_result_mask
= mask
& supported
& CGROUP_MASK_V2
; /* If you wonder why we mask this with
932 * CGROUP_MASK_V2: The 'supported' mask
933 * might contain pure-V1 or BPF
934 * controllers, and we never want to
935 * claim that we could enable those with
936 * cgroup.subtree_control */
940 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, p
, "cgroup.subtree_control", &fs
);
944 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
945 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
948 if (!FLAGS_SET(CGROUP_MASK_V2
, bit
))
951 if (!FLAGS_SET(supported
, bit
))
954 n
= cgroup_controller_to_string(c
);
956 char s
[1 + strlen(n
) + 1];
958 s
[0] = FLAGS_SET(mask
, bit
) ? '+' : '-';
964 return log_debug_errno(errno
, "Failed to open cgroup.subtree_control file of %s: %m", p
);
967 r
= write_string_stream(f
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
969 log_debug_errno(r
, "Failed to %s controller %s for %s (%s): %m",
970 FLAGS_SET(mask
, bit
) ? "enable" : "disable", n
, p
, fs
);
973 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
974 * happens for example when we attempt to turn off a controller up in the tree that is
975 * used down in the tree. */
976 if (!FLAGS_SET(mask
, bit
) && r
== -EBUSY
) /* You might wonder why we check for EBUSY
977 * only here, and not follow the same logic
978 * for other errors such as EINVAL or
979 * EOPNOTSUPP or anything else. That's
980 * because EBUSY indicates that the
981 * controllers is currently enabled and
982 * cannot be disabled because something down
983 * the hierarchy is still using it. Any other
984 * error most likely means something like "I
985 * never heard of this controller" or
986 * similar. In the former case it's hence
987 * safe to assume the controller is still on
988 * after the failed operation, while in the
989 * latter case it's safer to assume the
990 * controller is unknown and hence certainly
994 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
995 if (FLAGS_SET(mask
, bit
))
1001 /* Let's return the precise set of controllers now enabled for the cgroup. */
1002 if (ret_result_mask
)
1003 *ret_result_mask
= ret
;