1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include "cgroup-setup.h"
6 #include "cgroup-util.h"
7 #include "errno-util.h"
12 #include "parse-util.h"
13 #include "path-util.h"
14 #include "proc-cmdline.h"
15 #include "process-util.h"
16 #include "recurse-dir.h"
17 #include "stdio-util.h"
18 #include "string-util.h"
19 #include "user-util.h"
22 static int cg_any_controller_used_for_v1(void) {
23 _cleanup_free_
char *buf
= NULL
;
24 _cleanup_strv_free_
char **lines
= NULL
;
28 r
= read_full_virtual_file("/proc/cgroups", &buf
, NULL
);
30 return log_debug_errno(r
, "Could not read /proc/cgroups, ignoring: %m");
32 r
= strv_split_newlines_full(&lines
, buf
, 0);
36 /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all
37 * enabled kernel cgroup controllers are currently not in use by cgroup1. For reference:
38 * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-
40 * Note that this is typically only useful to check inside a container where we don't know what
41 * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use
42 * unified since some or all controllers would be missing. This is not the best way to detect this,
43 * as whatever container manager created our container should have mounted /sys/fs/cgroup
44 * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use
46 STRV_FOREACH(line
, lines
) {
47 _cleanup_free_
char *name
= NULL
, *hierarchy_id
= NULL
, *num
= NULL
, *enabled
= NULL
;
49 /* Skip header line */
50 if (startswith(*line
, "#"))
53 const char *p
= *line
;
54 r
= extract_many_words(&p
, NULL
, 0, &name
, &hierarchy_id
, &num
, &enabled
, NULL
);
56 return log_debug_errno(r
, "Error parsing /proc/cgroups line, ignoring: %m");
58 log_debug("Invalid /proc/cgroups line, ignoring.");
62 /* Ignore disabled controllers. */
63 if (streq(enabled
, "0"))
66 /* Ignore controllers we don't care about. */
67 if (cgroup_controller_from_string(name
) < 0)
70 /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a
71 * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1
72 * hierarchy, and can't be used in a unified cgroup. */
73 if (!streq(hierarchy_id
, "0")) {
74 log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name
);
82 bool cg_is_unified_wanted(void) {
83 static thread_local
int wanted
= -1;
85 const bool is_default
= DEFAULT_HIERARCHY
== CGROUP_UNIFIED_ALL
;
86 _cleanup_free_
char *c
= NULL
;
89 /* If we have a cached value, return that. */
93 /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
94 r
= cg_unified_cached(true);
96 return (wanted
= r
>= CGROUP_UNIFIED_ALL
);
98 /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */
99 r
= proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b
);
103 /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to
104 * use hybrid or legacy hierarchy. */
105 r
= proc_cmdline_get_key("cgroup_no_v1", 0, &c
);
106 if (r
> 0 && streq_ptr(c
, "all"))
107 return (wanted
= true);
109 /* If any controller is in use as v1, don't use unified. */
110 if (cg_any_controller_used_for_v1() > 0)
111 return (wanted
= false);
113 return (wanted
= is_default
);
116 bool cg_is_legacy_wanted(void) {
117 static thread_local
int wanted
= -1;
119 /* If we have a cached value, return that. */
123 /* Check if we have cgroup v2 already mounted. */
124 if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL
)
125 return (wanted
= false);
127 /* Otherwise, assume that at least partial legacy is wanted,
128 * since cgroup v2 should already be mounted at this point. */
129 return (wanted
= true);
132 bool cg_is_hybrid_wanted(void) {
133 static thread_local
int wanted
= -1;
136 const bool is_default
= DEFAULT_HIERARCHY
>= CGROUP_UNIFIED_SYSTEMD
;
137 /* We default to true if the default is "hybrid", obviously, but also when the default is "unified",
138 * because if we get called, it means that unified hierarchy was not mounted. */
140 /* If we have a cached value, return that. */
144 /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
145 if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL
)
146 return (wanted
= false);
148 /* Otherwise, let's see what the kernel command line has to say. Since checking is expensive, cache
149 * a non-error result. */
150 r
= proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b
);
152 /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the
154 return (wanted
= r
> 0 ? !b
: is_default
);
157 int cg_weight_parse(const char *s
, uint64_t *ret
) {
162 *ret
= CGROUP_WEIGHT_INVALID
;
166 r
= safe_atou64(s
, &u
);
170 if (u
< CGROUP_WEIGHT_MIN
|| u
> CGROUP_WEIGHT_MAX
)
177 int cg_cpu_shares_parse(const char *s
, uint64_t *ret
) {
182 *ret
= CGROUP_CPU_SHARES_INVALID
;
186 r
= safe_atou64(s
, &u
);
190 if (u
< CGROUP_CPU_SHARES_MIN
|| u
> CGROUP_CPU_SHARES_MAX
)
197 int cg_blkio_weight_parse(const char *s
, uint64_t *ret
) {
202 *ret
= CGROUP_BLKIO_WEIGHT_INVALID
;
206 r
= safe_atou64(s
, &u
);
210 if (u
< CGROUP_BLKIO_WEIGHT_MIN
|| u
> CGROUP_BLKIO_WEIGHT_MAX
)
218 RecurseDirEvent event
,
222 const struct dirent
*de
,
223 const struct statx
*sx
,
226 /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */
227 if (event
== RECURSE_DIR_LEAVE
&&
228 de
->d_type
== DT_DIR
&&
229 unlinkat(dir_fd
, de
->d_name
, AT_REMOVEDIR
) < 0 &&
230 !IN_SET(errno
, ENOENT
, ENOTEMPTY
, EBUSY
))
231 log_debug_errno(errno
, "Failed to trim inner cgroup %s, ignoring: %m", path
);
233 return RECURSE_DIR_CONTINUE
;
236 int cg_trim(const char *controller
, const char *path
, bool delete_root
) {
237 _cleanup_free_
char *fs
= NULL
;
243 r
= cg_get_path(controller
, path
, NULL
, &fs
);
251 /* n_depth_max= */ UINT_MAX
,
252 RECURSE_DIR_ENSURE_TYPE
,
255 if (r
== -ENOENT
) /* non-existing is the ultimate trimming, hence no error */
258 log_debug_errno(r
, "Failed to iterate through cgroup %s: %m", path
);
260 /* If we shall delete the top-level cgroup, then propagate the faiure to do so (except if it is
261 * already gone anyway). Also, let's debug log about this failure, except if the error code is an
263 if (delete_root
&& !empty_or_root(path
) &&
264 rmdir(fs
) < 0 && errno
!= ENOENT
) {
265 if (!IN_SET(errno
, ENOTEMPTY
, EBUSY
))
266 log_debug_errno(errno
, "Failed to trim cgroup %s: %m", path
);
271 q
= cg_hybrid_unified();
274 if (q
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
))
275 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, delete_root
);
280 /* Create a cgroup in the hierarchy of controller.
281 * Returns 0 if the group already existed, 1 on success, negative otherwise.
283 int cg_create(const char *controller
, const char *path
) {
284 _cleanup_free_
char *fs
= NULL
;
287 r
= cg_get_path_and_check(controller
, path
, NULL
, &fs
);
291 r
= mkdir_parents(fs
, 0755);
295 r
= RET_NERRNO(mkdir(fs
, 0755));
301 r
= cg_hybrid_unified();
305 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
306 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
308 log_warning_errno(r
, "Failed to create compat systemd cgroup %s: %m", path
);
314 int cg_create_and_attach(const char *controller
, const char *path
, pid_t pid
) {
319 r
= cg_create(controller
, path
);
323 q
= cg_attach(controller
, path
, pid
);
327 /* This does not remove the cgroup on failure */
331 int cg_attach(const char *controller
, const char *path
, pid_t pid
) {
332 _cleanup_free_
char *fs
= NULL
;
333 char c
[DECIMAL_STR_MAX(pid_t
) + 2];
339 r
= cg_get_path_and_check(controller
, path
, "cgroup.procs", &fs
);
344 pid
= getpid_cached();
346 xsprintf(c
, PID_FMT
"\n", pid
);
348 r
= write_string_file(fs
, c
, WRITE_STRING_FILE_DISABLE_BUFFER
);
349 if (r
== -EOPNOTSUPP
&& cg_is_threaded(controller
, path
) > 0)
350 /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */
355 r
= cg_hybrid_unified();
359 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
360 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, pid
);
362 log_warning_errno(r
, "Failed to attach "PID_FMT
" to compat systemd cgroup %s: %m", pid
, path
);
368 int cg_attach_fallback(const char *controller
, const char *path
, pid_t pid
) {
375 r
= cg_attach(controller
, path
, pid
);
377 char prefix
[strlen(path
) + 1];
379 /* This didn't work? Then let's try all prefixes of
382 PATH_FOREACH_PREFIX(prefix
, path
) {
385 q
= cg_attach(controller
, prefix
, pid
);
395 const char *controller
,
405 /* cgroup v1, aka legacy/non-unified */
406 static const struct Attribute legacy_attributes
[] = {
407 { "cgroup.procs", true },
409 { "cgroup.clone_children", false },
413 /* cgroup v2, aka unified */
414 static const struct Attribute unified_attributes
[] = {
415 { "cgroup.procs", true },
416 { "cgroup.subtree_control", true },
417 { "cgroup.threads", false },
421 static const struct Attribute
* const attributes
[] = {
422 [false] = legacy_attributes
,
423 [true] = unified_attributes
,
426 _cleanup_free_
char *fs
= NULL
;
427 const struct Attribute
*i
;
432 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
435 unified
= cg_unified_controller(controller
);
439 /* Configure access to the cgroup itself */
440 r
= cg_get_path(controller
, path
, NULL
, &fs
);
444 r
= chmod_and_chown(fs
, 0755, uid
, gid
);
448 /* Configure access to the cgroup's attributes */
449 for (i
= attributes
[unified
]; i
->name
; i
++) {
452 r
= cg_get_path(controller
, path
, i
->name
, &fs
);
456 r
= chmod_and_chown(fs
, 0644, uid
, gid
);
461 log_debug_errno(r
, "Failed to set access on cgroup %s, ignoring: %m", fs
);
465 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
466 r
= cg_hybrid_unified();
470 /* Always propagate access mode from unified to legacy controller */
471 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, uid
, gid
);
473 log_debug_errno(r
, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path
);
488 _cleanup_set_free_ Set
*s
= NULL
;
501 my_pid
= getpid_cached();
504 _cleanup_fclose_
FILE *f
= NULL
;
508 r
= cg_enumerate_processes(cfrom
, pfrom
, &f
);
510 if (ret
>= 0 && r
!= -ENOENT
)
516 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
518 /* This might do weird stuff if we aren't a
519 * single-threaded program. However, we
520 * luckily know we are not */
521 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
524 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
527 /* Ignore kernel threads. Since they can only
528 * exist in the root cgroup, we only check for
531 empty_or_root(pfrom
) &&
532 is_kernel_thread(pid
) > 0)
535 r
= cg_attach(cto
, pto
, pid
);
537 if (ret
>= 0 && r
!= -ESRCH
)
544 r
= set_put(s
, PID_TO_PTR(pid
));
564 int cg_migrate_recursive(
571 _cleanup_closedir_
DIR *d
= NULL
;
580 ret
= cg_migrate(cfrom
, pfrom
, cto
, pto
, flags
);
582 r
= cg_enumerate_subgroups(cfrom
, pfrom
, &d
);
584 if (ret
>= 0 && r
!= -ENOENT
)
590 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
591 _cleanup_free_
char *p
= NULL
;
593 p
= path_join(empty_to_root(pfrom
), fn
);
598 r
= cg_migrate_recursive(cfrom
, p
, cto
, pto
, flags
);
599 if (r
!= 0 && ret
>= 0)
603 if (r
< 0 && ret
>= 0)
606 if (flags
& CGROUP_REMOVE
) {
607 r
= cg_rmdir(cfrom
, pfrom
);
608 if (r
< 0 && ret
>= 0 && !IN_SET(r
, -ENOENT
, -EBUSY
))
615 int cg_migrate_recursive_fallback(
629 r
= cg_migrate_recursive(cfrom
, pfrom
, cto
, pto
, flags
);
631 char prefix
[strlen(pto
) + 1];
633 /* This didn't work? Then let's try all prefixes of the destination */
635 PATH_FOREACH_PREFIX(prefix
, pto
) {
638 q
= cg_migrate_recursive(cfrom
, pfrom
, cto
, prefix
, flags
);
647 int cg_create_everywhere(CGroupMask supported
, CGroupMask mask
, const char *path
) {
653 /* This one will create a cgroup in our private tree, but also
654 * duplicate it in the trees specified in mask, and remove it
657 * Returns 0 if the group already existed in the systemd hierarchy,
658 * 1 on success, negative otherwise.
661 /* First create the cgroup in our own hierarchy. */
662 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, path
);
667 /* If we are in the unified hierarchy, we are done now */
668 r
= cg_all_unified();
674 supported
&= CGROUP_MASK_V1
;
675 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
678 /* Otherwise, do the same in the other hierarchies */
679 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
680 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
683 if (!FLAGS_SET(supported
, bit
))
686 if (FLAGS_SET(done
, bit
))
689 n
= cgroup_controller_to_string(c
);
690 if (FLAGS_SET(mask
, bit
))
691 (void) cg_create(n
, path
);
693 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
699 int cg_attach_everywhere(CGroupMask supported
, const char *path
, pid_t pid
, cg_migrate_callback_t path_callback
, void *userdata
) {
702 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER
, path
, pid
);
706 r
= cg_all_unified();
712 supported
&= CGROUP_MASK_V1
;
715 for (CGroupController c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
716 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
717 const char *p
= NULL
;
719 if (!FLAGS_SET(supported
, bit
))
722 if (FLAGS_SET(done
, bit
))
726 p
= path_callback(bit
, userdata
);
730 (void) cg_attach_fallback(cgroup_controller_to_string(c
), p
, pid
);
731 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
737 int cg_migrate_v1_controllers(CGroupMask supported
, CGroupMask mask
, const char *from
, cg_migrate_callback_t to_callback
, void *userdata
) {
744 supported
&= CGROUP_MASK_V1
;
745 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
748 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
749 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
750 const char *to
= NULL
;
752 if (!FLAGS_SET(supported
, bit
))
755 if (FLAGS_SET(done
, bit
))
758 if (!FLAGS_SET(mask
, bit
))
761 to
= to_callback(bit
, userdata
);
763 /* Remember first error and try continuing */
764 q
= cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER
, from
, cgroup_controller_to_string(c
), to
, 0);
771 int cg_trim_everywhere(CGroupMask supported
, const char *path
, bool delete_root
) {
774 r
= cg_trim(SYSTEMD_CGROUP_CONTROLLER
, path
, delete_root
);
778 q
= cg_all_unified();
784 return cg_trim_v1_controllers(supported
, _CGROUP_MASK_ALL
, path
, delete_root
);
787 int cg_trim_v1_controllers(CGroupMask supported
, CGroupMask mask
, const char *path
, bool delete_root
) {
792 supported
&= CGROUP_MASK_V1
;
793 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
796 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
797 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
799 if (!FLAGS_SET(supported
, bit
))
802 if (FLAGS_SET(done
, bit
))
805 if (FLAGS_SET(mask
, bit
)) {
806 /* Remember first error and try continuing */
807 q
= cg_trim(cgroup_controller_to_string(c
), path
, delete_root
);
810 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
816 int cg_enable_everywhere(
817 CGroupMask supported
,
820 CGroupMask
*ret_result_mask
) {
822 _cleanup_fclose_
FILE *f
= NULL
;
823 _cleanup_free_
char *fs
= NULL
;
830 if (supported
== 0) {
832 *ret_result_mask
= 0;
836 r
= cg_all_unified();
840 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
841 * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
842 * caller tends to use the returned mask later on to compare if all controllers where properly joined,
843 * and if not requeues realization. This use is the primary purpose of the return value, hence let's
844 * minimize surprises here and reduce triggers for re-realization by always saying we fully
847 *ret_result_mask
= mask
& supported
& CGROUP_MASK_V2
; /* If you wonder why we mask this with
848 * CGROUP_MASK_V2: The 'supported' mask
849 * might contain pure-V1 or BPF
850 * controllers, and we never want to
851 * claim that we could enable those with
852 * cgroup.subtree_control */
856 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, p
, "cgroup.subtree_control", &fs
);
860 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
861 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
864 if (!FLAGS_SET(CGROUP_MASK_V2
, bit
))
867 if (!FLAGS_SET(supported
, bit
))
870 n
= cgroup_controller_to_string(c
);
872 char s
[1 + strlen(n
) + 1];
874 s
[0] = FLAGS_SET(mask
, bit
) ? '+' : '-';
880 return log_debug_errno(errno
, "Failed to open cgroup.subtree_control file of %s: %m", p
);
883 r
= write_string_stream(f
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
885 log_debug_errno(r
, "Failed to %s controller %s for %s (%s): %m",
886 FLAGS_SET(mask
, bit
) ? "enable" : "disable", n
, p
, fs
);
889 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
890 * happens for example when we attempt to turn off a controller up in the tree that is
891 * used down in the tree. */
892 if (!FLAGS_SET(mask
, bit
) && r
== -EBUSY
) /* You might wonder why we check for EBUSY
893 * only here, and not follow the same logic
894 * for other errors such as EINVAL or
895 * EOPNOTSUPP or anything else. That's
896 * because EBUSY indicates that the
897 * controllers is currently enabled and
898 * cannot be disabled because something down
899 * the hierarchy is still using it. Any other
900 * error most likely means something like "I
901 * never heard of this controller" or
902 * similar. In the former case it's hence
903 * safe to assume the controller is still on
904 * after the failed operation, while in the
905 * latter case it's safer to assume the
906 * controller is unknown and hence certainly
910 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
911 if (FLAGS_SET(mask
, bit
))
917 /* Let's return the precise set of controllers now enabled for the cgroup. */
919 *ret_result_mask
= ret
;