1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include "cgroup-setup.h"
7 #include "cgroup-util.h"
8 #include "errno-util.h"
9 #include "parse-util.h"
10 #include "path-util.h"
11 #include "proc-cmdline.h"
12 #include "stdio-util.h"
13 #include "string-util.h"
16 #include "process-util.h"
18 #include "user-util.h"
21 bool cg_is_unified_wanted(void) {
22 static thread_local
int wanted
= -1;
24 const bool is_default
= DEFAULT_HIERARCHY
== CGROUP_UNIFIED_ALL
;
25 _cleanup_free_
char *c
= NULL
;
28 /* If we have a cached value, return that. */
32 /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
33 r
= cg_unified_cached(true);
35 return (wanted
= r
>= CGROUP_UNIFIED_ALL
);
37 /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */
38 r
= proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b
);
42 /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to
43 * use hybrid or legacy hierarchy. */
44 r
= proc_cmdline_get_key("cgroup_no_v1", 0, &c
);
45 if (r
> 0 && streq_ptr(c
, "all"))
46 return (wanted
= true);
48 return (wanted
= is_default
);
51 bool cg_is_legacy_wanted(void) {
52 static thread_local
int wanted
= -1;
54 /* If we have a cached value, return that. */
58 /* Check if we have cgroup v2 already mounted. */
59 if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL
)
60 return (wanted
= false);
62 /* Otherwise, assume that at least partial legacy is wanted,
63 * since cgroup v2 should already be mounted at this point. */
64 return (wanted
= true);
67 bool cg_is_hybrid_wanted(void) {
68 static thread_local
int wanted
= -1;
71 const bool is_default
= DEFAULT_HIERARCHY
>= CGROUP_UNIFIED_SYSTEMD
;
72 /* We default to true if the default is "hybrid", obviously, but also when the default is "unified",
73 * because if we get called, it means that unified hierarchy was not mounted. */
75 /* If we have a cached value, return that. */
79 /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
80 if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL
)
81 return (wanted
= false);
83 /* Otherwise, let's see what the kernel command line has to say. Since checking is expensive, cache
84 * a non-error result. */
85 r
= proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b
);
87 /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the
89 return (wanted
= r
> 0 ? !b
: is_default
);
92 int cg_weight_parse(const char *s
, uint64_t *ret
) {
97 *ret
= CGROUP_WEIGHT_INVALID
;
101 r
= safe_atou64(s
, &u
);
105 if (u
< CGROUP_WEIGHT_MIN
|| u
> CGROUP_WEIGHT_MAX
)
112 int cg_cpu_shares_parse(const char *s
, uint64_t *ret
) {
117 *ret
= CGROUP_CPU_SHARES_INVALID
;
121 r
= safe_atou64(s
, &u
);
125 if (u
< CGROUP_CPU_SHARES_MIN
|| u
> CGROUP_CPU_SHARES_MAX
)
132 int cg_blkio_weight_parse(const char *s
, uint64_t *ret
) {
137 *ret
= CGROUP_BLKIO_WEIGHT_INVALID
;
141 r
= safe_atou64(s
, &u
);
145 if (u
< CGROUP_BLKIO_WEIGHT_MIN
|| u
> CGROUP_BLKIO_WEIGHT_MAX
)
152 static int trim_cb(const char *path
, const struct stat
*sb
, int typeflag
, struct FTW
*ftwbuf
) {
157 if (typeflag
!= FTW_DP
)
160 if (ftwbuf
->level
< 1)
167 int cg_trim(const char *controller
, const char *path
, bool delete_root
) {
168 _cleanup_free_
char *fs
= NULL
;
173 r
= cg_get_path(controller
, path
, NULL
, &fs
);
178 if (nftw(fs
, trim_cb
, 64, FTW_DEPTH
|FTW_MOUNT
|FTW_PHYS
) != 0) {
182 r
= errno_or_else(EIO
);
186 if (rmdir(fs
) < 0 && errno
!= ENOENT
)
190 q
= cg_hybrid_unified();
193 if (q
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
194 q
= cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, delete_root
);
196 log_warning_errno(q
, "Failed to trim compat systemd cgroup %s: %m", path
);
202 /* Create a cgroup in the hierarchy of controller.
203 * Returns 0 if the group already existed, 1 on success, negative otherwise.
205 int cg_create(const char *controller
, const char *path
) {
206 _cleanup_free_
char *fs
= NULL
;
209 r
= cg_get_path_and_check(controller
, path
, NULL
, &fs
);
213 r
= mkdir_parents(fs
, 0755);
217 r
= mkdir_errno_wrapper(fs
, 0755);
223 r
= cg_hybrid_unified();
227 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
228 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
);
230 log_warning_errno(r
, "Failed to create compat systemd cgroup %s: %m", path
);
236 int cg_create_and_attach(const char *controller
, const char *path
, pid_t pid
) {
241 r
= cg_create(controller
, path
);
245 q
= cg_attach(controller
, path
, pid
);
249 /* This does not remove the cgroup on failure */
253 int cg_attach(const char *controller
, const char *path
, pid_t pid
) {
254 _cleanup_free_
char *fs
= NULL
;
255 char c
[DECIMAL_STR_MAX(pid_t
) + 2];
261 r
= cg_get_path_and_check(controller
, path
, "cgroup.procs", &fs
);
266 pid
= getpid_cached();
268 xsprintf(c
, PID_FMT
"\n", pid
);
270 r
= write_string_file(fs
, c
, WRITE_STRING_FILE_DISABLE_BUFFER
);
274 r
= cg_hybrid_unified();
278 if (r
> 0 && streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
279 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, pid
);
281 log_warning_errno(r
, "Failed to attach "PID_FMT
" to compat systemd cgroup %s: %m", pid
, path
);
287 int cg_attach_fallback(const char *controller
, const char *path
, pid_t pid
) {
294 r
= cg_attach(controller
, path
, pid
);
296 char prefix
[strlen(path
) + 1];
298 /* This didn't work? Then let's try all prefixes of
301 PATH_FOREACH_PREFIX(prefix
, path
) {
304 q
= cg_attach(controller
, prefix
, pid
);
314 const char *controller
,
324 /* cgroup v1, aka legacy/non-unified */
325 static const struct Attribute legacy_attributes
[] = {
326 { "cgroup.procs", true },
328 { "cgroup.clone_children", false },
332 /* cgroup v2, aka unified */
333 static const struct Attribute unified_attributes
[] = {
334 { "cgroup.procs", true },
335 { "cgroup.subtree_control", true },
336 { "cgroup.threads", false },
340 static const struct Attribute
* const attributes
[] = {
341 [false] = legacy_attributes
,
342 [true] = unified_attributes
,
345 _cleanup_free_
char *fs
= NULL
;
346 const struct Attribute
*i
;
351 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
354 unified
= cg_unified_controller(controller
);
358 /* Configure access to the cgroup itself */
359 r
= cg_get_path(controller
, path
, NULL
, &fs
);
363 r
= chmod_and_chown(fs
, 0755, uid
, gid
);
367 /* Configure access to the cgroup's attributes */
368 for (i
= attributes
[unified
]; i
->name
; i
++) {
371 r
= cg_get_path(controller
, path
, i
->name
, &fs
);
375 r
= chmod_and_chown(fs
, 0644, uid
, gid
);
380 log_debug_errno(r
, "Failed to set access on cgroup %s, ignoring: %m", fs
);
384 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
385 r
= cg_hybrid_unified();
389 /* Always propagate access mode from unified to legacy controller */
390 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, uid
, gid
);
392 log_debug_errno(r
, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path
);
407 _cleanup_set_free_ Set
*s
= NULL
;
420 my_pid
= getpid_cached();
423 _cleanup_fclose_
FILE *f
= NULL
;
427 r
= cg_enumerate_processes(cfrom
, pfrom
, &f
);
429 if (ret
>= 0 && r
!= -ENOENT
)
435 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
437 /* This might do weird stuff if we aren't a
438 * single-threaded program. However, we
439 * luckily know we are not */
440 if ((flags
& CGROUP_IGNORE_SELF
) && pid
== my_pid
)
443 if (set_get(s
, PID_TO_PTR(pid
)) == PID_TO_PTR(pid
))
446 /* Ignore kernel threads. Since they can only
447 * exist in the root cgroup, we only check for
450 empty_or_root(pfrom
) &&
451 is_kernel_thread(pid
) > 0)
454 r
= cg_attach(cto
, pto
, pid
);
456 if (ret
>= 0 && r
!= -ESRCH
)
463 r
= set_put(s
, PID_TO_PTR(pid
));
483 int cg_migrate_recursive(
490 _cleanup_closedir_
DIR *d
= NULL
;
499 ret
= cg_migrate(cfrom
, pfrom
, cto
, pto
, flags
);
501 r
= cg_enumerate_subgroups(cfrom
, pfrom
, &d
);
503 if (ret
>= 0 && r
!= -ENOENT
)
509 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
510 _cleanup_free_
char *p
= NULL
;
512 p
= path_join(empty_to_root(pfrom
), fn
);
517 r
= cg_migrate_recursive(cfrom
, p
, cto
, pto
, flags
);
518 if (r
!= 0 && ret
>= 0)
522 if (r
< 0 && ret
>= 0)
525 if (flags
& CGROUP_REMOVE
) {
526 r
= cg_rmdir(cfrom
, pfrom
);
527 if (r
< 0 && ret
>= 0 && !IN_SET(r
, -ENOENT
, -EBUSY
))
534 int cg_migrate_recursive_fallback(
548 r
= cg_migrate_recursive(cfrom
, pfrom
, cto
, pto
, flags
);
550 char prefix
[strlen(pto
) + 1];
552 /* This didn't work? Then let's try all prefixes of the destination */
554 PATH_FOREACH_PREFIX(prefix
, pto
) {
557 q
= cg_migrate_recursive(cfrom
, pfrom
, cto
, prefix
, flags
);
566 int cg_create_everywhere(CGroupMask supported
, CGroupMask mask
, const char *path
) {
572 /* This one will create a cgroup in our private tree, but also
573 * duplicate it in the trees specified in mask, and remove it
576 * Returns 0 if the group already existed in the systemd hierarchy,
577 * 1 on success, negative otherwise.
580 /* First create the cgroup in our own hierarchy. */
581 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, path
);
586 /* If we are in the unified hierarchy, we are done now */
587 r
= cg_all_unified();
593 supported
&= CGROUP_MASK_V1
;
594 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
597 /* Otherwise, do the same in the other hierarchies */
598 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
599 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
602 if (!FLAGS_SET(supported
, bit
))
605 if (FLAGS_SET(done
, bit
))
608 n
= cgroup_controller_to_string(c
);
609 if (FLAGS_SET(mask
, bit
))
610 (void) cg_create(n
, path
);
612 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
618 int cg_attach_everywhere(CGroupMask supported
, const char *path
, pid_t pid
, cg_migrate_callback_t path_callback
, void *userdata
) {
621 r
= cg_attach(SYSTEMD_CGROUP_CONTROLLER
, path
, pid
);
625 r
= cg_all_unified();
631 supported
&= CGROUP_MASK_V1
;
634 for (CGroupController c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
635 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
636 const char *p
= NULL
;
638 if (!FLAGS_SET(supported
, bit
))
641 if (FLAGS_SET(done
, bit
))
645 p
= path_callback(bit
, userdata
);
649 (void) cg_attach_fallback(cgroup_controller_to_string(c
), p
, pid
);
650 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
656 int cg_migrate_v1_controllers(CGroupMask supported
, CGroupMask mask
, const char *from
, cg_migrate_callback_t to_callback
, void *userdata
) {
663 supported
&= CGROUP_MASK_V1
;
664 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
667 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
668 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
669 const char *to
= NULL
;
671 if (!FLAGS_SET(supported
, bit
))
674 if (FLAGS_SET(done
, bit
))
677 if (!FLAGS_SET(mask
, bit
))
680 to
= to_callback(bit
, userdata
);
682 /* Remember first error and try continuing */
683 q
= cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER
, from
, cgroup_controller_to_string(c
), to
, 0);
690 int cg_trim_everywhere(CGroupMask supported
, const char *path
, bool delete_root
) {
693 r
= cg_trim(SYSTEMD_CGROUP_CONTROLLER
, path
, delete_root
);
697 q
= cg_all_unified();
703 return cg_trim_v1_controllers(supported
, _CGROUP_MASK_ALL
, path
, delete_root
);
706 int cg_trim_v1_controllers(CGroupMask supported
, CGroupMask mask
, const char *path
, bool delete_root
) {
711 supported
&= CGROUP_MASK_V1
;
712 mask
= CGROUP_MASK_EXTEND_JOINED(mask
);
715 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
716 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
718 if (!FLAGS_SET(supported
, bit
))
721 if (FLAGS_SET(done
, bit
))
724 if (FLAGS_SET(mask
, bit
)) {
725 /* Remember first error and try continuing */
726 q
= cg_trim(cgroup_controller_to_string(c
), path
, delete_root
);
729 done
|= CGROUP_MASK_EXTEND_JOINED(bit
);
735 int cg_enable_everywhere(
736 CGroupMask supported
,
739 CGroupMask
*ret_result_mask
) {
741 _cleanup_fclose_
FILE *f
= NULL
;
742 _cleanup_free_
char *fs
= NULL
;
749 if (supported
== 0) {
751 *ret_result_mask
= 0;
755 r
= cg_all_unified();
759 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
760 * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
761 * caller tends to use the returned mask later on to compare if all controllers where properly joined,
762 * and if not requeues realization. This use is the primary purpose of the return value, hence let's
763 * minimize surprises here and reduce triggers for re-realization by always saying we fully
766 *ret_result_mask
= mask
& supported
& CGROUP_MASK_V2
; /* If you wonder why we mask this with
767 * CGROUP_MASK_V2: The 'supported' mask
768 * might contain pure-V1 or BPF
769 * controllers, and we never want to
770 * claim that we could enable those with
771 * cgroup.subtree_control */
775 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, p
, "cgroup.subtree_control", &fs
);
779 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
780 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
783 if (!FLAGS_SET(CGROUP_MASK_V2
, bit
))
786 if (!FLAGS_SET(supported
, bit
))
789 n
= cgroup_controller_to_string(c
);
791 char s
[1 + strlen(n
) + 1];
793 s
[0] = FLAGS_SET(mask
, bit
) ? '+' : '-';
799 return log_debug_errno(errno
, "Failed to open cgroup.subtree_control file of %s: %m", p
);
802 r
= write_string_stream(f
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
804 log_debug_errno(r
, "Failed to %s controller %s for %s (%s): %m",
805 FLAGS_SET(mask
, bit
) ? "enable" : "disable", n
, p
, fs
);
808 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
809 * happens for example when we attempt to turn off a controller up in the tree that is
810 * used down in the tree. */
811 if (!FLAGS_SET(mask
, bit
) && r
== -EBUSY
) /* You might wonder why we check for EBUSY
812 * only here, and not follow the same logic
813 * for other errors such as EINVAL or
814 * EOPNOTSUPP or anything else. That's
815 * because EBUSY indicates that the
816 * controllers is currently enabled and
817 * cannot be disabled because something down
818 * the hierarchy is still using it. Any other
819 * error most likely means something like "I
820 * never heard of this controller" or
821 * similar. In the former case it's hence
822 * safe to assume the controller is still on
823 * after the failed operation, while in the
824 * latter case it's safer to assume the
825 * controller is unknown and hence certainly
829 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
830 if (FLAGS_SET(mask
, bit
))
836 /* Let's return the precise set of controllers now enabled for the cgroup. */
838 *ret_result_mask
= ret
;