1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include "sd-messages.h"
11 #include "alloc-util.h"
12 #include "blockdev-util.h"
13 #include "bpf-devices.h"
14 #include "bpf-firewall.h"
15 #include "bpf-foreign.h"
16 #include "bpf-program.h"
17 #include "bpf-restrict-ifaces.h"
18 #include "bpf-socket-bind.h"
19 #include "btrfs-util.h"
20 #include "bus-error.h"
21 #include "bus-locator.h"
23 #include "cgroup-setup.h"
24 #include "cgroup-util.h"
25 #include "devnum-util.h"
26 #include "errno-util.h"
27 #include "extract-word.h"
31 #include "firewall-util.h"
32 #include "in-addr-prefix-util.h"
33 #include "inotify-util.h"
34 #include "ip-protocol-list.h"
35 #include "limits-util.h"
37 #include "nulstr-util.h"
38 #include "parse-util.h"
39 #include "path-util.h"
40 #include "percent-util.h"
42 #include "process-util.h"
43 #include "procfs-util.h"
44 #include "serialize.h"
47 #include "stdio-util.h"
48 #include "string-table.h"
49 #include "string-util.h"
54 #include "bpf-dlopen.h"
56 #include "bpf-restrict-fs.h"
57 #include "bpf/restrict_fs/restrict-fs-skel.h"
60 #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
62 /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
63 * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
64 * out specific attributes from us. */
65 #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(ABS(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
67 static void unit_remove_from_cgroup_empty_queue(Unit
*u
);
69 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax
*tasks_max
) {
70 if (tasks_max
->scale
== 0)
71 return tasks_max
->value
;
73 return system_tasks_max_scale(tasks_max
->value
, tasks_max
->scale
);
76 bool manager_owns_host_root_cgroup(Manager
*m
) {
79 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
80 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
81 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
82 * we run in any kind of container virtualization. */
84 if (MANAGER_IS_USER(m
))
87 if (detect_container() > 0)
90 return empty_or_root(m
->cgroup_root
);
93 bool unit_has_startup_cgroup_constraints(Unit
*u
) {
96 /* Returns true if this unit has any directives which apply during
97 * startup/shutdown phases. */
101 c
= unit_get_cgroup_context(u
);
105 return c
->startup_io_weight
!= CGROUP_WEIGHT_INVALID
||
106 c
->startup_cpuset_cpus
.set
||
107 c
->startup_cpuset_mems
.set
||
108 c
->startup_memory_high_set
||
109 c
->startup_memory_max_set
||
110 c
->startup_memory_swap_max_set
||
111 c
->startup_memory_zswap_max_set
||
112 c
->startup_memory_low_set
;
115 bool unit_has_host_root_cgroup(const Unit
*u
) {
119 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
120 * the manager manages the root cgroup. */
122 if (!manager_owns_host_root_cgroup(u
->manager
))
125 return unit_has_name(u
, SPECIAL_ROOT_SLICE
);
128 static int set_attribute_and_warn(Unit
*u
, const char *controller
, const char *attribute
, const char *value
) {
133 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
134 if (!crt
|| !crt
->cgroup_path
)
137 r
= cg_set_attribute(controller
, crt
->cgroup_path
, attribute
, value
);
139 log_unit_full_errno(u
, LOG_LEVEL_CGROUP_WRITE(r
), r
, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
140 strna(attribute
), empty_to_root(crt
->cgroup_path
), (int) strcspn(value
, NEWLINE
), value
);
145 void cgroup_context_init(CGroupContext
*c
) {
148 /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make
149 * sure to serialize in execute-serialize.c using serialize_bool() instead of
150 * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide()
151 * skips serialization if the value is 'false' (as that's the common default), so if the value at
152 * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other
153 * values, update/add a conditional serialization check. This is to minimize the amount of
154 * serialized data that is sent to the sd-executor, so that there is less work to do on the default
157 *c
= (CGroupContext
) {
158 .cpu_weight
= CGROUP_WEIGHT_INVALID
,
159 .startup_cpu_weight
= CGROUP_WEIGHT_INVALID
,
160 .cpu_quota_per_sec_usec
= USEC_INFINITY
,
161 .cpu_quota_period_usec
= USEC_INFINITY
,
163 .memory_high
= CGROUP_LIMIT_MAX
,
164 .startup_memory_high
= CGROUP_LIMIT_MAX
,
165 .memory_max
= CGROUP_LIMIT_MAX
,
166 .startup_memory_max
= CGROUP_LIMIT_MAX
,
167 .memory_swap_max
= CGROUP_LIMIT_MAX
,
168 .startup_memory_swap_max
= CGROUP_LIMIT_MAX
,
169 .memory_zswap_max
= CGROUP_LIMIT_MAX
,
170 .startup_memory_zswap_max
= CGROUP_LIMIT_MAX
,
172 .memory_zswap_writeback
= true,
174 .io_weight
= CGROUP_WEIGHT_INVALID
,
175 .startup_io_weight
= CGROUP_WEIGHT_INVALID
,
177 .tasks_max
= CGROUP_TASKS_MAX_UNSET
,
179 .moom_swap
= MANAGED_OOM_AUTO
,
180 .moom_mem_pressure
= MANAGED_OOM_AUTO
,
181 .moom_preference
= MANAGED_OOM_PREFERENCE_NONE
,
182 /* The default duration value in oomd.conf will be used when
183 * moom_mem_pressure_duration_usec is set to infinity. */
184 .moom_mem_pressure_duration_usec
= USEC_INFINITY
,
186 .memory_pressure_watch
= _CGROUP_PRESSURE_WATCH_INVALID
,
187 .memory_pressure_threshold_usec
= USEC_INFINITY
,
191 void cgroup_context_free_device_allow(CGroupContext
*c
, CGroupDeviceAllow
*a
) {
195 LIST_REMOVE(device_allow
, c
->device_allow
, a
);
200 void cgroup_context_free_io_device_weight(CGroupContext
*c
, CGroupIODeviceWeight
*w
) {
204 LIST_REMOVE(device_weights
, c
->io_device_weights
, w
);
209 void cgroup_context_free_io_device_latency(CGroupContext
*c
, CGroupIODeviceLatency
*l
) {
213 LIST_REMOVE(device_latencies
, c
->io_device_latencies
, l
);
218 void cgroup_context_free_io_device_limit(CGroupContext
*c
, CGroupIODeviceLimit
*l
) {
222 LIST_REMOVE(device_limits
, c
->io_device_limits
, l
);
227 void cgroup_context_remove_bpf_foreign_program(CGroupContext
*c
, CGroupBPFForeignProgram
*p
) {
231 LIST_REMOVE(programs
, c
->bpf_foreign_programs
, p
);
236 void cgroup_context_remove_socket_bind(CGroupSocketBindItem
**head
) {
239 LIST_CLEAR(socket_bind_items
, *head
, free
);
242 void cgroup_context_done(CGroupContext
*c
) {
245 while (c
->io_device_weights
)
246 cgroup_context_free_io_device_weight(c
, c
->io_device_weights
);
248 while (c
->io_device_latencies
)
249 cgroup_context_free_io_device_latency(c
, c
->io_device_latencies
);
251 while (c
->io_device_limits
)
252 cgroup_context_free_io_device_limit(c
, c
->io_device_limits
);
254 while (c
->device_allow
)
255 cgroup_context_free_device_allow(c
, c
->device_allow
);
257 cgroup_context_remove_socket_bind(&c
->socket_bind_allow
);
258 cgroup_context_remove_socket_bind(&c
->socket_bind_deny
);
260 c
->ip_address_allow
= set_free(c
->ip_address_allow
);
261 c
->ip_address_deny
= set_free(c
->ip_address_deny
);
263 c
->ip_filters_ingress
= strv_free(c
->ip_filters_ingress
);
264 c
->ip_filters_egress
= strv_free(c
->ip_filters_egress
);
266 while (c
->bpf_foreign_programs
)
267 cgroup_context_remove_bpf_foreign_program(c
, c
->bpf_foreign_programs
);
269 c
->restrict_network_interfaces
= set_free(c
->restrict_network_interfaces
);
271 cpu_set_done(&c
->cpuset_cpus
);
272 cpu_set_done(&c
->startup_cpuset_cpus
);
273 cpu_set_done(&c
->cpuset_mems
);
274 cpu_set_done(&c
->startup_cpuset_mems
);
276 c
->delegate_subgroup
= mfree(c
->delegate_subgroup
);
278 nft_set_context_clear(&c
->nft_set_context
);
281 static int unit_get_kernel_memory_limit(Unit
*u
, const char *file
, uint64_t *ret
) {
284 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
285 if (!crt
|| !crt
->cgroup_path
)
288 return cg_get_attribute_as_uint64("memory", crt
->cgroup_path
, file
, ret
);
291 static int unit_compare_memory_limit(Unit
*u
, const char *property_name
, uint64_t *ret_unit_value
, uint64_t *ret_kernel_value
) {
298 /* Compare kernel memcg configuration against our internal systemd state.
303 * 0: If the kernel memory setting doesn't match our configuration.
304 * >0: If the kernel memory setting matches our configuration.
306 * The following values are only guaranteed to be populated on return >=0:
308 * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
309 * - ret_kernel_value will contain the actual value presented by the kernel. */
313 /* The root slice doesn't have any controller files, so we can't compare anything. */
314 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
317 /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
318 * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
319 * avoid specious errors in these scenarios, check that we even expect the memory controller to be
321 m
= unit_get_target_mask(u
);
322 if (!FLAGS_SET(m
, CGROUP_MASK_MEMORY
))
325 assert_se(c
= unit_get_cgroup_context(u
));
327 bool startup
= u
->manager
&& IN_SET(manager_state(u
->manager
), MANAGER_STARTING
, MANAGER_INITIALIZING
, MANAGER_STOPPING
);
329 if (streq(property_name
, "MemoryLow")) {
330 unit_value
= unit_get_ancestor_memory_low(u
);
332 } else if (startup
&& streq(property_name
, "StartupMemoryLow")) {
333 unit_value
= unit_get_ancestor_startup_memory_low(u
);
335 } else if (streq(property_name
, "MemoryMin")) {
336 unit_value
= unit_get_ancestor_memory_min(u
);
338 } else if (streq(property_name
, "MemoryHigh")) {
339 unit_value
= c
->memory_high
;
340 file
= "memory.high";
341 } else if (startup
&& streq(property_name
, "StartupMemoryHigh")) {
342 unit_value
= c
->startup_memory_high
;
343 file
= "memory.high";
344 } else if (streq(property_name
, "MemoryMax")) {
345 unit_value
= c
->memory_max
;
347 } else if (startup
&& streq(property_name
, "StartupMemoryMax")) {
348 unit_value
= c
->startup_memory_max
;
350 } else if (streq(property_name
, "MemorySwapMax")) {
351 unit_value
= c
->memory_swap_max
;
352 file
= "memory.swap.max";
353 } else if (startup
&& streq(property_name
, "StartupMemorySwapMax")) {
354 unit_value
= c
->startup_memory_swap_max
;
355 file
= "memory.swap.max";
356 } else if (streq(property_name
, "MemoryZSwapMax")) {
357 unit_value
= c
->memory_zswap_max
;
358 file
= "memory.zswap.max";
359 } else if (startup
&& streq(property_name
, "StartupMemoryZSwapMax")) {
360 unit_value
= c
->startup_memory_zswap_max
;
361 file
= "memory.zswap.max";
365 r
= unit_get_kernel_memory_limit(u
, file
, ret_kernel_value
);
367 return log_unit_debug_errno(u
, r
, "Failed to parse %s: %m", file
);
369 /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
370 * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
371 * our internal page-counting. To support those future kernels, just check the value itself first
372 * without any page-alignment. */
373 if (*ret_kernel_value
== unit_value
) {
374 *ret_unit_value
= unit_value
;
378 /* The current kernel behaviour, by comparison, is that even if you write a particular number of
379 * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
380 * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
382 if (unit_value
!= CGROUP_LIMIT_MAX
)
383 unit_value
= PAGE_ALIGN_DOWN(unit_value
);
385 *ret_unit_value
= unit_value
;
387 return *ret_kernel_value
== *ret_unit_value
;
390 #define FORMAT_CGROUP_DIFF_MAX 128
392 static char *format_cgroup_memory_limit_comparison(Unit
*u
, const char *property_name
, char *buf
, size_t l
) {
397 assert(property_name
);
401 r
= unit_compare_memory_limit(u
, property_name
, &sval
, &kval
);
403 /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
404 * In the absence of reliably being able to detect whether memcg swap support is available or not,
405 * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying
406 * on CONFIG_ZSWAP. */
407 if (r
> 0 || IN_SET(r
, -ENODATA
, -EOWNERDEAD
) ||
408 (r
== -ENOENT
&& STR_IN_SET(property_name
,
410 "StartupMemorySwapMax",
412 "StartupMemoryZSwapMax")))
416 (void) snprintf(buf
, l
, " (error getting kernel value: %m)");
418 (void) snprintf(buf
, l
, " (different value in kernel: %" PRIu64
")", kval
);
423 const char* cgroup_device_permissions_to_string(CGroupDevicePermissions p
) {
424 static const char *table
[_CGROUP_DEVICE_PERMISSIONS_MAX
] = {
425 /* Lets simply define a table with every possible combination. As long as those are just 8 we
426 * can get away with it. If this ever grows to more we need to revisit this logic though. */
428 [CGROUP_DEVICE_READ
] = "r",
429 [CGROUP_DEVICE_WRITE
] = "w",
430 [CGROUP_DEVICE_MKNOD
] = "m",
431 [CGROUP_DEVICE_READ
|CGROUP_DEVICE_WRITE
] = "rw",
432 [CGROUP_DEVICE_READ
|CGROUP_DEVICE_MKNOD
] = "rm",
433 [CGROUP_DEVICE_WRITE
|CGROUP_DEVICE_MKNOD
] = "wm",
434 [CGROUP_DEVICE_READ
|CGROUP_DEVICE_WRITE
|CGROUP_DEVICE_MKNOD
] = "rwm",
437 if (p
< 0 || p
>= _CGROUP_DEVICE_PERMISSIONS_MAX
)
443 CGroupDevicePermissions
cgroup_device_permissions_from_string(const char *s
) {
444 CGroupDevicePermissions p
= 0;
447 return _CGROUP_DEVICE_PERMISSIONS_INVALID
;
449 for (const char *c
= s
; *c
; c
++) {
451 p
|= CGROUP_DEVICE_READ
;
453 p
|= CGROUP_DEVICE_WRITE
;
455 p
|= CGROUP_DEVICE_MKNOD
;
457 return _CGROUP_DEVICE_PERMISSIONS_INVALID
;
463 void cgroup_context_dump(Unit
*u
, FILE* f
, const char *prefix
) {
464 _cleanup_free_
char *disable_controllers_str
= NULL
, *delegate_controllers_str
= NULL
, *cpuset_cpus
= NULL
, *cpuset_mems
= NULL
, *startup_cpuset_cpus
= NULL
, *startup_cpuset_mems
= NULL
;
466 struct in_addr_prefix
*iaai
;
467 char cda
[FORMAT_CGROUP_DIFF_MAX
], cdb
[FORMAT_CGROUP_DIFF_MAX
], cdc
[FORMAT_CGROUP_DIFF_MAX
], cdd
[FORMAT_CGROUP_DIFF_MAX
],
468 cde
[FORMAT_CGROUP_DIFF_MAX
], cdf
[FORMAT_CGROUP_DIFF_MAX
], cdg
[FORMAT_CGROUP_DIFF_MAX
], cdh
[FORMAT_CGROUP_DIFF_MAX
],
469 cdi
[FORMAT_CGROUP_DIFF_MAX
], cdj
[FORMAT_CGROUP_DIFF_MAX
], cdk
[FORMAT_CGROUP_DIFF_MAX
];
474 assert_se(c
= unit_get_cgroup_context(u
));
476 prefix
= strempty(prefix
);
478 (void) cg_mask_to_string(c
->disable_controllers
, &disable_controllers_str
);
479 (void) cg_mask_to_string(c
->delegate_controllers
, &delegate_controllers_str
);
481 /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */
482 const char *delegate_str
= delegate_controllers_str
?: c
->delegate
? "(none)" : "no";
484 cpuset_cpus
= cpu_set_to_range_string(&c
->cpuset_cpus
);
485 startup_cpuset_cpus
= cpu_set_to_range_string(&c
->startup_cpuset_cpus
);
486 cpuset_mems
= cpu_set_to_range_string(&c
->cpuset_mems
);
487 startup_cpuset_mems
= cpu_set_to_range_string(&c
->startup_cpuset_mems
);
490 "%sIOAccounting: %s\n"
491 "%sMemoryAccounting: %s\n"
492 "%sTasksAccounting: %s\n"
493 "%sIPAccounting: %s\n"
494 "%sCPUWeight: %" PRIu64
"\n"
495 "%sStartupCPUWeight: %" PRIu64
"\n"
496 "%sCPUQuotaPerSecSec: %s\n"
497 "%sCPUQuotaPeriodSec: %s\n"
498 "%sAllowedCPUs: %s\n"
499 "%sStartupAllowedCPUs: %s\n"
500 "%sAllowedMemoryNodes: %s\n"
501 "%sStartupAllowedMemoryNodes: %s\n"
502 "%sIOWeight: %" PRIu64
"\n"
503 "%sStartupIOWeight: %" PRIu64
"\n"
504 "%sDefaultMemoryMin: %" PRIu64
"\n"
505 "%sDefaultMemoryLow: %" PRIu64
"\n"
506 "%sMemoryMin: %" PRIu64
"%s\n"
507 "%sMemoryLow: %" PRIu64
"%s\n"
508 "%sStartupMemoryLow: %" PRIu64
"%s\n"
509 "%sMemoryHigh: %" PRIu64
"%s\n"
510 "%sStartupMemoryHigh: %" PRIu64
"%s\n"
511 "%sMemoryMax: %" PRIu64
"%s\n"
512 "%sStartupMemoryMax: %" PRIu64
"%s\n"
513 "%sMemorySwapMax: %" PRIu64
"%s\n"
514 "%sStartupMemorySwapMax: %" PRIu64
"%s\n"
515 "%sMemoryZSwapMax: %" PRIu64
"%s\n"
516 "%sStartupMemoryZSwapMax: %" PRIu64
"%s\n"
517 "%sMemoryZSwapWriteback: %s\n"
518 "%sTasksMax: %" PRIu64
"\n"
519 "%sDevicePolicy: %s\n"
520 "%sDisableControllers: %s\n"
522 "%sManagedOOMSwap: %s\n"
523 "%sManagedOOMMemoryPressure: %s\n"
524 "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR
"\n"
525 "%sManagedOOMPreference: %s\n"
526 "%sMemoryPressureWatch: %s\n"
527 "%sCoredumpReceive: %s\n",
528 prefix
, yes_no(c
->io_accounting
),
529 prefix
, yes_no(c
->memory_accounting
),
530 prefix
, yes_no(c
->tasks_accounting
),
531 prefix
, yes_no(c
->ip_accounting
),
532 prefix
, c
->cpu_weight
,
533 prefix
, c
->startup_cpu_weight
,
534 prefix
, FORMAT_TIMESPAN(c
->cpu_quota_per_sec_usec
, 1),
535 prefix
, FORMAT_TIMESPAN(c
->cpu_quota_period_usec
, 1),
536 prefix
, strempty(cpuset_cpus
),
537 prefix
, strempty(startup_cpuset_cpus
),
538 prefix
, strempty(cpuset_mems
),
539 prefix
, strempty(startup_cpuset_mems
),
540 prefix
, c
->io_weight
,
541 prefix
, c
->startup_io_weight
,
542 prefix
, c
->default_memory_min
,
543 prefix
, c
->default_memory_low
,
544 prefix
, c
->memory_min
, format_cgroup_memory_limit_comparison(u
, "MemoryMin", cda
, sizeof(cda
)),
545 prefix
, c
->memory_low
, format_cgroup_memory_limit_comparison(u
, "MemoryLow", cdb
, sizeof(cdb
)),
546 prefix
, c
->startup_memory_low
, format_cgroup_memory_limit_comparison(u
, "StartupMemoryLow", cdc
, sizeof(cdc
)),
547 prefix
, c
->memory_high
, format_cgroup_memory_limit_comparison(u
, "MemoryHigh", cdd
, sizeof(cdd
)),
548 prefix
, c
->startup_memory_high
, format_cgroup_memory_limit_comparison(u
, "StartupMemoryHigh", cde
, sizeof(cde
)),
549 prefix
, c
->memory_max
, format_cgroup_memory_limit_comparison(u
, "MemoryMax", cdf
, sizeof(cdf
)),
550 prefix
, c
->startup_memory_max
, format_cgroup_memory_limit_comparison(u
, "StartupMemoryMax", cdg
, sizeof(cdg
)),
551 prefix
, c
->memory_swap_max
, format_cgroup_memory_limit_comparison(u
, "MemorySwapMax", cdh
, sizeof(cdh
)),
552 prefix
, c
->startup_memory_swap_max
, format_cgroup_memory_limit_comparison(u
, "StartupMemorySwapMax", cdi
, sizeof(cdi
)),
553 prefix
, c
->memory_zswap_max
, format_cgroup_memory_limit_comparison(u
, "MemoryZSwapMax", cdj
, sizeof(cdj
)),
554 prefix
, c
->startup_memory_zswap_max
, format_cgroup_memory_limit_comparison(u
, "StartupMemoryZSwapMax", cdk
, sizeof(cdk
)),
555 prefix
, yes_no(c
->memory_zswap_writeback
),
556 prefix
, cgroup_tasks_max_resolve(&c
->tasks_max
),
557 prefix
, cgroup_device_policy_to_string(c
->device_policy
),
558 prefix
, strempty(disable_controllers_str
),
559 prefix
, delegate_str
,
560 prefix
, managed_oom_mode_to_string(c
->moom_swap
),
561 prefix
, managed_oom_mode_to_string(c
->moom_mem_pressure
),
562 prefix
, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c
->moom_mem_pressure_limit
)),
563 prefix
, managed_oom_preference_to_string(c
->moom_preference
),
564 prefix
, cgroup_pressure_watch_to_string(c
->memory_pressure_watch
),
565 prefix
, yes_no(c
->coredump_receive
));
567 if (c
->delegate_subgroup
)
568 fprintf(f
, "%sDelegateSubgroup: %s\n",
569 prefix
, c
->delegate_subgroup
);
571 if (c
->memory_pressure_threshold_usec
!= USEC_INFINITY
)
572 fprintf(f
, "%sMemoryPressureThresholdSec: %s\n",
573 prefix
, FORMAT_TIMESPAN(c
->memory_pressure_threshold_usec
, 1));
575 if (c
->moom_mem_pressure_duration_usec
!= USEC_INFINITY
)
576 fprintf(f
, "%sManagedOOMMemoryPressureDurationSec: %s\n",
577 prefix
, FORMAT_TIMESPAN(c
->moom_mem_pressure_duration_usec
, 1));
579 LIST_FOREACH(device_allow
, a
, c
->device_allow
)
580 /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
582 "%sDeviceAllow: %s %s\n",
585 strna(cgroup_device_permissions_to_string(a
->permissions
)));
587 LIST_FOREACH(device_weights
, iw
, c
->io_device_weights
)
589 "%sIODeviceWeight: %s %" PRIu64
"\n",
594 LIST_FOREACH(device_latencies
, l
, c
->io_device_latencies
)
596 "%sIODeviceLatencyTargetSec: %s %s\n",
599 FORMAT_TIMESPAN(l
->target_usec
, 1));
601 LIST_FOREACH(device_limits
, il
, c
->io_device_limits
)
602 for (CGroupIOLimitType type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++)
603 if (il
->limits
[type
] != cgroup_io_limit_defaults
[type
])
607 cgroup_io_limit_type_to_string(type
),
609 FORMAT_BYTES(il
->limits
[type
]));
611 SET_FOREACH(iaai
, c
->ip_address_allow
)
612 fprintf(f
, "%sIPAddressAllow: %s\n", prefix
,
613 IN_ADDR_PREFIX_TO_STRING(iaai
->family
, &iaai
->address
, iaai
->prefixlen
));
614 SET_FOREACH(iaai
, c
->ip_address_deny
)
615 fprintf(f
, "%sIPAddressDeny: %s\n", prefix
,
616 IN_ADDR_PREFIX_TO_STRING(iaai
->family
, &iaai
->address
, iaai
->prefixlen
));
618 STRV_FOREACH(path
, c
->ip_filters_ingress
)
619 fprintf(f
, "%sIPIngressFilterPath: %s\n", prefix
, *path
);
620 STRV_FOREACH(path
, c
->ip_filters_egress
)
621 fprintf(f
, "%sIPEgressFilterPath: %s\n", prefix
, *path
);
623 LIST_FOREACH(programs
, p
, c
->bpf_foreign_programs
)
624 fprintf(f
, "%sBPFProgram: %s:%s",
625 prefix
, bpf_cgroup_attach_type_to_string(p
->attach_type
), p
->bpffs_path
);
627 if (c
->socket_bind_allow
) {
628 fprintf(f
, "%sSocketBindAllow: ", prefix
);
629 cgroup_context_dump_socket_bind_items(c
->socket_bind_allow
, f
);
633 if (c
->socket_bind_deny
) {
634 fprintf(f
, "%sSocketBindDeny: ", prefix
);
635 cgroup_context_dump_socket_bind_items(c
->socket_bind_deny
, f
);
639 if (c
->restrict_network_interfaces
) {
641 SET_FOREACH(iface
, c
->restrict_network_interfaces
)
642 fprintf(f
, "%sRestrictNetworkInterfaces: %s\n", prefix
, iface
);
645 FOREACH_ARRAY(nft_set
, c
->nft_set_context
.sets
, c
->nft_set_context
.n_sets
)
646 fprintf(f
, "%sNFTSet: %s:%s:%s:%s\n", prefix
, nft_set_source_to_string(nft_set
->source
),
647 nfproto_to_string(nft_set
->nfproto
), nft_set
->table
, nft_set
->set
);
650 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem
*item
, FILE *f
) {
651 const char *family
, *colon1
, *protocol
= "", *colon2
= "";
653 family
= strempty(af_to_ipv4_ipv6(item
->address_family
));
654 colon1
= isempty(family
) ? "" : ":";
656 if (item
->ip_protocol
!= 0) {
657 protocol
= ip_protocol_to_tcp_udp(item
->ip_protocol
);
661 if (item
->nr_ports
== 0)
662 fprintf(f
, "%s%s%s%sany", family
, colon1
, protocol
, colon2
);
663 else if (item
->nr_ports
== 1)
664 fprintf(f
, "%s%s%s%s%" PRIu16
, family
, colon1
, protocol
, colon2
, item
->port_min
);
666 uint16_t port_max
= item
->port_min
+ item
->nr_ports
- 1;
667 fprintf(f
, "%s%s%s%s%" PRIu16
"-%" PRIu16
, family
, colon1
, protocol
, colon2
,
668 item
->port_min
, port_max
);
672 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem
*items
, FILE *f
) {
675 LIST_FOREACH(socket_bind_items
, bi
, items
) {
681 cgroup_context_dump_socket_bind_item(bi
, f
);
685 int cgroup_context_add_device_allow(CGroupContext
*c
, const char *dev
, CGroupDevicePermissions p
) {
686 _cleanup_free_ CGroupDeviceAllow
*a
= NULL
;
687 _cleanup_free_
char *d
= NULL
;
691 assert(p
>= 0 && p
< _CGROUP_DEVICE_PERMISSIONS_MAX
);
694 p
= _CGROUP_DEVICE_PERMISSIONS_ALL
;
696 a
= new(CGroupDeviceAllow
, 1);
704 *a
= (CGroupDeviceAllow
) {
709 LIST_PREPEND(device_allow
, c
->device_allow
, a
);
715 int cgroup_context_add_or_update_device_allow(CGroupContext
*c
, const char *dev
, CGroupDevicePermissions p
) {
718 assert(p
>= 0 && p
< _CGROUP_DEVICE_PERMISSIONS_MAX
);
721 p
= _CGROUP_DEVICE_PERMISSIONS_ALL
;
723 LIST_FOREACH(device_allow
, b
, c
->device_allow
)
724 if (path_equal(b
->path
, dev
)) {
729 return cgroup_context_add_device_allow(c
, dev
, p
);
732 int cgroup_context_add_bpf_foreign_program(CGroupContext
*c
, uint32_t attach_type
, const char *bpffs_path
) {
733 CGroupBPFForeignProgram
*p
;
734 _cleanup_free_
char *d
= NULL
;
739 if (!path_is_normalized(bpffs_path
) || !path_is_absolute(bpffs_path
))
740 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Path is not normalized.");
742 d
= strdup(bpffs_path
);
746 p
= new(CGroupBPFForeignProgram
, 1);
750 *p
= (CGroupBPFForeignProgram
) {
751 .attach_type
= attach_type
,
752 .bpffs_path
= TAKE_PTR(d
),
755 LIST_PREPEND(programs
, c
->bpf_foreign_programs
, TAKE_PTR(p
));
760 #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
761 uint64_t unit_get_ancestor_##entry(Unit *u) { \
764 /* 1. Is entry set in this unit? If so, use that. \
765 * 2. Is the default for this entry set in any \
766 * ancestor? If so, use that. \
767 * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
771 c = unit_get_cgroup_context(u); \
772 if (c && c->entry##_set) \
775 while ((u = UNIT_GET_SLICE(u))) { \
776 c = unit_get_cgroup_context(u); \
777 if (c && c->default_##entry##_set) \
778 return c->default_##entry; \
781 /* We've reached the root, but nobody had default for \
782 * this entry set, so set it to the kernel default. */ \
783 return CGROUP_LIMIT_MIN; \
786 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low
);
787 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low
);
788 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min
);
790 static void unit_set_xattr_graceful(Unit
*u
, const char *name
, const void *data
, size_t size
) {
796 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
797 if (!crt
|| !crt
->cgroup_path
)
800 r
= cg_set_xattr(crt
->cgroup_path
, name
, data
, size
, 0);
802 log_unit_debug_errno(u
, r
, "Failed to set '%s' xattr on control group %s, ignoring: %m", name
, empty_to_root(crt
->cgroup_path
));
805 static void unit_remove_xattr_graceful(Unit
*u
, const char *name
) {
811 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
812 if (!crt
|| !crt
->cgroup_path
)
815 r
= cg_remove_xattr(crt
->cgroup_path
, name
);
816 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
817 log_unit_debug_errno(u
, r
, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name
, empty_to_root(crt
->cgroup_path
));
820 static void cgroup_oomd_xattr_apply(Unit
*u
) {
825 c
= unit_get_cgroup_context(u
);
829 if (c
->moom_preference
== MANAGED_OOM_PREFERENCE_OMIT
)
830 unit_set_xattr_graceful(u
, "user.oomd_omit", "1", 1);
832 if (c
->moom_preference
== MANAGED_OOM_PREFERENCE_AVOID
)
833 unit_set_xattr_graceful(u
, "user.oomd_avoid", "1", 1);
835 if (c
->moom_preference
!= MANAGED_OOM_PREFERENCE_AVOID
)
836 unit_remove_xattr_graceful(u
, "user.oomd_avoid");
838 if (c
->moom_preference
!= MANAGED_OOM_PREFERENCE_OMIT
)
839 unit_remove_xattr_graceful(u
, "user.oomd_omit");
842 static int cgroup_log_xattr_apply(Unit
*u
) {
844 size_t len
, allowed_patterns_len
, denied_patterns_len
;
845 _cleanup_free_
char *patterns
= NULL
, *allowed_patterns
= NULL
, *denied_patterns
= NULL
;
851 c
= unit_get_exec_context(u
);
853 /* Some unit types have a cgroup context but no exec context, so we do not log
854 * any error here to avoid confusion. */
857 if (set_isempty(c
->log_filter_allowed_patterns
) && set_isempty(c
->log_filter_denied_patterns
)) {
858 unit_remove_xattr_graceful(u
, "user.journald_log_filter_patterns");
862 r
= set_make_nulstr(c
->log_filter_allowed_patterns
, &allowed_patterns
, &allowed_patterns_len
);
864 return log_debug_errno(r
, "Failed to make nulstr from set: %m");
866 r
= set_make_nulstr(c
->log_filter_denied_patterns
, &denied_patterns
, &denied_patterns_len
);
868 return log_debug_errno(r
, "Failed to make nulstr from set: %m");
870 /* Use nul character separated strings without trailing nul */
871 allowed_patterns_len
= LESS_BY(allowed_patterns_len
, 1u);
872 denied_patterns_len
= LESS_BY(denied_patterns_len
, 1u);
874 len
= allowed_patterns_len
+ 1 + denied_patterns_len
;
875 patterns
= new(char, len
);
877 return log_oom_debug();
879 last
= mempcpy_safe(patterns
, allowed_patterns
, allowed_patterns_len
);
881 memcpy_safe(last
, denied_patterns
, denied_patterns_len
);
883 unit_set_xattr_graceful(u
, "user.journald_log_filter_patterns", patterns
, len
);
888 static void cgroup_invocation_id_xattr_apply(Unit
*u
) {
893 b
= !sd_id128_is_null(u
->invocation_id
);
894 FOREACH_STRING(xn
, "trusted.invocation_id", "user.invocation_id") {
896 unit_set_xattr_graceful(u
, xn
, SD_ID128_TO_STRING(u
->invocation_id
), 32);
898 unit_remove_xattr_graceful(u
, xn
);
902 static void cgroup_coredump_xattr_apply(Unit
*u
) {
907 c
= unit_get_cgroup_context(u
);
911 if (unit_cgroup_delegate(u
) && c
->coredump_receive
)
912 unit_set_xattr_graceful(u
, "user.coredump_receive", "1", 1);
914 unit_remove_xattr_graceful(u
, "user.coredump_receive");
917 static void cgroup_delegate_xattr_apply(Unit
*u
) {
922 /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
923 * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
924 * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
925 * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
926 * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
927 * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
928 * be a big problem given this communicates delegation state to clients, but the manager never reads
930 b
= unit_cgroup_delegate(u
);
931 FOREACH_STRING(xn
, "trusted.delegate", "user.delegate") {
933 unit_set_xattr_graceful(u
, xn
, "1", 1);
935 unit_remove_xattr_graceful(u
, xn
);
939 static void cgroup_survive_xattr_apply(Unit
*u
) {
944 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
948 if (u
->survive_final_kill_signal
) {
951 "user.survive_final_kill_signal",
955 /* user xattr support was added in kernel v5.7 */
956 if (ERRNO_IS_NEG_NOT_SUPPORTED(r
))
959 "trusted.survive_final_kill_signal",
964 log_unit_debug_errno(u
,
966 "Failed to set 'survive_final_kill_signal' xattr on control "
967 "group %s, ignoring: %m",
968 empty_to_root(crt
->cgroup_path
));
970 unit_remove_xattr_graceful(u
, "user.survive_final_kill_signal");
971 unit_remove_xattr_graceful(u
, "trusted.survive_final_kill_signal");
975 static void cgroup_xattr_apply(Unit
*u
) {
978 /* The 'user.*' xattrs can be set from a user manager. */
979 cgroup_oomd_xattr_apply(u
);
980 cgroup_log_xattr_apply(u
);
981 cgroup_coredump_xattr_apply(u
);
983 if (!MANAGER_IS_SYSTEM(u
->manager
))
986 cgroup_invocation_id_xattr_apply(u
);
987 cgroup_delegate_xattr_apply(u
);
988 cgroup_survive_xattr_apply(u
);
991 static int lookup_block_device(const char *p
, dev_t
*ret
) {
999 r
= device_path_parse_major_minor(p
, &mode
, &rdev
);
1000 if (r
== -ENODEV
) { /* not a parsable device node, need to go to disk */
1003 if (stat(p
, &st
) < 0)
1004 return log_warning_errno(errno
, "Couldn't stat device '%s': %m", p
);
1010 return log_warning_errno(r
, "Failed to parse major/minor from path '%s': %m", p
);
1013 return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK
),
1014 "Device node '%s' is a character device, but block device needed.", p
);
1017 else if (major(dev
) != 0)
1018 *ret
= dev
; /* If this is not a device node then use the block device this file is stored on */
1020 /* If this is btrfs, getting the backing block device is a bit harder */
1021 r
= btrfs_get_block_device(p
, ret
);
1023 return log_warning_errno(SYNTHETIC_ERRNO(ENODEV
),
1024 "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p
);
1026 return log_warning_errno(r
, "Failed to determine block device backing btrfs file system '%s': %m", p
);
1029 /* If this is a LUKS/DM device, recursively try to get the originating block device */
1030 while (block_get_originating(*ret
, ret
) >= 0)
1033 /* If this is a partition, try to get the originating block device */
1034 (void) block_get_whole_disk(*ret
, ret
);
1038 static bool cgroup_context_has_cpu_weight(CGroupContext
*c
) {
1039 return c
->cpu_weight
!= CGROUP_WEIGHT_INVALID
||
1040 c
->startup_cpu_weight
!= CGROUP_WEIGHT_INVALID
;
1043 static bool cgroup_context_has_allowed_cpus(CGroupContext
*c
) {
1044 return c
->cpuset_cpus
.set
|| c
->startup_cpuset_cpus
.set
;
1047 static bool cgroup_context_has_allowed_mems(CGroupContext
*c
) {
1048 return c
->cpuset_mems
.set
|| c
->startup_cpuset_mems
.set
;
1051 uint64_t cgroup_context_cpu_weight(CGroupContext
*c
, ManagerState state
) {
1054 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
, MANAGER_STOPPING
) &&
1055 c
->startup_cpu_weight
!= CGROUP_WEIGHT_INVALID
)
1056 return c
->startup_cpu_weight
;
1057 else if (c
->cpu_weight
!= CGROUP_WEIGHT_INVALID
)
1058 return c
->cpu_weight
;
1060 return CGROUP_WEIGHT_DEFAULT
;
1063 static CPUSet
*cgroup_context_allowed_cpus(CGroupContext
*c
, ManagerState state
) {
1064 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
, MANAGER_STOPPING
) &&
1065 c
->startup_cpuset_cpus
.set
)
1066 return &c
->startup_cpuset_cpus
;
1068 return &c
->cpuset_cpus
;
1071 static CPUSet
*cgroup_context_allowed_mems(CGroupContext
*c
, ManagerState state
) {
1072 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
, MANAGER_STOPPING
) &&
1073 c
->startup_cpuset_mems
.set
)
1074 return &c
->startup_cpuset_mems
;
1076 return &c
->cpuset_mems
;
1079 usec_t
cgroup_cpu_adjust_period(usec_t period
, usec_t quota
, usec_t resolution
, usec_t max_period
) {
1080 /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
1081 * need to be higher than that boundary. quota is specified in USecPerSec.
1082 * Additionally, period must be at most max_period. */
1085 return MIN(MAX3(period
, resolution
, resolution
* USEC_PER_SEC
/ quota
), max_period
);
1088 static usec_t
cgroup_cpu_adjust_period_and_log(Unit
*u
, usec_t period
, usec_t quota
) {
1093 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1095 return USEC_INFINITY
;
1097 if (quota
== USEC_INFINITY
)
1098 /* Always use default period for infinity quota. */
1099 return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC
;
1101 if (period
== USEC_INFINITY
)
1102 /* Default period was requested. */
1103 period
= CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC
;
1105 /* Clamp to interval [1ms, 1s] */
1106 new_period
= cgroup_cpu_adjust_period(period
, quota
, USEC_PER_MSEC
, USEC_PER_SEC
);
1108 if (new_period
!= period
) {
1109 log_unit_full(u
, crt
->warned_clamping_cpu_quota_period
? LOG_DEBUG
: LOG_WARNING
,
1110 "Clamping CPU interval for cpu.max: period is now %s",
1111 FORMAT_TIMESPAN(new_period
, 1));
1112 crt
->warned_clamping_cpu_quota_period
= true;
1118 static void cgroup_apply_cpu_weight(Unit
*u
, uint64_t weight
) {
1119 char buf
[DECIMAL_STR_MAX(uint64_t) + 2];
1121 if (weight
== CGROUP_WEIGHT_IDLE
)
1123 xsprintf(buf
, "%" PRIu64
"\n", weight
);
1124 (void) set_attribute_and_warn(u
, "cpu", "cpu.weight", buf
);
1127 static void cgroup_apply_cpu_idle(Unit
*u
, uint64_t weight
) {
1130 const char *idle_val
;
1134 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1135 if (!crt
|| !crt
->cgroup_path
)
1138 is_idle
= weight
== CGROUP_WEIGHT_IDLE
;
1139 idle_val
= one_zero(is_idle
);
1140 r
= cg_set_attribute("cpu", crt
->cgroup_path
, "cpu.idle", idle_val
);
1141 if (r
< 0 && (r
!= -ENOENT
|| is_idle
))
1142 log_unit_full_errno(u
, LOG_LEVEL_CGROUP_WRITE(r
), r
, "Failed to set '%s' attribute on '%s' to '%s': %m",
1143 "cpu.idle", empty_to_root(crt
->cgroup_path
), idle_val
);
1146 static void cgroup_apply_cpu_quota(Unit
*u
, usec_t quota
, usec_t period
) {
1147 char buf
[(DECIMAL_STR_MAX(usec_t
) + 1) * 2 + 1];
1151 period
= cgroup_cpu_adjust_period_and_log(u
, period
, quota
);
1152 if (quota
!= USEC_INFINITY
)
1153 xsprintf(buf
, USEC_FMT
" " USEC_FMT
"\n",
1154 MAX(quota
* period
/ USEC_PER_SEC
, USEC_PER_MSEC
), period
);
1156 xsprintf(buf
, "max " USEC_FMT
"\n", period
);
1157 (void) set_attribute_and_warn(u
, "cpu", "cpu.max", buf
);
1160 static void cgroup_apply_cpuset(Unit
*u
, const CPUSet
*cpus
, const char *name
) {
1161 _cleanup_free_
char *buf
= NULL
;
1163 buf
= cpu_set_to_range_string(cpus
);
1169 (void) set_attribute_and_warn(u
, "cpuset", name
, buf
);
1172 static bool cgroup_context_has_io_config(CGroupContext
*c
) {
1173 return c
->io_accounting
||
1174 c
->io_weight
!= CGROUP_WEIGHT_INVALID
||
1175 c
->startup_io_weight
!= CGROUP_WEIGHT_INVALID
||
1176 c
->io_device_weights
||
1177 c
->io_device_latencies
||
1178 c
->io_device_limits
;
1181 static uint64_t cgroup_context_io_weight(CGroupContext
*c
, ManagerState state
) {
1182 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
, MANAGER_STOPPING
) &&
1183 c
->startup_io_weight
!= CGROUP_WEIGHT_INVALID
)
1184 return c
->startup_io_weight
;
1185 if (c
->io_weight
!= CGROUP_WEIGHT_INVALID
)
1186 return c
->io_weight
;
1187 return CGROUP_WEIGHT_DEFAULT
;
1190 static int set_bfq_weight(Unit
*u
, const char *controller
, dev_t dev
, uint64_t io_weight
) {
1191 static bool warned
= false;
1192 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
1194 uint64_t bfq_weight
;
1199 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1200 if (!crt
|| !crt
->cgroup_path
)
1203 /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
1204 * See also: https://github.com/systemd/systemd/pull/13335 and
1205 * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
1206 p
= strjoina(controller
, ".bfq.weight");
1207 /* Adjust to kernel range is 1..1000, the default is 100. */
1208 bfq_weight
= BFQ_WEIGHT(io_weight
);
1211 xsprintf(buf
, DEVNUM_FORMAT_STR
" %" PRIu64
"\n", DEVNUM_FORMAT_VAL(dev
), bfq_weight
);
1213 xsprintf(buf
, "%" PRIu64
"\n", bfq_weight
);
1215 r
= cg_set_attribute(controller
, crt
->cgroup_path
, p
, buf
);
1217 /* FIXME: drop this when kernels prior
1218 * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
1219 * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
1220 * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
1221 if (r
== -EINVAL
&& major(dev
) > 0) {
1223 log_unit_warning(u
, "Kernel version does not accept per-device setting in %s.", p
);
1226 r
= -EOPNOTSUPP
; /* mask as unconfigured device */
1227 } else if (r
>= 0 && io_weight
!= bfq_weight
)
1228 log_unit_debug(u
, "%s=%" PRIu64
" scaled to %s=%" PRIu64
,
1229 major(dev
) > 0 ? "IODeviceWeight" : "IOWeight",
1230 io_weight
, p
, bfq_weight
);
1234 static void cgroup_apply_io_device_weight(Unit
*u
, const char *dev_path
, uint64_t io_weight
) {
1235 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1241 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1242 if (!crt
|| !crt
->cgroup_path
)
1245 if (lookup_block_device(dev_path
, &dev
) < 0)
1248 r1
= set_bfq_weight(u
, "io", dev
, io_weight
);
1250 xsprintf(buf
, DEVNUM_FORMAT_STR
" %" PRIu64
"\n", DEVNUM_FORMAT_VAL(dev
), io_weight
);
1251 r2
= cg_set_attribute("io", crt
->cgroup_path
, "io.weight", buf
);
1253 /* Look at the configured device, when both fail, prefer io.weight errno. */
1254 r
= r2
== -EOPNOTSUPP
? r1
: r2
;
1257 log_unit_full_errno(u
, LOG_LEVEL_CGROUP_WRITE(r
),
1258 r
, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
1259 empty_to_root(crt
->cgroup_path
), (int) strcspn(buf
, NEWLINE
), buf
);
1262 static void cgroup_apply_io_device_latency(Unit
*u
, const char *dev_path
, usec_t target
) {
1263 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
1267 r
= lookup_block_device(dev_path
, &dev
);
1271 if (target
!= USEC_INFINITY
)
1272 xsprintf(buf
, DEVNUM_FORMAT_STR
" target=%" PRIu64
"\n", DEVNUM_FORMAT_VAL(dev
), target
);
1274 xsprintf(buf
, DEVNUM_FORMAT_STR
" target=max\n", DEVNUM_FORMAT_VAL(dev
));
1276 (void) set_attribute_and_warn(u
, "io", "io.latency", buf
);
1279 static void cgroup_apply_io_device_limit(Unit
*u
, const char *dev_path
, uint64_t *limits
) {
1280 char limit_bufs
[_CGROUP_IO_LIMIT_TYPE_MAX
][DECIMAL_STR_MAX(uint64_t)],
1281 buf
[DECIMAL_STR_MAX(dev_t
)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
1284 if (lookup_block_device(dev_path
, &dev
) < 0)
1287 for (CGroupIOLimitType type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++)
1288 if (limits
[type
] != cgroup_io_limit_defaults
[type
])
1289 xsprintf(limit_bufs
[type
], "%" PRIu64
, limits
[type
]);
1291 xsprintf(limit_bufs
[type
], "%s", limits
[type
] == CGROUP_LIMIT_MAX
? "max" : "0");
1293 xsprintf(buf
, DEVNUM_FORMAT_STR
" rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev
),
1294 limit_bufs
[CGROUP_IO_RBPS_MAX
], limit_bufs
[CGROUP_IO_WBPS_MAX
],
1295 limit_bufs
[CGROUP_IO_RIOPS_MAX
], limit_bufs
[CGROUP_IO_WIOPS_MAX
]);
1296 (void) set_attribute_and_warn(u
, "io", "io.max", buf
);
1299 static bool unit_has_memory_config(Unit
*u
) {
1304 assert_se(c
= unit_get_cgroup_context(u
));
1306 return unit_get_ancestor_memory_min(u
) > 0 ||
1307 unit_get_ancestor_memory_low(u
) > 0 || unit_get_ancestor_startup_memory_low(u
) > 0 ||
1308 c
->memory_high
!= CGROUP_LIMIT_MAX
|| c
->startup_memory_high_set
||
1309 c
->memory_max
!= CGROUP_LIMIT_MAX
|| c
->startup_memory_max_set
||
1310 c
->memory_swap_max
!= CGROUP_LIMIT_MAX
|| c
->startup_memory_swap_max_set
||
1311 c
->memory_zswap_max
!= CGROUP_LIMIT_MAX
|| c
->startup_memory_zswap_max_set
;
1314 static void cgroup_apply_memory_limit(Unit
*u
, const char *file
, uint64_t v
) {
1315 char buf
[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
1317 if (v
!= CGROUP_LIMIT_MAX
)
1318 xsprintf(buf
, "%" PRIu64
"\n", v
);
1320 (void) set_attribute_and_warn(u
, "memory", file
, buf
);
1323 static void cgroup_apply_firewall(Unit
*u
) {
1326 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
1328 if (bpf_firewall_compile(u
) < 0)
1331 (void) bpf_firewall_load_custom(u
);
1332 (void) bpf_firewall_install(u
);
1335 void unit_modify_nft_set(Unit
*u
, bool add
) {
1340 if (!MANAGER_IS_SYSTEM(u
->manager
))
1343 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
1346 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1347 if (!crt
|| crt
->cgroup_id
== 0)
1350 if (!u
->manager
->fw_ctx
) {
1351 r
= fw_ctx_new_full(&u
->manager
->fw_ctx
, /* init_tables= */ false);
1355 assert(u
->manager
->fw_ctx
);
1358 CGroupContext
*c
= ASSERT_PTR(unit_get_cgroup_context(u
));
1360 FOREACH_ARRAY(nft_set
, c
->nft_set_context
.sets
, c
->nft_set_context
.n_sets
) {
1361 if (nft_set
->source
!= NFT_SET_SOURCE_CGROUP
)
1364 uint64_t element
= crt
->cgroup_id
;
1366 r
= nft_set_element_modify_any(u
->manager
->fw_ctx
, add
, nft_set
->nfproto
, nft_set
->table
, nft_set
->set
, &element
, sizeof(element
));
1368 log_warning_errno(r
, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64
", ignoring: %m",
1369 add
? "add" : "delete", nfproto_to_string(nft_set
->nfproto
), nft_set
->table
, nft_set
->set
, crt
->cgroup_id
);
1371 log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64
,
1372 add
? "Added" : "Deleted", nfproto_to_string(nft_set
->nfproto
), nft_set
->table
, nft_set
->set
, crt
->cgroup_id
);
1376 static void cgroup_apply_socket_bind(Unit
*u
) {
1379 (void) bpf_socket_bind_install(u
);
1382 static void cgroup_apply_restrict_network_interfaces(Unit
*u
) {
1385 (void) bpf_restrict_ifaces_install(u
);
1388 static int cgroup_apply_devices(Unit
*u
) {
1389 _cleanup_(bpf_program_freep
) BPFProgram
*prog
= NULL
;
1391 CGroupDevicePolicy policy
;
1394 assert_se(c
= unit_get_cgroup_context(u
));
1396 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1397 if (!crt
|| !crt
->cgroup_path
)
1400 policy
= c
->device_policy
;
1402 r
= bpf_devices_cgroup_init(&prog
, policy
, c
->device_allow
);
1404 return log_unit_warning_errno(u
, r
, "Failed to initialize device control bpf program: %m");
1406 bool allow_list_static
= policy
== CGROUP_DEVICE_POLICY_CLOSED
||
1407 (policy
== CGROUP_DEVICE_POLICY_AUTO
&& c
->device_allow
);
1410 if (allow_list_static
) {
1411 r
= bpf_devices_allow_list_static(prog
, crt
->cgroup_path
);
1416 LIST_FOREACH(device_allow
, a
, c
->device_allow
) {
1419 if (a
->permissions
== 0)
1422 if (path_startswith(a
->path
, "/dev/"))
1423 r
= bpf_devices_allow_list_device(prog
, crt
->cgroup_path
, a
->path
, a
->permissions
);
1424 else if ((val
= startswith(a
->path
, "block-")))
1425 r
= bpf_devices_allow_list_major(prog
, crt
->cgroup_path
, val
, 'b', a
->permissions
);
1426 else if ((val
= startswith(a
->path
, "char-")))
1427 r
= bpf_devices_allow_list_major(prog
, crt
->cgroup_path
, val
, 'c', a
->permissions
);
1429 log_unit_debug(u
, "Ignoring device '%s' while writing cgroup attribute.", a
->path
);
1438 log_unit_warning(u
, "No devices matched by device filter.");
1440 /* The kernel verifier would reject a program we would build with the normal intro and outro
1441 but no allow-listing rules (outro would contain an unreachable instruction for successful
1443 policy
= CGROUP_DEVICE_POLICY_STRICT
;
1446 r
= bpf_devices_apply_policy(&prog
, policy
, any
, crt
->cgroup_path
, &crt
->bpf_device_control_installed
);
1448 static bool warned
= false;
1450 log_full_errno(warned
? LOG_DEBUG
: LOG_WARNING
, r
,
1451 "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1452 "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1453 "(This warning is only shown for the first loaded unit using device ACL.)", u
->id
);
1460 static void set_io_weight(Unit
*u
, uint64_t weight
) {
1461 char buf
[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
1465 (void) set_bfq_weight(u
, "io", makedev(0, 0), weight
);
1467 xsprintf(buf
, "default %" PRIu64
"\n", weight
);
1468 (void) set_attribute_and_warn(u
, "io", "io.weight", buf
);
1471 static void cgroup_apply_bpf_foreign_program(Unit
*u
) {
1474 (void) bpf_foreign_install(u
);
1477 static void cgroup_context_apply(
1479 CGroupMask apply_mask
,
1480 ManagerState state
) {
1482 bool is_host_root
, is_local_root
;
1488 /* Nothing to do? Exit early! */
1489 if (apply_mask
== 0)
1492 /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
1493 * attributes should only be managed for cgroups further down the tree. */
1494 is_local_root
= unit_has_name(u
, SPECIAL_ROOT_SLICE
);
1495 is_host_root
= unit_has_host_root_cgroup(u
);
1497 assert_se(c
= unit_get_cgroup_context(u
));
1499 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1500 if (!crt
|| !crt
->cgroup_path
)
1503 /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
1504 * then), and missing cgroups, i.e. EROFS and ENOENT. */
1506 /* These attributes don't exist on the host cgroup root. */
1507 if ((apply_mask
& CGROUP_MASK_CPU
) && !is_local_root
) {
1510 if (cgroup_context_has_cpu_weight(c
))
1511 weight
= cgroup_context_cpu_weight(c
, state
);
1513 weight
= CGROUP_WEIGHT_DEFAULT
;
1515 cgroup_apply_cpu_idle(u
, weight
);
1516 cgroup_apply_cpu_weight(u
, weight
);
1517 cgroup_apply_cpu_quota(u
, c
->cpu_quota_per_sec_usec
, c
->cpu_quota_period_usec
);
1520 if ((apply_mask
& CGROUP_MASK_CPUSET
) && !is_local_root
) {
1521 cgroup_apply_cpuset(u
, cgroup_context_allowed_cpus(c
, state
), "cpuset.cpus");
1522 cgroup_apply_cpuset(u
, cgroup_context_allowed_mems(c
, state
), "cpuset.mems");
1525 /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
1526 * controller), and in case of containers we want to leave control of these attributes to the container manager
1527 * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
1528 if ((apply_mask
& CGROUP_MASK_IO
) && !is_local_root
) {
1532 has_io
= cgroup_context_has_io_config(c
);
1535 weight
= cgroup_context_io_weight(c
, state
);
1537 weight
= CGROUP_WEIGHT_DEFAULT
;
1539 set_io_weight(u
, weight
);
1542 LIST_FOREACH(device_weights
, w
, c
->io_device_weights
)
1543 cgroup_apply_io_device_weight(u
, w
->path
, w
->weight
);
1545 LIST_FOREACH(device_limits
, limit
, c
->io_device_limits
)
1546 cgroup_apply_io_device_limit(u
, limit
->path
, limit
->limits
);
1548 LIST_FOREACH(device_latencies
, latency
, c
->io_device_latencies
)
1549 cgroup_apply_io_device_latency(u
, latency
->path
, latency
->target_usec
);
1553 /* 'memory' attributes do not exist on the root cgroup. */
1554 if ((apply_mask
& CGROUP_MASK_MEMORY
) && !is_local_root
) {
1555 uint64_t max
= CGROUP_LIMIT_MAX
, swap_max
= CGROUP_LIMIT_MAX
, zswap_max
= CGROUP_LIMIT_MAX
, high
= CGROUP_LIMIT_MAX
;
1557 if (unit_has_memory_config(u
)) {
1558 bool startup
= IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
, MANAGER_STOPPING
);
1560 high
= startup
&& c
->startup_memory_high_set
? c
->startup_memory_high
: c
->memory_high
;
1561 max
= startup
&& c
->startup_memory_max_set
? c
->startup_memory_max
: c
->memory_max
;
1562 swap_max
= startup
&& c
->startup_memory_swap_max_set
? c
->startup_memory_swap_max
: c
->memory_swap_max
;
1563 zswap_max
= startup
&& c
->startup_memory_zswap_max_set
? c
->startup_memory_zswap_max
: c
->memory_zswap_max
;
1566 cgroup_apply_memory_limit(u
, "memory.min", unit_get_ancestor_memory_min(u
));
1567 cgroup_apply_memory_limit(u
, "memory.low", unit_get_ancestor_memory_low(u
));
1568 cgroup_apply_memory_limit(u
, "memory.high", high
);
1569 cgroup_apply_memory_limit(u
, "memory.max", max
);
1570 cgroup_apply_memory_limit(u
, "memory.swap.max", swap_max
);
1571 cgroup_apply_memory_limit(u
, "memory.zswap.max", zswap_max
);
1573 (void) set_attribute_and_warn(u
, "memory", "memory.oom.group", one_zero(c
->memory_oom_group
));
1574 (void) set_attribute_and_warn(u
, "memory", "memory.zswap.writeback", one_zero(c
->memory_zswap_writeback
));
1577 if (apply_mask
& CGROUP_MASK_PIDS
) {
1580 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1581 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1582 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1583 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1584 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1585 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1586 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1587 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1588 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1589 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1590 * which is desirable so that there's an official way to release control of the sysctl from
1591 * systemd: set the limit to unbounded and reload. */
1593 if (cgroup_tasks_max_isset(&c
->tasks_max
)) {
1594 u
->manager
->sysctl_pid_max_changed
= true;
1595 r
= procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c
->tasks_max
));
1596 } else if (u
->manager
->sysctl_pid_max_changed
)
1597 r
= procfs_tasks_set_limit(TASKS_MAX
);
1601 log_unit_full_errno(u
, LOG_LEVEL_CGROUP_WRITE(r
), r
,
1602 "Failed to write to tasks limit sysctls: %m");
1605 /* The attribute itself is not available on the host root cgroup, and in the container case we want to
1606 * leave it for the container manager. */
1607 if (!is_local_root
) {
1608 if (cgroup_tasks_max_isset(&c
->tasks_max
)) {
1609 char buf
[DECIMAL_STR_MAX(uint64_t) + 1];
1611 xsprintf(buf
, "%" PRIu64
"\n", cgroup_tasks_max_resolve(&c
->tasks_max
));
1612 (void) set_attribute_and_warn(u
, "pids", "pids.max", buf
);
1614 (void) set_attribute_and_warn(u
, "pids", "pids.max", "max\n");
1618 /* On cgroup v2 we can apply BPF everywhere. */
1619 if (apply_mask
& CGROUP_MASK_BPF_DEVICES
)
1620 (void) cgroup_apply_devices(u
);
1622 if (apply_mask
& CGROUP_MASK_BPF_FIREWALL
)
1623 cgroup_apply_firewall(u
);
1625 if (apply_mask
& CGROUP_MASK_BPF_FOREIGN
)
1626 cgroup_apply_bpf_foreign_program(u
);
1628 if (apply_mask
& CGROUP_MASK_BPF_SOCKET_BIND
)
1629 cgroup_apply_socket_bind(u
);
1631 if (apply_mask
& CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES
)
1632 cgroup_apply_restrict_network_interfaces(u
);
1634 unit_modify_nft_set(u
, /* add = */ true);
1637 static bool unit_get_needs_bpf_firewall(Unit
*u
) {
1641 c
= unit_get_cgroup_context(u
);
1645 if (c
->ip_accounting
||
1646 !set_isempty(c
->ip_address_allow
) ||
1647 !set_isempty(c
->ip_address_deny
) ||
1648 c
->ip_filters_ingress
||
1649 c
->ip_filters_egress
)
1652 /* If any parent slice has an IP access list defined, it applies too */
1653 for (Unit
*p
= UNIT_GET_SLICE(u
); p
; p
= UNIT_GET_SLICE(p
)) {
1654 c
= unit_get_cgroup_context(p
);
1658 if (!set_isempty(c
->ip_address_allow
) ||
1659 !set_isempty(c
->ip_address_deny
))
1666 static bool unit_get_needs_bpf_foreign_program(Unit
*u
) {
1670 c
= unit_get_cgroup_context(u
);
1674 return !!c
->bpf_foreign_programs
;
1677 static bool unit_get_needs_socket_bind(Unit
*u
) {
1681 c
= unit_get_cgroup_context(u
);
1685 return c
->socket_bind_allow
|| c
->socket_bind_deny
;
1688 static bool unit_get_needs_restrict_network_interfaces(Unit
*u
) {
1692 c
= unit_get_cgroup_context(u
);
1696 return !set_isempty(c
->restrict_network_interfaces
);
1699 static CGroupMask
unit_get_cgroup_mask(Unit
*u
) {
1700 CGroupMask mask
= 0;
1705 assert_se(c
= unit_get_cgroup_context(u
));
1707 /* Figure out which controllers we need, based on the cgroup context object */
1709 if (cgroup_context_has_cpu_weight(c
) ||
1710 c
->cpu_quota_per_sec_usec
!= USEC_INFINITY
)
1711 mask
|= CGROUP_MASK_CPU
;
1713 if (cgroup_context_has_allowed_cpus(c
) || cgroup_context_has_allowed_mems(c
))
1714 mask
|= CGROUP_MASK_CPUSET
;
1716 if (cgroup_context_has_io_config(c
))
1717 mask
|= CGROUP_MASK_IO
| CGROUP_MASK_BLKIO
;
1719 if (c
->memory_accounting
||
1720 unit_has_memory_config(u
))
1721 mask
|= CGROUP_MASK_MEMORY
;
1723 if (c
->device_allow
||
1724 c
->device_policy
!= CGROUP_DEVICE_POLICY_AUTO
)
1725 mask
|= CGROUP_MASK_DEVICES
| CGROUP_MASK_BPF_DEVICES
;
1727 if (c
->tasks_accounting
||
1728 cgroup_tasks_max_isset(&c
->tasks_max
))
1729 mask
|= CGROUP_MASK_PIDS
;
1734 static CGroupMask
unit_get_bpf_mask(Unit
*u
) {
1735 CGroupMask mask
= 0;
1737 /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
1740 if (unit_get_needs_bpf_firewall(u
))
1741 mask
|= CGROUP_MASK_BPF_FIREWALL
;
1743 if (unit_get_needs_bpf_foreign_program(u
))
1744 mask
|= CGROUP_MASK_BPF_FOREIGN
;
1746 if (unit_get_needs_socket_bind(u
))
1747 mask
|= CGROUP_MASK_BPF_SOCKET_BIND
;
1749 if (unit_get_needs_restrict_network_interfaces(u
))
1750 mask
|= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES
;
1755 CGroupMask
unit_get_own_mask(Unit
*u
) {
1758 /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
1759 * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
1761 if (u
->load_state
!= UNIT_LOADED
)
1764 c
= unit_get_cgroup_context(u
);
1768 return unit_get_cgroup_mask(u
) | unit_get_bpf_mask(u
) | unit_get_delegate_mask(u
);
1771 CGroupMask
unit_get_delegate_mask(Unit
*u
) {
1774 /* If delegation is turned on, then turn on selected controllers.
1776 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1778 if (!unit_cgroup_delegate(u
))
1781 assert_se(c
= unit_get_cgroup_context(u
));
1782 return c
->delegate_controllers
;
1785 static CGroupMask
unit_get_subtree_mask(Unit
*u
) {
1787 /* Returns the mask of this subtree, meaning of the group
1788 * itself and its children. */
1790 return unit_get_own_mask(u
) | unit_get_members_mask(u
);
1793 CGroupMask
unit_get_members_mask(Unit
*u
) {
1796 /* Returns the mask of controllers all of the unit's children require, merged */
1798 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1799 if (crt
&& crt
->cgroup_members_mask_valid
)
1800 return crt
->cgroup_members_mask
; /* Use cached value if possible */
1803 if (u
->type
== UNIT_SLICE
) {
1806 UNIT_FOREACH_DEPENDENCY(member
, u
, UNIT_ATOM_SLICE_OF
)
1807 m
|= unit_get_subtree_mask(member
); /* note that this calls ourselves again, for the children */
1811 crt
->cgroup_members_mask
= m
;
1812 crt
->cgroup_members_mask_valid
= true;
1818 CGroupMask
unit_get_siblings_mask(Unit
*u
) {
1822 /* Returns the mask of controllers all of the unit's siblings
1823 * require, i.e. the members mask of the unit's parent slice
1824 * if there is one. */
1826 slice
= UNIT_GET_SLICE(u
);
1828 return unit_get_members_mask(slice
);
1830 return unit_get_subtree_mask(u
); /* we are the top-level slice */
1833 static CGroupMask
unit_get_disable_mask(Unit
*u
) {
1836 c
= unit_get_cgroup_context(u
);
1840 return c
->disable_controllers
;
1843 CGroupMask
unit_get_ancestor_disable_mask(Unit
*u
) {
1848 mask
= unit_get_disable_mask(u
);
1850 /* Returns the mask of controllers which are marked as forcibly
1851 * disabled in any ancestor unit or the unit in question. */
1853 slice
= UNIT_GET_SLICE(u
);
1855 mask
|= unit_get_ancestor_disable_mask(slice
);
1860 CGroupMask
unit_get_target_mask(Unit
*u
) {
1861 CGroupMask own_mask
, mask
;
1863 /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
1864 * it needs itself, plus all that its children need, plus all that its siblings need. This is
1865 * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
1866 * hierarchy that shall be enabled for it. */
1868 own_mask
= unit_get_own_mask(u
);
1870 if (own_mask
& CGROUP_MASK_BPF_FIREWALL
& ~u
->manager
->cgroup_supported
)
1871 emit_bpf_firewall_warning(u
);
1873 mask
= own_mask
| unit_get_members_mask(u
) | unit_get_siblings_mask(u
);
1875 mask
&= u
->manager
->cgroup_supported
;
1876 mask
&= ~unit_get_ancestor_disable_mask(u
);
1881 CGroupMask
unit_get_enable_mask(Unit
*u
) {
1884 /* This returns the cgroup mask of all controllers to enable
1885 * for the children of a specific cgroup. This is primarily
1886 * useful for the unified cgroup hierarchy, where each cgroup
1887 * controls which controllers are enabled for its children. */
1889 mask
= unit_get_members_mask(u
);
1890 mask
&= u
->manager
->cgroup_supported
;
1891 mask
&= ~unit_get_ancestor_disable_mask(u
);
1896 void unit_invalidate_cgroup_members_masks(Unit
*u
) {
1901 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1905 /* Recurse invalidate the member masks cache all the way up the tree */
1906 crt
->cgroup_members_mask_valid
= false;
1908 slice
= UNIT_GET_SLICE(u
);
1910 unit_invalidate_cgroup_members_masks(slice
);
1913 static int unit_default_cgroup_path(const Unit
*u
, char **ret
) {
1914 _cleanup_free_
char *p
= NULL
;
1920 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
1921 p
= strdup(u
->manager
->cgroup_root
);
1923 _cleanup_free_
char *escaped
= NULL
, *slice_path
= NULL
;
1926 slice
= UNIT_GET_SLICE(u
);
1927 if (slice
&& !unit_has_name(slice
, SPECIAL_ROOT_SLICE
)) {
1928 r
= cg_slice_to_path(slice
->id
, &slice_path
);
1933 r
= cg_escape(u
->id
, &escaped
);
1937 p
= path_join(empty_to_root(u
->manager
->cgroup_root
), slice_path
, escaped
);
1946 static int unit_set_cgroup_path(Unit
*u
, const char *path
) {
1947 _cleanup_free_
char *p
= NULL
;
1953 crt
= unit_get_cgroup_runtime(u
);
1954 if (crt
&& streq_ptr(crt
->cgroup_path
, path
))
1957 unit_release_cgroup(u
, /* drop_cgroup_runtime = */ true);
1959 crt
= unit_setup_cgroup_runtime(u
);
1968 r
= hashmap_put(u
->manager
->cgroup_unit
, p
, u
);
1973 assert(!crt
->cgroup_path
);
1974 crt
->cgroup_path
= TAKE_PTR(p
);
1979 int unit_get_cgroup_path_with_fallback(const Unit
*u
, char **ret
) {
1983 const CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
1984 if (!crt
|| !crt
->cgroup_path
)
1985 return unit_default_cgroup_path(u
, ret
);
1987 return strdup_to_full(ret
, crt
->cgroup_path
); /* returns 1 -> cgroup_path is alive */
1990 static int unit_watch_cgroup(Unit
*u
) {
1991 _cleanup_free_
char *events
= NULL
;
1996 /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
1997 * cgroupv2 is available. */
1999 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2000 if (!crt
|| !crt
->cgroup_path
)
2003 if (crt
->cgroup_control_inotify_wd
>= 0)
2006 /* No point in watch the top-level slice, it's never going to run empty. */
2007 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
2010 r
= hashmap_ensure_allocated(&u
->manager
->cgroup_control_inotify_wd_unit
, &trivial_hash_ops
);
2014 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, crt
->cgroup_path
, "cgroup.events", &events
);
2018 crt
->cgroup_control_inotify_wd
= inotify_add_watch(u
->manager
->cgroup_inotify_fd
, events
, IN_MODIFY
);
2019 if (crt
->cgroup_control_inotify_wd
< 0) {
2021 if (errno
== ENOENT
) /* If the directory is already gone we don't need to track it, so this
2022 * is not an error */
2025 return log_unit_error_errno(u
, errno
, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(crt
->cgroup_path
));
2028 r
= hashmap_put(u
->manager
->cgroup_control_inotify_wd_unit
, INT_TO_PTR(crt
->cgroup_control_inotify_wd
), u
);
2030 return log_unit_error_errno(u
, r
, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt
->cgroup_path
));
2035 static int unit_watch_cgroup_memory(Unit
*u
) {
2036 _cleanup_free_
char *events
= NULL
;
2041 /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
2042 * cgroupv2 is available. */
2044 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2045 if (!crt
|| !crt
->cgroup_path
)
2048 CGroupContext
*c
= unit_get_cgroup_context(u
);
2052 /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
2053 * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
2055 if (!c
->memory_accounting
)
2058 /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
2059 * we also don't want to generate a log message for each parent cgroup of a process. */
2060 if (u
->type
== UNIT_SLICE
)
2063 if (crt
->cgroup_memory_inotify_wd
>= 0)
2066 r
= hashmap_ensure_allocated(&u
->manager
->cgroup_memory_inotify_wd_unit
, &trivial_hash_ops
);
2070 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, crt
->cgroup_path
, "memory.events", &events
);
2074 crt
->cgroup_memory_inotify_wd
= inotify_add_watch(u
->manager
->cgroup_inotify_fd
, events
, IN_MODIFY
);
2075 if (crt
->cgroup_memory_inotify_wd
< 0) {
2077 if (errno
== ENOENT
) /* If the directory is already gone we don't need to track it, so this
2078 * is not an error */
2081 return log_unit_error_errno(u
, errno
, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(crt
->cgroup_path
));
2084 r
= hashmap_put(u
->manager
->cgroup_memory_inotify_wd_unit
, INT_TO_PTR(crt
->cgroup_memory_inotify_wd
), u
);
2086 return log_unit_error_errno(u
, r
, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt
->cgroup_path
));
2091 static int unit_update_cgroup(
2093 CGroupMask target_mask
,
2094 CGroupMask enable_mask
,
2095 ManagerState state
) {
2097 _cleanup_free_
char *cgroup
= NULL
, *cgroup_full_path
= NULL
;
2098 bool set_path
, created
;
2103 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2106 if (u
->freezer_state
!= FREEZER_RUNNING
)
2107 return log_unit_error_errno(u
, SYNTHETIC_ERRNO(EBUSY
), "Cannot realize cgroup for frozen unit.");
2109 r
= unit_get_cgroup_path_with_fallback(u
, &cgroup
);
2111 return log_unit_error_errno(u
, r
, "Failed to get cgroup path: %m");
2114 /* First, create our own group */
2115 r
= cg_create(cgroup
);
2117 return log_unit_error_errno(u
, r
, "Failed to create cgroup %s: %m", empty_to_root(cgroup
));
2121 r
= unit_set_cgroup_path(u
, cgroup
);
2123 return log_unit_error_errno(u
, r
, "Picked control group '%s' as default, but it's in use already.", empty_to_root(cgroup
));
2125 return log_unit_error_errno(u
, r
, "Failed to set unit's control group path to '%s': %m", empty_to_root(cgroup
));
2129 CGroupRuntime
*crt
= ASSERT_PTR(unit_get_cgroup_runtime(u
));
2131 uint64_t cgroup_id
= 0;
2132 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, crt
->cgroup_path
, NULL
, &cgroup_full_path
);
2134 r
= cg_path_get_cgroupid(cgroup_full_path
, &cgroup_id
);
2136 log_unit_full_errno(u
, ERRNO_IS_NOT_SUPPORTED(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
2137 "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path
);
2139 log_unit_warning_errno(u
, r
, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(crt
->cgroup_path
));
2141 crt
->cgroup_id
= cgroup_id
;
2143 /* Start watching it */
2144 (void) unit_watch_cgroup(u
);
2145 (void) unit_watch_cgroup_memory(u
);
2147 /* For v2 we preserve enabled controllers in delegated units, adjust others, */
2148 if (created
|| !unit_cgroup_delegate(u
)) {
2149 CGroupMask result_mask
= 0;
2151 /* Enable all controllers we need */
2152 r
= cg_enable(u
->manager
->cgroup_supported
, enable_mask
, crt
->cgroup_path
, &result_mask
);
2154 log_unit_warning_errno(u
, r
, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(crt
->cgroup_path
));
2156 /* Remember what's actually enabled now */
2157 crt
->cgroup_enabled_mask
= result_mask
;
2160 /* Keep track that this is now realized */
2161 crt
->cgroup_realized_mask
= target_mask
;
2163 /* Set attributes */
2164 cgroup_context_apply(u
, target_mask
, state
);
2165 cgroup_xattr_apply(u
);
2167 /* For most units we expect that memory monitoring is set up before the unit is started and we won't
2168 * touch it after. For PID 1 this is different though, because we couldn't possibly do that given
2169 * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's
2170 * try to open the memory pressure interface anew. */
2171 if (unit_has_name(u
, SPECIAL_INIT_SCOPE
))
2172 (void) manager_setup_memory_pressure_event_source(u
->manager
);
2177 static int unit_attach_pid_to_cgroup_via_bus(Unit
*u
, pid_t pid
, const char *suffix_path
) {
2178 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
2184 if (MANAGER_IS_SYSTEM(u
->manager
))
2187 if (!u
->manager
->system_bus
)
2190 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2191 if (!crt
|| !crt
->cgroup_path
)
2194 /* Determine this unit's cgroup path relative to our cgroup root */
2195 pp
= path_startswith(crt
->cgroup_path
, u
->manager
->cgroup_root
);
2199 pp
= strjoina("/", pp
, suffix_path
);
2202 r
= bus_call_method(u
->manager
->system_bus
,
2204 "AttachProcessesToUnit",
2207 NULL
/* empty unit name means client's unit, i.e. us */, pp
, 1, (uint32_t) pid
);
2209 return log_unit_debug_errno(u
, r
, "Failed to attach unit process " PID_FMT
" via the bus: %s", pid
, bus_error_message(&error
, r
));
2214 int unit_attach_pids_to_cgroup(Unit
*u
, Set
*pids
, const char *suffix_path
) {
2215 _cleanup_free_
char *joined
= NULL
;
2221 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2224 if (set_isempty(pids
))
2227 /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
2228 * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
2229 r
= bpf_firewall_load_custom(u
);
2233 r
= unit_realize_cgroup(u
);
2237 CGroupRuntime
*crt
= ASSERT_PTR(unit_get_cgroup_runtime(u
));
2239 if (isempty(suffix_path
))
2240 p
= crt
->cgroup_path
;
2242 joined
= path_join(crt
->cgroup_path
, suffix_path
);
2250 SET_FOREACH(pid
, pids
) {
2252 /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead,
2253 * which of course is racy. Let's shorten the race a bit though, and re-validate the PID
2254 * before we use it */
2255 r
= pidref_verify(pid
);
2257 log_unit_info_errno(u
, r
, "PID " PID_FMT
" vanished before we could move it to target cgroup '%s', skipping: %m", pid
->pid
, empty_to_root(p
));
2261 r
= cg_attach(p
, pid
->pid
);
2263 bool again
= MANAGER_IS_USER(u
->manager
) && ERRNO_IS_NEG_PRIVILEGE(r
);
2265 log_unit_full_errno(u
, again
? LOG_DEBUG
: LOG_INFO
, r
,
2266 "Couldn't move process "PID_FMT
" to%s requested cgroup '%s': %m",
2267 pid
->pid
, again
? " directly" : "", empty_to_root(p
));
2272 /* If we are in a user instance, and we can't move the process ourselves due
2273 * to permission problems, let's ask the system instance about it instead.
2274 * Since it's more privileged it might be able to move the process across the
2275 * leaves of a subtree whose top node is not owned by us. */
2277 z
= unit_attach_pid_to_cgroup_via_bus(u
, pid
->pid
, suffix_path
);
2281 log_unit_info_errno(u
, z
, "Couldn't move process "PID_FMT
" to requested cgroup '%s' (directly or via the system bus): %m", pid
->pid
, empty_to_root(p
));
2289 /* the cgroup is definitely not empty now. in case the unit was in the cgroup empty queue,
2290 * drop it from there */
2291 unit_remove_from_cgroup_empty_queue(u
);
2294 ret
++; /* Count successful additions */
2300 int unit_remove_subcgroup(Unit
*u
, const char *suffix_path
) {
2305 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2308 if (!unit_cgroup_delegate(u
))
2311 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2312 if (!crt
|| !crt
->cgroup_path
)
2315 _cleanup_free_
char *j
= NULL
;
2318 if (empty_or_root(suffix_path
)) {
2319 d
= empty_to_root(crt
->cgroup_path
);
2320 delete_root
= false; /* Don't attempt to delete the main cgroup of this unit */
2322 j
= path_join(crt
->cgroup_path
, suffix_path
);
2330 log_unit_debug(u
, "Removing subcgroup '%s'...", d
);
2332 r
= cg_trim(d
, delete_root
);
2334 return log_unit_debug_errno(u
, r
, "Failed to fully %s cgroup '%s': %m", delete_root
? "remove" : "trim", d
);
2339 static bool unit_has_mask_realized(
2341 CGroupMask target_mask
,
2342 CGroupMask enable_mask
) {
2346 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2350 /* Returns true if this unit is fully realized. We check four things:
2352 * 1. Whether the cgroup was created at all
2353 * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
2354 * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
2355 * 4. Whether the invalidation mask is currently zero
2357 * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
2358 * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
2359 * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
2360 * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
2361 * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
2362 * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
2363 * simply don't matter. */
2365 return crt
->cgroup_path
&&
2366 ((crt
->cgroup_realized_mask
^ target_mask
) & CGROUP_MASK_V1
) == 0 &&
2367 ((crt
->cgroup_enabled_mask
^ enable_mask
) & CGROUP_MASK_V2
) == 0 &&
2368 crt
->cgroup_invalidated_mask
== 0;
2371 static bool unit_has_mask_disables_realized(
2373 CGroupMask target_mask
,
2374 CGroupMask enable_mask
) {
2378 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2382 /* Returns true if all controllers which should be disabled are indeed disabled.
2384 * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
2385 * already removed. */
2387 return !crt
->cgroup_path
||
2388 (FLAGS_SET(crt
->cgroup_realized_mask
, target_mask
& CGROUP_MASK_V1
) &&
2389 FLAGS_SET(crt
->cgroup_enabled_mask
, enable_mask
& CGROUP_MASK_V2
));
2392 static bool unit_has_mask_enables_realized(
2394 CGroupMask target_mask
,
2395 CGroupMask enable_mask
) {
2399 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2403 /* Returns true if all controllers which should be enabled are indeed enabled.
2405 * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
2406 * we want to add is already added. */
2408 return crt
->cgroup_path
&&
2409 ((crt
->cgroup_realized_mask
| target_mask
) & CGROUP_MASK_V1
) == (crt
->cgroup_realized_mask
& CGROUP_MASK_V1
) &&
2410 ((crt
->cgroup_enabled_mask
| enable_mask
) & CGROUP_MASK_V2
) == (crt
->cgroup_enabled_mask
& CGROUP_MASK_V2
);
2413 void unit_add_to_cgroup_realize_queue(Unit
*u
) {
2416 if (u
->in_cgroup_realize_queue
)
2419 LIST_APPEND(cgroup_realize_queue
, u
->manager
->cgroup_realize_queue
, u
);
2420 u
->in_cgroup_realize_queue
= true;
2423 static void unit_remove_from_cgroup_realize_queue(Unit
*u
) {
2426 if (!u
->in_cgroup_realize_queue
)
2429 LIST_REMOVE(cgroup_realize_queue
, u
->manager
->cgroup_realize_queue
, u
);
2430 u
->in_cgroup_realize_queue
= false;
2433 /* Controllers can only be enabled breadth-first, from the root of the
2434 * hierarchy downwards to the unit in question. */
2435 static int unit_realize_cgroup_now_enable(Unit
*u
, ManagerState state
) {
2436 CGroupMask target_mask
, enable_mask
, new_target_mask
, new_enable_mask
;
2442 /* First go deal with this unit's parent, or we won't be able to enable
2443 * any new controllers at this layer. */
2444 slice
= UNIT_GET_SLICE(u
);
2446 r
= unit_realize_cgroup_now_enable(slice
, state
);
2451 target_mask
= unit_get_target_mask(u
);
2452 enable_mask
= unit_get_enable_mask(u
);
2454 /* We can only enable in this direction, don't try to disable anything.
2456 if (unit_has_mask_enables_realized(u
, target_mask
, enable_mask
))
2459 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2461 new_target_mask
= (crt
? crt
->cgroup_realized_mask
: 0) | target_mask
;
2462 new_enable_mask
= (crt
? crt
->cgroup_enabled_mask
: 0) | enable_mask
;
2464 return unit_update_cgroup(u
, new_target_mask
, new_enable_mask
, state
);
2467 /* Controllers can only be disabled depth-first, from the leaves of the
2468 * hierarchy upwards to the unit in question. */
2469 static int unit_realize_cgroup_now_disable(Unit
*u
, ManagerState state
) {
2474 if (u
->type
!= UNIT_SLICE
)
2477 UNIT_FOREACH_DEPENDENCY(m
, u
, UNIT_ATOM_SLICE_OF
) {
2478 CGroupMask target_mask
, enable_mask
, new_target_mask
, new_enable_mask
;
2481 CGroupRuntime
*rt
= unit_get_cgroup_runtime(m
);
2485 /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
2486 * holding any controllers open anyway. */
2487 if (!rt
->cgroup_path
)
2490 /* We must disable those below us first in order to release the controller. */
2491 if (m
->type
== UNIT_SLICE
)
2492 (void) unit_realize_cgroup_now_disable(m
, state
);
2494 target_mask
= unit_get_target_mask(m
);
2495 enable_mask
= unit_get_enable_mask(m
);
2497 /* We can only disable in this direction, don't try to enable anything. */
2498 if (unit_has_mask_disables_realized(m
, target_mask
, enable_mask
))
2501 new_target_mask
= rt
->cgroup_realized_mask
& target_mask
;
2502 new_enable_mask
= rt
->cgroup_enabled_mask
& enable_mask
;
2504 r
= unit_update_cgroup(m
, new_target_mask
, new_enable_mask
, state
);
2512 /* Check if necessary controllers and attributes for a unit are in place.
2514 * - If so, do nothing.
2515 * - If not, create paths, move processes over, and set attributes.
2517 * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
2518 * a depth-first way. As such the process looks like this:
2520 * Suppose we have a cgroup hierarchy which looks like this:
2533 * 1. We want to realise cgroup "d" now.
2534 * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
2535 * 3. cgroup "k" just started requesting the memory controller.
2537 * To make this work we must do the following in order:
2539 * 1. Disable CPU controller in k, j
2540 * 2. Disable CPU controller in d
2541 * 3. Enable memory controller in root
2542 * 4. Enable memory controller in a
2543 * 5. Enable memory controller in d
2544 * 6. Enable memory controller in k
2546 * Notice that we need to touch j in one direction, but not the other. We also
2547 * don't go beyond d when disabling -- it's up to "a" to get realized if it
2548 * wants to disable further. The basic rules are therefore:
2550 * - If you're disabling something, you need to realise all of the cgroups from
2551 * your recursive descendants to the root. This starts from the leaves.
2552 * - If you're enabling something, you need to realise from the root cgroup
2553 * downwards, but you don't need to iterate your recursive descendants.
2555 * Returns 0 on success and < 0 on failure. */
2556 static int unit_realize_cgroup_now(Unit
*u
, ManagerState state
) {
2557 CGroupMask target_mask
, enable_mask
;
2563 unit_remove_from_cgroup_realize_queue(u
);
2565 target_mask
= unit_get_target_mask(u
);
2566 enable_mask
= unit_get_enable_mask(u
);
2568 if (unit_has_mask_realized(u
, target_mask
, enable_mask
))
2571 /* Disable controllers below us, if there are any */
2572 r
= unit_realize_cgroup_now_disable(u
, state
);
2576 /* Enable controllers above us, if there are any */
2577 slice
= UNIT_GET_SLICE(u
);
2579 r
= unit_realize_cgroup_now_enable(slice
, state
);
2584 /* Now actually deal with the cgroup we were trying to realise and set attributes */
2585 r
= unit_update_cgroup(u
, target_mask
, enable_mask
, state
);
2589 CGroupRuntime
*crt
= ASSERT_PTR(unit_get_cgroup_runtime(u
));
2591 /* Now, reset the invalidation mask */
2592 crt
->cgroup_invalidated_mask
= 0;
2596 unsigned manager_dispatch_cgroup_realize_queue(Manager
*m
) {
2604 state
= manager_state(m
);
2606 while ((i
= m
->cgroup_realize_queue
)) {
2607 assert(i
->in_cgroup_realize_queue
);
2609 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i
))) {
2610 /* Maybe things changed, and the unit is not actually active anymore? */
2611 unit_remove_from_cgroup_realize_queue(i
);
2615 r
= unit_realize_cgroup_now(i
, state
);
2617 log_warning_errno(r
, "Failed to realize cgroups for queued unit %s, ignoring: %m", i
->id
);
2625 void unit_add_family_to_cgroup_realize_queue(Unit
*u
) {
2627 assert(u
->type
== UNIT_SLICE
);
2629 /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
2632 * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
2633 * very weird if two units that own processes reside in the same slice, but one is realized in the
2634 * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
2635 * not), because that means individual processes need to be scheduled against whole cgroups. Let's
2636 * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
2637 * controller hierarchies too (if unit requires the controller to be realized).
2639 * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
2643 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2645 /* Children of u likely changed when we're called */
2647 crt
->cgroup_members_mask_valid
= false;
2650 UNIT_FOREACH_DEPENDENCY(m
, u
, UNIT_ATOM_SLICE_OF
) {
2652 /* No point in doing cgroup application for units without active processes. */
2653 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m
)))
2656 /* We only enqueue siblings if they were realized once at least, in the main
2658 crt
= unit_get_cgroup_runtime(m
);
2659 if (!crt
|| !crt
->cgroup_path
)
2662 /* If the unit doesn't need any new controllers and has current ones
2663 * realized, it doesn't need any changes. */
2664 if (unit_has_mask_realized(m
,
2665 unit_get_target_mask(m
),
2666 unit_get_enable_mask(m
)))
2669 unit_add_to_cgroup_realize_queue(m
);
2672 /* Parent comes after children */
2673 unit_add_to_cgroup_realize_queue(u
);
2675 u
= UNIT_GET_SLICE(u
);
2679 int unit_realize_cgroup(Unit
*u
) {
2684 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2687 /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
2688 * parents, but there's more actually: for the weight-based controllers we also need to make sure
2689 * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the
2690 * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
2691 * and ancestors and they should be (de)realized too.
2693 * This call will defer work on the siblings and derealized ancestors to the next event loop
2694 * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
2696 slice
= UNIT_GET_SLICE(u
);
2698 unit_add_family_to_cgroup_realize_queue(slice
);
2700 /* And realize this one now (and apply the values) */
2701 return unit_realize_cgroup_now(u
, manager_state(u
->manager
));
2704 void unit_release_cgroup(Unit
*u
, bool drop_cgroup_runtime
) {
2707 /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
2708 * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
2710 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2714 if (crt
->cgroup_path
) {
2715 (void) hashmap_remove(u
->manager
->cgroup_unit
, crt
->cgroup_path
);
2716 crt
->cgroup_path
= mfree(crt
->cgroup_path
);
2719 if (crt
->cgroup_control_inotify_wd
>= 0) {
2720 if (inotify_rm_watch(u
->manager
->cgroup_inotify_fd
, crt
->cgroup_control_inotify_wd
) < 0)
2721 log_unit_debug_errno(u
, errno
, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", crt
->cgroup_control_inotify_wd
, u
->id
);
2723 (void) hashmap_remove(u
->manager
->cgroup_control_inotify_wd_unit
, INT_TO_PTR(crt
->cgroup_control_inotify_wd
));
2724 crt
->cgroup_control_inotify_wd
= -1;
2727 if (crt
->cgroup_memory_inotify_wd
>= 0) {
2728 if (inotify_rm_watch(u
->manager
->cgroup_inotify_fd
, crt
->cgroup_memory_inotify_wd
) < 0)
2729 log_unit_debug_errno(u
, errno
, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", crt
->cgroup_memory_inotify_wd
, u
->id
);
2731 (void) hashmap_remove(u
->manager
->cgroup_memory_inotify_wd_unit
, INT_TO_PTR(crt
->cgroup_memory_inotify_wd
));
2732 crt
->cgroup_memory_inotify_wd
= -1;
2735 if (drop_cgroup_runtime
)
2736 *(CGroupRuntime
**) ((uint8_t*) u
+ UNIT_VTABLE(u
)->cgroup_runtime_offset
) = cgroup_runtime_free(crt
);
2739 int unit_cgroup_is_empty(Unit
*u
) {
2744 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2747 if (!crt
->cgroup_path
)
2750 r
= cg_is_empty(SYSTEMD_CGROUP_CONTROLLER
, crt
->cgroup_path
);
2752 log_unit_debug_errno(u
, r
, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(crt
->cgroup_path
));
2756 static bool unit_maybe_release_cgroup(Unit
*u
) {
2759 /* Releases the cgroup only if it is recursively empty.
2760 * Returns true if the cgroup was released, false otherwise. */
2764 /* Don't release the cgroup if there are still processes under it. If we get notified later when all
2765 * the processes exit (e.g. the processes were in D-state and exited after the unit was marked as
2766 * failed) we need the cgroup paths to continue to be tracked by the manager so they can be looked up
2767 * and cleaned up later. */
2768 r
= unit_cgroup_is_empty(u
);
2770 /* Do not free CGroupRuntime when called from unit_prune_cgroup. Various accounting data
2771 * we should keep, especially CPU usage and *_peak ones which would be shown even after
2772 * the unit stops. */
2773 unit_release_cgroup(u
, /* drop_cgroup_runtime = */ false);
2780 static int unit_prune_cgroup_via_bus(Unit
*u
) {
2781 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
2787 if (MANAGER_IS_SYSTEM(u
->manager
))
2790 if (!u
->manager
->system_bus
)
2793 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2794 if (!crt
|| !crt
->cgroup_path
)
2797 /* Determine this unit's cgroup path relative to our cgroup root */
2798 const char *pp
= path_startswith_full(
2800 u
->manager
->cgroup_root
,
2801 PATH_STARTSWITH_RETURN_LEADING_SLASH
|PATH_STARTSWITH_REFUSE_DOT_DOT
);
2805 r
= bus_call_method(u
->manager
->system_bus
,
2807 "RemoveSubgroupFromUnit",
2810 NULL
/* empty unit name means client's unit, i.e. us */,
2814 return log_unit_debug_errno(u
, r
, "Failed to trim cgroup via the bus: %s", bus_error_message(&error
, r
));
2819 void unit_prune_cgroup(Unit
*u
) {
2825 /* Removes the cgroup, if empty and possible, and stops watching it. */
2826 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2827 if (!crt
|| !crt
->cgroup_path
)
2830 /* Cache the last resource usage values before we destroy the cgroup */
2831 (void) unit_get_cpu_usage(u
, /* ret = */ NULL
);
2833 for (CGroupMemoryAccountingMetric metric
= 0; metric
<= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST
; metric
++)
2834 (void) unit_get_memory_accounting(u
, metric
, /* ret = */ NULL
);
2836 /* All IO metrics are read at once from the underlying cgroup, so issue just a single call */
2837 (void) unit_get_io_accounting(u
, _CGROUP_IO_ACCOUNTING_METRIC_INVALID
, /* ret = */ NULL
);
2839 /* We do not cache IP metrics here because the firewall objects are not freed with cgroups */
2842 (void) bpf_restrict_fs_cleanup(u
); /* Remove cgroup from the global LSM BPF map */
2845 unit_modify_nft_set(u
, /* add = */ false);
2847 is_root_slice
= unit_has_name(u
, SPECIAL_ROOT_SLICE
);
2849 r
= cg_trim(crt
->cgroup_path
, !is_root_slice
);
2851 int k
= unit_prune_cgroup_via_bus(u
);
2854 log_unit_debug_errno(u
, r
, "Failed to destroy cgroup %s on our own (%m), but worked when talking to PID 1.", empty_to_root(crt
->cgroup_path
));
2856 /* One reason we could have failed here is, that the cgroup still contains a process.
2857 * However, if the cgroup becomes removable at a later time, it might be removed when
2858 * the containing slice is stopped. So even if we failed now, this unit shouldn't
2859 * assume that the cgroup is still realized the next time it is started. Do not
2860 * return early on error, continue cleanup. */
2861 log_unit_full_errno(u
, r
== -EBUSY
? LOG_DEBUG
: LOG_WARNING
, r
,
2862 "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt
->cgroup_path
));
2869 if (!unit_maybe_release_cgroup(u
)) /* Returns true if the cgroup was released */
2872 assert(crt
== unit_get_cgroup_runtime(u
));
2873 assert(!crt
->cgroup_path
);
2875 crt
->cgroup_realized_mask
= 0;
2876 crt
->cgroup_enabled_mask
= 0;
2878 crt
->bpf_device_control_installed
= bpf_program_free(crt
->bpf_device_control_installed
);
2881 int unit_search_main_pid(Unit
*u
, PidRef
*ret
) {
2882 _cleanup_(pidref_done
) PidRef pidref
= PIDREF_NULL
;
2883 _cleanup_fclose_
FILE *f
= NULL
;
2889 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
2890 if (!crt
|| !crt
->cgroup_path
)
2893 r
= cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER
, crt
->cgroup_path
, &f
);
2898 _cleanup_(pidref_done
) PidRef npidref
= PIDREF_NULL
;
2900 /* cg_read_pidref() will return an error on unmapped PIDs.
2901 * We can't reasonably deal with units that contain those. */
2902 r
= cg_read_pidref(f
, &npidref
, CGROUP_DONT_SKIP_UNMAPPED
);
2908 if (pidref_equal(&pidref
, &npidref
)) /* seen already, cgroupfs reports duplicates! */
2911 if (pidref_is_my_child(&npidref
) <= 0) /* ignore processes further down the tree */
2914 if (pidref_is_set(&pidref
) != 0)
2915 /* Dang, there's more than one daemonized PID in this group, so we don't know what
2916 * process is the main process. */
2919 pidref
= TAKE_PIDREF(npidref
);
2922 if (!pidref_is_set(&pidref
))
2925 *ret
= TAKE_PIDREF(pidref
);
2929 static int on_cgroup_empty_event(sd_event_source
*s
, void *userdata
) {
2930 Manager
*m
= ASSERT_PTR(userdata
);
2936 u
= m
->cgroup_empty_queue
;
2940 assert(u
->in_cgroup_empty_queue
);
2941 u
->in_cgroup_empty_queue
= false;
2942 LIST_REMOVE(cgroup_empty_queue
, m
->cgroup_empty_queue
, u
);
2944 if (m
->cgroup_empty_queue
) {
2945 /* More stuff queued, let's make sure we remain enabled */
2946 r
= sd_event_source_set_enabled(s
, SD_EVENT_ONESHOT
);
2948 log_debug_errno(r
, "Failed to reenable cgroup empty event source, ignoring: %m");
2951 /* Update state based on OOM kills before we notify about cgroup empty event */
2952 (void) unit_check_oom(u
);
2953 (void) unit_check_oomd_kill(u
);
2955 unit_add_to_gc_queue(u
);
2957 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u
)))
2958 unit_prune_cgroup(u
);
2959 else if (UNIT_VTABLE(u
)->notify_cgroup_empty
)
2960 UNIT_VTABLE(u
)->notify_cgroup_empty(u
);
2965 static void unit_add_to_cgroup_empty_queue(Unit
*u
) {
2970 /* Note that cgroup empty events are dispatched in a separate queue with a lower priority than
2971 * the SIGCHLD handler, so that we always use SIGCHLD if we can get it first, and only use
2972 * the cgroup empty notifications if there's no SIGCHLD pending (which might happen if the cgroup
2973 * doesn't contain processes that are our own child, which is typically the case for scope units). */
2975 if (u
->in_cgroup_empty_queue
)
2978 LIST_PREPEND(cgroup_empty_queue
, u
->manager
->cgroup_empty_queue
, u
);
2979 u
->in_cgroup_empty_queue
= true;
2981 /* Trigger the defer event */
2982 r
= sd_event_source_set_enabled(u
->manager
->cgroup_empty_event_source
, SD_EVENT_ONESHOT
);
2984 log_debug_errno(r
, "Failed to enable cgroup empty event source: %m");
2987 static void unit_remove_from_cgroup_empty_queue(Unit
*u
) {
2990 if (!u
->in_cgroup_empty_queue
)
2993 LIST_REMOVE(cgroup_empty_queue
, u
->manager
->cgroup_empty_queue
, u
);
2994 u
->in_cgroup_empty_queue
= false;
2997 int unit_check_oomd_kill(Unit
*u
) {
2998 _cleanup_free_
char *value
= NULL
;
3005 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3006 if (!crt
|| !crt
->cgroup_path
)
3009 r
= cg_get_xattr(crt
->cgroup_path
, "user.oomd_ooms", &value
, /* ret_size= */ NULL
);
3010 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
3013 if (!isempty(value
)) {
3014 r
= safe_atou64(value
, &n
);
3019 increased
= n
> crt
->managed_oom_kill_last
;
3020 crt
->managed_oom_kill_last
= n
;
3026 value
= mfree(value
);
3027 r
= cg_get_xattr(crt
->cgroup_path
, "user.oomd_kill", &value
, /* ret_size= */ NULL
);
3028 if (r
>= 0 && !isempty(value
))
3029 (void) safe_atou64(value
, &n
);
3032 log_unit_struct(u
, LOG_NOTICE
,
3033 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OOMD_KILL_STR
),
3034 LOG_UNIT_INVOCATION_ID(u
),
3035 LOG_UNIT_MESSAGE(u
, "systemd-oomd killed %"PRIu64
" process(es) in this unit.", n
),
3036 LOG_ITEM("N_PROCESSES=%" PRIu64
, n
));
3038 log_unit_struct(u
, LOG_NOTICE
,
3039 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OOMD_KILL_STR
),
3040 LOG_UNIT_INVOCATION_ID(u
),
3041 LOG_UNIT_MESSAGE(u
, "systemd-oomd killed some process(es) in this unit."));
3043 unit_notify_cgroup_oom(u
, /* managed_oom= */ true);
3048 int unit_check_oom(Unit
*u
) {
3049 _cleanup_free_
char *oom_kill
= NULL
;
3054 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3055 if (!crt
|| !crt
->cgroup_path
)
3058 r
= cg_get_keyed_attribute(
3062 STRV_MAKE("oom_kill"),
3064 if (IN_SET(r
, -ENOENT
, -ENXIO
)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
3067 return log_unit_debug_errno(u
, r
, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
3069 r
= safe_atou64(oom_kill
, &c
);
3071 return log_unit_debug_errno(u
, r
, "Failed to parse oom_kill field: %m");
3074 increased
= c
> crt
->oom_kill_last
;
3075 crt
->oom_kill_last
= c
;
3080 log_unit_struct(u
, LOG_NOTICE
,
3081 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR
),
3082 LOG_UNIT_INVOCATION_ID(u
),
3083 LOG_UNIT_MESSAGE(u
, "A process of this unit has been killed by the OOM killer."));
3085 unit_notify_cgroup_oom(u
, /* managed_oom= */ false);
3090 static int on_cgroup_oom_event(sd_event_source
*s
, void *userdata
) {
3091 Manager
*m
= ASSERT_PTR(userdata
);
3097 u
= m
->cgroup_oom_queue
;
3101 assert(u
->in_cgroup_oom_queue
);
3102 u
->in_cgroup_oom_queue
= false;
3103 LIST_REMOVE(cgroup_oom_queue
, m
->cgroup_oom_queue
, u
);
3105 if (m
->cgroup_oom_queue
) {
3106 /* More stuff queued, let's make sure we remain enabled */
3107 r
= sd_event_source_set_enabled(s
, SD_EVENT_ONESHOT
);
3109 log_debug_errno(r
, "Failed to reenable cgroup oom event source, ignoring: %m");
3112 (void) unit_check_oom(u
);
3113 unit_add_to_gc_queue(u
);
3118 static void unit_add_to_cgroup_oom_queue(Unit
*u
) {
3123 if (u
->in_cgroup_oom_queue
)
3126 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3127 if (!crt
|| !crt
->cgroup_path
)
3130 LIST_PREPEND(cgroup_oom_queue
, u
->manager
->cgroup_oom_queue
, u
);
3131 u
->in_cgroup_oom_queue
= true;
3133 /* Trigger the defer event */
3134 if (!u
->manager
->cgroup_oom_event_source
) {
3135 _cleanup_(sd_event_source_unrefp
) sd_event_source
*s
= NULL
;
3137 r
= sd_event_add_defer(u
->manager
->event
, &s
, on_cgroup_oom_event
, u
->manager
);
3139 log_error_errno(r
, "Failed to create cgroup oom event source: %m");
3143 r
= sd_event_source_set_priority(s
, EVENT_PRIORITY_CGROUP_OOM
);
3145 log_error_errno(r
, "Failed to set priority of cgroup oom event source: %m");
3149 (void) sd_event_source_set_description(s
, "cgroup-oom");
3150 u
->manager
->cgroup_oom_event_source
= TAKE_PTR(s
);
3153 r
= sd_event_source_set_enabled(u
->manager
->cgroup_oom_event_source
, SD_EVENT_ONESHOT
);
3155 log_error_errno(r
, "Failed to enable cgroup oom event source: %m");
3158 static int unit_check_cgroup_events(Unit
*u
) {
3159 char *values
[2] = {};
3164 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3165 if (!crt
|| !crt
->cgroup_path
)
3168 r
= cg_get_keyed_attribute(
3169 SYSTEMD_CGROUP_CONTROLLER
,
3172 STRV_MAKE("populated", "frozen"),
3177 /* The cgroup.events notifications can be merged together so act as we saw the given state for the
3178 * first time. The functions we call to handle given state are idempotent, which makes them
3179 * effectively remember the previous state. */
3180 if (streq(values
[0], "1"))
3181 unit_remove_from_cgroup_empty_queue(u
);
3183 unit_add_to_cgroup_empty_queue(u
);
3185 /* Disregard freezer state changes due to operations not initiated by us.
3186 * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and
3187 * https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */
3188 if (IN_SET(u
->freezer_state
, FREEZER_FREEZING
, FREEZER_FREEZING_BY_PARENT
, FREEZER_THAWING
))
3189 unit_freezer_complete(u
, streq(values
[1], "0") ? FREEZER_RUNNING
: FREEZER_FROZEN
);
3191 free_many_charp(values
, ELEMENTSOF(values
));
3195 static int on_cgroup_inotify_event(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
3196 Manager
*m
= ASSERT_PTR(userdata
);
3202 union inotify_event_buffer buffer
;
3205 l
= read(fd
, &buffer
, sizeof(buffer
));
3207 if (ERRNO_IS_TRANSIENT(errno
))
3210 return log_error_errno(errno
, "Failed to read control group inotify events: %m");
3213 FOREACH_INOTIFY_EVENT_WARN(e
, buffer
, l
) {
3217 /* Queue overflow has no watch descriptor */
3220 if (e
->mask
& IN_IGNORED
)
3221 /* The watch was just removed */
3224 /* Note that inotify might deliver events for a watch even after it was removed,
3225 * because it was queued before the removal. Let's ignore this here safely. */
3227 u
= hashmap_get(m
->cgroup_control_inotify_wd_unit
, INT_TO_PTR(e
->wd
));
3229 unit_check_cgroup_events(u
);
3231 u
= hashmap_get(m
->cgroup_memory_inotify_wd_unit
, INT_TO_PTR(e
->wd
));
3233 unit_add_to_cgroup_oom_queue(u
);
3238 static int cg_bpf_mask_supported(CGroupMask
*ret
) {
3239 CGroupMask mask
= 0;
3242 /* BPF-based firewall, device access control, and pinned foreign prog */
3243 if (bpf_program_supported() > 0)
3244 mask
|= CGROUP_MASK_BPF_FIREWALL
|
3245 CGROUP_MASK_BPF_DEVICES
|
3246 CGROUP_MASK_BPF_FOREIGN
;
3248 /* BPF-based bind{4|6} hooks */
3249 r
= bpf_socket_bind_supported();
3253 mask
|= CGROUP_MASK_BPF_SOCKET_BIND
;
3255 /* BPF-based cgroup_skb/{egress|ingress} hooks */
3256 r
= bpf_restrict_ifaces_supported();
3260 mask
|= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES
;
3266 int manager_setup_cgroup(Manager
*m
) {
3271 /* 1. Determine hierarchy */
3272 m
->cgroup_root
= mfree(m
->cgroup_root
);
3273 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &m
->cgroup_root
);
3275 return log_error_errno(r
, "Cannot determine cgroup we are running in: %m");
3277 /* Chop off the init scope, if we are already located in it */
3278 char *e
= endswith(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
3282 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
3283 * easily prepend it everywhere. */
3284 delete_trailing_chars(m
->cgroup_root
, "/");
3286 /* 2. Pin the cgroupfs mount, so that it cannot be unmounted */
3287 safe_close(m
->pin_cgroupfs_fd
);
3288 m
->pin_cgroupfs_fd
= open("/sys/fs/cgroup", O_PATH
|O_CLOEXEC
|O_DIRECTORY
);
3289 if (m
->pin_cgroupfs_fd
< 0)
3290 return log_error_errno(errno
, "Failed to pin cgroup hierarchy: %m");
3292 /* 3. Allocate cgroup empty defer event source */
3293 m
->cgroup_empty_event_source
= sd_event_source_disable_unref(m
->cgroup_empty_event_source
);
3294 r
= sd_event_add_defer(m
->event
, &m
->cgroup_empty_event_source
, on_cgroup_empty_event
, m
);
3296 return log_error_errno(r
, "Failed to create cgroup empty event source: %m");
3298 /* Schedule cgroup empty checks early, but after having processed service notification messages or
3299 * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
3300 * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
3301 r
= sd_event_source_set_priority(m
->cgroup_empty_event_source
, EVENT_PRIORITY_CGROUP_EMPTY
);
3303 return log_error_errno(r
, "Failed to set priority of cgroup empty event source: %m");
3305 r
= sd_event_source_set_enabled(m
->cgroup_empty_event_source
, SD_EVENT_OFF
);
3307 return log_error_errno(r
, "Failed to disable cgroup empty event source: %m");
3309 (void) sd_event_source_set_description(m
->cgroup_empty_event_source
, "cgroup-empty");
3311 /* 4. Install cgroup empty event notifier inotify object */
3312 m
->cgroup_inotify_event_source
= sd_event_source_disable_unref(m
->cgroup_inotify_event_source
);
3313 safe_close(m
->cgroup_inotify_fd
);
3315 m
->cgroup_inotify_fd
= inotify_init1(IN_NONBLOCK
|IN_CLOEXEC
);
3316 if (m
->cgroup_inotify_fd
< 0)
3317 return log_error_errno(errno
, "Failed to create control group inotify object: %m");
3319 r
= sd_event_add_io(m
->event
, &m
->cgroup_inotify_event_source
, m
->cgroup_inotify_fd
, EPOLLIN
, on_cgroup_inotify_event
, m
);
3321 return log_error_errno(r
, "Failed to watch control group inotify object: %m");
3323 /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
3324 * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
3325 * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
3326 r
= sd_event_source_set_priority(m
->cgroup_inotify_event_source
, EVENT_PRIORITY_CGROUP_INOTIFY
);
3328 return log_error_errno(r
, "Failed to set priority of inotify event source: %m");
3330 (void) sd_event_source_set_description(m
->cgroup_inotify_event_source
, "cgroup-inotify");
3332 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
3333 const char *scope_path
= strjoina(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
3334 r
= cg_create_and_attach(scope_path
, /* pid = */ 0);
3336 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
3337 r
= cg_migrate(m
->cgroup_root
, scope_path
, 0);
3339 log_warning_errno(r
, "Couldn't move remaining userspace processes, ignoring: %m");
3341 } else if (!MANAGER_IS_TEST_RUN(m
))
3342 return log_error_errno(r
, "Failed to create %s control group: %m", scope_path
);
3344 /* 6. Figure out which controllers are supported */
3345 r
= cg_mask_supported_subtree(m
->cgroup_root
, &m
->cgroup_supported
);
3347 return log_error_errno(r
, "Failed to determine supported controllers: %m");
3349 /* 7. Figure out which bpf-based pseudo-controllers are supported */
3351 r
= cg_bpf_mask_supported(&mask
);
3353 return log_error_errno(r
, "Failed to determine supported bpf-based pseudo-controllers: %m");
3354 m
->cgroup_supported
|= mask
;
3356 /* 8. Log which controllers are supported */
3357 for (CGroupController c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++)
3358 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c
),
3359 yes_no(m
->cgroup_supported
& CGROUP_CONTROLLER_TO_MASK(c
)));
3364 void manager_shutdown_cgroup(Manager
*m
, bool delete) {
3367 /* We can't really delete the group, since we are in it. But
3369 if (delete && m
->cgroup_root
&& !FLAGS_SET(m
->test_run_flags
, MANAGER_TEST_RUN_MINIMAL
))
3370 (void) cg_trim(m
->cgroup_root
, false);
3372 m
->cgroup_empty_event_source
= sd_event_source_disable_unref(m
->cgroup_empty_event_source
);
3374 m
->cgroup_control_inotify_wd_unit
= hashmap_free(m
->cgroup_control_inotify_wd_unit
);
3375 m
->cgroup_memory_inotify_wd_unit
= hashmap_free(m
->cgroup_memory_inotify_wd_unit
);
3377 m
->cgroup_inotify_event_source
= sd_event_source_disable_unref(m
->cgroup_inotify_event_source
);
3378 m
->cgroup_inotify_fd
= safe_close(m
->cgroup_inotify_fd
);
3380 m
->pin_cgroupfs_fd
= safe_close(m
->pin_cgroupfs_fd
);
3382 m
->cgroup_root
= mfree(m
->cgroup_root
);
3385 Unit
* manager_get_unit_by_cgroup(Manager
*m
, const char *cgroup
) {
3392 u
= hashmap_get(m
->cgroup_unit
, cgroup
);
3396 p
= strdupa_safe(cgroup
);
3400 e
= strrchr(p
, '/');
3402 return NULL
; /* reached cgroup root? return NULL and possibly fall back to manager_get_unit_by_pidref_watching() */
3406 u
= hashmap_get(m
->cgroup_unit
, p
);
3412 Unit
* manager_get_unit_by_pidref_cgroup(Manager
*m
, const PidRef
*pid
) {
3413 _cleanup_free_
char *cgroup
= NULL
;
3417 if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
) < 0)
3420 return manager_get_unit_by_cgroup(m
, cgroup
);
3423 Unit
* manager_get_unit_by_pidref_watching(Manager
*m
, const PidRef
*pid
) {
3428 if (!pidref_is_set(pid
))
3431 u
= hashmap_get(m
->watch_pids
, pid
);
3435 array
= hashmap_get(m
->watch_pids_more
, pid
);
3442 Unit
* manager_get_unit_by_pidref(Manager
*m
, PidRef
*pid
) {
3447 /* Note that a process might be owned by multiple units, we return only one here, which is good
3448 * enough for most cases, though not strictly correct. We prefer the one reported by cgroup
3449 * membership, as that's the most relevant one as children of the process will be assigned to that
3450 * one, too, before all else. */
3452 if (!pidref_is_set(pid
))
3455 if (pidref_is_self(pid
))
3456 return hashmap_get(m
->units
, SPECIAL_INIT_SCOPE
);
3460 u
= manager_get_unit_by_pidref_cgroup(m
, pid
);
3464 u
= manager_get_unit_by_pidref_watching(m
, pid
);
3471 int unit_get_memory_available(Unit
*u
, uint64_t *ret
) {
3472 uint64_t available
= UINT64_MAX
, current
= 0;
3477 /* If data from cgroups can be accessed, try to find out how much more memory a unit can
3478 * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
3479 * and MemoryMax, and also any slice the unit might be nested below. */
3482 uint64_t unit_available
, unit_limit
= UINT64_MAX
;
3483 CGroupContext
*unit_context
;
3485 /* No point in continuing if we can't go any lower */
3489 unit_context
= unit_get_cgroup_context(u
);
3493 (void) unit_get_memory_accounting(u
, CGROUP_MEMORY_CURRENT
, ¤t
);
3494 /* in case of error, previous current propagates as lower bound */
3496 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
3497 unit_limit
= physical_memory();
3498 else if (unit_context
->memory_max
== UINT64_MAX
&& unit_context
->memory_high
== UINT64_MAX
)
3500 unit_limit
= MIN3(unit_limit
, unit_context
->memory_max
, unit_context
->memory_high
);
3502 unit_available
= LESS_BY(unit_limit
, current
);
3503 available
= MIN(unit_available
, available
);
3504 } while ((u
= UNIT_GET_SLICE(u
)));
3511 int unit_get_memory_accounting(Unit
*u
, CGroupMemoryAccountingMetric metric
, uint64_t *ret
) {
3513 static const char* const attributes_table
[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX
] = {
3514 [CGROUP_MEMORY_CURRENT
] = "memory.current",
3515 [CGROUP_MEMORY_PEAK
] = "memory.peak",
3516 [CGROUP_MEMORY_SWAP_CURRENT
] = "memory.swap.current",
3517 [CGROUP_MEMORY_SWAP_PEAK
] = "memory.swap.peak",
3518 [CGROUP_MEMORY_ZSWAP_CURRENT
] = "memory.zswap.current",
3522 bool updated
= false;
3526 assert(metric
>= 0);
3527 assert(metric
< _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX
);
3529 if (!UNIT_CGROUP_BOOL(u
, memory_accounting
))
3532 /* The root cgroup doesn't expose this information. */
3533 if (unit_has_host_root_cgroup(u
)) {
3534 /* System-wide memory usage can be acquired from /proc/ */
3535 if (metric
== CGROUP_MEMORY_CURRENT
)
3536 return procfs_memory_get_used(ret
);
3541 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3544 if (!crt
->cgroup_path
)
3545 /* If the cgroup is already gone, we try to find the last cached value. */
3548 if (!FLAGS_SET(crt
->cgroup_realized_mask
, CGROUP_MASK_MEMORY
))
3551 r
= cg_get_attribute_as_uint64("memory", crt
->cgroup_path
, attributes_table
[metric
], &bytes
);
3552 if (r
< 0 && r
!= -ENODATA
)
3557 if (metric
<= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST
) {
3558 uint64_t *last
= &crt
->memory_accounting_last
[metric
];
3562 else if (*last
!= UINT64_MAX
)
3567 } else if (!updated
)
3576 int unit_get_tasks_current(Unit
*u
, uint64_t *ret
) {
3580 if (!UNIT_CGROUP_BOOL(u
, tasks_accounting
))
3583 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3584 if (!crt
|| !crt
->cgroup_path
)
3587 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3588 if (unit_has_host_root_cgroup(u
))
3589 return procfs_tasks_get_current(ret
);
3591 if ((crt
->cgroup_realized_mask
& CGROUP_MASK_PIDS
) == 0)
3594 return cg_get_attribute_as_uint64("pids", crt
->cgroup_path
, "pids.current", ret
);
3597 static int unit_get_cpu_usage_raw(const Unit
*u
, const CGroupRuntime
*crt
, nsec_t
*ret
) {
3604 if (!crt
->cgroup_path
)
3607 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3608 if (unit_has_host_root_cgroup(u
))
3609 return procfs_cpu_get_usage(ret
);
3611 _cleanup_free_
char *val
= NULL
;
3614 r
= cg_get_keyed_attribute("cpu", crt
->cgroup_path
, "cpu.stat", STRV_MAKE("usage_usec"), &val
);
3618 r
= safe_atou64(val
, &us
);
3622 *ret
= us
* NSEC_PER_USEC
;
3627 int unit_get_cpu_usage(Unit
*u
, nsec_t
*ret
) {
3633 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
3634 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
3635 * call this function with a NULL return value. */
3637 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3641 r
= unit_get_cpu_usage_raw(u
, crt
, &ns
);
3642 if (r
== -ENODATA
&& crt
->cpu_usage_last
!= NSEC_INFINITY
) {
3643 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
3647 *ret
= crt
->cpu_usage_last
;
3653 if (ns
> crt
->cpu_usage_base
)
3654 ns
-= crt
->cpu_usage_base
;
3658 crt
->cpu_usage_last
= ns
;
3665 int unit_get_ip_accounting(
3667 CGroupIPAccountingMetric metric
,
3674 assert(metric
>= 0);
3675 assert(metric
< _CGROUP_IP_ACCOUNTING_METRIC_MAX
);
3678 if (!UNIT_CGROUP_BOOL(u
, ip_accounting
))
3681 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3685 fd
= IN_SET(metric
, CGROUP_IP_INGRESS_BYTES
, CGROUP_IP_INGRESS_PACKETS
) ?
3686 crt
->ip_accounting_ingress_map_fd
:
3687 crt
->ip_accounting_egress_map_fd
;
3691 if (IN_SET(metric
, CGROUP_IP_INGRESS_BYTES
, CGROUP_IP_EGRESS_BYTES
))
3692 r
= bpf_firewall_read_accounting(fd
, &value
, NULL
);
3694 r
= bpf_firewall_read_accounting(fd
, NULL
, &value
);
3698 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
3699 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
3700 * ip_accounting_extra[] field, and add them in here transparently. */
3702 *ret
= value
+ crt
->ip_accounting_extra
[metric
];
3707 static uint64_t unit_get_effective_limit_one(Unit
*u
, CGroupLimitType type
) {
3711 assert(UNIT_HAS_CGROUP_CONTEXT(u
));
3713 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
3715 case CGROUP_LIMIT_MEMORY_MAX
:
3716 case CGROUP_LIMIT_MEMORY_HIGH
:
3717 return physical_memory();
3718 case CGROUP_LIMIT_TASKS_MAX
:
3719 return system_tasks_max();
3721 assert_not_reached();
3724 cc
= ASSERT_PTR(unit_get_cgroup_context(u
));
3726 case CGROUP_LIMIT_MEMORY_MAX
:
3727 return cc
->memory_max
;
3728 case CGROUP_LIMIT_MEMORY_HIGH
:
3729 return cc
->memory_high
;
3730 case CGROUP_LIMIT_TASKS_MAX
:
3731 return cgroup_tasks_max_resolve(&cc
->tasks_max
);
3733 assert_not_reached();
3737 int unit_get_effective_limit(Unit
*u
, CGroupLimitType type
, uint64_t *ret
) {
3743 assert(type
< _CGROUP_LIMIT_TYPE_MAX
);
3745 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
3748 infimum
= unit_get_effective_limit_one(u
, type
);
3749 for (Unit
*slice
= UNIT_GET_SLICE(u
); slice
; slice
= UNIT_GET_SLICE(slice
))
3750 infimum
= MIN(infimum
, unit_get_effective_limit_one(slice
, type
));
3756 static int unit_get_io_accounting_raw(
3758 const CGroupRuntime
*crt
,
3759 uint64_t ret
[static _CGROUP_IO_ACCOUNTING_METRIC_MAX
]) {
3761 static const char* const field_names
[_CGROUP_IO_ACCOUNTING_METRIC_MAX
] = {
3762 [CGROUP_IO_READ_BYTES
] = "rbytes=",
3763 [CGROUP_IO_WRITE_BYTES
] = "wbytes=",
3764 [CGROUP_IO_READ_OPERATIONS
] = "rios=",
3765 [CGROUP_IO_WRITE_OPERATIONS
] = "wios=",
3768 uint64_t acc
[_CGROUP_IO_ACCOUNTING_METRIC_MAX
] = {};
3769 _cleanup_free_
char *path
= NULL
;
3770 _cleanup_fclose_
FILE *f
= NULL
;
3776 if (!crt
->cgroup_path
)
3779 if (unit_has_host_root_cgroup(u
))
3780 return -ENODATA
; /* TODO: return useful data for the top-level cgroup */
3782 if (!FLAGS_SET(crt
->cgroup_realized_mask
, CGROUP_MASK_IO
))
3785 r
= cg_get_path("io", crt
->cgroup_path
, "io.stat", &path
);
3789 f
= fopen(path
, "re");
3794 _cleanup_free_
char *line
= NULL
;
3797 r
= read_line(f
, LONG_LINE_MAX
, &line
);
3804 p
+= strcspn(p
, WHITESPACE
); /* Skip over device major/minor */
3805 p
+= strspn(p
, WHITESPACE
); /* Skip over following whitespace */
3808 _cleanup_free_
char *word
= NULL
;
3810 r
= extract_first_word(&p
, &word
, NULL
, EXTRACT_RETAIN_ESCAPE
);
3816 for (CGroupIOAccountingMetric i
= 0; i
< _CGROUP_IO_ACCOUNTING_METRIC_MAX
; i
++) {
3819 x
= startswith(word
, field_names
[i
]);
3823 r
= safe_atou64(x
, &w
);
3827 /* Sum up the stats of all devices */
3835 memcpy(ret
, acc
, sizeof(acc
));
3839 int unit_get_io_accounting(
3841 CGroupIOAccountingMetric metric
,
3844 uint64_t raw
[_CGROUP_IO_ACCOUNTING_METRIC_MAX
];
3848 * Retrieve an IO counter, subtracting the value of the counter value at the time the unit was started.
3849 * If ret == NULL and metric == _<...>_INVALID, no return value is expected (refresh the caches only).
3853 assert(metric
>= 0 || (!ret
&& metric
== _CGROUP_IO_ACCOUNTING_METRIC_INVALID
));
3854 assert(metric
< _CGROUP_IO_ACCOUNTING_METRIC_MAX
);
3856 if (!UNIT_CGROUP_BOOL(u
, io_accounting
))
3859 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3863 r
= unit_get_io_accounting_raw(u
, crt
, raw
);
3864 if (r
== -ENODATA
&& metric
>= 0 && crt
->io_accounting_last
[metric
] != UINT64_MAX
)
3869 for (CGroupIOAccountingMetric i
= 0; i
< _CGROUP_IO_ACCOUNTING_METRIC_MAX
; i
++) {
3870 /* Saturated subtraction */
3871 if (raw
[i
] > crt
->io_accounting_base
[i
])
3872 crt
->io_accounting_last
[i
] = raw
[i
] - crt
->io_accounting_base
[i
];
3874 crt
->io_accounting_last
[i
] = 0;
3879 *ret
= crt
->io_accounting_last
[metric
];
3884 static int unit_reset_cpu_accounting(Unit
*unit
, CGroupRuntime
*crt
) {
3889 crt
->cpu_usage_base
= 0;
3890 crt
->cpu_usage_last
= NSEC_INFINITY
;
3893 r
= unit_get_cpu_usage_raw(unit
, crt
, &crt
->cpu_usage_base
);
3894 if (r
< 0 && r
!= -ENODATA
)
3901 static int unit_reset_io_accounting(Unit
*unit
, CGroupRuntime
*crt
) {
3906 zero(crt
->io_accounting_base
);
3907 FOREACH_ELEMENT(i
, crt
->io_accounting_last
)
3911 r
= unit_get_io_accounting_raw(unit
, crt
, crt
->io_accounting_base
);
3912 if (r
< 0 && r
!= -ENODATA
)
3919 static void cgroup_runtime_reset_memory_accounting_last(CGroupRuntime
*crt
) {
3922 FOREACH_ELEMENT(i
, crt
->memory_accounting_last
)
3926 static int cgroup_runtime_reset_ip_accounting(CGroupRuntime
*crt
) {
3931 if (crt
->ip_accounting_ingress_map_fd
>= 0)
3932 RET_GATHER(r
, bpf_firewall_reset_accounting(crt
->ip_accounting_ingress_map_fd
));
3934 if (crt
->ip_accounting_egress_map_fd
>= 0)
3935 RET_GATHER(r
, bpf_firewall_reset_accounting(crt
->ip_accounting_egress_map_fd
));
3937 zero(crt
->ip_accounting_extra
);
3942 int unit_reset_accounting(Unit
*u
) {
3947 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3951 cgroup_runtime_reset_memory_accounting_last(crt
);
3952 RET_GATHER(r
, unit_reset_cpu_accounting(u
, crt
));
3953 RET_GATHER(r
, unit_reset_io_accounting(u
, crt
));
3954 RET_GATHER(r
, cgroup_runtime_reset_ip_accounting(crt
));
3959 void unit_invalidate_cgroup(Unit
*u
, CGroupMask m
) {
3962 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
3965 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3969 if (FLAGS_SET(crt
->cgroup_invalidated_mask
, m
)) /* NOP? */
3972 crt
->cgroup_invalidated_mask
|= m
;
3973 unit_add_to_cgroup_realize_queue(u
);
3976 void unit_invalidate_cgroup_bpf(Unit
*u
) {
3979 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
3982 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
3986 if (crt
->cgroup_invalidated_mask
& CGROUP_MASK_BPF_FIREWALL
) /* NOP? */
3989 crt
->cgroup_invalidated_mask
|= CGROUP_MASK_BPF_FIREWALL
;
3990 unit_add_to_cgroup_realize_queue(u
);
3992 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
3993 * list of our children includes our own. */
3994 if (u
->type
== UNIT_SLICE
) {
3997 UNIT_FOREACH_DEPENDENCY(member
, u
, UNIT_ATOM_SLICE_OF
)
3998 unit_invalidate_cgroup_bpf(member
);
4002 void unit_cgroup_catchup(Unit
*u
) {
4005 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
4008 /* We dropped the inotify watch during reexec/reload, so we need to
4009 * check these as they may have changed.
4010 * Note that (currently) the kernel doesn't actually update cgroup
4011 * file modification times, so we can't just serialize and then check
4012 * the mtime for file(s) we are interested in. */
4013 (void) unit_check_cgroup_events(u
);
4014 unit_add_to_cgroup_oom_queue(u
);
4017 bool unit_cgroup_delegate(Unit
*u
) {
4022 if (!UNIT_VTABLE(u
)->can_delegate
)
4025 c
= unit_get_cgroup_context(u
);
4032 void manager_invalidate_startup_units(Manager
*m
) {
4037 SET_FOREACH(u
, m
->startup_units
)
4038 unit_invalidate_cgroup(u
, CGROUP_MASK_CPU
|CGROUP_MASK_IO
|CGROUP_MASK_BLKIO
|CGROUP_MASK_CPUSET
);
4041 static int unit_cgroup_freezer_kernel_state(Unit
*u
, FreezerState
*ret
) {
4042 _cleanup_free_
char *val
= NULL
;
4049 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
4050 if (!crt
|| !crt
->cgroup_path
)
4053 r
= cg_get_keyed_attribute(
4054 SYSTEMD_CGROUP_CONTROLLER
,
4057 STRV_MAKE("frozen"),
4062 if (streq(val
, "0"))
4063 s
= FREEZER_RUNNING
;
4064 else if (streq(val
, "1"))
4067 log_unit_debug(u
, "Unexpected cgroup frozen state: %s", val
);
4068 s
= _FREEZER_STATE_INVALID
;
4075 int unit_cgroup_freezer_action(Unit
*u
, FreezerAction action
) {
4076 _cleanup_free_
char *path
= NULL
;
4077 FreezerState current
, next
, objective
;
4078 bool action_in_progress
= false;
4082 assert(action
>= 0);
4083 assert(action
< _FREEZER_ACTION_MAX
);
4085 unit_next_freezer_state(u
, action
, &next
, &objective
);
4087 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
4088 if (!crt
|| !crt
->cgroup_path
)
4089 /* No realized cgroup = nothing to freeze */
4092 r
= unit_cgroup_freezer_kernel_state(u
, ¤t
);
4096 if (current
== objective
) {
4097 if (objective
== FREEZER_FROZEN
)
4100 /* Skip thaw only if no freeze operation was in flight */
4101 if (IN_SET(u
->freezer_state
, FREEZER_RUNNING
, FREEZER_THAWING
))
4104 action_in_progress
= true;
4106 if (next
== freezer_state_finish(next
)) {
4107 /* We're directly transitioning into a finished state, which in theory means that
4108 * the cgroup's current state already matches the objective and thus we'd return 0.
4109 * But, reality shows otherwise (such case would have been handled by current == objective
4110 * branch above). This indicates that our freezer_state tracking has diverged
4111 * from the real state of the cgroup, which can happen if someone meddles with the
4112 * cgroup from underneath us. This really shouldn't happen during normal operation,
4113 * though. So, let's warn about it and fix up the state to be valid */
4115 log_unit_warning(u
, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.",
4116 freezer_state_to_string(next
), freezer_state_to_string(current
) ?: "(invalid)");
4118 if (next
== FREEZER_FROZEN
)
4119 next
= FREEZER_FREEZING
;
4120 else if (next
== FREEZER_FROZEN_BY_PARENT
)
4121 next
= FREEZER_FREEZING_BY_PARENT
;
4122 else if (next
== FREEZER_RUNNING
)
4123 next
= FREEZER_THAWING
;
4125 assert_not_reached();
4128 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, crt
->cgroup_path
, "cgroup.freeze", &path
);
4132 r
= write_string_file(path
, one_zero(objective
== FREEZER_FROZEN
), WRITE_STRING_FILE_DISABLE_BUFFER
);
4137 if (action_in_progress
)
4138 unit_set_freezer_state(u
, next
);
4140 unit_set_freezer_state(u
, freezer_state_finish(next
));
4142 return action_in_progress
;
4145 int unit_get_cpuset(Unit
*u
, CPUSet
*cpus
, const char *name
) {
4146 _cleanup_free_
char *v
= NULL
;
4152 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
4153 if (!crt
|| !crt
->cgroup_path
)
4156 if ((crt
->cgroup_realized_mask
& CGROUP_MASK_CPUSET
) == 0)
4159 r
= cg_get_attribute("cpuset", crt
->cgroup_path
, name
, &v
);
4165 return parse_cpu_set(v
, cpus
);
4168 CGroupRuntime
* cgroup_runtime_new(void) {
4169 _cleanup_(cgroup_runtime_freep
) CGroupRuntime
*crt
= NULL
;
4171 crt
= new(CGroupRuntime
, 1);
4175 *crt
= (CGroupRuntime
) {
4176 .cgroup_control_inotify_wd
= -1,
4177 .cgroup_memory_inotify_wd
= -1,
4179 .ip_accounting_ingress_map_fd
= -EBADF
,
4180 .ip_accounting_egress_map_fd
= -EBADF
,
4182 .ipv4_allow_map_fd
= -EBADF
,
4183 .ipv6_allow_map_fd
= -EBADF
,
4184 .ipv4_deny_map_fd
= -EBADF
,
4185 .ipv6_deny_map_fd
= -EBADF
,
4187 .cgroup_invalidated_mask
= _CGROUP_MASK_ALL
,
4189 .deserialized_cgroup_realized
= -1,
4192 unit_reset_cpu_accounting(/* unit = */ NULL
, crt
);
4193 unit_reset_io_accounting(/* unit = */ NULL
, crt
);
4194 cgroup_runtime_reset_memory_accounting_last(crt
);
4195 assert_se(cgroup_runtime_reset_ip_accounting(crt
) >= 0);
4197 return TAKE_PTR(crt
);
4200 CGroupRuntime
* cgroup_runtime_free(CGroupRuntime
*crt
) {
4204 fdset_free(crt
->initial_socket_bind_link_fds
);
4206 bpf_link_free(crt
->ipv4_socket_bind_link
);
4207 bpf_link_free(crt
->ipv6_socket_bind_link
);
4209 hashmap_free(crt
->bpf_foreign_by_key
);
4211 bpf_program_free(crt
->bpf_device_control_installed
);
4214 bpf_link_free(crt
->restrict_ifaces_ingress_bpf_link
);
4215 bpf_link_free(crt
->restrict_ifaces_egress_bpf_link
);
4217 fdset_free(crt
->initial_restrict_ifaces_link_fds
);
4219 bpf_firewall_close(crt
);
4221 free(crt
->cgroup_path
);
4226 static const char* const ip_accounting_metric_field_table
[_CGROUP_IP_ACCOUNTING_METRIC_MAX
] = {
4227 [CGROUP_IP_INGRESS_BYTES
] = "ip-accounting-ingress-bytes",
4228 [CGROUP_IP_INGRESS_PACKETS
] = "ip-accounting-ingress-packets",
4229 [CGROUP_IP_EGRESS_BYTES
] = "ip-accounting-egress-bytes",
4230 [CGROUP_IP_EGRESS_PACKETS
] = "ip-accounting-egress-packets",
4233 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field
, CGroupIPAccountingMetric
);
4235 static const char* const io_accounting_metric_field_base_table
[_CGROUP_IO_ACCOUNTING_METRIC_MAX
] = {
4236 [CGROUP_IO_READ_BYTES
] = "io-accounting-read-bytes-base",
4237 [CGROUP_IO_WRITE_BYTES
] = "io-accounting-write-bytes-base",
4238 [CGROUP_IO_READ_OPERATIONS
] = "io-accounting-read-operations-base",
4239 [CGROUP_IO_WRITE_OPERATIONS
] = "io-accounting-write-operations-base",
4242 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base
, CGroupIOAccountingMetric
);
4244 static const char* const io_accounting_metric_field_last_table
[_CGROUP_IO_ACCOUNTING_METRIC_MAX
] = {
4245 [CGROUP_IO_READ_BYTES
] = "io-accounting-read-bytes-last",
4246 [CGROUP_IO_WRITE_BYTES
] = "io-accounting-write-bytes-last",
4247 [CGROUP_IO_READ_OPERATIONS
] = "io-accounting-read-operations-last",
4248 [CGROUP_IO_WRITE_OPERATIONS
] = "io-accounting-write-operations-last",
4251 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last
, CGroupIOAccountingMetric
);
4253 static const char* const memory_accounting_metric_field_last_table
[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST
+ 1] = {
4254 [CGROUP_MEMORY_PEAK
] = "memory-accounting-peak",
4255 [CGROUP_MEMORY_SWAP_PEAK
] = "memory-accounting-swap-peak",
4258 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last
, CGroupMemoryAccountingMetric
);
4260 static int serialize_cgroup_mask(FILE *f
, const char *key
, CGroupMask mask
) {
4261 _cleanup_free_
char *s
= NULL
;
4270 r
= cg_mask_to_string(mask
, &s
);
4272 return log_error_errno(r
, "Failed to format cgroup mask: %m");
4274 return serialize_item(f
, key
, s
);
4277 int cgroup_runtime_serialize(Unit
*u
, FILE *f
, FDSet
*fds
) {
4284 CGroupRuntime
*crt
= unit_get_cgroup_runtime(u
);
4288 (void) serialize_item_format(f
, "cpu-usage-base", "%" PRIu64
, crt
->cpu_usage_base
);
4289 if (crt
->cpu_usage_last
!= NSEC_INFINITY
)
4290 (void) serialize_item_format(f
, "cpu-usage-last", "%" PRIu64
, crt
->cpu_usage_last
);
4292 if (crt
->managed_oom_kill_last
> 0)
4293 (void) serialize_item_format(f
, "managed-oom-kill-last", "%" PRIu64
, crt
->managed_oom_kill_last
);
4295 if (crt
->oom_kill_last
> 0)
4296 (void) serialize_item_format(f
, "oom-kill-last", "%" PRIu64
, crt
->oom_kill_last
);
4298 for (CGroupMemoryAccountingMetric metric
= 0; metric
<= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST
; metric
++) {
4301 r
= unit_get_memory_accounting(u
, metric
, &v
);
4303 (void) serialize_item_format(f
, memory_accounting_metric_field_last_to_string(metric
), "%" PRIu64
, v
);
4306 for (CGroupIPAccountingMetric m
= 0; m
< _CGROUP_IP_ACCOUNTING_METRIC_MAX
; m
++) {
4309 r
= unit_get_ip_accounting(u
, m
, &v
);
4311 (void) serialize_item_format(f
, ip_accounting_metric_field_to_string(m
), "%" PRIu64
, v
);
4314 for (CGroupIOAccountingMetric im
= 0; im
< _CGROUP_IO_ACCOUNTING_METRIC_MAX
; im
++) {
4315 (void) serialize_item_format(f
, io_accounting_metric_field_base_to_string(im
), "%" PRIu64
, crt
->io_accounting_base
[im
]);
4317 if (crt
->io_accounting_last
[im
] != UINT64_MAX
)
4318 (void) serialize_item_format(f
, io_accounting_metric_field_last_to_string(im
), "%" PRIu64
, crt
->io_accounting_last
[im
]);
4321 if (crt
->cgroup_path
)
4322 (void) serialize_item(f
, "cgroup", crt
->cgroup_path
);
4323 if (crt
->cgroup_id
!= 0)
4324 (void) serialize_item_format(f
, "cgroup-id", "%" PRIu64
, crt
->cgroup_id
);
4326 (void) serialize_cgroup_mask(f
, "cgroup-realized-mask", crt
->cgroup_realized_mask
);
4327 (void) serialize_cgroup_mask(f
, "cgroup-enabled-mask", crt
->cgroup_enabled_mask
);
4328 (void) serialize_cgroup_mask(f
, "cgroup-invalidated-mask", crt
->cgroup_invalidated_mask
);
4330 (void) bpf_socket_bind_serialize(u
, f
, fds
);
4332 (void) bpf_program_serialize_attachment(f
, fds
, "ip-bpf-ingress-installed", crt
->ip_bpf_ingress_installed
);
4333 (void) bpf_program_serialize_attachment(f
, fds
, "ip-bpf-egress-installed", crt
->ip_bpf_egress_installed
);
4334 (void) bpf_program_serialize_attachment(f
, fds
, "bpf-device-control-installed", crt
->bpf_device_control_installed
);
4335 (void) bpf_program_serialize_attachment_set(f
, fds
, "ip-bpf-custom-ingress-installed", crt
->ip_bpf_custom_ingress_installed
);
4336 (void) bpf_program_serialize_attachment_set(f
, fds
, "ip-bpf-custom-egress-installed", crt
->ip_bpf_custom_egress_installed
);
4338 (void) bpf_restrict_ifaces_serialize(u
, f
, fds
);
4343 #define MATCH_DESERIALIZE(u, key, l, v, parse_func, target) \
4345 bool _deserialize_matched = streq(l, key); \
4346 if (_deserialize_matched) { \
4347 CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4351 int _deserialize_r = parse_func(v); \
4352 if (_deserialize_r < 0) \
4353 log_unit_debug_errno(u, _deserialize_r, \
4354 "Failed to parse \"%s=%s\", ignoring.", l, v); \
4356 crt->target = _deserialize_r; \
4359 _deserialize_matched; \
4362 #define MATCH_DESERIALIZE_IMMEDIATE(u, key, l, v, parse_func, target) \
4364 bool _deserialize_matched = streq(l, key); \
4365 if (_deserialize_matched) { \
4366 CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4370 int _deserialize_r = parse_func(v, &crt->target); \
4371 if (_deserialize_r < 0) \
4372 log_unit_debug_errno(u, _deserialize_r, \
4373 "Failed to parse \"%s=%s\", ignoring", l, v); \
4376 _deserialize_matched; \
4379 #define MATCH_DESERIALIZE_METRIC(u, key, l, v, parse_func, target) \
4381 bool _deserialize_matched = streq(l, key); \
4382 if (_deserialize_matched) { \
4383 CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4387 int _deserialize_r = parse_func(v); \
4388 if (_deserialize_r < 0) \
4389 log_unit_debug_errno(u, _deserialize_r, \
4390 "Failed to parse \"%s=%s\", ignoring.", l, v); \
4392 crt->target = _deserialize_r; \
4395 _deserialize_matched; \
4398 int cgroup_runtime_deserialize_one(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
4404 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
4407 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "cpu-usage-base", key
, value
, safe_atou64
, cpu_usage_base
) ||
4408 MATCH_DESERIALIZE_IMMEDIATE(u
, "cpuacct-usage-base", key
, value
, safe_atou64
, cpu_usage_base
))
4411 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "cpu-usage-last", key
, value
, safe_atou64
, cpu_usage_last
))
4414 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "managed-oom-kill-last", key
, value
, safe_atou64
, managed_oom_kill_last
))
4417 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "oom-kill-last", key
, value
, safe_atou64
, oom_kill_last
))
4420 if (streq(key
, "cgroup")) {
4421 r
= unit_set_cgroup_path(u
, value
);
4423 log_unit_debug_errno(u
, r
, "Failed to set cgroup path %s, ignoring: %m", value
);
4428 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "cgroup-id", key
, value
, safe_atou64
, cgroup_id
))
4431 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "cgroup-realized", key
, value
, parse_tristate
, deserialized_cgroup_realized
))
4434 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "cgroup-realized-mask", key
, value
, cg_mask_from_string
, cgroup_realized_mask
))
4437 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "cgroup-enabled-mask", key
, value
, cg_mask_from_string
, cgroup_enabled_mask
))
4440 if (MATCH_DESERIALIZE_IMMEDIATE(u
, "cgroup-invalidated-mask", key
, value
, cg_mask_from_string
, cgroup_invalidated_mask
))
4443 if (STR_IN_SET(key
, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
4446 fd
= deserialize_fd(fds
, value
);
4448 (void) bpf_socket_bind_add_initial_link_fd(u
, fd
);
4454 "ip-bpf-ingress-installed", "ip-bpf-egress-installed",
4455 "bpf-device-control-installed",
4456 "ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) {
4458 CGroupRuntime
*crt
= unit_setup_cgroup_runtime(u
);
4462 if (streq(key
, "ip-bpf-ingress-installed"))
4463 (void) bpf_program_deserialize_attachment(value
, fds
, &crt
->ip_bpf_ingress_installed
);
4465 if (streq(key
, "ip-bpf-egress-installed"))
4466 (void) bpf_program_deserialize_attachment(value
, fds
, &crt
->ip_bpf_egress_installed
);
4468 if (streq(key
, "bpf-device-control-installed"))
4469 (void) bpf_program_deserialize_attachment(value
, fds
, &crt
->bpf_device_control_installed
);
4471 if (streq(key
, "ip-bpf-custom-ingress-installed"))
4472 (void) bpf_program_deserialize_attachment_set(value
, fds
, &crt
->ip_bpf_custom_ingress_installed
);
4474 if (streq(key
, "ip-bpf-custom-egress-installed"))
4475 (void) bpf_program_deserialize_attachment_set(value
, fds
, &crt
->ip_bpf_custom_egress_installed
);
4481 if (streq(key
, "restrict-ifaces-bpf-fd")) {
4484 fd
= deserialize_fd(fds
, value
);
4486 (void) bpf_restrict_ifaces_add_initial_link_fd(u
, fd
);
4490 CGroupMemoryAccountingMetric mm
= memory_accounting_metric_field_last_from_string(key
);
4494 r
= safe_atou64(value
, &c
);
4496 log_unit_debug(u
, "Failed to parse memory accounting last value %s, ignoring.", value
);
4498 CGroupRuntime
*crt
= unit_setup_cgroup_runtime(u
);
4502 crt
->memory_accounting_last
[mm
] = c
;
4508 CGroupIPAccountingMetric ipm
= ip_accounting_metric_field_from_string(key
);
4512 r
= safe_atou64(value
, &c
);
4514 log_unit_debug(u
, "Failed to parse IP accounting value %s, ignoring.", value
);
4516 CGroupRuntime
*crt
= unit_setup_cgroup_runtime(u
);
4520 crt
->ip_accounting_extra
[ipm
] = c
;
4526 CGroupIOAccountingMetric iom
= io_accounting_metric_field_base_from_string(key
);
4530 r
= safe_atou64(value
, &c
);
4532 log_unit_debug(u
, "Failed to parse IO accounting base value %s, ignoring.", value
);
4534 CGroupRuntime
*crt
= unit_setup_cgroup_runtime(u
);
4538 crt
->io_accounting_base
[iom
] = c
;
4544 iom
= io_accounting_metric_field_last_from_string(key
);
4548 r
= safe_atou64(value
, &c
);
4550 log_unit_debug(u
, "Failed to parse IO accounting last value %s, ignoring.", value
);
4552 CGroupRuntime
*crt
= unit_setup_cgroup_runtime(u
);
4556 crt
->io_accounting_last
[iom
] = c
;
4564 static const char* const cgroup_device_policy_table
[_CGROUP_DEVICE_POLICY_MAX
] = {
4565 [CGROUP_DEVICE_POLICY_AUTO
] = "auto",
4566 [CGROUP_DEVICE_POLICY_CLOSED
] = "closed",
4567 [CGROUP_DEVICE_POLICY_STRICT
] = "strict",
4570 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy
, CGroupDevicePolicy
);
4572 static const char* const cgroup_pressure_watch_table
[_CGROUP_PRESSURE_WATCH_MAX
] = {
4573 [CGROUP_PRESSURE_WATCH_NO
] = "no",
4574 [CGROUP_PRESSURE_WATCH_YES
] = "yes",
4575 [CGROUP_PRESSURE_WATCH_AUTO
] = "auto",
4576 [CGROUP_PRESSURE_WATCH_SKIP
] = "skip",
4579 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch
, CGroupPressureWatch
, CGROUP_PRESSURE_WATCH_YES
);
4581 static const char* const cgroup_ip_accounting_metric_table
[_CGROUP_IP_ACCOUNTING_METRIC_MAX
] = {
4582 [CGROUP_IP_INGRESS_BYTES
] = "IPIngressBytes",
4583 [CGROUP_IP_EGRESS_BYTES
] = "IPEgressBytes",
4584 [CGROUP_IP_INGRESS_PACKETS
] = "IPIngressPackets",
4585 [CGROUP_IP_EGRESS_PACKETS
] = "IPEgressPackets",
4588 DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric
, CGroupIPAccountingMetric
);
4590 static const char* const cgroup_io_accounting_metric_table
[_CGROUP_IO_ACCOUNTING_METRIC_MAX
] = {
4591 [CGROUP_IO_READ_BYTES
] = "IOReadBytes",
4592 [CGROUP_IO_WRITE_BYTES
] = "IOWriteBytes",
4593 [CGROUP_IO_READ_OPERATIONS
] = "IOReadOperations",
4594 [CGROUP_IO_WRITE_OPERATIONS
] = "IOWriteOperations",
4597 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric
, CGroupIOAccountingMetric
);
4599 static const char* const cgroup_memory_accounting_metric_table
[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX
] = {
4600 [CGROUP_MEMORY_CURRENT
] = "MemoryCurrent",
4601 [CGROUP_MEMORY_PEAK
] = "MemoryPeak",
4602 [CGROUP_MEMORY_SWAP_CURRENT
] = "MemorySwapCurrent",
4603 [CGROUP_MEMORY_SWAP_PEAK
] = "MemorySwapPeak",
4604 [CGROUP_MEMORY_ZSWAP_CURRENT
] = "MemoryZSwapCurrent",
4607 DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric
, CGroupMemoryAccountingMetric
);
4609 static const char *const cgroup_effective_limit_type_table
[_CGROUP_LIMIT_TYPE_MAX
] = {
4610 [CGROUP_LIMIT_MEMORY_MAX
] = "EffectiveMemoryMax",
4611 [CGROUP_LIMIT_MEMORY_HIGH
] = "EffectiveMemoryHigh",
4612 [CGROUP_LIMIT_TASKS_MAX
] = "EffectiveTasksMax",
4615 DEFINE_STRING_TABLE_LOOKUP(cgroup_effective_limit_type
, CGroupLimitType
);