1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
7 #include "cgroup-util.h"
8 #include "cpu-set-util.h"
9 #include "firewall-util.h"
12 #include "time-util.h"
14 typedef struct CGroupTasksMax
{
15 /* If scale == 0, just use value; otherwise, value / scale.
16 * See tasks_max_resolve(). */
21 #define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })
23 static inline bool cgroup_tasks_max_isset(const CGroupTasksMax
*tasks_max
) {
24 return tasks_max
->value
!= UINT64_MAX
|| tasks_max
->scale
!= 0;
27 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax
*tasks_max
);
29 typedef struct CGroupContext CGroupContext
;
30 typedef struct CGroupDeviceAllow CGroupDeviceAllow
;
31 typedef struct CGroupIODeviceWeight CGroupIODeviceWeight
;
32 typedef struct CGroupIODeviceLimit CGroupIODeviceLimit
;
33 typedef struct CGroupIODeviceLatency CGroupIODeviceLatency
;
34 typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight
;
35 typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth
;
36 typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram
;
37 typedef struct CGroupSocketBindItem CGroupSocketBindItem
;
39 typedef enum CGroupDevicePolicy
{
40 /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
42 CGROUP_DEVICE_POLICY_AUTO
,
44 /* Everything forbidden, except built-in ones and listed ones. */
45 CGROUP_DEVICE_POLICY_CLOSED
,
47 /* Everything forbidden, except for the listed devices */
48 CGROUP_DEVICE_POLICY_STRICT
,
50 _CGROUP_DEVICE_POLICY_MAX
,
51 _CGROUP_DEVICE_POLICY_INVALID
= -EINVAL
,
54 typedef enum FreezerAction
{
59 _FREEZER_ACTION_INVALID
= -EINVAL
,
62 typedef enum CGroupDevicePermissions
{
63 /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
64 CGROUP_DEVICE_MKNOD
= 1 << 0,
65 CGROUP_DEVICE_READ
= 1 << 1,
66 CGROUP_DEVICE_WRITE
= 1 << 2,
67 _CGROUP_DEVICE_PERMISSIONS_MAX
= 1 << 3,
68 _CGROUP_DEVICE_PERMISSIONS_ALL
= _CGROUP_DEVICE_PERMISSIONS_MAX
- 1,
69 _CGROUP_DEVICE_PERMISSIONS_INVALID
= -EINVAL
,
70 } CGroupDevicePermissions
;
72 struct CGroupDeviceAllow
{
73 LIST_FIELDS(CGroupDeviceAllow
, device_allow
);
75 CGroupDevicePermissions permissions
;
78 struct CGroupIODeviceWeight
{
79 LIST_FIELDS(CGroupIODeviceWeight
, device_weights
);
84 struct CGroupIODeviceLimit
{
85 LIST_FIELDS(CGroupIODeviceLimit
, device_limits
);
87 uint64_t limits
[_CGROUP_IO_LIMIT_TYPE_MAX
];
90 struct CGroupIODeviceLatency
{
91 LIST_FIELDS(CGroupIODeviceLatency
, device_latencies
);
96 struct CGroupBlockIODeviceWeight
{
97 LIST_FIELDS(CGroupBlockIODeviceWeight
, device_weights
);
102 struct CGroupBlockIODeviceBandwidth
{
103 LIST_FIELDS(CGroupBlockIODeviceBandwidth
, device_bandwidths
);
109 struct CGroupBPFForeignProgram
{
110 LIST_FIELDS(CGroupBPFForeignProgram
, programs
);
111 uint32_t attach_type
;
115 struct CGroupSocketBindItem
{
116 LIST_FIELDS(CGroupSocketBindItem
, socket_bind_items
);
123 typedef enum CGroupPressureWatch
{
124 CGROUP_PRESSURE_WATCH_OFF
, /* → tells the service payload explicitly not to watch for memory pressure */
125 CGROUP_PRESSURE_WATCH_AUTO
, /* → on if memory account is on anyway for the unit, otherwise off */
126 CGROUP_PRESSURE_WATCH_ON
,
127 CGROUP_PRESSURE_WATCH_SKIP
, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
128 _CGROUP_PRESSURE_WATCH_MAX
,
129 _CGROUP_PRESSURE_WATCH_INVALID
= -EINVAL
,
130 } CGroupPressureWatch
;
132 /* When adding members make sure to update cgroup_context_copy() accordingly */
133 struct CGroupContext
{
136 bool blockio_accounting
;
137 bool memory_accounting
;
138 bool tasks_accounting
;
141 /* Configures the memory.oom.group attribute (on unified) */
142 bool memory_oom_group
;
145 CGroupMask delegate_controllers
;
146 CGroupMask disable_controllers
;
147 char *delegate_subgroup
;
149 /* For unified hierarchy */
151 uint64_t startup_cpu_weight
;
152 usec_t cpu_quota_per_sec_usec
;
153 usec_t cpu_quota_period_usec
;
156 CPUSet startup_cpuset_cpus
;
158 CPUSet startup_cpuset_mems
;
161 uint64_t startup_io_weight
;
162 LIST_HEAD(CGroupIODeviceWeight
, io_device_weights
);
163 LIST_HEAD(CGroupIODeviceLimit
, io_device_limits
);
164 LIST_HEAD(CGroupIODeviceLatency
, io_device_latencies
);
166 uint64_t default_memory_min
;
167 uint64_t default_memory_low
;
168 uint64_t default_startup_memory_low
;
171 uint64_t startup_memory_low
;
172 uint64_t memory_high
;
173 uint64_t startup_memory_high
;
175 uint64_t startup_memory_max
;
176 uint64_t memory_swap_max
;
177 uint64_t startup_memory_swap_max
;
178 uint64_t memory_zswap_max
;
179 uint64_t startup_memory_zswap_max
;
181 bool default_memory_min_set
:1;
182 bool default_memory_low_set
:1;
183 bool default_startup_memory_low_set
:1;
184 bool memory_min_set
:1;
185 bool memory_low_set
:1;
186 bool startup_memory_low_set
:1;
187 bool startup_memory_high_set
:1;
188 bool startup_memory_max_set
:1;
189 bool startup_memory_swap_max_set
:1;
190 bool startup_memory_zswap_max_set
:1;
192 Set
*ip_address_allow
;
193 Set
*ip_address_deny
;
194 /* These two flags indicate that redundant entries have been removed from
195 * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
196 bool ip_address_allow_reduced
;
197 bool ip_address_deny_reduced
;
199 char **ip_filters_ingress
;
200 char **ip_filters_egress
;
201 LIST_HEAD(CGroupBPFForeignProgram
, bpf_foreign_programs
);
203 Set
*restrict_network_interfaces
;
204 bool restrict_network_interfaces_is_allow_list
;
206 /* For legacy hierarchies */
208 uint64_t startup_cpu_shares
;
210 uint64_t blockio_weight
;
211 uint64_t startup_blockio_weight
;
212 LIST_HEAD(CGroupBlockIODeviceWeight
, blockio_device_weights
);
213 LIST_HEAD(CGroupBlockIODeviceBandwidth
, blockio_device_bandwidths
);
215 uint64_t memory_limit
;
217 CGroupDevicePolicy device_policy
;
218 LIST_HEAD(CGroupDeviceAllow
, device_allow
);
220 LIST_HEAD(CGroupSocketBindItem
, socket_bind_allow
);
221 LIST_HEAD(CGroupSocketBindItem
, socket_bind_deny
);
224 CGroupTasksMax tasks_max
;
226 /* Settings for systemd-oomd */
227 ManagedOOMMode moom_swap
;
228 ManagedOOMMode moom_mem_pressure
;
229 uint32_t moom_mem_pressure_limit
; /* Normalized to 2^32-1 == 100% */
230 ManagedOOMPreference moom_preference
;
232 /* Memory pressure logic */
233 CGroupPressureWatch memory_pressure_watch
;
234 usec_t memory_pressure_threshold_usec
;
235 /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
236 * triggers, nor triggers for non-memory pressure. We might add that later. */
238 NFTSetContext nft_set_context
;
240 /* Forward coredumps for processes that crash within this cgroup.
241 * Requires 'delegate' to also be true. */
242 bool coredump_receive
;
245 /* Used when querying IP accounting data */
246 typedef enum CGroupIPAccountingMetric
{
247 CGROUP_IP_INGRESS_BYTES
,
248 CGROUP_IP_INGRESS_PACKETS
,
249 CGROUP_IP_EGRESS_BYTES
,
250 CGROUP_IP_EGRESS_PACKETS
,
251 _CGROUP_IP_ACCOUNTING_METRIC_MAX
,
252 _CGROUP_IP_ACCOUNTING_METRIC_INVALID
= -EINVAL
,
253 } CGroupIPAccountingMetric
;
255 /* Used when querying IO accounting data */
256 typedef enum CGroupIOAccountingMetric
{
257 CGROUP_IO_READ_BYTES
,
258 CGROUP_IO_WRITE_BYTES
,
259 CGROUP_IO_READ_OPERATIONS
,
260 CGROUP_IO_WRITE_OPERATIONS
,
261 _CGROUP_IO_ACCOUNTING_METRIC_MAX
,
262 _CGROUP_IO_ACCOUNTING_METRIC_INVALID
= -EINVAL
,
263 } CGroupIOAccountingMetric
;
265 typedef enum CGroupMemoryAccountingMetric
{
267 CGROUP_MEMORY_SWAP_PEAK
,
268 /* We cache the above attributes, so that they can be fetched even after the cgroup is gone, e.g.
269 * when systemd-run exits. */
270 _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST
= CGROUP_MEMORY_SWAP_PEAK
,
272 /* These attributes are transient, so no need for caching. */
273 CGROUP_MEMORY_SWAP_CURRENT
,
274 CGROUP_MEMORY_ZSWAP_CURRENT
,
276 _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX
,
277 _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID
= -EINVAL
,
278 } CGroupMemoryAccountingMetric
;
280 /* Used for limits whose value sets have infimum */
281 typedef enum CGroupLimitType
{
282 CGROUP_LIMIT_MEMORY_MAX
,
283 CGROUP_LIMIT_MEMORY_HIGH
,
284 CGROUP_LIMIT_TASKS_MAX
,
285 _CGROUP_LIMIT_TYPE_MAX
,
286 _CGROUP_LIMIT_INVALID
= -EINVAL
,
289 typedef struct Unit Unit
;
290 typedef struct Manager Manager
;
291 typedef enum ManagerState ManagerState
;
293 uint64_t cgroup_context_cpu_weight(CGroupContext
*c
, ManagerState state
);
295 usec_t
cgroup_cpu_adjust_period(usec_t period
, usec_t quota
, usec_t resolution
, usec_t max_period
);
297 void cgroup_context_init(CGroupContext
*c
);
298 int cgroup_context_copy(CGroupContext
*dst
, const CGroupContext
*src
);
299 void cgroup_context_done(CGroupContext
*c
);
300 void cgroup_context_dump(Unit
*u
, FILE* f
, const char *prefix
);
301 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem
*item
, FILE *f
);
302 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem
*items
, FILE *f
);
304 void cgroup_context_free_device_allow(CGroupContext
*c
, CGroupDeviceAllow
*a
);
305 void cgroup_context_free_io_device_weight(CGroupContext
*c
, CGroupIODeviceWeight
*w
);
306 void cgroup_context_free_io_device_limit(CGroupContext
*c
, CGroupIODeviceLimit
*l
);
307 void cgroup_context_free_io_device_latency(CGroupContext
*c
, CGroupIODeviceLatency
*l
);
308 void cgroup_context_free_blockio_device_weight(CGroupContext
*c
, CGroupBlockIODeviceWeight
*w
);
309 void cgroup_context_free_blockio_device_bandwidth(CGroupContext
*c
, CGroupBlockIODeviceBandwidth
*b
);
310 void cgroup_context_remove_bpf_foreign_program(CGroupContext
*c
, CGroupBPFForeignProgram
*p
);
311 void cgroup_context_remove_socket_bind(CGroupSocketBindItem
**head
);
313 static inline bool cgroup_context_want_memory_pressure(const CGroupContext
*c
) {
316 return c
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_ON
||
317 (c
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_AUTO
&& c
->memory_accounting
);
320 int cgroup_context_add_device_allow(CGroupContext
*c
, const char *dev
, CGroupDevicePermissions p
);
321 int cgroup_context_add_or_update_device_allow(CGroupContext
*c
, const char *dev
, CGroupDevicePermissions p
);
322 int cgroup_context_add_bpf_foreign_program(CGroupContext
*c
, uint32_t attach_type
, const char *path
);
323 int cgroup_context_add_io_device_limit_dup(CGroupContext
*c
, CGroupIODeviceLimit
*l
);
324 int cgroup_context_add_io_device_weight_dup(CGroupContext
*c
, CGroupIODeviceWeight
*w
);
325 int cgroup_context_add_io_device_latency_dup(CGroupContext
*c
, CGroupIODeviceLatency
*l
);
326 int cgroup_context_add_block_io_device_weight_dup(CGroupContext
*c
, CGroupBlockIODeviceWeight
*w
);
327 int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext
*c
, CGroupBlockIODeviceBandwidth
*b
);
328 int cgroup_context_add_device_allow_dup(CGroupContext
*c
, CGroupDeviceAllow
*a
);
329 int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext
*c
, CGroupSocketBindItem
*i
);
330 int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext
*c
, CGroupSocketBindItem
*i
);
332 static inline int cgroup_context_add_bpf_foreign_program_dup(CGroupContext
*c
, CGroupBPFForeignProgram
*p
) {
333 return cgroup_context_add_bpf_foreign_program(c
, p
->attach_type
, p
->bpffs_path
);
336 void unit_modify_nft_set(Unit
*u
, bool add
);
338 CGroupMask
unit_get_own_mask(Unit
*u
);
339 CGroupMask
unit_get_delegate_mask(Unit
*u
);
340 CGroupMask
unit_get_members_mask(Unit
*u
);
341 CGroupMask
unit_get_siblings_mask(Unit
*u
);
342 CGroupMask
unit_get_ancestor_disable_mask(Unit
*u
);
344 CGroupMask
unit_get_target_mask(Unit
*u
);
345 CGroupMask
unit_get_enable_mask(Unit
*u
);
347 void unit_invalidate_cgroup_members_masks(Unit
*u
);
349 void unit_add_family_to_cgroup_realize_queue(Unit
*u
);
351 const char *unit_get_realized_cgroup_path(Unit
*u
, CGroupMask mask
);
352 int unit_default_cgroup_path(const Unit
*u
, char **ret
);
353 int unit_set_cgroup_path(Unit
*u
, const char *path
);
354 int unit_pick_cgroup_path(Unit
*u
);
356 int unit_realize_cgroup(Unit
*u
);
357 void unit_prune_cgroup(Unit
*u
);
358 int unit_watch_cgroup(Unit
*u
);
359 int unit_watch_cgroup_memory(Unit
*u
);
360 void unit_add_to_cgroup_realize_queue(Unit
*u
);
362 void unit_release_cgroup(Unit
*u
);
363 /* Releases the cgroup only if it is recursively empty.
364 * Returns true if the cgroup was released, false otherwise. */
365 bool unit_maybe_release_cgroup(Unit
*u
);
367 void unit_add_to_cgroup_empty_queue(Unit
*u
);
368 int unit_check_oomd_kill(Unit
*u
);
369 int unit_check_oom(Unit
*u
);
371 int unit_attach_pids_to_cgroup(Unit
*u
, Set
*pids
, const char *suffix_path
);
373 int manager_setup_cgroup(Manager
*m
);
374 void manager_shutdown_cgroup(Manager
*m
, bool delete);
376 unsigned manager_dispatch_cgroup_realize_queue(Manager
*m
);
378 Unit
*manager_get_unit_by_cgroup(Manager
*m
, const char *cgroup
);
379 Unit
*manager_get_unit_by_pidref_cgroup(Manager
*m
, PidRef
*pid
);
380 Unit
*manager_get_unit_by_pidref_watching(Manager
*m
, PidRef
*pid
);
381 Unit
* manager_get_unit_by_pidref(Manager
*m
, PidRef
*pid
);
382 Unit
* manager_get_unit_by_pid(Manager
*m
, pid_t pid
);
384 uint64_t unit_get_ancestor_memory_min(Unit
*u
);
385 uint64_t unit_get_ancestor_memory_low(Unit
*u
);
386 uint64_t unit_get_ancestor_startup_memory_low(Unit
*u
);
388 int unit_search_main_pid(Unit
*u
, PidRef
*ret
);
389 int unit_watch_all_pids(Unit
*u
);
391 int unit_synthesize_cgroup_empty_event(Unit
*u
);
393 int unit_get_memory_available(Unit
*u
, uint64_t *ret
);
394 int unit_get_memory_current(Unit
*u
, uint64_t *ret
);
395 int unit_get_memory_accounting(Unit
*u
, CGroupMemoryAccountingMetric metric
, uint64_t *ret
);
396 int unit_get_tasks_current(Unit
*u
, uint64_t *ret
);
397 int unit_get_cpu_usage(Unit
*u
, nsec_t
*ret
);
398 int unit_get_io_accounting(Unit
*u
, CGroupIOAccountingMetric metric
, bool allow_cache
, uint64_t *ret
);
399 int unit_get_ip_accounting(Unit
*u
, CGroupIPAccountingMetric metric
, uint64_t *ret
);
400 int unit_get_effective_limit(Unit
*u
, CGroupLimitType type
, uint64_t *ret
);
402 int unit_reset_cpu_accounting(Unit
*u
);
403 void unit_reset_memory_accounting_last(Unit
*u
);
404 int unit_reset_ip_accounting(Unit
*u
);
405 void unit_reset_io_accounting_last(Unit
*u
);
406 int unit_reset_io_accounting(Unit
*u
);
407 int unit_reset_accounting(Unit
*u
);
409 #define UNIT_CGROUP_BOOL(u, name) \
411 CGroupContext *cc = unit_get_cgroup_context(u); \
412 cc ? cc->name : false; \
415 bool manager_owns_host_root_cgroup(Manager
*m
);
416 bool unit_has_host_root_cgroup(Unit
*u
);
418 bool unit_has_startup_cgroup_constraints(Unit
*u
);
420 int manager_notify_cgroup_empty(Manager
*m
, const char *group
);
422 void unit_invalidate_cgroup(Unit
*u
, CGroupMask m
);
423 void unit_invalidate_cgroup_bpf(Unit
*u
);
425 void manager_invalidate_startup_units(Manager
*m
);
427 const char* cgroup_device_policy_to_string(CGroupDevicePolicy i
) _const_
;
428 CGroupDevicePolicy
cgroup_device_policy_from_string(const char *s
) _pure_
;
430 void unit_cgroup_catchup(Unit
*u
);
432 bool unit_cgroup_delegate(Unit
*u
);
434 int unit_get_cpuset(Unit
*u
, CPUSet
*cpus
, const char *name
);
435 int unit_cgroup_freezer_action(Unit
*u
, FreezerAction action
);
437 const char* freezer_action_to_string(FreezerAction a
) _const_
;
438 FreezerAction
freezer_action_from_string(const char *s
) _pure_
;
440 const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a
) _const_
;
441 CGroupPressureWatch
cgroup_pressure_watch_from_string(const char *s
) _pure_
;
443 const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p
) _const_
;
444 CGroupDevicePermissions
cgroup_device_permissions_from_string(const char *s
) _pure_
;
446 const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m
) _const_
;
447 CGroupIPAccountingMetric
cgroup_ip_accounting_metric_from_string(const char *s
) _pure_
;
449 const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m
) _const_
;
450 CGroupIOAccountingMetric
cgroup_io_accounting_metric_from_string(const char *s
) _pure_
;
452 const char* cgroup_limit_type_to_string(CGroupLimitType m
) _const_
;
453 CGroupLimitType
cgroup_limit_type_from_string(const char *s
) _pure_
;
455 const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m
) _const_
;
456 CGroupMemoryAccountingMetric
cgroup_memory_accounting_metric_from_string(const char *s
) _pure_
;