1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
7 #include "cgroup-util.h"
8 #include "cpu-set-util.h"
9 #include "firewall-util.h"
12 #include "time-util.h"
14 typedef struct CGroupTasksMax
{
15 /* If scale == 0, just use value; otherwise, value / scale.
16 * See tasks_max_resolve(). */
21 #define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })
23 static inline bool cgroup_tasks_max_isset(const CGroupTasksMax
*tasks_max
) {
24 return tasks_max
->value
!= UINT64_MAX
|| tasks_max
->scale
!= 0;
27 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax
*tasks_max
);
29 typedef struct CGroupContext CGroupContext
;
30 typedef struct CGroupDeviceAllow CGroupDeviceAllow
;
31 typedef struct CGroupIODeviceWeight CGroupIODeviceWeight
;
32 typedef struct CGroupIODeviceLimit CGroupIODeviceLimit
;
33 typedef struct CGroupIODeviceLatency CGroupIODeviceLatency
;
34 typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight
;
35 typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth
;
36 typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram
;
37 typedef struct CGroupSocketBindItem CGroupSocketBindItem
;
39 typedef enum CGroupDevicePolicy
{
40 /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
42 CGROUP_DEVICE_POLICY_AUTO
,
44 /* Everything forbidden, except built-in ones and listed ones. */
45 CGROUP_DEVICE_POLICY_CLOSED
,
47 /* Everything forbidden, except for the listed devices */
48 CGROUP_DEVICE_POLICY_STRICT
,
50 _CGROUP_DEVICE_POLICY_MAX
,
51 _CGROUP_DEVICE_POLICY_INVALID
= -EINVAL
,
54 typedef enum FreezerAction
{
59 _FREEZER_ACTION_INVALID
= -EINVAL
,
62 typedef enum CGroupDevicePermissions
{
63 /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
64 CGROUP_DEVICE_MKNOD
= 1 << 0,
65 CGROUP_DEVICE_READ
= 1 << 1,
66 CGROUP_DEVICE_WRITE
= 1 << 2,
67 _CGROUP_DEVICE_PERMISSIONS_MAX
= 1 << 3,
68 _CGROUP_DEVICE_PERMISSIONS_ALL
= _CGROUP_DEVICE_PERMISSIONS_MAX
- 1,
69 _CGROUP_DEVICE_PERMISSIONS_INVALID
= -EINVAL
,
70 } CGroupDevicePermissions
;
72 struct CGroupDeviceAllow
{
73 LIST_FIELDS(CGroupDeviceAllow
, device_allow
);
75 CGroupDevicePermissions permissions
;
78 struct CGroupIODeviceWeight
{
79 LIST_FIELDS(CGroupIODeviceWeight
, device_weights
);
84 struct CGroupIODeviceLimit
{
85 LIST_FIELDS(CGroupIODeviceLimit
, device_limits
);
87 uint64_t limits
[_CGROUP_IO_LIMIT_TYPE_MAX
];
90 struct CGroupIODeviceLatency
{
91 LIST_FIELDS(CGroupIODeviceLatency
, device_latencies
);
96 struct CGroupBlockIODeviceWeight
{
97 LIST_FIELDS(CGroupBlockIODeviceWeight
, device_weights
);
102 struct CGroupBlockIODeviceBandwidth
{
103 LIST_FIELDS(CGroupBlockIODeviceBandwidth
, device_bandwidths
);
109 struct CGroupBPFForeignProgram
{
110 LIST_FIELDS(CGroupBPFForeignProgram
, programs
);
111 uint32_t attach_type
;
115 struct CGroupSocketBindItem
{
116 LIST_FIELDS(CGroupSocketBindItem
, socket_bind_items
);
123 typedef enum CGroupPressureWatch
{
124 CGROUP_PRESSURE_WATCH_OFF
, /* → tells the service payload explicitly not to watch for memory pressure */
125 CGROUP_PRESSURE_WATCH_AUTO
, /* → on if memory account is on anyway for the unit, otherwise off */
126 CGROUP_PRESSURE_WATCH_ON
,
127 CGROUP_PRESSURE_WATCH_SKIP
, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
128 _CGROUP_PRESSURE_WATCH_MAX
,
129 _CGROUP_PRESSURE_WATCH_INVALID
= -EINVAL
,
130 } CGroupPressureWatch
;
132 struct CGroupContext
{
135 bool blockio_accounting
;
136 bool memory_accounting
;
137 bool tasks_accounting
;
140 /* Configures the memory.oom.group attribute (on unified) */
141 bool memory_oom_group
;
144 CGroupMask delegate_controllers
;
145 CGroupMask disable_controllers
;
146 char *delegate_subgroup
;
148 /* For unified hierarchy */
150 uint64_t startup_cpu_weight
;
151 usec_t cpu_quota_per_sec_usec
;
152 usec_t cpu_quota_period_usec
;
155 CPUSet startup_cpuset_cpus
;
157 CPUSet startup_cpuset_mems
;
160 uint64_t startup_io_weight
;
161 LIST_HEAD(CGroupIODeviceWeight
, io_device_weights
);
162 LIST_HEAD(CGroupIODeviceLimit
, io_device_limits
);
163 LIST_HEAD(CGroupIODeviceLatency
, io_device_latencies
);
165 uint64_t default_memory_min
;
166 uint64_t default_memory_low
;
167 uint64_t default_startup_memory_low
;
170 uint64_t startup_memory_low
;
171 uint64_t memory_high
;
172 uint64_t startup_memory_high
;
174 uint64_t startup_memory_max
;
175 uint64_t memory_swap_max
;
176 uint64_t startup_memory_swap_max
;
177 uint64_t memory_zswap_max
;
178 uint64_t startup_memory_zswap_max
;
180 bool default_memory_min_set
:1;
181 bool default_memory_low_set
:1;
182 bool default_startup_memory_low_set
:1;
183 bool memory_min_set
:1;
184 bool memory_low_set
:1;
185 bool startup_memory_low_set
:1;
186 bool startup_memory_high_set
:1;
187 bool startup_memory_max_set
:1;
188 bool startup_memory_swap_max_set
:1;
189 bool startup_memory_zswap_max_set
:1;
191 Set
*ip_address_allow
;
192 Set
*ip_address_deny
;
193 /* These two flags indicate that redundant entries have been removed from
194 * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
195 bool ip_address_allow_reduced
;
196 bool ip_address_deny_reduced
;
198 char **ip_filters_ingress
;
199 char **ip_filters_egress
;
200 LIST_HEAD(CGroupBPFForeignProgram
, bpf_foreign_programs
);
202 Set
*restrict_network_interfaces
;
203 bool restrict_network_interfaces_is_allow_list
;
205 /* For legacy hierarchies */
207 uint64_t startup_cpu_shares
;
209 uint64_t blockio_weight
;
210 uint64_t startup_blockio_weight
;
211 LIST_HEAD(CGroupBlockIODeviceWeight
, blockio_device_weights
);
212 LIST_HEAD(CGroupBlockIODeviceBandwidth
, blockio_device_bandwidths
);
214 uint64_t memory_limit
;
216 CGroupDevicePolicy device_policy
;
217 LIST_HEAD(CGroupDeviceAllow
, device_allow
);
219 LIST_HEAD(CGroupSocketBindItem
, socket_bind_allow
);
220 LIST_HEAD(CGroupSocketBindItem
, socket_bind_deny
);
223 CGroupTasksMax tasks_max
;
225 /* Settings for systemd-oomd */
226 ManagedOOMMode moom_swap
;
227 ManagedOOMMode moom_mem_pressure
;
228 uint32_t moom_mem_pressure_limit
; /* Normalized to 2^32-1 == 100% */
229 ManagedOOMPreference moom_preference
;
231 /* Memory pressure logic */
232 CGroupPressureWatch memory_pressure_watch
;
233 usec_t memory_pressure_threshold_usec
;
234 /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
235 * triggers, nor triggers for non-memory pressure. We might add that later. */
237 NFTSetContext nft_set_context
;
239 /* Forward coredumps for processes that crash within this cgroup.
240 * Requires 'delegate' to also be true. */
241 bool coredump_receive
;
244 /* Used when querying IP accounting data */
245 typedef enum CGroupIPAccountingMetric
{
246 CGROUP_IP_INGRESS_BYTES
,
247 CGROUP_IP_INGRESS_PACKETS
,
248 CGROUP_IP_EGRESS_BYTES
,
249 CGROUP_IP_EGRESS_PACKETS
,
250 _CGROUP_IP_ACCOUNTING_METRIC_MAX
,
251 _CGROUP_IP_ACCOUNTING_METRIC_INVALID
= -EINVAL
,
252 } CGroupIPAccountingMetric
;
254 /* Used when querying IO accounting data */
255 typedef enum CGroupIOAccountingMetric
{
256 CGROUP_IO_READ_BYTES
,
257 CGROUP_IO_WRITE_BYTES
,
258 CGROUP_IO_READ_OPERATIONS
,
259 CGROUP_IO_WRITE_OPERATIONS
,
260 _CGROUP_IO_ACCOUNTING_METRIC_MAX
,
261 _CGROUP_IO_ACCOUNTING_METRIC_INVALID
= -EINVAL
,
262 } CGroupIOAccountingMetric
;
264 typedef struct Unit Unit
;
265 typedef struct Manager Manager
;
266 typedef enum ManagerState ManagerState
;
268 uint64_t cgroup_context_cpu_weight(CGroupContext
*c
, ManagerState state
);
270 usec_t
cgroup_cpu_adjust_period(usec_t period
, usec_t quota
, usec_t resolution
, usec_t max_period
);
272 void cgroup_context_init(CGroupContext
*c
);
273 void cgroup_context_done(CGroupContext
*c
);
274 void cgroup_context_dump(Unit
*u
, FILE* f
, const char *prefix
);
275 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem
*item
, FILE *f
);
276 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem
*items
, FILE *f
);
278 void cgroup_context_free_device_allow(CGroupContext
*c
, CGroupDeviceAllow
*a
);
279 void cgroup_context_free_io_device_weight(CGroupContext
*c
, CGroupIODeviceWeight
*w
);
280 void cgroup_context_free_io_device_limit(CGroupContext
*c
, CGroupIODeviceLimit
*l
);
281 void cgroup_context_free_io_device_latency(CGroupContext
*c
, CGroupIODeviceLatency
*l
);
282 void cgroup_context_free_blockio_device_weight(CGroupContext
*c
, CGroupBlockIODeviceWeight
*w
);
283 void cgroup_context_free_blockio_device_bandwidth(CGroupContext
*c
, CGroupBlockIODeviceBandwidth
*b
);
284 void cgroup_context_remove_bpf_foreign_program(CGroupContext
*c
, CGroupBPFForeignProgram
*p
);
285 void cgroup_context_remove_socket_bind(CGroupSocketBindItem
**head
);
287 static inline bool cgroup_context_want_memory_pressure(const CGroupContext
*c
) {
290 return c
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_ON
||
291 (c
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_AUTO
&& c
->memory_accounting
);
294 int cgroup_context_add_device_allow(CGroupContext
*c
, const char *dev
, CGroupDevicePermissions p
);
295 int cgroup_context_add_or_update_device_allow(CGroupContext
*c
, const char *dev
, CGroupDevicePermissions p
);
296 int cgroup_context_add_bpf_foreign_program(CGroupContext
*c
, uint32_t attach_type
, const char *path
);
298 void unit_modify_nft_set(Unit
*u
, bool add
);
300 CGroupMask
unit_get_own_mask(Unit
*u
);
301 CGroupMask
unit_get_delegate_mask(Unit
*u
);
302 CGroupMask
unit_get_members_mask(Unit
*u
);
303 CGroupMask
unit_get_siblings_mask(Unit
*u
);
304 CGroupMask
unit_get_ancestor_disable_mask(Unit
*u
);
306 CGroupMask
unit_get_target_mask(Unit
*u
);
307 CGroupMask
unit_get_enable_mask(Unit
*u
);
309 void unit_invalidate_cgroup_members_masks(Unit
*u
);
311 void unit_add_family_to_cgroup_realize_queue(Unit
*u
);
313 const char *unit_get_realized_cgroup_path(Unit
*u
, CGroupMask mask
);
314 int unit_default_cgroup_path(const Unit
*u
, char **ret
);
315 int unit_set_cgroup_path(Unit
*u
, const char *path
);
316 int unit_pick_cgroup_path(Unit
*u
);
318 int unit_realize_cgroup(Unit
*u
);
319 void unit_prune_cgroup(Unit
*u
);
320 int unit_watch_cgroup(Unit
*u
);
321 int unit_watch_cgroup_memory(Unit
*u
);
322 void unit_add_to_cgroup_realize_queue(Unit
*u
);
324 void unit_release_cgroup(Unit
*u
);
325 /* Releases the cgroup only if it is recursively empty.
326 * Returns true if the cgroup was released, false otherwise. */
327 bool unit_maybe_release_cgroup(Unit
*u
);
329 void unit_add_to_cgroup_empty_queue(Unit
*u
);
330 int unit_check_oomd_kill(Unit
*u
);
331 int unit_check_oom(Unit
*u
);
333 int unit_attach_pids_to_cgroup(Unit
*u
, Set
*pids
, const char *suffix_path
);
335 int manager_setup_cgroup(Manager
*m
);
336 void manager_shutdown_cgroup(Manager
*m
, bool delete);
338 unsigned manager_dispatch_cgroup_realize_queue(Manager
*m
);
340 Unit
*manager_get_unit_by_cgroup(Manager
*m
, const char *cgroup
);
341 Unit
*manager_get_unit_by_pidref_cgroup(Manager
*m
, PidRef
*pid
);
342 Unit
*manager_get_unit_by_pidref_watching(Manager
*m
, PidRef
*pid
);
343 Unit
* manager_get_unit_by_pidref(Manager
*m
, PidRef
*pid
);
344 Unit
* manager_get_unit_by_pid(Manager
*m
, pid_t pid
);
346 uint64_t unit_get_ancestor_memory_min(Unit
*u
);
347 uint64_t unit_get_ancestor_memory_low(Unit
*u
);
348 uint64_t unit_get_ancestor_startup_memory_low(Unit
*u
);
350 int unit_search_main_pid(Unit
*u
, PidRef
*ret
);
351 int unit_watch_all_pids(Unit
*u
);
353 int unit_synthesize_cgroup_empty_event(Unit
*u
);
355 int unit_get_memory_current(Unit
*u
, uint64_t *ret
);
356 int unit_get_memory_peak(Unit
*u
, uint64_t *ret
);
357 int unit_get_memory_swap_current(Unit
*u
, uint64_t *ret
);
358 int unit_get_memory_swap_peak(Unit
*u
, uint64_t *ret
);
359 int unit_get_memory_available(Unit
*u
, uint64_t *ret
);
360 int unit_get_tasks_current(Unit
*u
, uint64_t *ret
);
361 int unit_get_cpu_usage(Unit
*u
, nsec_t
*ret
);
362 int unit_get_io_accounting(Unit
*u
, CGroupIOAccountingMetric metric
, bool allow_cache
, uint64_t *ret
);
363 int unit_get_ip_accounting(Unit
*u
, CGroupIPAccountingMetric metric
, uint64_t *ret
);
365 int unit_reset_cpu_accounting(Unit
*u
);
366 int unit_reset_ip_accounting(Unit
*u
);
367 int unit_reset_io_accounting(Unit
*u
);
368 int unit_reset_accounting(Unit
*u
);
370 #define UNIT_CGROUP_BOOL(u, name) \
372 CGroupContext *cc = unit_get_cgroup_context(u); \
373 cc ? cc->name : false; \
376 bool manager_owns_host_root_cgroup(Manager
*m
);
377 bool unit_has_host_root_cgroup(Unit
*u
);
379 bool unit_has_startup_cgroup_constraints(Unit
*u
);
381 int manager_notify_cgroup_empty(Manager
*m
, const char *group
);
383 void unit_invalidate_cgroup(Unit
*u
, CGroupMask m
);
384 void unit_invalidate_cgroup_bpf(Unit
*u
);
386 void manager_invalidate_startup_units(Manager
*m
);
388 const char* cgroup_device_policy_to_string(CGroupDevicePolicy i
) _const_
;
389 CGroupDevicePolicy
cgroup_device_policy_from_string(const char *s
) _pure_
;
391 void unit_cgroup_catchup(Unit
*u
);
393 bool unit_cgroup_delegate(Unit
*u
);
395 int unit_get_cpuset(Unit
*u
, CPUSet
*cpus
, const char *name
);
396 int unit_cgroup_freezer_action(Unit
*u
, FreezerAction action
);
398 const char* freezer_action_to_string(FreezerAction a
) _const_
;
399 FreezerAction
freezer_action_from_string(const char *s
) _pure_
;
401 const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a
) _const_
;
402 CGroupPressureWatch
cgroup_pressure_watch_from_string(const char *s
) _pure_
;
404 const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p
) _const_
;
405 CGroupDevicePermissions
cgroup_device_permissions_from_string(const char *s
) _pure_
;
407 const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m
) _const_
;
408 CGroupIPAccountingMetric
cgroup_ip_accounting_metric_from_string(const char *s
) _pure_
;
410 const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m
) _const_
;
411 CGroupIOAccountingMetric
cgroup_io_accounting_metric_from_string(const char *s
) _pure_
;