]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.h
Merge pull request #28797 from Werkov/eff_limits
[thirdparty/systemd.git] / src / core / cgroup.h
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 #pragma once
3
4 #include <stdbool.h>
5
6 #include "bpf-lsm.h"
7 #include "cgroup-util.h"
8 #include "cpu-set-util.h"
9 #include "firewall-util.h"
10 #include "list.h"
11 #include "pidref.h"
12 #include "time-util.h"
13
14 typedef struct CGroupTasksMax {
15 /* If scale == 0, just use value; otherwise, value / scale.
16 * See tasks_max_resolve(). */
17 uint64_t value;
18 uint64_t scale;
19 } CGroupTasksMax;
20
21 #define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })
22
23 static inline bool cgroup_tasks_max_isset(const CGroupTasksMax *tasks_max) {
24 return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
25 }
26
27 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max);
28
29 typedef struct CGroupContext CGroupContext;
30 typedef struct CGroupDeviceAllow CGroupDeviceAllow;
31 typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
32 typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
33 typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
34 typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
35 typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
36 typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
37 typedef struct CGroupSocketBindItem CGroupSocketBindItem;
38
39 typedef enum CGroupDevicePolicy {
40 /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
41 * everything. */
42 CGROUP_DEVICE_POLICY_AUTO,
43
44 /* Everything forbidden, except built-in ones and listed ones. */
45 CGROUP_DEVICE_POLICY_CLOSED,
46
47 /* Everything forbidden, except for the listed devices */
48 CGROUP_DEVICE_POLICY_STRICT,
49
50 _CGROUP_DEVICE_POLICY_MAX,
51 _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
52 } CGroupDevicePolicy;
53
54 typedef enum FreezerAction {
55 FREEZER_FREEZE,
56 FREEZER_THAW,
57
58 _FREEZER_ACTION_MAX,
59 _FREEZER_ACTION_INVALID = -EINVAL,
60 } FreezerAction;
61
62 typedef enum CGroupDevicePermissions {
63 /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
64 CGROUP_DEVICE_MKNOD = 1 << 0,
65 CGROUP_DEVICE_READ = 1 << 1,
66 CGROUP_DEVICE_WRITE = 1 << 2,
67 _CGROUP_DEVICE_PERMISSIONS_MAX = 1 << 3,
68 _CGROUP_DEVICE_PERMISSIONS_ALL = _CGROUP_DEVICE_PERMISSIONS_MAX - 1,
69 _CGROUP_DEVICE_PERMISSIONS_INVALID = -EINVAL,
70 } CGroupDevicePermissions;
71
72 struct CGroupDeviceAllow {
73 LIST_FIELDS(CGroupDeviceAllow, device_allow);
74 char *path;
75 CGroupDevicePermissions permissions;
76 };
77
78 struct CGroupIODeviceWeight {
79 LIST_FIELDS(CGroupIODeviceWeight, device_weights);
80 char *path;
81 uint64_t weight;
82 };
83
84 struct CGroupIODeviceLimit {
85 LIST_FIELDS(CGroupIODeviceLimit, device_limits);
86 char *path;
87 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
88 };
89
90 struct CGroupIODeviceLatency {
91 LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
92 char *path;
93 usec_t target_usec;
94 };
95
96 struct CGroupBlockIODeviceWeight {
97 LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
98 char *path;
99 uint64_t weight;
100 };
101
102 struct CGroupBlockIODeviceBandwidth {
103 LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
104 char *path;
105 uint64_t rbps;
106 uint64_t wbps;
107 };
108
109 struct CGroupBPFForeignProgram {
110 LIST_FIELDS(CGroupBPFForeignProgram, programs);
111 uint32_t attach_type;
112 char *bpffs_path;
113 };
114
115 struct CGroupSocketBindItem {
116 LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
117 int address_family;
118 int ip_protocol;
119 uint16_t nr_ports;
120 uint16_t port_min;
121 };
122
123 typedef enum CGroupPressureWatch {
124 CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */
125 CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */
126 CGROUP_PRESSURE_WATCH_ON,
127 CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
128 _CGROUP_PRESSURE_WATCH_MAX,
129 _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
130 } CGroupPressureWatch;
131
132 /* When adding members make sure to update cgroup_context_copy() accordingly */
133 struct CGroupContext {
134 bool cpu_accounting;
135 bool io_accounting;
136 bool blockio_accounting;
137 bool memory_accounting;
138 bool tasks_accounting;
139 bool ip_accounting;
140
141 /* Configures the memory.oom.group attribute (on unified) */
142 bool memory_oom_group;
143
144 bool delegate;
145 CGroupMask delegate_controllers;
146 CGroupMask disable_controllers;
147 char *delegate_subgroup;
148
149 /* For unified hierarchy */
150 uint64_t cpu_weight;
151 uint64_t startup_cpu_weight;
152 usec_t cpu_quota_per_sec_usec;
153 usec_t cpu_quota_period_usec;
154
155 CPUSet cpuset_cpus;
156 CPUSet startup_cpuset_cpus;
157 CPUSet cpuset_mems;
158 CPUSet startup_cpuset_mems;
159
160 uint64_t io_weight;
161 uint64_t startup_io_weight;
162 LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
163 LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
164 LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
165
166 uint64_t default_memory_min;
167 uint64_t default_memory_low;
168 uint64_t default_startup_memory_low;
169 uint64_t memory_min;
170 uint64_t memory_low;
171 uint64_t startup_memory_low;
172 uint64_t memory_high;
173 uint64_t startup_memory_high;
174 uint64_t memory_max;
175 uint64_t startup_memory_max;
176 uint64_t memory_swap_max;
177 uint64_t startup_memory_swap_max;
178 uint64_t memory_zswap_max;
179 uint64_t startup_memory_zswap_max;
180
181 bool default_memory_min_set:1;
182 bool default_memory_low_set:1;
183 bool default_startup_memory_low_set:1;
184 bool memory_min_set:1;
185 bool memory_low_set:1;
186 bool startup_memory_low_set:1;
187 bool startup_memory_high_set:1;
188 bool startup_memory_max_set:1;
189 bool startup_memory_swap_max_set:1;
190 bool startup_memory_zswap_max_set:1;
191
192 Set *ip_address_allow;
193 Set *ip_address_deny;
194 /* These two flags indicate that redundant entries have been removed from
195 * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
196 bool ip_address_allow_reduced;
197 bool ip_address_deny_reduced;
198
199 char **ip_filters_ingress;
200 char **ip_filters_egress;
201 LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);
202
203 Set *restrict_network_interfaces;
204 bool restrict_network_interfaces_is_allow_list;
205
206 /* For legacy hierarchies */
207 uint64_t cpu_shares;
208 uint64_t startup_cpu_shares;
209
210 uint64_t blockio_weight;
211 uint64_t startup_blockio_weight;
212 LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
213 LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
214
215 uint64_t memory_limit;
216
217 CGroupDevicePolicy device_policy;
218 LIST_HEAD(CGroupDeviceAllow, device_allow);
219
220 LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
221 LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);
222
223 /* Common */
224 CGroupTasksMax tasks_max;
225
226 /* Settings for systemd-oomd */
227 ManagedOOMMode moom_swap;
228 ManagedOOMMode moom_mem_pressure;
229 uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
230 ManagedOOMPreference moom_preference;
231
232 /* Memory pressure logic */
233 CGroupPressureWatch memory_pressure_watch;
234 usec_t memory_pressure_threshold_usec;
235 /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
236 * triggers, nor triggers for non-memory pressure. We might add that later. */
237
238 NFTSetContext nft_set_context;
239
240 /* Forward coredumps for processes that crash within this cgroup.
241 * Requires 'delegate' to also be true. */
242 bool coredump_receive;
243 };
244
245 /* Used when querying IP accounting data */
246 typedef enum CGroupIPAccountingMetric {
247 CGROUP_IP_INGRESS_BYTES,
248 CGROUP_IP_INGRESS_PACKETS,
249 CGROUP_IP_EGRESS_BYTES,
250 CGROUP_IP_EGRESS_PACKETS,
251 _CGROUP_IP_ACCOUNTING_METRIC_MAX,
252 _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
253 } CGroupIPAccountingMetric;
254
255 /* Used when querying IO accounting data */
256 typedef enum CGroupIOAccountingMetric {
257 CGROUP_IO_READ_BYTES,
258 CGROUP_IO_WRITE_BYTES,
259 CGROUP_IO_READ_OPERATIONS,
260 CGROUP_IO_WRITE_OPERATIONS,
261 _CGROUP_IO_ACCOUNTING_METRIC_MAX,
262 _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
263 } CGroupIOAccountingMetric;
264
265 typedef enum CGroupMemoryAccountingMetric {
266 CGROUP_MEMORY_PEAK,
267 CGROUP_MEMORY_SWAP_PEAK,
268 /* We cache the above attributes, so that they can be fetched even after the cgroup is gone, e.g.
269 * when systemd-run exits. */
270 _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST = CGROUP_MEMORY_SWAP_PEAK,
271
272 /* These attributes are transient, so no need for caching. */
273 CGROUP_MEMORY_SWAP_CURRENT,
274 CGROUP_MEMORY_ZSWAP_CURRENT,
275
276 _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX,
277 _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL,
278 } CGroupMemoryAccountingMetric;
279
280 /* Used for limits whose value sets have infimum */
281 typedef enum CGroupLimitType {
282 CGROUP_LIMIT_MEMORY_MAX,
283 CGROUP_LIMIT_MEMORY_HIGH,
284 CGROUP_LIMIT_TASKS_MAX,
285 _CGROUP_LIMIT_TYPE_MAX,
286 _CGROUP_LIMIT_INVALID = -EINVAL,
287 } CGroupLimitType;
288
289 typedef struct Unit Unit;
290 typedef struct Manager Manager;
291 typedef enum ManagerState ManagerState;
292
293 uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
294
295 usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
296
297 void cgroup_context_init(CGroupContext *c);
298 int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src);
299 void cgroup_context_done(CGroupContext *c);
300 void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
301 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
302 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f);
303
304 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
305 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
306 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
307 void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
308 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
309 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
310 void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
311 void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
312
313 static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
314 assert(c);
315
316 return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
317 (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
318 }
319
320 int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
321 int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
322 int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
323 int cgroup_context_add_io_device_limit_dup(CGroupContext *c, CGroupIODeviceLimit *l);
324 int cgroup_context_add_io_device_weight_dup(CGroupContext *c, CGroupIODeviceWeight *w);
325 int cgroup_context_add_io_device_latency_dup(CGroupContext *c, CGroupIODeviceLatency *l);
326 int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, CGroupBlockIODeviceWeight *w);
327 int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
328 int cgroup_context_add_device_allow_dup(CGroupContext *c, CGroupDeviceAllow *a);
329 int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, CGroupSocketBindItem *i);
330 int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, CGroupSocketBindItem *i);
331
332 static inline int cgroup_context_add_bpf_foreign_program_dup(CGroupContext *c, CGroupBPFForeignProgram *p) {
333 return cgroup_context_add_bpf_foreign_program(c, p->attach_type, p->bpffs_path);
334 }
335
336 void unit_modify_nft_set(Unit *u, bool add);
337
338 CGroupMask unit_get_own_mask(Unit *u);
339 CGroupMask unit_get_delegate_mask(Unit *u);
340 CGroupMask unit_get_members_mask(Unit *u);
341 CGroupMask unit_get_siblings_mask(Unit *u);
342 CGroupMask unit_get_ancestor_disable_mask(Unit *u);
343
344 CGroupMask unit_get_target_mask(Unit *u);
345 CGroupMask unit_get_enable_mask(Unit *u);
346
347 void unit_invalidate_cgroup_members_masks(Unit *u);
348
349 void unit_add_family_to_cgroup_realize_queue(Unit *u);
350
351 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
352 int unit_default_cgroup_path(const Unit *u, char **ret);
353 int unit_set_cgroup_path(Unit *u, const char *path);
354 int unit_pick_cgroup_path(Unit *u);
355
356 int unit_realize_cgroup(Unit *u);
357 void unit_prune_cgroup(Unit *u);
358 int unit_watch_cgroup(Unit *u);
359 int unit_watch_cgroup_memory(Unit *u);
360 void unit_add_to_cgroup_realize_queue(Unit *u);
361
362 void unit_release_cgroup(Unit *u);
363 /* Releases the cgroup only if it is recursively empty.
364 * Returns true if the cgroup was released, false otherwise. */
365 bool unit_maybe_release_cgroup(Unit *u);
366
367 void unit_add_to_cgroup_empty_queue(Unit *u);
368 int unit_check_oomd_kill(Unit *u);
369 int unit_check_oom(Unit *u);
370
371 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
372
373 int manager_setup_cgroup(Manager *m);
374 void manager_shutdown_cgroup(Manager *m, bool delete);
375
376 unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
377
378 Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
379 Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
380 Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
381 Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
382 Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
383
384 uint64_t unit_get_ancestor_memory_min(Unit *u);
385 uint64_t unit_get_ancestor_memory_low(Unit *u);
386 uint64_t unit_get_ancestor_startup_memory_low(Unit *u);
387
388 int unit_search_main_pid(Unit *u, PidRef *ret);
389 int unit_watch_all_pids(Unit *u);
390
391 int unit_synthesize_cgroup_empty_event(Unit *u);
392
393 int unit_get_memory_available(Unit *u, uint64_t *ret);
394 int unit_get_memory_current(Unit *u, uint64_t *ret);
395 int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret);
396 int unit_get_tasks_current(Unit *u, uint64_t *ret);
397 int unit_get_cpu_usage(Unit *u, nsec_t *ret);
398 int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
399 int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
400 int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret);
401
402 int unit_reset_cpu_accounting(Unit *u);
403 void unit_reset_memory_accounting_last(Unit *u);
404 int unit_reset_ip_accounting(Unit *u);
405 void unit_reset_io_accounting_last(Unit *u);
406 int unit_reset_io_accounting(Unit *u);
407 int unit_reset_accounting(Unit *u);
408
409 #define UNIT_CGROUP_BOOL(u, name) \
410 ({ \
411 CGroupContext *cc = unit_get_cgroup_context(u); \
412 cc ? cc->name : false; \
413 })
414
415 bool manager_owns_host_root_cgroup(Manager *m);
416 bool unit_has_host_root_cgroup(Unit *u);
417
418 bool unit_has_startup_cgroup_constraints(Unit *u);
419
420 int manager_notify_cgroup_empty(Manager *m, const char *group);
421
422 void unit_invalidate_cgroup(Unit *u, CGroupMask m);
423 void unit_invalidate_cgroup_bpf(Unit *u);
424
425 void manager_invalidate_startup_units(Manager *m);
426
427 const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
428 CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
429
430 void unit_cgroup_catchup(Unit *u);
431
432 bool unit_cgroup_delegate(Unit *u);
433
434 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
435 int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
436
437 const char* freezer_action_to_string(FreezerAction a) _const_;
438 FreezerAction freezer_action_from_string(const char *s) _pure_;
439
440 const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
441 CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;
442
443 const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) _const_;
444 CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) _pure_;
445
446 const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m) _const_;
447 CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) _pure_;
448
449 const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_;
450 CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_;
451
452 const char* cgroup_limit_type_to_string(CGroupLimitType m) _const_;
453 CGroupLimitType cgroup_limit_type_from_string(const char *s) _pure_;
454
455 const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_;
456 CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_;