]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.h
cgroup: add support for memory.swap.current
[thirdparty/systemd.git] / src / core / cgroup.h
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 #pragma once
3
4 #include <stdbool.h>
5
6 #include "bpf-lsm.h"
7 #include "cgroup-util.h"
8 #include "cpu-set-util.h"
9 #include "firewall-util.h"
10 #include "list.h"
11 #include "pidref.h"
12 #include "time-util.h"
13
14 typedef struct CGroupTasksMax {
15 /* If scale == 0, just use value; otherwise, value / scale.
16 * See tasks_max_resolve(). */
17 uint64_t value;
18 uint64_t scale;
19 } CGroupTasksMax;
20
21 #define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })
22
23 static inline bool cgroup_tasks_max_isset(const CGroupTasksMax *tasks_max) {
24 return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
25 }
26
27 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max);
28
29 typedef struct CGroupContext CGroupContext;
30 typedef struct CGroupDeviceAllow CGroupDeviceAllow;
31 typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
32 typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
33 typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
34 typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
35 typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
36 typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
37 typedef struct CGroupSocketBindItem CGroupSocketBindItem;
38
39 typedef enum CGroupDevicePolicy {
40 /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
41 * everything. */
42 CGROUP_DEVICE_POLICY_AUTO,
43
44 /* Everything forbidden, except built-in ones and listed ones. */
45 CGROUP_DEVICE_POLICY_CLOSED,
46
47 /* Everything forbidden, except for the listed devices */
48 CGROUP_DEVICE_POLICY_STRICT,
49
50 _CGROUP_DEVICE_POLICY_MAX,
51 _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
52 } CGroupDevicePolicy;
53
54 typedef enum FreezerAction {
55 FREEZER_FREEZE,
56 FREEZER_THAW,
57
58 _FREEZER_ACTION_MAX,
59 _FREEZER_ACTION_INVALID = -EINVAL,
60 } FreezerAction;
61
62 typedef enum CGroupDevicePermissions {
63 /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
64 CGROUP_DEVICE_MKNOD = 1 << 0,
65 CGROUP_DEVICE_READ = 1 << 1,
66 CGROUP_DEVICE_WRITE = 1 << 2,
67 _CGROUP_DEVICE_PERMISSIONS_MAX = 1 << 3,
68 _CGROUP_DEVICE_PERMISSIONS_ALL = _CGROUP_DEVICE_PERMISSIONS_MAX - 1,
69 _CGROUP_DEVICE_PERMISSIONS_INVALID = -EINVAL,
70 } CGroupDevicePermissions;
71
72 struct CGroupDeviceAllow {
73 LIST_FIELDS(CGroupDeviceAllow, device_allow);
74 char *path;
75 CGroupDevicePermissions permissions;
76 };
77
78 struct CGroupIODeviceWeight {
79 LIST_FIELDS(CGroupIODeviceWeight, device_weights);
80 char *path;
81 uint64_t weight;
82 };
83
84 struct CGroupIODeviceLimit {
85 LIST_FIELDS(CGroupIODeviceLimit, device_limits);
86 char *path;
87 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
88 };
89
90 struct CGroupIODeviceLatency {
91 LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
92 char *path;
93 usec_t target_usec;
94 };
95
96 struct CGroupBlockIODeviceWeight {
97 LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
98 char *path;
99 uint64_t weight;
100 };
101
102 struct CGroupBlockIODeviceBandwidth {
103 LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
104 char *path;
105 uint64_t rbps;
106 uint64_t wbps;
107 };
108
109 struct CGroupBPFForeignProgram {
110 LIST_FIELDS(CGroupBPFForeignProgram, programs);
111 uint32_t attach_type;
112 char *bpffs_path;
113 };
114
115 struct CGroupSocketBindItem {
116 LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
117 int address_family;
118 int ip_protocol;
119 uint16_t nr_ports;
120 uint16_t port_min;
121 };
122
123 typedef enum CGroupPressureWatch {
124 CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */
125 CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */
126 CGROUP_PRESSURE_WATCH_ON,
127 CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
128 _CGROUP_PRESSURE_WATCH_MAX,
129 _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
130 } CGroupPressureWatch;
131
132 struct CGroupContext {
133 bool cpu_accounting;
134 bool io_accounting;
135 bool blockio_accounting;
136 bool memory_accounting;
137 bool tasks_accounting;
138 bool ip_accounting;
139
140 /* Configures the memory.oom.group attribute (on unified) */
141 bool memory_oom_group;
142
143 bool delegate;
144 CGroupMask delegate_controllers;
145 CGroupMask disable_controllers;
146 char *delegate_subgroup;
147
148 /* For unified hierarchy */
149 uint64_t cpu_weight;
150 uint64_t startup_cpu_weight;
151 usec_t cpu_quota_per_sec_usec;
152 usec_t cpu_quota_period_usec;
153
154 CPUSet cpuset_cpus;
155 CPUSet startup_cpuset_cpus;
156 CPUSet cpuset_mems;
157 CPUSet startup_cpuset_mems;
158
159 uint64_t io_weight;
160 uint64_t startup_io_weight;
161 LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
162 LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
163 LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
164
165 uint64_t default_memory_min;
166 uint64_t default_memory_low;
167 uint64_t default_startup_memory_low;
168 uint64_t memory_min;
169 uint64_t memory_low;
170 uint64_t startup_memory_low;
171 uint64_t memory_high;
172 uint64_t startup_memory_high;
173 uint64_t memory_max;
174 uint64_t startup_memory_max;
175 uint64_t memory_swap_max;
176 uint64_t startup_memory_swap_max;
177 uint64_t memory_zswap_max;
178 uint64_t startup_memory_zswap_max;
179
180 bool default_memory_min_set:1;
181 bool default_memory_low_set:1;
182 bool default_startup_memory_low_set:1;
183 bool memory_min_set:1;
184 bool memory_low_set:1;
185 bool startup_memory_low_set:1;
186 bool startup_memory_high_set:1;
187 bool startup_memory_max_set:1;
188 bool startup_memory_swap_max_set:1;
189 bool startup_memory_zswap_max_set:1;
190
191 Set *ip_address_allow;
192 Set *ip_address_deny;
193 /* These two flags indicate that redundant entries have been removed from
194 * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
195 bool ip_address_allow_reduced;
196 bool ip_address_deny_reduced;
197
198 char **ip_filters_ingress;
199 char **ip_filters_egress;
200 LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);
201
202 Set *restrict_network_interfaces;
203 bool restrict_network_interfaces_is_allow_list;
204
205 /* For legacy hierarchies */
206 uint64_t cpu_shares;
207 uint64_t startup_cpu_shares;
208
209 uint64_t blockio_weight;
210 uint64_t startup_blockio_weight;
211 LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
212 LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
213
214 uint64_t memory_limit;
215
216 CGroupDevicePolicy device_policy;
217 LIST_HEAD(CGroupDeviceAllow, device_allow);
218
219 LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
220 LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);
221
222 /* Common */
223 CGroupTasksMax tasks_max;
224
225 /* Settings for systemd-oomd */
226 ManagedOOMMode moom_swap;
227 ManagedOOMMode moom_mem_pressure;
228 uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
229 ManagedOOMPreference moom_preference;
230
231 /* Memory pressure logic */
232 CGroupPressureWatch memory_pressure_watch;
233 usec_t memory_pressure_threshold_usec;
234 /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
235 * triggers, nor triggers for non-memory pressure. We might add that later. */
236
237 NFTSetContext nft_set_context;
238
239 /* Forward coredumps for processes that crash within this cgroup.
240 * Requires 'delegate' to also be true. */
241 bool coredump_receive;
242 };
243
244 /* Used when querying IP accounting data */
245 typedef enum CGroupIPAccountingMetric {
246 CGROUP_IP_INGRESS_BYTES,
247 CGROUP_IP_INGRESS_PACKETS,
248 CGROUP_IP_EGRESS_BYTES,
249 CGROUP_IP_EGRESS_PACKETS,
250 _CGROUP_IP_ACCOUNTING_METRIC_MAX,
251 _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
252 } CGroupIPAccountingMetric;
253
254 /* Used when querying IO accounting data */
255 typedef enum CGroupIOAccountingMetric {
256 CGROUP_IO_READ_BYTES,
257 CGROUP_IO_WRITE_BYTES,
258 CGROUP_IO_READ_OPERATIONS,
259 CGROUP_IO_WRITE_OPERATIONS,
260 _CGROUP_IO_ACCOUNTING_METRIC_MAX,
261 _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
262 } CGroupIOAccountingMetric;
263
264 typedef struct Unit Unit;
265 typedef struct Manager Manager;
266 typedef enum ManagerState ManagerState;
267
268 uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
269
270 usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
271
272 void cgroup_context_init(CGroupContext *c);
273 void cgroup_context_done(CGroupContext *c);
274 void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
275 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
276 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f);
277
278 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
279 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
280 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
281 void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
282 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
283 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
284 void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
285 void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
286
287 static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
288 assert(c);
289
290 return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
291 (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
292 }
293
294 int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
295 int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
296 int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
297
298 void unit_modify_nft_set(Unit *u, bool add);
299
300 CGroupMask unit_get_own_mask(Unit *u);
301 CGroupMask unit_get_delegate_mask(Unit *u);
302 CGroupMask unit_get_members_mask(Unit *u);
303 CGroupMask unit_get_siblings_mask(Unit *u);
304 CGroupMask unit_get_ancestor_disable_mask(Unit *u);
305
306 CGroupMask unit_get_target_mask(Unit *u);
307 CGroupMask unit_get_enable_mask(Unit *u);
308
309 void unit_invalidate_cgroup_members_masks(Unit *u);
310
311 void unit_add_family_to_cgroup_realize_queue(Unit *u);
312
313 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
314 int unit_default_cgroup_path(const Unit *u, char **ret);
315 int unit_set_cgroup_path(Unit *u, const char *path);
316 int unit_pick_cgroup_path(Unit *u);
317
318 int unit_realize_cgroup(Unit *u);
319 void unit_prune_cgroup(Unit *u);
320 int unit_watch_cgroup(Unit *u);
321 int unit_watch_cgroup_memory(Unit *u);
322 void unit_add_to_cgroup_realize_queue(Unit *u);
323
324 void unit_release_cgroup(Unit *u);
325 /* Releases the cgroup only if it is recursively empty.
326 * Returns true if the cgroup was released, false otherwise. */
327 bool unit_maybe_release_cgroup(Unit *u);
328
329 void unit_add_to_cgroup_empty_queue(Unit *u);
330 int unit_check_oomd_kill(Unit *u);
331 int unit_check_oom(Unit *u);
332
333 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
334
335 int manager_setup_cgroup(Manager *m);
336 void manager_shutdown_cgroup(Manager *m, bool delete);
337
338 unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
339
340 Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
341 Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
342 Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
343 Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
344 Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
345
346 uint64_t unit_get_ancestor_memory_min(Unit *u);
347 uint64_t unit_get_ancestor_memory_low(Unit *u);
348 uint64_t unit_get_ancestor_startup_memory_low(Unit *u);
349
350 int unit_search_main_pid(Unit *u, PidRef *ret);
351 int unit_watch_all_pids(Unit *u);
352
353 int unit_synthesize_cgroup_empty_event(Unit *u);
354
355 int unit_get_memory_current(Unit *u, uint64_t *ret);
356 int unit_get_memory_peak(Unit *u, uint64_t *ret);
357 int unit_get_memory_swap_current(Unit *u, uint64_t *ret);
358 int unit_get_memory_swap_peak(Unit *u, uint64_t *ret);
359 int unit_get_memory_available(Unit *u, uint64_t *ret);
360 int unit_get_tasks_current(Unit *u, uint64_t *ret);
361 int unit_get_cpu_usage(Unit *u, nsec_t *ret);
362 int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
363 int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
364
365 int unit_reset_cpu_accounting(Unit *u);
366 int unit_reset_ip_accounting(Unit *u);
367 int unit_reset_io_accounting(Unit *u);
368 int unit_reset_accounting(Unit *u);
369
370 #define UNIT_CGROUP_BOOL(u, name) \
371 ({ \
372 CGroupContext *cc = unit_get_cgroup_context(u); \
373 cc ? cc->name : false; \
374 })
375
376 bool manager_owns_host_root_cgroup(Manager *m);
377 bool unit_has_host_root_cgroup(Unit *u);
378
379 bool unit_has_startup_cgroup_constraints(Unit *u);
380
381 int manager_notify_cgroup_empty(Manager *m, const char *group);
382
383 void unit_invalidate_cgroup(Unit *u, CGroupMask m);
384 void unit_invalidate_cgroup_bpf(Unit *u);
385
386 void manager_invalidate_startup_units(Manager *m);
387
388 const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
389 CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
390
391 void unit_cgroup_catchup(Unit *u);
392
393 bool unit_cgroup_delegate(Unit *u);
394
395 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
396 int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
397
398 const char* freezer_action_to_string(FreezerAction a) _const_;
399 FreezerAction freezer_action_from_string(const char *s) _pure_;
400
401 const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
402 CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;
403
404 const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) _const_;
405 CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) _pure_;
406
407 const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m) _const_;
408 CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) _pure_;
409
410 const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_;
411 CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_;