]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.h
core: move pid watch/unwatch logic of the service manager to pidfd
[thirdparty/systemd.git] / src / core / cgroup.h
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 #pragma once
3
4 #include <stdbool.h>
5
6 #include "bpf-lsm.h"
7 #include "cgroup-util.h"
8 #include "cpu-set-util.h"
9 #include "firewall-util.h"
10 #include "list.h"
11 #include "pidref.h"
12 #include "time-util.h"
13
14 typedef struct TasksMax {
15 /* If scale == 0, just use value; otherwise, value / scale.
16 * See tasks_max_resolve(). */
17 uint64_t value;
18 uint64_t scale;
19 } TasksMax;
20
21 #define TASKS_MAX_UNSET ((TasksMax) { .value = UINT64_MAX, .scale = 0 })
22
23 static inline bool tasks_max_isset(const TasksMax *tasks_max) {
24 return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
25 }
26
27 uint64_t tasks_max_resolve(const TasksMax *tasks_max);
28
29 typedef struct CGroupContext CGroupContext;
30 typedef struct CGroupDeviceAllow CGroupDeviceAllow;
31 typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
32 typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
33 typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
34 typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
35 typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
36 typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
37 typedef struct CGroupSocketBindItem CGroupSocketBindItem;
38
39 typedef enum CGroupDevicePolicy {
40 /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
41 * everything. */
42 CGROUP_DEVICE_POLICY_AUTO,
43
44 /* Everything forbidden, except built-in ones and listed ones. */
45 CGROUP_DEVICE_POLICY_CLOSED,
46
47 /* Everything forbidden, except for the listed devices */
48 CGROUP_DEVICE_POLICY_STRICT,
49
50 _CGROUP_DEVICE_POLICY_MAX,
51 _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
52 } CGroupDevicePolicy;
53
54 typedef enum FreezerAction {
55 FREEZER_FREEZE,
56 FREEZER_THAW,
57
58 _FREEZER_ACTION_MAX,
59 _FREEZER_ACTION_INVALID = -EINVAL,
60 } FreezerAction;
61
62 struct CGroupDeviceAllow {
63 LIST_FIELDS(CGroupDeviceAllow, device_allow);
64 char *path;
65 bool r:1;
66 bool w:1;
67 bool m:1;
68 };
69
70 struct CGroupIODeviceWeight {
71 LIST_FIELDS(CGroupIODeviceWeight, device_weights);
72 char *path;
73 uint64_t weight;
74 };
75
76 struct CGroupIODeviceLimit {
77 LIST_FIELDS(CGroupIODeviceLimit, device_limits);
78 char *path;
79 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
80 };
81
82 struct CGroupIODeviceLatency {
83 LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
84 char *path;
85 usec_t target_usec;
86 };
87
88 struct CGroupBlockIODeviceWeight {
89 LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
90 char *path;
91 uint64_t weight;
92 };
93
94 struct CGroupBlockIODeviceBandwidth {
95 LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
96 char *path;
97 uint64_t rbps;
98 uint64_t wbps;
99 };
100
101 struct CGroupBPFForeignProgram {
102 LIST_FIELDS(CGroupBPFForeignProgram, programs);
103 uint32_t attach_type;
104 char *bpffs_path;
105 };
106
107 struct CGroupSocketBindItem {
108 LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
109 int address_family;
110 int ip_protocol;
111 uint16_t nr_ports;
112 uint16_t port_min;
113 };
114
115 typedef enum CGroupPressureWatch {
116 CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */
117 CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */
118 CGROUP_PRESSURE_WATCH_ON,
119 CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
120 _CGROUP_PRESSURE_WATCH_MAX,
121 _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
122 } CGroupPressureWatch;
123
124 struct CGroupContext {
125 bool cpu_accounting;
126 bool io_accounting;
127 bool blockio_accounting;
128 bool memory_accounting;
129 bool tasks_accounting;
130 bool ip_accounting;
131
132 /* Configures the memory.oom.group attribute (on unified) */
133 bool memory_oom_group;
134
135 bool delegate;
136 CGroupMask delegate_controllers;
137 CGroupMask disable_controllers;
138 char *delegate_subgroup;
139
140 /* For unified hierarchy */
141 uint64_t cpu_weight;
142 uint64_t startup_cpu_weight;
143 usec_t cpu_quota_per_sec_usec;
144 usec_t cpu_quota_period_usec;
145
146 CPUSet cpuset_cpus;
147 CPUSet startup_cpuset_cpus;
148 CPUSet cpuset_mems;
149 CPUSet startup_cpuset_mems;
150
151 uint64_t io_weight;
152 uint64_t startup_io_weight;
153 LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
154 LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
155 LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
156
157 uint64_t default_memory_min;
158 uint64_t default_memory_low;
159 uint64_t default_startup_memory_low;
160 uint64_t memory_min;
161 uint64_t memory_low;
162 uint64_t startup_memory_low;
163 uint64_t memory_high;
164 uint64_t startup_memory_high;
165 uint64_t memory_max;
166 uint64_t startup_memory_max;
167 uint64_t memory_swap_max;
168 uint64_t startup_memory_swap_max;
169 uint64_t memory_zswap_max;
170 uint64_t startup_memory_zswap_max;
171
172 bool default_memory_min_set:1;
173 bool default_memory_low_set:1;
174 bool default_startup_memory_low_set:1;
175 bool memory_min_set:1;
176 bool memory_low_set:1;
177 bool startup_memory_low_set:1;
178 bool startup_memory_high_set:1;
179 bool startup_memory_max_set:1;
180 bool startup_memory_swap_max_set:1;
181 bool startup_memory_zswap_max_set:1;
182
183 Set *ip_address_allow;
184 Set *ip_address_deny;
185 /* These two flags indicate that redundant entries have been removed from
186 * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
187 bool ip_address_allow_reduced;
188 bool ip_address_deny_reduced;
189
190 char **ip_filters_ingress;
191 char **ip_filters_egress;
192 LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);
193
194 Set *restrict_network_interfaces;
195 bool restrict_network_interfaces_is_allow_list;
196
197 /* For legacy hierarchies */
198 uint64_t cpu_shares;
199 uint64_t startup_cpu_shares;
200
201 uint64_t blockio_weight;
202 uint64_t startup_blockio_weight;
203 LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
204 LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
205
206 uint64_t memory_limit;
207
208 CGroupDevicePolicy device_policy;
209 LIST_HEAD(CGroupDeviceAllow, device_allow);
210
211 LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
212 LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);
213
214 /* Common */
215 TasksMax tasks_max;
216
217 /* Settings for systemd-oomd */
218 ManagedOOMMode moom_swap;
219 ManagedOOMMode moom_mem_pressure;
220 uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
221 ManagedOOMPreference moom_preference;
222
223 /* Memory pressure logic */
224 CGroupPressureWatch memory_pressure_watch;
225 usec_t memory_pressure_threshold_usec;
226 /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
227 * triggers, nor triggers for non-memory pressure. We might add that later. */
228
229 NFTSetContext nft_set_context;
230 };
231
232 /* Used when querying IP accounting data */
233 typedef enum CGroupIPAccountingMetric {
234 CGROUP_IP_INGRESS_BYTES,
235 CGROUP_IP_INGRESS_PACKETS,
236 CGROUP_IP_EGRESS_BYTES,
237 CGROUP_IP_EGRESS_PACKETS,
238 _CGROUP_IP_ACCOUNTING_METRIC_MAX,
239 _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
240 } CGroupIPAccountingMetric;
241
242 /* Used when querying IO accounting data */
243 typedef enum CGroupIOAccountingMetric {
244 CGROUP_IO_READ_BYTES,
245 CGROUP_IO_WRITE_BYTES,
246 CGROUP_IO_READ_OPERATIONS,
247 CGROUP_IO_WRITE_OPERATIONS,
248 _CGROUP_IO_ACCOUNTING_METRIC_MAX,
249 _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
250 } CGroupIOAccountingMetric;
251
252 typedef struct Unit Unit;
253 typedef struct Manager Manager;
254
255 usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
256
257 void cgroup_context_init(CGroupContext *c);
258 void cgroup_context_done(CGroupContext *c);
259 void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
260 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
261
262 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
263 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
264 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
265 void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
266 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
267 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
268 void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
269 void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
270
271 static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
272 assert(c);
273
274 return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
275 (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
276 }
277
278 int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode);
279 int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
280
281 void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path);
282 int cgroup_log_xattr_apply(Unit *u, const char *cgroup_path);
283
284 void cgroup_modify_nft_set(Unit *u, bool add);
285
286 CGroupMask unit_get_own_mask(Unit *u);
287 CGroupMask unit_get_delegate_mask(Unit *u);
288 CGroupMask unit_get_members_mask(Unit *u);
289 CGroupMask unit_get_siblings_mask(Unit *u);
290 CGroupMask unit_get_ancestor_disable_mask(Unit *u);
291
292 CGroupMask unit_get_target_mask(Unit *u);
293 CGroupMask unit_get_enable_mask(Unit *u);
294
295 void unit_invalidate_cgroup_members_masks(Unit *u);
296
297 void unit_add_family_to_cgroup_realize_queue(Unit *u);
298
299 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
300 int unit_default_cgroup_path(const Unit *u, char **ret);
301 int unit_set_cgroup_path(Unit *u, const char *path);
302 int unit_pick_cgroup_path(Unit *u);
303
304 int unit_realize_cgroup(Unit *u);
305 void unit_prune_cgroup(Unit *u);
306 int unit_watch_cgroup(Unit *u);
307 int unit_watch_cgroup_memory(Unit *u);
308 void unit_add_to_cgroup_realize_queue(Unit *u);
309
310 void unit_release_cgroup(Unit *u);
311 /* Releases the cgroup only if it is recursively empty.
312 * Returns true if the cgroup was released, false otherwise. */
313 bool unit_maybe_release_cgroup(Unit *u);
314
315 void unit_add_to_cgroup_empty_queue(Unit *u);
316 int unit_check_oomd_kill(Unit *u);
317 int unit_check_oom(Unit *u);
318
319 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
320
321 int manager_setup_cgroup(Manager *m);
322 void manager_shutdown_cgroup(Manager *m, bool delete);
323
324 unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
325
326 Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
327 Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
328 Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
329 Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
330 Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
331
332 uint64_t unit_get_ancestor_memory_min(Unit *u);
333 uint64_t unit_get_ancestor_memory_low(Unit *u);
334 uint64_t unit_get_ancestor_startup_memory_low(Unit *u);
335
336 int unit_search_main_pid(Unit *u, PidRef *ret);
337 int unit_watch_all_pids(Unit *u);
338
339 int unit_synthesize_cgroup_empty_event(Unit *u);
340
341 int unit_get_memory_current(Unit *u, uint64_t *ret);
342 int unit_get_memory_available(Unit *u, uint64_t *ret);
343 int unit_get_tasks_current(Unit *u, uint64_t *ret);
344 int unit_get_cpu_usage(Unit *u, nsec_t *ret);
345 int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
346 int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
347
348 int unit_reset_cpu_accounting(Unit *u);
349 int unit_reset_ip_accounting(Unit *u);
350 int unit_reset_io_accounting(Unit *u);
351 int unit_reset_accounting(Unit *u);
352
353 #define UNIT_CGROUP_BOOL(u, name) \
354 ({ \
355 CGroupContext *cc = unit_get_cgroup_context(u); \
356 cc ? cc->name : false; \
357 })
358
359 bool manager_owns_host_root_cgroup(Manager *m);
360 bool unit_has_host_root_cgroup(Unit *u);
361
362 bool unit_has_startup_cgroup_constraints(Unit *u);
363
364 int manager_notify_cgroup_empty(Manager *m, const char *group);
365
366 void unit_invalidate_cgroup(Unit *u, CGroupMask m);
367 void unit_invalidate_cgroup_bpf(Unit *u);
368
369 void manager_invalidate_startup_units(Manager *m);
370
371 const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
372 CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
373
374 void unit_cgroup_catchup(Unit *u);
375
376 bool unit_cgroup_delegate(Unit *u);
377
378 int compare_job_priority(const void *a, const void *b);
379
380 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
381 int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
382
383 const char* freezer_action_to_string(FreezerAction a) _const_;
384 FreezerAction freezer_action_from_string(const char *s) _pure_;
385
386 const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
387 CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;