]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
Fixes for vscode/intellisense parsing (#38040)
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
8e274523 2
c6c18be3 3#include <fcntl.h>
836e4e7e 4#include <sys/stat.h>
4f18ff2e 5#include <unistd.h>
8c6db833 6
836e4e7e 7#include "sd-bus.h"
afcfaa69
LP
8#include "sd-messages.h"
9
a4817536 10#include "af-list.h"
b5efdb8a 11#include "alloc-util.h"
18c528e9 12#include "blockdev-util.h"
d8b4d14d 13#include "bpf-devices.h"
906c06f6 14#include "bpf-firewall.h"
506ea51b 15#include "bpf-foreign.h"
836e4e7e 16#include "bpf-program.h"
62e22490 17#include "bpf-restrict-ifaces.h"
cd09a5f3 18#include "bpf-socket-bind.h"
45c2e068 19#include "btrfs-util.h"
6592b975 20#include "bus-error.h"
78fa2f91 21#include "bus-locator.h"
1cf40697 22#include "cgroup.h"
fdb3deca 23#include "cgroup-setup.h"
03a7b521 24#include "cgroup-util.h"
7176f06c 25#include "devnum-util.h"
836e4e7e
DDM
26#include "errno-util.h"
27#include "extract-word.h"
3ffd4af2 28#include "fd-util.h"
836e4e7e 29#include "fdset.h"
0d39fa9c 30#include "fileio.h"
dc7d69b3 31#include "firewall-util.h"
84ebe6f0 32#include "in-addr-prefix-util.h"
9e5fd717 33#include "inotify-util.h"
5587ce7f 34#include "ip-protocol-list.h"
3a0f06c4 35#include "limits-util.h"
4ea4abb6 36#include "manager.h"
d9e45bc3 37#include "nulstr-util.h"
6bedfcbb 38#include "parse-util.h"
9eb977db 39#include "path-util.h"
1ead0b2a 40#include "percent-util.h"
836e4e7e 41#include "pidref.h"
03a7b521 42#include "process-util.h"
c36a69f4 43#include "procfs-util.h"
9cc54544 44#include "serialize.h"
836e4e7e 45#include "set.h"
9444b1f2 46#include "special.h"
906c06f6 47#include "stdio-util.h"
8b43440b 48#include "string-table.h"
07630cea 49#include "string-util.h"
836e4e7e 50#include "strv.h"
cc6271f1 51#include "virt.h"
8e274523 52
b1994387
ILG
53#if BPF_FRAMEWORK
54#include "bpf-dlopen.h"
55#include "bpf-link.h"
836e4e7e 56#include "bpf-restrict-fs.h"
b1994387
ILG
57#include "bpf/restrict_fs/restrict-fs-skel.h"
58#endif
59
10f28641 60#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
9a054909 61
39b9fefb
LP
62/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
63 * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
64 * out specific attributes from us. */
836e4e7e 65#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(ABS(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
39b9fefb 66
bb160976
LP
67static void unit_remove_from_cgroup_empty_queue(Unit *u);
68
94f0b13b 69uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max) {
3a0f06c4
ZJS
70 if (tasks_max->scale == 0)
71 return tasks_max->value;
72
73 return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
74}
75
611c4f8a 76bool manager_owns_host_root_cgroup(Manager *m) {
cc6271f1
LP
77 assert(m);
78
79 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
80 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
81 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
82 * we run in any kind of container virtualization. */
83
28cfdc5a
LP
84 if (MANAGER_IS_USER(m))
85 return false;
86
cc6271f1
LP
87 if (detect_container() > 0)
88 return false;
89
57ea45e1 90 return empty_or_root(m->cgroup_root);
cc6271f1
LP
91}
92
9dfb6a3a
PM
93bool unit_has_startup_cgroup_constraints(Unit *u) {
94 assert(u);
95
96 /* Returns true if this unit has any directives which apply during
97 * startup/shutdown phases. */
98
99 CGroupContext *c;
100
101 c = unit_get_cgroup_context(u);
102 if (!c)
103 return false;
104
a7b06f6c 105 return c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
9dfb6a3a 106 c->startup_cpuset_cpus.set ||
53fda560
LB
107 c->startup_cpuset_mems.set ||
108 c->startup_memory_high_set ||
109 c->startup_memory_max_set ||
110 c->startup_memory_swap_max_set||
111 c->startup_memory_zswap_max_set ||
112 c->startup_memory_low_set;
9dfb6a3a
PM
113}
114
4442aef0 115bool unit_has_host_root_cgroup(const Unit *u) {
f3725e64 116 assert(u);
4442aef0 117 assert(u->manager);
f3725e64 118
cc6271f1
LP
119 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
120 * the manager manages the root cgroup. */
f3725e64 121
611c4f8a 122 if (!manager_owns_host_root_cgroup(u->manager))
f3725e64
LP
123 return false;
124
cc6271f1 125 return unit_has_name(u, SPECIAL_ROOT_SLICE);
f3725e64
LP
126}
127
293d32df
LP
128static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
129 int r;
130
9cc54544
LP
131 assert(u);
132
133 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
134 if (!crt || !crt->cgroup_path)
135 return -EOWNERDEAD;
136
137 r = cg_set_attribute(controller, crt->cgroup_path, attribute, value);
293d32df 138 if (r < 0)
8ed6f81b 139 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
9cc54544 140 strna(attribute), empty_to_root(crt->cgroup_path), (int) strcspn(value, NEWLINE), value);
293d32df
LP
141
142 return r;
143}
144
4ad49000
LP
145void cgroup_context_init(CGroupContext *c) {
146 assert(c);
147
154eb43f
LB
148 /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make
149 * sure to serialize in execute-serialize.c using serialize_bool() instead of
150 * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide()
151 * skips serialization if the value is 'false' (as that's the common default), so if the value at
152 * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other
153 * values, update/add a conditional serialization check. This is to minimize the amount of
154 * serialized data that is sent to the sd-executor, so that there is less work to do on the default
155 * cases. */
4ad49000 156
de8a711a
LP
157 *c = (CGroupContext) {
158 .cpu_weight = CGROUP_WEIGHT_INVALID,
159 .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
160 .cpu_quota_per_sec_usec = USEC_INFINITY,
10f28641 161 .cpu_quota_period_usec = USEC_INFINITY,
66ebf6c0 162
de8a711a 163 .memory_high = CGROUP_LIMIT_MAX,
53fda560 164 .startup_memory_high = CGROUP_LIMIT_MAX,
de8a711a 165 .memory_max = CGROUP_LIMIT_MAX,
53fda560 166 .startup_memory_max = CGROUP_LIMIT_MAX,
de8a711a 167 .memory_swap_max = CGROUP_LIMIT_MAX,
53fda560 168 .startup_memory_swap_max = CGROUP_LIMIT_MAX,
d7fe0a67 169 .memory_zswap_max = CGROUP_LIMIT_MAX,
53fda560 170 .startup_memory_zswap_max = CGROUP_LIMIT_MAX,
da4d897e 171
1ea275f1
MY
172 .memory_zswap_writeback = true,
173
de8a711a
LP
174 .io_weight = CGROUP_WEIGHT_INVALID,
175 .startup_io_weight = CGROUP_WEIGHT_INVALID,
13c31542 176
94f0b13b 177 .tasks_max = CGROUP_TASKS_MAX_UNSET,
4d824a4e
AZ
178
179 .moom_swap = MANAGED_OOM_AUTO,
180 .moom_mem_pressure = MANAGED_OOM_AUTO,
4e806bfa 181 .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
63d4c427
RW
182 /* The default duration value in oomd.conf will be used when
183 * moom_mem_pressure_duration_usec is set to infinity. */
184 .moom_mem_pressure_duration_usec = USEC_INFINITY,
6bb00842
LP
185
186 .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
187 .memory_pressure_threshold_usec = USEC_INFINITY,
de8a711a 188 };
4ad49000 189}
84c01612 190
4ad49000
LP
191void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
192 assert(c);
193 assert(a);
194
71fda00f 195 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
196 free(a->path);
197 free(a);
198}
199
13c31542
TH
200void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
201 assert(c);
202 assert(w);
203
204 LIST_REMOVE(device_weights, c->io_device_weights, w);
205 free(w->path);
206 free(w);
207}
208
6ae4283c
TH
209void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
210 assert(c);
211 assert(l);
212
213 LIST_REMOVE(device_latencies, c->io_device_latencies, l);
214 free(l->path);
215 free(l);
216}
217
13c31542
TH
218void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
219 assert(c);
220 assert(l);
221
222 LIST_REMOVE(device_limits, c->io_device_limits, l);
223 free(l->path);
224 free(l);
225}
226
b894ef1b
JK
227void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
228 assert(c);
229 assert(p);
230
231 LIST_REMOVE(programs, c->bpf_foreign_programs, p);
232 free(p->bpffs_path);
233 free(p);
234}
235
b18e9fc1 236void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
b18e9fc1
JK
237 assert(head);
238
9aad490e 239 LIST_CLEAR(socket_bind_items, *head, free);
b18e9fc1
JK
240}
241
4ad49000
LP
242void cgroup_context_done(CGroupContext *c) {
243 assert(c);
244
13c31542
TH
245 while (c->io_device_weights)
246 cgroup_context_free_io_device_weight(c, c->io_device_weights);
247
6ae4283c
TH
248 while (c->io_device_latencies)
249 cgroup_context_free_io_device_latency(c, c->io_device_latencies);
250
13c31542
TH
251 while (c->io_device_limits)
252 cgroup_context_free_io_device_limit(c, c->io_device_limits);
253
4ad49000
LP
254 while (c->device_allow)
255 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f 256
b18e9fc1
JK
257 cgroup_context_remove_socket_bind(&c->socket_bind_allow);
258 cgroup_context_remove_socket_bind(&c->socket_bind_deny);
259
84ebe6f0
YW
260 c->ip_address_allow = set_free(c->ip_address_allow);
261 c->ip_address_deny = set_free(c->ip_address_deny);
fab34748
KL
262
263 c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
264 c->ip_filters_egress = strv_free(c->ip_filters_egress);
047f5d63 265
b894ef1b
JK
266 while (c->bpf_foreign_programs)
267 cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
268
c4c4ea2f 269 c->restrict_network_interfaces = set_free(c->restrict_network_interfaces);
6f50d4f7 270
296fe3d5
YW
271 cpu_set_done(&c->cpuset_cpus);
272 cpu_set_done(&c->startup_cpuset_cpus);
273 cpu_set_done(&c->cpuset_mems);
274 cpu_set_done(&c->startup_cpuset_mems);
a8b993dc
LP
275
276 c->delegate_subgroup = mfree(c->delegate_subgroup);
dc7d69b3
TM
277
278 nft_set_context_clear(&c->nft_set_context);
4ad49000
LP
279}
280
74b5fb27 281static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
74b5fb27
CD
282 assert(u);
283
9cc54544
LP
284 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
285 if (!crt || !crt->cgroup_path)
74b5fb27
CD
286 return -EOWNERDEAD;
287
9cc54544 288 return cg_get_attribute_as_uint64("memory", crt->cgroup_path, file, ret);
74b5fb27
CD
289}
290
291static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
292 CGroupContext *c;
293 CGroupMask m;
294 const char *file;
295 uint64_t unit_value;
296 int r;
297
35f88201 298 /* Compare kernel memcg configuration against our internal systemd state.
74b5fb27
CD
299 *
300 * Returns:
301 *
302 * <0: On error.
303 * 0: If the kernel memory setting doesn't match our configuration.
304 * >0: If the kernel memory setting matches our configuration.
305 *
306 * The following values are only guaranteed to be populated on return >=0:
307 *
308 * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
309 * - ret_kernel_value will contain the actual value presented by the kernel. */
310
311 assert(u);
312
74b5fb27
CD
313 /* The root slice doesn't have any controller files, so we can't compare anything. */
314 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
315 return -ENODATA;
316
317 /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
318 * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
319 * avoid specious errors in these scenarios, check that we even expect the memory controller to be
320 * enabled at all. */
321 m = unit_get_target_mask(u);
322 if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
323 return -ENODATA;
324
806a9362 325 assert_se(c = unit_get_cgroup_context(u));
74b5fb27 326
53fda560
LB
327 bool startup = u->manager && IN_SET(manager_state(u->manager), MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
328
74b5fb27
CD
329 if (streq(property_name, "MemoryLow")) {
330 unit_value = unit_get_ancestor_memory_low(u);
331 file = "memory.low";
53fda560
LB
332 } else if (startup && streq(property_name, "StartupMemoryLow")) {
333 unit_value = unit_get_ancestor_startup_memory_low(u);
334 file = "memory.low";
74b5fb27
CD
335 } else if (streq(property_name, "MemoryMin")) {
336 unit_value = unit_get_ancestor_memory_min(u);
337 file = "memory.min";
338 } else if (streq(property_name, "MemoryHigh")) {
339 unit_value = c->memory_high;
340 file = "memory.high";
53fda560
LB
341 } else if (startup && streq(property_name, "StartupMemoryHigh")) {
342 unit_value = c->startup_memory_high;
343 file = "memory.high";
74b5fb27
CD
344 } else if (streq(property_name, "MemoryMax")) {
345 unit_value = c->memory_max;
346 file = "memory.max";
53fda560
LB
347 } else if (startup && streq(property_name, "StartupMemoryMax")) {
348 unit_value = c->startup_memory_max;
349 file = "memory.max";
74b5fb27
CD
350 } else if (streq(property_name, "MemorySwapMax")) {
351 unit_value = c->memory_swap_max;
352 file = "memory.swap.max";
53fda560
LB
353 } else if (startup && streq(property_name, "StartupMemorySwapMax")) {
354 unit_value = c->startup_memory_swap_max;
355 file = "memory.swap.max";
d7fe0a67
PV
356 } else if (streq(property_name, "MemoryZSwapMax")) {
357 unit_value = c->memory_zswap_max;
358 file = "memory.zswap.max";
53fda560
LB
359 } else if (startup && streq(property_name, "StartupMemoryZSwapMax")) {
360 unit_value = c->startup_memory_zswap_max;
361 file = "memory.zswap.max";
74b5fb27
CD
362 } else
363 return -EINVAL;
364
365 r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
366 if (r < 0)
367 return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
368
369 /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
370 * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
371 * our internal page-counting. To support those future kernels, just check the value itself first
372 * without any page-alignment. */
373 if (*ret_kernel_value == unit_value) {
374 *ret_unit_value = unit_value;
375 return 1;
376 }
377
378 /* The current kernel behaviour, by comparison, is that even if you write a particular number of
379 * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
380 * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
381 * cricket. */
382 if (unit_value != CGROUP_LIMIT_MAX)
383 unit_value = PAGE_ALIGN_DOWN(unit_value);
384
385 *ret_unit_value = unit_value;
386
387 return *ret_kernel_value == *ret_unit_value;
388}
389
bc0623df
CD
390#define FORMAT_CGROUP_DIFF_MAX 128
391
3f236f24 392static char *format_cgroup_memory_limit_comparison(Unit *u, const char *property_name, char *buf, size_t l) {
bc0623df
CD
393 uint64_t kval, sval;
394 int r;
395
396 assert(u);
3f236f24 397 assert(property_name);
bc0623df
CD
398 assert(buf);
399 assert(l > 0);
400
401 r = unit_compare_memory_limit(u, property_name, &sval, &kval);
402
403 /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
404 * In the absence of reliably being able to detect whether memcg swap support is available or not,
d7fe0a67
PV
405 * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying
406 * on CONFIG_ZSWAP. */
bc0623df 407 if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
53fda560
LB
408 (r == -ENOENT && STR_IN_SET(property_name,
409 "MemorySwapMax",
410 "StartupMemorySwapMax",
411 "MemoryZSwapMax",
412 "StartupMemoryZSwapMax")))
bc0623df 413 buf[0] = 0;
38553034
ZJS
414 else if (r < 0) {
415 errno = -r;
416 (void) snprintf(buf, l, " (error getting kernel value: %m)");
417 } else
418 (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
bc0623df
CD
419
420 return buf;
421}
422
bfd5a068 423const char* cgroup_device_permissions_to_string(CGroupDevicePermissions p) {
a1044811
LP
424 static const char *table[_CGROUP_DEVICE_PERMISSIONS_MAX] = {
425 /* Lets simply define a table with every possible combination. As long as those are just 8 we
426 * can get away with it. If this ever grows to more we need to revisit this logic though. */
427 [0] = "",
428 [CGROUP_DEVICE_READ] = "r",
429 [CGROUP_DEVICE_WRITE] = "w",
430 [CGROUP_DEVICE_MKNOD] = "m",
431 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE] = "rw",
432 [CGROUP_DEVICE_READ|CGROUP_DEVICE_MKNOD] = "rm",
433 [CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "wm",
434 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "rwm",
435 };
436
437 if (p < 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
438 return NULL;
439
440 return table[p];
441}
442
443CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) {
444 CGroupDevicePermissions p = 0;
445
446 if (!s)
447 return _CGROUP_DEVICE_PERMISSIONS_INVALID;
448
449 for (const char *c = s; *c; c++) {
450 if (*c == 'r')
451 p |= CGROUP_DEVICE_READ;
452 else if (*c == 'w')
453 p |= CGROUP_DEVICE_WRITE;
454 else if (*c == 'm')
455 p |= CGROUP_DEVICE_MKNOD;
456 else
457 return _CGROUP_DEVICE_PERMISSIONS_INVALID;
458 }
459
460 return p;
461}
462
bc0623df 463void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
7b3693e4 464 _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
bc0623df 465 CGroupContext *c;
84ebe6f0 466 struct in_addr_prefix *iaai;
3f236f24
LP
467 char cda[FORMAT_CGROUP_DIFF_MAX], cdb[FORMAT_CGROUP_DIFF_MAX], cdc[FORMAT_CGROUP_DIFF_MAX], cdd[FORMAT_CGROUP_DIFF_MAX],
468 cde[FORMAT_CGROUP_DIFF_MAX], cdf[FORMAT_CGROUP_DIFF_MAX], cdg[FORMAT_CGROUP_DIFF_MAX], cdh[FORMAT_CGROUP_DIFF_MAX],
469 cdi[FORMAT_CGROUP_DIFF_MAX], cdj[FORMAT_CGROUP_DIFF_MAX], cdk[FORMAT_CGROUP_DIFF_MAX];
bc0623df
CD
470
471 assert(u);
4ad49000
LP
472 assert(f);
473
806a9362 474 assert_se(c = unit_get_cgroup_context(u));
bc0623df 475
4ad49000
LP
476 prefix = strempty(prefix);
477
25cc30c4 478 (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
7b3693e4 479 (void) cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
25cc30c4 480
af2b151b
ZJS
481 /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */
482 const char *delegate_str = delegate_controllers_str ?: c->delegate ? "(none)" : "no";
483
047f5d63 484 cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
31d3a520 485 startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
047f5d63 486 cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
31d3a520 487 startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
047f5d63 488
4ad49000 489 fprintf(f,
6dfb9282 490 "%sIOAccounting: %s\n"
6dfb9282
CD
491 "%sMemoryAccounting: %s\n"
492 "%sTasksAccounting: %s\n"
493 "%sIPAccounting: %s\n"
494 "%sCPUWeight: %" PRIu64 "\n"
495 "%sStartupCPUWeight: %" PRIu64 "\n"
6dfb9282
CD
496 "%sCPUQuotaPerSecSec: %s\n"
497 "%sCPUQuotaPeriodSec: %s\n"
498 "%sAllowedCPUs: %s\n"
31d3a520 499 "%sStartupAllowedCPUs: %s\n"
6dfb9282 500 "%sAllowedMemoryNodes: %s\n"
31d3a520 501 "%sStartupAllowedMemoryNodes: %s\n"
6dfb9282
CD
502 "%sIOWeight: %" PRIu64 "\n"
503 "%sStartupIOWeight: %" PRIu64 "\n"
6dfb9282
CD
504 "%sDefaultMemoryMin: %" PRIu64 "\n"
505 "%sDefaultMemoryLow: %" PRIu64 "\n"
bc0623df
CD
506 "%sMemoryMin: %" PRIu64 "%s\n"
507 "%sMemoryLow: %" PRIu64 "%s\n"
53fda560 508 "%sStartupMemoryLow: %" PRIu64 "%s\n"
bc0623df 509 "%sMemoryHigh: %" PRIu64 "%s\n"
53fda560 510 "%sStartupMemoryHigh: %" PRIu64 "%s\n"
bc0623df 511 "%sMemoryMax: %" PRIu64 "%s\n"
53fda560 512 "%sStartupMemoryMax: %" PRIu64 "%s\n"
bc0623df 513 "%sMemorySwapMax: %" PRIu64 "%s\n"
53fda560 514 "%sStartupMemorySwapMax: %" PRIu64 "%s\n"
d7fe0a67 515 "%sMemoryZSwapMax: %" PRIu64 "%s\n"
53fda560 516 "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n"
1ea275f1 517 "%sMemoryZSwapWriteback: %s\n"
6dfb9282
CD
518 "%sTasksMax: %" PRIu64 "\n"
519 "%sDevicePolicy: %s\n"
520 "%sDisableControllers: %s\n"
4d824a4e
AZ
521 "%sDelegate: %s\n"
522 "%sManagedOOMSwap: %s\n"
523 "%sManagedOOMMemoryPressure: %s\n"
d9d3f05d 524 "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
6bb00842 525 "%sManagedOOMPreference: %s\n"
6cf96ab4
NR
526 "%sMemoryPressureWatch: %s\n"
527 "%sCoredumpReceive: %s\n",
13c31542 528 prefix, yes_no(c->io_accounting),
4ad49000 529 prefix, yes_no(c->memory_accounting),
d53d9474 530 prefix, yes_no(c->tasks_accounting),
c21c9906 531 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
532 prefix, c->cpu_weight,
533 prefix, c->startup_cpu_weight,
5291f26d
ZJS
534 prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
535 prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
85c3b278 536 prefix, strempty(cpuset_cpus),
31d3a520 537 prefix, strempty(startup_cpuset_cpus),
85c3b278 538 prefix, strempty(cpuset_mems),
31d3a520 539 prefix, strempty(startup_cpuset_mems),
13c31542
TH
540 prefix, c->io_weight,
541 prefix, c->startup_io_weight,
7ad5439e 542 prefix, c->default_memory_min,
c52db42b 543 prefix, c->default_memory_low,
3f236f24
LP
544 prefix, c->memory_min, format_cgroup_memory_limit_comparison(u, "MemoryMin", cda, sizeof(cda)),
545 prefix, c->memory_low, format_cgroup_memory_limit_comparison(u, "MemoryLow", cdb, sizeof(cdb)),
546 prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(u, "StartupMemoryLow", cdc, sizeof(cdc)),
547 prefix, c->memory_high, format_cgroup_memory_limit_comparison(u, "MemoryHigh", cdd, sizeof(cdd)),
548 prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(u, "StartupMemoryHigh", cde, sizeof(cde)),
549 prefix, c->memory_max, format_cgroup_memory_limit_comparison(u, "MemoryMax", cdf, sizeof(cdf)),
550 prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryMax", cdg, sizeof(cdg)),
551 prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(u, "MemorySwapMax", cdh, sizeof(cdh)),
552 prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(u, "StartupMemorySwapMax", cdi, sizeof(cdi)),
553 prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(u, "MemoryZSwapMax", cdj, sizeof(cdj)),
554 prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryZSwapMax", cdk, sizeof(cdk)),
1ea275f1 555 prefix, yes_no(c->memory_zswap_writeback),
94f0b13b 556 prefix, cgroup_tasks_max_resolve(&c->tasks_max),
a931ad47 557 prefix, cgroup_device_policy_to_string(c->device_policy),
f4c43a81 558 prefix, strempty(disable_controllers_str),
af2b151b 559 prefix, delegate_str,
4d824a4e
AZ
560 prefix, managed_oom_mode_to_string(c->moom_swap),
561 prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
d9d3f05d 562 prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
6bb00842 563 prefix, managed_oom_preference_to_string(c->moom_preference),
6cf96ab4
NR
564 prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch),
565 prefix, yes_no(c->coredump_receive));
6bb00842 566
a8b993dc
LP
567 if (c->delegate_subgroup)
568 fprintf(f, "%sDelegateSubgroup: %s\n",
569 prefix, c->delegate_subgroup);
570
6bb00842
LP
571 if (c->memory_pressure_threshold_usec != USEC_INFINITY)
572 fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
573 prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
4ad49000 574
63d4c427
RW
575 if (c->moom_mem_pressure_duration_usec != USEC_INFINITY)
576 fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n",
577 prefix, FORMAT_TIMESPAN(c->moom_mem_pressure_duration_usec, 1));
578
4ad49000 579 LIST_FOREACH(device_allow, a, c->device_allow)
14338cca 580 /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
4ad49000 581 fprintf(f,
a1044811 582 "%sDeviceAllow: %s %s\n",
4ad49000
LP
583 prefix,
584 a->path,
14338cca 585 strna(cgroup_device_permissions_to_string(a->permissions)));
4ad49000 586
13c31542
TH
587 LIST_FOREACH(device_weights, iw, c->io_device_weights)
588 fprintf(f,
6dfb9282 589 "%sIODeviceWeight: %s %" PRIu64 "\n",
13c31542
TH
590 prefix,
591 iw->path,
592 iw->weight);
593
6ae4283c
TH
594 LIST_FOREACH(device_latencies, l, c->io_device_latencies)
595 fprintf(f,
6dfb9282 596 "%sIODeviceLatencyTargetSec: %s %s\n",
6ae4283c
TH
597 prefix,
598 l->path,
5291f26d 599 FORMAT_TIMESPAN(l->target_usec, 1));
6ae4283c 600
2b59bf51 601 LIST_FOREACH(device_limits, il, c->io_device_limits)
e8616626 602 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
9be57249
TH
603 if (il->limits[type] != cgroup_io_limit_defaults[type])
604 fprintf(f,
6dfb9282 605 "%s%s: %s %s\n",
9be57249
TH
606 prefix,
607 cgroup_io_limit_type_to_string(type),
608 il->path,
2b59bf51 609 FORMAT_BYTES(il->limits[type]));
13c31542 610
c71384a9
ZJS
611 SET_FOREACH(iaai, c->ip_address_allow)
612 fprintf(f, "%sIPAddressAllow: %s\n", prefix,
613 IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
614 SET_FOREACH(iaai, c->ip_address_deny)
615 fprintf(f, "%sIPAddressDeny: %s\n", prefix,
616 IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
fab34748
KL
617
618 STRV_FOREACH(path, c->ip_filters_ingress)
6dfb9282 619 fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
fab34748 620 STRV_FOREACH(path, c->ip_filters_egress)
6dfb9282 621 fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
b894ef1b
JK
622
623 LIST_FOREACH(programs, p, c->bpf_foreign_programs)
624 fprintf(f, "%sBPFProgram: %s:%s",
625 prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
b18e9fc1
JK
626
627 if (c->socket_bind_allow) {
b0bb3be1
FS
628 fprintf(f, "%sSocketBindAllow: ", prefix);
629 cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f);
b18e9fc1
JK
630 fputc('\n', f);
631 }
632
633 if (c->socket_bind_deny) {
b0bb3be1
FS
634 fprintf(f, "%sSocketBindDeny: ", prefix);
635 cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f);
b18e9fc1
JK
636 fputc('\n', f);
637 }
6f50d4f7
MV
638
639 if (c->restrict_network_interfaces) {
640 char *iface;
641 SET_FOREACH(iface, c->restrict_network_interfaces)
642 fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
643 }
dc7d69b3
TM
644
645 FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
646 fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source),
647 nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set);
b18e9fc1
JK
648}
649
650void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
5587ce7f 651 const char *family, *colon1, *protocol = "", *colon2 = "";
a4817536
LP
652
653 family = strempty(af_to_ipv4_ipv6(item->address_family));
5587ce7f
JK
654 colon1 = isempty(family) ? "" : ":";
655
656 if (item->ip_protocol != 0) {
657 protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
658 colon2 = ":";
659 }
b18e9fc1
JK
660
661 if (item->nr_ports == 0)
b0bb3be1 662 fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2);
b18e9fc1 663 else if (item->nr_ports == 1)
b0bb3be1 664 fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
b18e9fc1
JK
665 else {
666 uint16_t port_max = item->port_min + item->nr_ports - 1;
b0bb3be1 667 fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
5587ce7f 668 item->port_min, port_max);
b18e9fc1 669 }
4ad49000
LP
670}
671
b0bb3be1
FS
672void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) {
673 bool first = true;
674
675 LIST_FOREACH(socket_bind_items, bi, items) {
676 if (first)
677 first = false;
678 else
679 fputc(' ', f);
680
681 cgroup_context_dump_socket_bind_item(bi, f);
682 }
683}
684
a1044811 685int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
fd870bac
YW
686 _cleanup_free_ CGroupDeviceAllow *a = NULL;
687 _cleanup_free_ char *d = NULL;
688
689 assert(c);
690 assert(dev);
a1044811
LP
691 assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
692
693 if (p == 0)
694 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
fd870bac
YW
695
696 a = new(CGroupDeviceAllow, 1);
697 if (!a)
698 return -ENOMEM;
699
700 d = strdup(dev);
701 if (!d)
702 return -ENOMEM;
703
704 *a = (CGroupDeviceAllow) {
705 .path = TAKE_PTR(d),
a1044811 706 .permissions = p,
fd870bac
YW
707 };
708
709 LIST_PREPEND(device_allow, c->device_allow, a);
710 TAKE_PTR(a);
711
712 return 0;
713}
714
a1044811 715int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
c3166b25
LB
716 assert(c);
717 assert(dev);
a1044811
LP
718 assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
719
720 if (p == 0)
721 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
c3166b25
LB
722
723 LIST_FOREACH(device_allow, b, c->device_allow)
724 if (path_equal(b->path, dev)) {
a1044811 725 b->permissions = p;
c3166b25
LB
726 return 0;
727 }
728
a1044811 729 return cgroup_context_add_device_allow(c, dev, p);
c3166b25
LB
730}
731
c6f2dca6 732int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
b894ef1b
JK
733 CGroupBPFForeignProgram *p;
734 _cleanup_free_ char *d = NULL;
735
736 assert(c);
737 assert(bpffs_path);
738
739 if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
4e494e6a 740 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized.");
b894ef1b
JK
741
742 d = strdup(bpffs_path);
743 if (!d)
744 return log_oom();
745
746 p = new(CGroupBPFForeignProgram, 1);
747 if (!p)
748 return log_oom();
749
750 *p = (CGroupBPFForeignProgram) {
751 .attach_type = attach_type,
752 .bpffs_path = TAKE_PTR(d),
753 };
754
755 LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
756
757 return 0;
758}
759
6264b85e
CD
760#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
761 uint64_t unit_get_ancestor_##entry(Unit *u) { \
762 CGroupContext *c; \
763 \
764 /* 1. Is entry set in this unit? If so, use that. \
765 * 2. Is the default for this entry set in any \
766 * ancestor? If so, use that. \
767 * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
768 \
769 assert(u); \
770 \
771 c = unit_get_cgroup_context(u); \
c5322608 772 if (c && c->entry##_set) \
6264b85e
CD
773 return c->entry; \
774 \
12f64221 775 while ((u = UNIT_GET_SLICE(u))) { \
6264b85e 776 c = unit_get_cgroup_context(u); \
c5322608 777 if (c && c->default_##entry##_set) \
6264b85e
CD
778 return c->default_##entry; \
779 } \
780 \
781 /* We've reached the root, but nobody had default for \
782 * this entry set, so set it to the kernel default. */ \
783 return CGROUP_LIMIT_MIN; \
c52db42b
CD
784}
785
6264b85e 786UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
53fda560 787UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low);
7ad5439e 788UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
6264b85e 789
17d047f5 790static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, size_t size) {
1fa3b6c2
LP
791 int r;
792
793 assert(u);
794 assert(name);
795
9cc54544
LP
796 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
797 if (!crt || !crt->cgroup_path)
17d047f5 798 return;
1fa3b6c2 799
9cc54544 800 r = cg_set_xattr(crt->cgroup_path, name, data, size, 0);
1fa3b6c2 801 if (r < 0)
9cc54544 802 log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
1fa3b6c2
LP
803}
804
17d047f5 805static void unit_remove_xattr_graceful(Unit *u, const char *name) {
1fa3b6c2
LP
806 int r;
807
808 assert(u);
809 assert(name);
810
9cc54544
LP
811 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
812 if (!crt || !crt->cgroup_path)
17d047f5 813 return;
1fa3b6c2 814
9cc54544 815 r = cg_remove_xattr(crt->cgroup_path, name);
00675c36 816 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
9cc54544 817 log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
1fa3b6c2
LP
818}
819
64c71f4f 820static void cgroup_oomd_xattr_apply(Unit *u) {
4e806bfa 821 CGroupContext *c;
4e806bfa
AZ
822
823 assert(u);
824
825 c = unit_get_cgroup_context(u);
826 if (!c)
827 return;
828
1fa3b6c2 829 if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT)
17d047f5 830 unit_set_xattr_graceful(u, "user.oomd_omit", "1", 1);
4e806bfa 831
1fa3b6c2 832 if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID)
17d047f5 833 unit_set_xattr_graceful(u, "user.oomd_avoid", "1", 1);
4e806bfa 834
1fa3b6c2 835 if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID)
17d047f5 836 unit_remove_xattr_graceful(u, "user.oomd_avoid");
4e806bfa 837
1fa3b6c2 838 if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT)
17d047f5 839 unit_remove_xattr_graceful(u, "user.oomd_omit");
4e806bfa
AZ
840}
841
64c71f4f 842static int cgroup_log_xattr_apply(Unit *u) {
523ea123
QD
843 ExecContext *c;
844 size_t len, allowed_patterns_len, denied_patterns_len;
845 _cleanup_free_ char *patterns = NULL, *allowed_patterns = NULL, *denied_patterns = NULL;
48d85160 846 char *last;
523ea123
QD
847 int r;
848
849 assert(u);
850
851 c = unit_get_exec_context(u);
852 if (!c)
853 /* Some unit types have a cgroup context but no exec context, so we do not log
854 * any error here to avoid confusion. */
855 return 0;
856
857 if (set_isempty(c->log_filter_allowed_patterns) && set_isempty(c->log_filter_denied_patterns)) {
17d047f5 858 unit_remove_xattr_graceful(u, "user.journald_log_filter_patterns");
523ea123
QD
859 return 0;
860 }
861
862 r = set_make_nulstr(c->log_filter_allowed_patterns, &allowed_patterns, &allowed_patterns_len);
863 if (r < 0)
864 return log_debug_errno(r, "Failed to make nulstr from set: %m");
865
866 r = set_make_nulstr(c->log_filter_denied_patterns, &denied_patterns, &denied_patterns_len);
867 if (r < 0)
868 return log_debug_errno(r, "Failed to make nulstr from set: %m");
869
870 /* Use nul character separated strings without trailing nul */
871 allowed_patterns_len = LESS_BY(allowed_patterns_len, 1u);
872 denied_patterns_len = LESS_BY(denied_patterns_len, 1u);
873
874 len = allowed_patterns_len + 1 + denied_patterns_len;
875 patterns = new(char, len);
876 if (!patterns)
877 return log_oom_debug();
878
48d85160
QD
879 last = mempcpy_safe(patterns, allowed_patterns, allowed_patterns_len);
880 *(last++) = '\xff';
881 memcpy_safe(last, denied_patterns, denied_patterns_len);
523ea123 882
17d047f5 883 unit_set_xattr_graceful(u, "user.journald_log_filter_patterns", patterns, len);
523ea123
QD
884
885 return 0;
886}
887
d46510de 888static void cgroup_invocation_id_xattr_apply(Unit *u) {
d9bc1c36 889 bool b;
0d2d6fbf
CD
890
891 assert(u);
892
1fa3b6c2
LP
893 b = !sd_id128_is_null(u->invocation_id);
894 FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") {
895 if (b)
17d047f5 896 unit_set_xattr_graceful(u, xn, SD_ID128_TO_STRING(u->invocation_id), 32);
1fa3b6c2 897 else
17d047f5 898 unit_remove_xattr_graceful(u, xn);
3288ea8f 899 }
d46510de
LP
900}
901
6cf96ab4
NR
902static void cgroup_coredump_xattr_apply(Unit *u) {
903 CGroupContext *c;
904
905 assert(u);
906
907 c = unit_get_cgroup_context(u);
908 if (!c)
909 return;
910
911 if (unit_cgroup_delegate(u) && c->coredump_receive)
912 unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1);
913 else
914 unit_remove_xattr_graceful(u, "user.coredump_receive");
915}
916
d46510de
LP
917static void cgroup_delegate_xattr_apply(Unit *u) {
918 bool b;
919
920 assert(u);
0d2d6fbf 921
d9bc1c36
LP
922 /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
923 * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
924 * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
925 * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
926 * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
927 * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
928 * be a big problem given this communicates delegation state to clients, but the manager never reads
929 * it. */
930 b = unit_cgroup_delegate(u);
931 FOREACH_STRING(xn, "trusted.delegate", "user.delegate") {
1fa3b6c2 932 if (b)
17d047f5 933 unit_set_xattr_graceful(u, xn, "1", 1);
1fa3b6c2 934 else
17d047f5 935 unit_remove_xattr_graceful(u, xn);
3288ea8f 936 }
d46510de
LP
937}
938
939static void cgroup_survive_xattr_apply(Unit *u) {
940 int r;
941
942 assert(u);
559214cb 943
9cc54544
LP
944 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
945 if (!crt)
946 return;
947
559214cb 948 if (u->survive_final_kill_signal) {
bd1791b5 949 r = cg_set_xattr(
9cc54544 950 crt->cgroup_path,
bd1791b5
LP
951 "user.survive_final_kill_signal",
952 "1",
953 1,
954 /* flags= */ 0);
559214cb
LB
955 /* user xattr support was added in kernel v5.7 */
956 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
bd1791b5 957 r = cg_set_xattr(
9cc54544 958 crt->cgroup_path,
559214cb
LB
959 "trusted.survive_final_kill_signal",
960 "1",
961 1,
962 /* flags= */ 0);
963 if (r < 0)
964 log_unit_debug_errno(u,
965 r,
966 "Failed to set 'survive_final_kill_signal' xattr on control "
967 "group %s, ignoring: %m",
9cc54544 968 empty_to_root(crt->cgroup_path));
559214cb 969 } else {
17d047f5
LP
970 unit_remove_xattr_graceful(u, "user.survive_final_kill_signal");
971 unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal");
559214cb 972 }
0d2d6fbf
CD
973}
974
d46510de
LP
975static void cgroup_xattr_apply(Unit *u) {
976 assert(u);
977
978 /* The 'user.*' xattrs can be set from a user manager. */
979 cgroup_oomd_xattr_apply(u);
980 cgroup_log_xattr_apply(u);
6cf96ab4 981 cgroup_coredump_xattr_apply(u);
d46510de
LP
982
983 if (!MANAGER_IS_SYSTEM(u->manager))
984 return;
985
986 cgroup_invocation_id_xattr_apply(u);
987 cgroup_delegate_xattr_apply(u);
988 cgroup_survive_xattr_apply(u);
989}
990
45c2e068 991static int lookup_block_device(const char *p, dev_t *ret) {
f5855697
YS
992 dev_t rdev, dev = 0;
993 mode_t mode;
45c2e068 994 int r;
4ad49000
LP
995
996 assert(p);
45c2e068 997 assert(ret);
4ad49000 998
f5855697 999 r = device_path_parse_major_minor(p, &mode, &rdev);
d5aecba6 1000 if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
f5855697 1001 struct stat st;
57f1030b 1002
d5aecba6
LP
1003 if (stat(p, &st) < 0)
1004 return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
57f1030b 1005
f5855697 1006 mode = st.st_mode;
a0d6590c
LP
1007 rdev = st.st_rdev;
1008 dev = st.st_dev;
d5aecba6
LP
1009 } else if (r < 0)
1010 return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
1011
57f1030b
LP
1012 if (S_ISCHR(mode))
1013 return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
1014 "Device node '%s' is a character device, but block device needed.", p);
1015 if (S_ISBLK(mode))
f5855697
YS
1016 *ret = rdev;
1017 else if (major(dev) != 0)
1018 *ret = dev; /* If this is not a device node then use the block device this file is stored on */
45c2e068
LP
1019 else {
1020 /* If this is btrfs, getting the backing block device is a bit harder */
1021 r = btrfs_get_block_device(p, ret);
57f1030b
LP
1022 if (r == -ENOTTY)
1023 return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
1024 "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
1025 if (r < 0)
45c2e068 1026 return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
4ad49000 1027 }
8e274523 1028
b7cf4b4e 1029 /* If this is a LUKS/DM device, recursively try to get the originating block device */
ab2f5407
MY
1030 while (block_get_originating(*ret, ret) >= 0)
1031 ;
45c2e068
LP
1032
1033 /* If this is a partition, try to get the originating block device */
1034 (void) block_get_whole_disk(*ret, ret);
8e274523 1035 return 0;
8e274523
LP
1036}
1037
66ebf6c0
TH
1038static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
1039 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
1040 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
1041}
1042
31d3a520
PM
1043static bool cgroup_context_has_allowed_cpus(CGroupContext *c) {
1044 return c->cpuset_cpus.set || c->startup_cpuset_cpus.set;
1045}
1046
1047static bool cgroup_context_has_allowed_mems(CGroupContext *c) {
1048 return c->cpuset_mems.set || c->startup_cpuset_mems.set;
1049}
1050
a8157796
LP
1051uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
1052 assert(c);
1053
9dfb6a3a 1054 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
66ebf6c0
TH
1055 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
1056 return c->startup_cpu_weight;
1057 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
1058 return c->cpu_weight;
1059 else
1060 return CGROUP_WEIGHT_DEFAULT;
1061}
1062
31d3a520 1063static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) {
9dfb6a3a 1064 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
31d3a520
PM
1065 c->startup_cpuset_cpus.set)
1066 return &c->startup_cpuset_cpus;
1067 else
1068 return &c->cpuset_cpus;
1069}
1070
1071static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) {
9dfb6a3a 1072 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
31d3a520
PM
1073 c->startup_cpuset_mems.set)
1074 return &c->startup_cpuset_mems;
1075 else
1076 return &c->cpuset_mems;
1077}
1078
10f28641
FB
1079usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
1080 /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
1081 * need to be higher than that boundary. quota is specified in USecPerSec.
1082 * Additionally, period must be at most max_period. */
1083 assert(quota > 0);
1084
1085 return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
1086}
1087
1088static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
1089 usec_t new_period;
1090
9cc54544
LP
1091 assert(u);
1092
1093 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1094 if (!crt)
1095 return USEC_INFINITY;
1096
10f28641
FB
1097 if (quota == USEC_INFINITY)
1098 /* Always use default period for infinity quota. */
1099 return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1100
1101 if (period == USEC_INFINITY)
1102 /* Default period was requested. */
1103 period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1104
1105 /* Clamp to interval [1ms, 1s] */
1106 new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
1107
1108 if (new_period != period) {
9cc54544 1109 log_unit_full(u, crt->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
10f28641 1110 "Clamping CPU interval for cpu.max: period is now %s",
5291f26d 1111 FORMAT_TIMESPAN(new_period, 1));
9cc54544 1112 crt->warned_clamping_cpu_quota_period = true;
10f28641
FB
1113 }
1114
1115 return new_period;
1116}
1117
08183002 1118static void cgroup_apply_cpu_weight(Unit *u, uint64_t weight) {
52fecf20 1119 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
66ebf6c0 1120
c8340822 1121 if (weight == CGROUP_WEIGHT_IDLE)
1122 return;
66ebf6c0 1123 xsprintf(buf, "%" PRIu64 "\n", weight);
293d32df 1124 (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
52fecf20
LP
1125}
1126
08183002 1127static void cgroup_apply_cpu_idle(Unit *u, uint64_t weight) {
c8340822 1128 int r;
1129 bool is_idle;
1130 const char *idle_val;
1131
9cc54544
LP
1132 assert(u);
1133
1134 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1135 if (!crt || !crt->cgroup_path)
1136 return;
1137
c8340822 1138 is_idle = weight == CGROUP_WEIGHT_IDLE;
1139 idle_val = one_zero(is_idle);
9cc54544 1140 r = cg_set_attribute("cpu", crt->cgroup_path, "cpu.idle", idle_val);
c8340822 1141 if (r < 0 && (r != -ENOENT || is_idle))
1142 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
9cc54544 1143 "cpu.idle", empty_to_root(crt->cgroup_path), idle_val);
c8340822 1144}
1145
08183002 1146static void cgroup_apply_cpu_quota(Unit *u, usec_t quota, usec_t period) {
52fecf20 1147 char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
66ebf6c0 1148
9cc54544
LP
1149 assert(u);
1150
10f28641 1151 period = cgroup_cpu_adjust_period_and_log(u, period, quota);
66ebf6c0
TH
1152 if (quota != USEC_INFINITY)
1153 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
10f28641 1154 MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
66ebf6c0 1155 else
10f28641 1156 xsprintf(buf, "max " USEC_FMT "\n", period);
293d32df 1157 (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
66ebf6c0
TH
1158}
1159
08183002 1160static void cgroup_apply_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
047f5d63
PH
1161 _cleanup_free_ char *buf = NULL;
1162
2cea199e 1163 buf = cpu_set_to_range_string(cpus);
c259ac9a
LP
1164 if (!buf) {
1165 log_oom();
1166 return;
1167 }
047f5d63
PH
1168
1169 (void) set_attribute_and_warn(u, "cpuset", name, buf);
1170}
1171
508c45da 1172static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
1173 return c->io_accounting ||
1174 c->io_weight != CGROUP_WEIGHT_INVALID ||
1175 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
1176 c->io_device_weights ||
6ae4283c 1177 c->io_device_latencies ||
538b4852
TH
1178 c->io_device_limits;
1179}
1180
508c45da 1181static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
9dfb6a3a 1182 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
64faf04c
TH
1183 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
1184 return c->startup_io_weight;
d38655d7 1185 if (c->io_weight != CGROUP_WEIGHT_INVALID)
64faf04c 1186 return c->io_weight;
d38655d7 1187 return CGROUP_WEIGHT_DEFAULT;
64faf04c
TH
1188}
1189
3e6eafdd 1190static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) {
8d75f60e 1191 static bool warned = false;
9f0c0c4e 1192 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
bec17e80
MK
1193 const char *p;
1194 uint64_t bfq_weight;
8d75f60e 1195 int r;
bec17e80 1196
9cc54544
LP
1197 assert(u);
1198
1199 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1200 if (!crt || !crt->cgroup_path)
1201 return -EOWNERDEAD;
1202
bec17e80
MK
1203 /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
1204 * See also: https://github.com/systemd/systemd/pull/13335 and
1205 * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
1206 p = strjoina(controller, ".bfq.weight");
1207 /* Adjust to kernel range is 1..1000, the default is 100. */
1208 bfq_weight = BFQ_WEIGHT(io_weight);
1209
9f0c0c4e 1210 if (major(dev) > 0)
ec61371f 1211 xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight);
9f0c0c4e
MK
1212 else
1213 xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
bec17e80 1214
9cc54544 1215 r = cg_set_attribute(controller, crt->cgroup_path, p, buf);
8d75f60e
MK
1216
1217 /* FIXME: drop this when kernels prior
1218 * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
1219 * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
1220 * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
3e6eafdd
MK
1221 if (r == -EINVAL && major(dev) > 0) {
1222 if (!warned) {
1223 log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p);
1224 warned = true;
1225 }
1226 r = -EOPNOTSUPP; /* mask as unconfigured device */
1227 } else if (r >= 0 && io_weight != bfq_weight)
1cf4a685 1228 log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64,
a7b06f6c 1229 major(dev) > 0 ? "IODeviceWeight" : "IOWeight",
bec17e80 1230 io_weight, p, bfq_weight);
3e6eafdd 1231 return r;
bec17e80
MK
1232}
1233
f29ff115 1234static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
1235 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1236 dev_t dev;
3e6eafdd 1237 int r, r1, r2;
64faf04c 1238
9cc54544
LP
1239 assert(u);
1240
1241 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1242 if (!crt || !crt->cgroup_path)
1243 return;
1244
3e6eafdd 1245 if (lookup_block_device(dev_path, &dev) < 0)
64faf04c
TH
1246 return;
1247
3e6eafdd 1248 r1 = set_bfq_weight(u, "io", dev, io_weight);
9f0c0c4e 1249
ec61371f 1250 xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
9cc54544 1251 r2 = cg_set_attribute("io", crt->cgroup_path, "io.weight", buf);
3e6eafdd
MK
1252
1253 /* Look at the configured device, when both fail, prefer io.weight errno. */
1254 r = r2 == -EOPNOTSUPP ? r1 : r2;
1255
1256 if (r < 0)
1257 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
1258 r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
9cc54544 1259 empty_to_root(crt->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
64faf04c
TH
1260}
1261
6ae4283c
TH
1262static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
1263 char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
1264 dev_t dev;
1265 int r;
1266
1267 r = lookup_block_device(dev_path, &dev);
1268 if (r < 0)
1269 return;
1270
1271 if (target != USEC_INFINITY)
ec61371f 1272 xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target);
6ae4283c 1273 else
ec61371f 1274 xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev));
6ae4283c 1275
293d32df 1276 (void) set_attribute_and_warn(u, "io", "io.latency", buf);
6ae4283c
TH
1277}
1278
17ae2780 1279static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
4c1f9343
ZJS
1280 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
1281 buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
64faf04c 1282 dev_t dev;
64faf04c 1283
4c1f9343 1284 if (lookup_block_device(dev_path, &dev) < 0)
17ae2780 1285 return;
64faf04c 1286
4c1f9343 1287 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
17ae2780 1288 if (limits[type] != cgroup_io_limit_defaults[type])
64faf04c 1289 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
17ae2780 1290 else
64faf04c 1291 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
64faf04c 1292
ec61371f 1293 xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev),
64faf04c
TH
1294 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
1295 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
293d32df 1296 (void) set_attribute_and_warn(u, "io", "io.max", buf);
64faf04c
TH
1297}
1298
08183002 1299static bool unit_has_memory_config(Unit *u) {
c52db42b
CD
1300 CGroupContext *c;
1301
1302 assert(u);
1303
806a9362 1304 assert_se(c = unit_get_cgroup_context(u));
c52db42b 1305
53fda560
LB
1306 return unit_get_ancestor_memory_min(u) > 0 ||
1307 unit_get_ancestor_memory_low(u) > 0 || unit_get_ancestor_startup_memory_low(u) > 0 ||
1308 c->memory_high != CGROUP_LIMIT_MAX || c->startup_memory_high_set ||
1309 c->memory_max != CGROUP_LIMIT_MAX || c->startup_memory_max_set ||
1310 c->memory_swap_max != CGROUP_LIMIT_MAX || c->startup_memory_swap_max_set ||
1311 c->memory_zswap_max != CGROUP_LIMIT_MAX || c->startup_memory_zswap_max_set;
da4d897e
TH
1312}
1313
08183002 1314static void cgroup_apply_memory_limit(Unit *u, const char *file, uint64_t v) {
589a5f7a 1315 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
da4d897e
TH
1316
1317 if (v != CGROUP_LIMIT_MAX)
1318 xsprintf(buf, "%" PRIu64 "\n", v);
1319
293d32df 1320 (void) set_attribute_and_warn(u, "memory", file, buf);
da4d897e
TH
1321}
1322
0f2d84d2 1323static void cgroup_apply_firewall(Unit *u) {
0f2d84d2
LP
1324 assert(u);
1325
acf7f253 1326 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
906c06f6 1327
acf7f253 1328 if (bpf_firewall_compile(u) < 0)
906c06f6
DM
1329 return;
1330
fab34748 1331 (void) bpf_firewall_load_custom(u);
906c06f6 1332 (void) bpf_firewall_install(u);
906c06f6
DM
1333}
1334
49b6babb 1335void unit_modify_nft_set(Unit *u, bool add) {
dc7d69b3 1336 int r;
dc7d69b3
TM
1337
1338 assert(u);
1339
1340 if (!MANAGER_IS_SYSTEM(u->manager))
1341 return;
1342
49b6babb
LP
1343 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1344 return;
1345
9cc54544
LP
1346 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1347 if (!crt || crt->cgroup_id == 0)
dc7d69b3
TM
1348 return;
1349
1350 if (!u->manager->fw_ctx) {
1351 r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
1352 if (r < 0)
1353 return;
1354
1355 assert(u->manager->fw_ctx);
1356 }
1357
49b6babb
LP
1358 CGroupContext *c = ASSERT_PTR(unit_get_cgroup_context(u));
1359
dc7d69b3 1360 FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
b2082753
TM
1361 if (nft_set->source != NFT_SET_SOURCE_CGROUP)
1362 continue;
1363
9cc54544 1364 uint64_t element = crt->cgroup_id;
dc7d69b3
TM
1365
1366 r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
1367 if (r < 0)
1368 log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m",
9cc54544 1369 add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
dc7d69b3
TM
1370 else
1371 log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64,
9cc54544 1372 add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
dc7d69b3
TM
1373 }
1374}
1375
a8e5eb17
JK
1376static void cgroup_apply_socket_bind(Unit *u) {
1377 assert(u);
1378
cd09a5f3 1379 (void) bpf_socket_bind_install(u);
a8e5eb17
JK
1380}
1381
6f50d4f7
MV
1382static void cgroup_apply_restrict_network_interfaces(Unit *u) {
1383 assert(u);
1384
62e22490 1385 (void) bpf_restrict_ifaces_install(u);
6f50d4f7
MV
1386}
1387
8b139557 1388static int cgroup_apply_devices(Unit *u) {
76dc1725 1389 _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
8b139557 1390 CGroupContext *c;
45669ae2 1391 CGroupDevicePolicy policy;
8b139557
ZJS
1392 int r;
1393
1394 assert_se(c = unit_get_cgroup_context(u));
9cc54544
LP
1395
1396 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1397 if (!crt || !crt->cgroup_path)
1398 return -EOWNERDEAD;
8b139557 1399
45669ae2
ZJS
1400 policy = c->device_policy;
1401
ff7f99db
YW
1402 r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
1403 if (r < 0)
1404 return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
8b139557 1405
6b000af4 1406 bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
45669ae2 1407 (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
8b139557 1408
958b73be
LP
1409 bool any = false;
1410 if (allow_list_static) {
9cc54544 1411 r = bpf_devices_allow_list_static(prog, crt->cgroup_path);
958b73be
LP
1412 if (r > 0)
1413 any = true;
1414 }
1415
8b139557 1416 LIST_FOREACH(device_allow, a, c->device_allow) {
a1044811
LP
1417 const char *val;
1418
1419 if (a->permissions == 0)
8b139557 1420 continue;
8b139557
ZJS
1421
1422 if (path_startswith(a->path, "/dev/"))
9cc54544 1423 r = bpf_devices_allow_list_device(prog, crt->cgroup_path, a->path, a->permissions);
8b139557 1424 else if ((val = startswith(a->path, "block-")))
9cc54544 1425 r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'b', a->permissions);
8b139557 1426 else if ((val = startswith(a->path, "char-")))
9cc54544 1427 r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'c', a->permissions);
45669ae2 1428 else {
8b139557 1429 log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
45669ae2
ZJS
1430 continue;
1431 }
1432
958b73be 1433 if (r > 0)
45669ae2
ZJS
1434 any = true;
1435 }
1436
1437 if (prog && !any) {
4e494e6a 1438 log_unit_warning(u, "No devices matched by device filter.");
45669ae2
ZJS
1439
1440 /* The kernel verifier would reject a program we would build with the normal intro and outro
6b000af4 1441 but no allow-listing rules (outro would contain an unreachable instruction for successful
45669ae2
ZJS
1442 return). */
1443 policy = CGROUP_DEVICE_POLICY_STRICT;
8b139557
ZJS
1444 }
1445
9cc54544 1446 r = bpf_devices_apply_policy(&prog, policy, any, crt->cgroup_path, &crt->bpf_device_control_installed);
8b139557
ZJS
1447 if (r < 0) {
1448 static bool warned = false;
1449
1450 log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
1451 "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1452 "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1453 "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
1454
1455 warned = true;
1456 }
1457 return r;
1458}
1459
17283ce7
YW
1460static void set_io_weight(Unit *u, uint64_t weight) {
1461 char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
17283ce7
YW
1462
1463 assert(u);
29eb0eef 1464
3e6eafdd 1465 (void) set_bfq_weight(u, "io", makedev(0, 0), weight);
29eb0eef 1466
29eb0eef 1467 xsprintf(buf, "default %" PRIu64 "\n", weight);
17283ce7
YW
1468 (void) set_attribute_and_warn(u, "io", "io.weight", buf);
1469}
1470
506ea51b
JK
1471static void cgroup_apply_bpf_foreign_program(Unit *u) {
1472 assert(u);
1473
1474 (void) bpf_foreign_install(u);
1475}
1476
906c06f6
DM
1477static void cgroup_context_apply(
1478 Unit *u,
1479 CGroupMask apply_mask,
906c06f6
DM
1480 ManagerState state) {
1481
9cc54544 1482 bool is_host_root, is_local_root;
f29ff115 1483 CGroupContext *c;
4ad49000
LP
1484 int r;
1485
f29ff115
TH
1486 assert(u);
1487
906c06f6 1488 /* Nothing to do? Exit early! */
17f14955 1489 if (apply_mask == 0)
4ad49000 1490 return;
8e274523 1491
52fecf20
LP
1492 /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
1493 * attributes should only be managed for cgroups further down the tree. */
1494 is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
1495 is_host_root = unit_has_host_root_cgroup(u);
f3725e64
LP
1496
1497 assert_se(c = unit_get_cgroup_context(u));
9cc54544
LP
1498
1499 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1500 if (!crt || !crt->cgroup_path)
1501 return;
1502
be2c0327
LP
1503 /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
1504 * then), and missing cgroups, i.e. EROFS and ENOENT. */
714e2e1d 1505
f1c5534e 1506 /* These attributes don't exist on the host cgroup root. */
be2c0327 1507 if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
f1c5534e 1508 uint64_t weight;
8e274523 1509
f1c5534e
YW
1510 if (cgroup_context_has_cpu_weight(c))
1511 weight = cgroup_context_cpu_weight(c, state);
1512 else
1513 weight = CGROUP_WEIGHT_DEFAULT;
66ebf6c0 1514
08183002
YW
1515 cgroup_apply_cpu_idle(u, weight);
1516 cgroup_apply_cpu_weight(u, weight);
1517 cgroup_apply_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
4ad49000
LP
1518 }
1519
047f5d63 1520 if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
08183002
YW
1521 cgroup_apply_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus");
1522 cgroup_apply_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems");
047f5d63
PH
1523 }
1524
4e1dfa45 1525 /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
52fecf20
LP
1526 * controller), and in case of containers we want to leave control of these attributes to the container manager
1527 * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
1528 if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
a7b06f6c 1529 bool has_io;
52fecf20 1530 uint64_t weight;
13c31542 1531
52fecf20 1532 has_io = cgroup_context_has_io_config(c);
13c31542 1533
52fecf20
LP
1534 if (has_io)
1535 weight = cgroup_context_io_weight(c, state);
a7b06f6c 1536 else
52fecf20 1537 weight = CGROUP_WEIGHT_DEFAULT;
13c31542 1538
17283ce7 1539 set_io_weight(u, weight);
2dbc45ae 1540
52fecf20 1541 if (has_io) {
52fecf20
LP
1542 LIST_FOREACH(device_weights, w, c->io_device_weights)
1543 cgroup_apply_io_device_weight(u, w->path, w->weight);
128fadc9 1544
52fecf20
LP
1545 LIST_FOREACH(device_limits, limit, c->io_device_limits)
1546 cgroup_apply_io_device_limit(u, limit->path, limit->limits);
6ae4283c 1547
52fecf20
LP
1548 LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
1549 cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
13c31542
TH
1550 }
1551 }
1552
f1c5534e 1553 /* 'memory' attributes do not exist on the root cgroup. */
be2c0327 1554 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
f1c5534e 1555 uint64_t max = CGROUP_LIMIT_MAX, swap_max = CGROUP_LIMIT_MAX, zswap_max = CGROUP_LIMIT_MAX, high = CGROUP_LIMIT_MAX;
efdb0237 1556
08183002 1557 if (unit_has_memory_config(u)) {
f1c5534e 1558 bool startup = IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
128fadc9 1559
f1c5534e
YW
1560 high = startup && c->startup_memory_high_set ? c->startup_memory_high : c->memory_high;
1561 max = startup && c->startup_memory_max_set ? c->startup_memory_max : c->memory_max;
1562 swap_max = startup && c->startup_memory_swap_max_set ? c->startup_memory_swap_max : c->memory_swap_max;
1563 zswap_max = startup && c->startup_memory_zswap_max_set ? c->startup_memory_zswap_max : c->memory_zswap_max;
1564 }
78a4ee59 1565
08183002
YW
1566 cgroup_apply_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
1567 cgroup_apply_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
1568 cgroup_apply_memory_limit(u, "memory.high", high);
1569 cgroup_apply_memory_limit(u, "memory.max", max);
1570 cgroup_apply_memory_limit(u, "memory.swap.max", swap_max);
1571 cgroup_apply_memory_limit(u, "memory.zswap.max", zswap_max);
be2c0327 1572
f1c5534e
YW
1573 (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
1574 (void) set_attribute_and_warn(u, "memory", "memory.zswap.writeback", one_zero(c->memory_zswap_writeback));
4ad49000 1575 }
8e274523 1576
00b5974f
LP
1577 if (apply_mask & CGROUP_MASK_PIDS) {
1578
52fecf20 1579 if (is_host_root) {
00b5974f
LP
1580 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1581 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1582 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1583 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1584 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1585 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1586 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1587 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1588 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1589 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
5238e957 1590 * which is desirable so that there's an official way to release control of the sysctl from
00b5974f
LP
1591 * systemd: set the limit to unbounded and reload. */
1592
94f0b13b 1593 if (cgroup_tasks_max_isset(&c->tasks_max)) {
00b5974f 1594 u->manager->sysctl_pid_max_changed = true;
94f0b13b 1595 r = procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c->tasks_max));
00b5974f
LP
1596 } else if (u->manager->sysctl_pid_max_changed)
1597 r = procfs_tasks_set_limit(TASKS_MAX);
1598 else
1599 r = 0;
00b5974f 1600 if (r < 0)
8ed6f81b
YW
1601 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
1602 "Failed to write to tasks limit sysctls: %m");
52fecf20 1603 }
03a7b521 1604
52fecf20
LP
1605 /* The attribute itself is not available on the host root cgroup, and in the container case we want to
1606 * leave it for the container manager. */
1607 if (!is_local_root) {
94f0b13b 1608 if (cgroup_tasks_max_isset(&c->tasks_max)) {
3a0f06c4 1609 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
03a7b521 1610
94f0b13b 1611 xsprintf(buf, "%" PRIu64 "\n", cgroup_tasks_max_resolve(&c->tasks_max));
293d32df 1612 (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
00b5974f 1613 } else
589a5f7a 1614 (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
00b5974f 1615 }
03a7b521 1616 }
906c06f6 1617
239afa59
MY
1618 /* On cgroup v2 we can apply BPF everywhere. */
1619 if (apply_mask & CGROUP_MASK_BPF_DEVICES)
1620 (void) cgroup_apply_devices(u);
1621
17f14955 1622 if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
0f2d84d2 1623 cgroup_apply_firewall(u);
506ea51b
JK
1624
1625 if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
1626 cgroup_apply_bpf_foreign_program(u);
a8e5eb17
JK
1627
1628 if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
1629 cgroup_apply_socket_bind(u);
6f50d4f7
MV
1630
1631 if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
1632 cgroup_apply_restrict_network_interfaces(u);
dc7d69b3 1633
49b6babb 1634 unit_modify_nft_set(u, /* add = */ true);
fb385181
LP
1635}
1636
16492445
LP
1637static bool unit_get_needs_bpf_firewall(Unit *u) {
1638 CGroupContext *c;
16492445
LP
1639 assert(u);
1640
1641 c = unit_get_cgroup_context(u);
1642 if (!c)
1643 return false;
1644
1645 if (c->ip_accounting ||
84ebe6f0
YW
1646 !set_isempty(c->ip_address_allow) ||
1647 !set_isempty(c->ip_address_deny) ||
fab34748
KL
1648 c->ip_filters_ingress ||
1649 c->ip_filters_egress)
16492445
LP
1650 return true;
1651
1652 /* If any parent slice has an IP access list defined, it applies too */
e8616626 1653 for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
16492445
LP
1654 c = unit_get_cgroup_context(p);
1655 if (!c)
1656 return false;
1657
84ebe6f0
YW
1658 if (!set_isempty(c->ip_address_allow) ||
1659 !set_isempty(c->ip_address_deny))
16492445
LP
1660 return true;
1661 }
1662
1663 return false;
1664}
1665
506ea51b
JK
1666static bool unit_get_needs_bpf_foreign_program(Unit *u) {
1667 CGroupContext *c;
1668 assert(u);
1669
1670 c = unit_get_cgroup_context(u);
1671 if (!c)
1672 return false;
1673
64903d18 1674 return !!c->bpf_foreign_programs;
506ea51b
JK
1675}
1676
a8e5eb17
JK
1677static bool unit_get_needs_socket_bind(Unit *u) {
1678 CGroupContext *c;
1679 assert(u);
1680
1681 c = unit_get_cgroup_context(u);
1682 if (!c)
1683 return false;
1684
11ab01e4 1685 return c->socket_bind_allow || c->socket_bind_deny;
a8e5eb17
JK
1686}
1687
6f50d4f7
MV
1688static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
1689 CGroupContext *c;
1690 assert(u);
1691
1692 c = unit_get_cgroup_context(u);
1693 if (!c)
1694 return false;
1695
1696 return !set_isempty(c->restrict_network_interfaces);
1697}
1698
c52db42b 1699static CGroupMask unit_get_cgroup_mask(Unit *u) {
efdb0237 1700 CGroupMask mask = 0;
c52db42b
CD
1701 CGroupContext *c;
1702
1703 assert(u);
1704
806a9362 1705 assert_se(c = unit_get_cgroup_context(u));
c710d3b4 1706
fae9bc29 1707 /* Figure out which controllers we need, based on the cgroup context object */
8e274523 1708
fae9bc29 1709 if (cgroup_context_has_cpu_weight(c) ||
3a43da28 1710 c->cpu_quota_per_sec_usec != USEC_INFINITY)
fae9bc29 1711 mask |= CGROUP_MASK_CPU;
ecedd90f 1712
31d3a520 1713 if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c))
047f5d63
PH
1714 mask |= CGROUP_MASK_CPUSET;
1715
a7b06f6c 1716 if (cgroup_context_has_io_config(c))
538b4852 1717 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1718
4ad49000 1719 if (c->memory_accounting ||
08183002 1720 unit_has_memory_config(u))
efdb0237 1721 mask |= CGROUP_MASK_MEMORY;
8e274523 1722
a931ad47 1723 if (c->device_allow ||
084870f9 1724 c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
084c7007 1725 mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
4ad49000 1726
03a7b521 1727 if (c->tasks_accounting ||
94f0b13b 1728 cgroup_tasks_max_isset(&c->tasks_max))
03a7b521
LP
1729 mask |= CGROUP_MASK_PIDS;
1730
cd3435fc 1731 return mask;
8e274523
LP
1732}
1733
53aea74a 1734static CGroupMask unit_get_bpf_mask(Unit *u) {
17f14955
RG
1735 CGroupMask mask = 0;
1736
fae9bc29
LP
1737 /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
1738 * too. */
1739
17f14955
RG
1740 if (unit_get_needs_bpf_firewall(u))
1741 mask |= CGROUP_MASK_BPF_FIREWALL;
1742
506ea51b
JK
1743 if (unit_get_needs_bpf_foreign_program(u))
1744 mask |= CGROUP_MASK_BPF_FOREIGN;
1745
a8e5eb17
JK
1746 if (unit_get_needs_socket_bind(u))
1747 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
1748
6f50d4f7
MV
1749 if (unit_get_needs_restrict_network_interfaces(u))
1750 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
1751
17f14955
RG
1752 return mask;
1753}
1754
efdb0237 1755CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1756 CGroupContext *c;
8e274523 1757
442ce775
LP
1758 /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
1759 * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
1760
1761 if (u->load_state != UNIT_LOADED)
1762 return 0;
efdb0237 1763
4ad49000
LP
1764 c = unit_get_cgroup_context(u);
1765 if (!c)
1766 return 0;
8e274523 1767
12b975e0 1768 return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
02638280
LP
1769}
1770
1771CGroupMask unit_get_delegate_mask(Unit *u) {
1772 CGroupContext *c;
1773
35f88201 1774 /* If delegation is turned on, then turn on selected controllers.
19af675e 1775 *
02638280 1776 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1777
1d9cc876 1778 if (!unit_cgroup_delegate(u))
02638280
LP
1779 return 0;
1780
1d9cc876 1781 assert_se(c = unit_get_cgroup_context(u));
cd3435fc 1782 return c->delegate_controllers;
8e274523
LP
1783}
1784
d9ef5944
MK
1785static CGroupMask unit_get_subtree_mask(Unit *u) {
1786
1787 /* Returns the mask of this subtree, meaning of the group
1788 * itself and its children. */
1789
1790 return unit_get_own_mask(u) | unit_get_members_mask(u);
1791}
1792
efdb0237 1793CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1794 assert(u);
bc432dc7 1795
02638280 1796 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1797
9cc54544
LP
1798 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1799 if (crt && crt->cgroup_members_mask_valid)
1800 return crt->cgroup_members_mask; /* Use cached value if possible */
bc432dc7 1801
9cc54544 1802 CGroupMask m = 0;
bc432dc7
LP
1803 if (u->type == UNIT_SLICE) {
1804 Unit *member;
bc432dc7 1805
d219a2b0 1806 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
9cc54544 1807 m |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1808 }
1809
9cc54544
LP
1810 if (crt) {
1811 crt->cgroup_members_mask = m;
1812 crt->cgroup_members_mask_valid = true;
1813 }
1814
1815 return m;
246aa6dd
LP
1816}
1817
efdb0237 1818CGroupMask unit_get_siblings_mask(Unit *u) {
12f64221 1819 Unit *slice;
4ad49000 1820 assert(u);
246aa6dd 1821
efdb0237
LP
1822 /* Returns the mask of controllers all of the unit's siblings
1823 * require, i.e. the members mask of the unit's parent slice
1824 * if there is one. */
1825
12f64221
LP
1826 slice = UNIT_GET_SLICE(u);
1827 if (slice)
1828 return unit_get_members_mask(slice);
4ad49000 1829
64e844e5 1830 return unit_get_subtree_mask(u); /* we are the top-level slice */
246aa6dd
LP
1831}
1832
d9ef5944 1833static CGroupMask unit_get_disable_mask(Unit *u) {
4f6f62e4
CD
1834 CGroupContext *c;
1835
1836 c = unit_get_cgroup_context(u);
1837 if (!c)
1838 return 0;
1839
1840 return c->disable_controllers;
1841}
1842
1843CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
1844 CGroupMask mask;
12f64221 1845 Unit *slice;
4f6f62e4
CD
1846
1847 assert(u);
1848 mask = unit_get_disable_mask(u);
1849
1850 /* Returns the mask of controllers which are marked as forcibly
1851 * disabled in any ancestor unit or the unit in question. */
1852
12f64221
LP
1853 slice = UNIT_GET_SLICE(u);
1854 if (slice)
1855 mask |= unit_get_ancestor_disable_mask(slice);
4f6f62e4
CD
1856
1857 return mask;
1858}
1859
efdb0237 1860CGroupMask unit_get_target_mask(Unit *u) {
a437c5e4 1861 CGroupMask own_mask, mask;
efdb0237 1862
a437c5e4
LP
1863 /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
1864 * it needs itself, plus all that its children need, plus all that its siblings need. This is
1865 * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
efdb0237 1866 * hierarchy that shall be enabled for it. */
6414b7c9 1867
a437c5e4 1868 own_mask = unit_get_own_mask(u);
84d2744b 1869
a437c5e4 1870 if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
84d2744b
ZJS
1871 emit_bpf_firewall_warning(u);
1872
a437c5e4
LP
1873 mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1874
efdb0237 1875 mask &= u->manager->cgroup_supported;
c72703e2 1876 mask &= ~unit_get_ancestor_disable_mask(u);
efdb0237
LP
1877
1878 return mask;
1879}
1880
1881CGroupMask unit_get_enable_mask(Unit *u) {
1882 CGroupMask mask;
1883
1884 /* This returns the cgroup mask of all controllers to enable
1885 * for the children of a specific cgroup. This is primarily
1886 * useful for the unified cgroup hierarchy, where each cgroup
1887 * controls which controllers are enabled for its children. */
1888
1889 mask = unit_get_members_mask(u);
6414b7c9 1890 mask &= u->manager->cgroup_supported;
c72703e2 1891 mask &= ~unit_get_ancestor_disable_mask(u);
6414b7c9
DS
1892
1893 return mask;
1894}
1895
5af88058 1896void unit_invalidate_cgroup_members_masks(Unit *u) {
12f64221
LP
1897 Unit *slice;
1898
bc432dc7
LP
1899 assert(u);
1900
9cc54544
LP
1901 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1902 if (!crt)
1903 return;
1904
5af88058 1905 /* Recurse invalidate the member masks cache all the way up the tree */
9cc54544 1906 crt->cgroup_members_mask_valid = false;
bc432dc7 1907
12f64221
LP
1908 slice = UNIT_GET_SLICE(u);
1909 if (slice)
1910 unit_invalidate_cgroup_members_masks(slice);
6414b7c9
DS
1911}
1912
ea763af4 1913static int unit_default_cgroup_path(const Unit *u, char **ret) {
1a56b0c0 1914 _cleanup_free_ char *p = NULL;
efdb0237
LP
1915 int r;
1916
1917 assert(u);
1a56b0c0 1918 assert(ret);
efdb0237
LP
1919
1920 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1a56b0c0
LP
1921 p = strdup(u->manager->cgroup_root);
1922 else {
1923 _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
1924 Unit *slice;
efdb0237 1925
1a56b0c0
LP
1926 slice = UNIT_GET_SLICE(u);
1927 if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
1928 r = cg_slice_to_path(slice->id, &slice_path);
1929 if (r < 0)
1930 return r;
1931 }
1932
1933 r = cg_escape(u->id, &escaped);
efdb0237 1934 if (r < 0)
1a56b0c0 1935 return r;
efdb0237 1936
1a56b0c0
LP
1937 p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
1938 }
1939 if (!p)
1940 return -ENOMEM;
efdb0237 1941
1a56b0c0
LP
1942 *ret = TAKE_PTR(p);
1943 return 0;
efdb0237
LP
1944}
1945
ea763af4 1946static int unit_set_cgroup_path(Unit *u, const char *path) {
efdb0237 1947 _cleanup_free_ char *p = NULL;
9cc54544 1948 CGroupRuntime *crt;
efdb0237
LP
1949 int r;
1950
1951 assert(u);
1952
9cc54544 1953 crt = unit_get_cgroup_runtime(u);
9cc54544 1954 if (crt && streq_ptr(crt->cgroup_path, path))
5210387e
LP
1955 return 0;
1956
4918f14a 1957 unit_release_cgroup(u, /* drop_cgroup_runtime = */ true);
9cc54544
LP
1958
1959 crt = unit_setup_cgroup_runtime(u);
1960 if (!crt)
1961 return -ENOMEM;
1962
efdb0237
LP
1963 if (path) {
1964 p = strdup(path);
1965 if (!p)
1966 return -ENOMEM;
efdb0237 1967
efdb0237
LP
1968 r = hashmap_put(u->manager->cgroup_unit, p, u);
1969 if (r < 0)
1970 return r;
1971 }
1972
9cc54544
LP
1973 assert(!crt->cgroup_path);
1974 crt->cgroup_path = TAKE_PTR(p);
efdb0237
LP
1975
1976 return 1;
1977}
1978
23ac0811
MY
1979int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret) {
1980 assert(u);
1981 assert(ret);
1982
1983 const CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1984 if (!crt || !crt->cgroup_path)
1985 return unit_default_cgroup_path(u, ret);
1986
1987 return strdup_to_full(ret, crt->cgroup_path); /* returns 1 -> cgroup_path is alive */
1988}
1989
ea763af4 1990static int unit_watch_cgroup(Unit *u) {
ab2c3861 1991 _cleanup_free_ char *events = NULL;
efdb0237
LP
1992 int r;
1993
1994 assert(u);
1995
0bb814c2
LP
1996 /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
1997 * cgroupv2 is available. */
1998
9cc54544
LP
1999 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2000 if (!crt || !crt->cgroup_path)
efdb0237
LP
2001 return 0;
2002
9cc54544 2003 if (crt->cgroup_control_inotify_wd >= 0)
efdb0237
LP
2004 return 0;
2005
0bb814c2 2006 /* No point in watch the top-level slice, it's never going to run empty. */
efdb0237
LP
2007 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
2008 return 0;
2009
0bb814c2 2010 r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
efdb0237
LP
2011 if (r < 0)
2012 return log_oom();
2013
9cc54544 2014 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.events", &events);
efdb0237
LP
2015 if (r < 0)
2016 return log_oom();
2017
9cc54544
LP
2018 crt->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2019 if (crt->cgroup_control_inotify_wd < 0) {
efdb0237 2020
0bb814c2
LP
2021 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2022 * is not an error */
efdb0237
LP
2023 return 0;
2024
9cc54544 2025 return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
efdb0237
LP
2026 }
2027
9cc54544 2028 r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd), u);
efdb0237 2029 if (r < 0)
9cc54544 2030 return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
efdb0237
LP
2031
2032 return 0;
2033}
2034
ea763af4 2035static int unit_watch_cgroup_memory(Unit *u) {
afcfaa69 2036 _cleanup_free_ char *events = NULL;
afcfaa69
LP
2037 int r;
2038
2039 assert(u);
2040
2041 /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
2042 * cgroupv2 is available. */
2043
9cc54544
LP
2044 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2045 if (!crt || !crt->cgroup_path)
afcfaa69
LP
2046 return 0;
2047
9cc54544 2048 CGroupContext *c = unit_get_cgroup_context(u);
afcfaa69
LP
2049 if (!c)
2050 return 0;
2051
2052 /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
2053 * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
2054 * all. */
2055 if (!c->memory_accounting)
2056 return 0;
2057
2058 /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
2059 * we also don't want to generate a log message for each parent cgroup of a process. */
2060 if (u->type == UNIT_SLICE)
2061 return 0;
2062
9cc54544 2063 if (crt->cgroup_memory_inotify_wd >= 0)
afcfaa69
LP
2064 return 0;
2065
afcfaa69
LP
2066 r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
2067 if (r < 0)
2068 return log_oom();
2069
9cc54544 2070 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "memory.events", &events);
afcfaa69
LP
2071 if (r < 0)
2072 return log_oom();
2073
9cc54544
LP
2074 crt->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2075 if (crt->cgroup_memory_inotify_wd < 0) {
afcfaa69
LP
2076
2077 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2078 * is not an error */
2079 return 0;
2080
9cc54544 2081 return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
afcfaa69
LP
2082 }
2083
9cc54544 2084 r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd), u);
afcfaa69 2085 if (r < 0)
9cc54544 2086 return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
afcfaa69
LP
2087
2088 return 0;
2089}
2090
7b639614 2091static int unit_update_cgroup(
efdb0237
LP
2092 Unit *u,
2093 CGroupMask target_mask,
0d2d6fbf
CD
2094 CGroupMask enable_mask,
2095 ManagerState state) {
efdb0237 2096
23ac0811
MY
2097 _cleanup_free_ char *cgroup = NULL, *cgroup_full_path = NULL;
2098 bool set_path, created;
27adcc97 2099 int r;
64747e2d 2100
4ad49000 2101 assert(u);
64747e2d 2102
27c4ed79 2103 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0cd385d3
LP
2104 return 0;
2105
7923e949
AV
2106 if (u->freezer_state != FREEZER_RUNNING)
2107 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Cannot realize cgroup for frozen unit.");
2108
23ac0811 2109 r = unit_get_cgroup_path_with_fallback(u, &cgroup);
a4634b21 2110 if (r < 0)
23ac0811
MY
2111 return log_unit_error_errno(u, r, "Failed to get cgroup path: %m");
2112 set_path = r == 0;
9cc54544 2113
03b90d4b 2114 /* First, create our own group */
23ac0811 2115 r = cg_create(cgroup);
23bbb0de 2116 if (r < 0)
23ac0811 2117 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(cgroup));
490c5a37 2118 created = r;
efdb0237 2119
23ac0811
MY
2120 if (set_path) {
2121 r = unit_set_cgroup_path(u, cgroup);
2122 if (r == -EEXIST)
2123 return log_unit_error_errno(u, r, "Picked control group '%s' as default, but it's in use already.", empty_to_root(cgroup));
2124 if (r < 0)
2125 return log_unit_error_errno(u, r, "Failed to set unit's control group path to '%s': %m", empty_to_root(cgroup));
2126 assert(r > 0);
2127 }
2128
2129 CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
2130
4ee64e43
YW
2131 uint64_t cgroup_id = 0;
2132 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path);
2133 if (r == 0) {
2134 r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
2135 if (r < 0)
2136 log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
2137 "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
2138 } else
2139 log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
184b4f78 2140
4ee64e43 2141 crt->cgroup_id = cgroup_id;
184b4f78 2142
efdb0237
LP
2143 /* Start watching it */
2144 (void) unit_watch_cgroup(u);
afcfaa69 2145 (void) unit_watch_cgroup_memory(u);
efdb0237 2146
4ee64e43 2147 /* For v2 we preserve enabled controllers in delegated units, adjust others, */
23ac0811 2148 if (created || !unit_cgroup_delegate(u)) {
27adcc97 2149 CGroupMask result_mask = 0;
65be7e06
ZJS
2150
2151 /* Enable all controllers we need */
188286ee 2152 r = cg_enable(u->manager->cgroup_supported, enable_mask, crt->cgroup_path, &result_mask);
65be7e06 2153 if (r < 0)
9cc54544 2154 log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
27adcc97 2155
27adcc97 2156 /* Remember what's actually enabled now */
9cc54544 2157 crt->cgroup_enabled_mask = result_mask;
65be7e06 2158 }
03b90d4b
LP
2159
2160 /* Keep track that this is now realized */
9cc54544 2161 crt->cgroup_realized_mask = target_mask;
4ad49000 2162
0d2d6fbf
CD
2163 /* Set attributes */
2164 cgroup_context_apply(u, target_mask, state);
2165 cgroup_xattr_apply(u);
2166
29e6b0c1
LP
2167 /* For most units we expect that memory monitoring is set up before the unit is started and we won't
2168 * touch it after. For PID 1 this is different though, because we couldn't possibly do that given
2169 * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's
2170 * try to open the memory pressure interface anew. */
2171 if (unit_has_name(u, SPECIAL_INIT_SCOPE))
2172 (void) manager_setup_memory_pressure_event_source(u->manager);
2173
64747e2d
LP
2174 return 0;
2175}
2176
6592b975
LP
2177static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
2178 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2179 char *pp;
7b3fd631 2180 int r;
6592b975 2181
7b3fd631
LP
2182 assert(u);
2183
6592b975
LP
2184 if (MANAGER_IS_SYSTEM(u->manager))
2185 return -EINVAL;
2186
2187 if (!u->manager->system_bus)
2188 return -EIO;
2189
9cc54544
LP
2190 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2191 if (!crt || !crt->cgroup_path)
2192 return -EOWNERDEAD;
6592b975
LP
2193
2194 /* Determine this unit's cgroup path relative to our cgroup root */
9cc54544 2195 pp = path_startswith(crt->cgroup_path, u->manager->cgroup_root);
6592b975
LP
2196 if (!pp)
2197 return -EINVAL;
2198
2199 pp = strjoina("/", pp, suffix_path);
4ff361cc 2200 path_simplify(pp);
6592b975 2201
78fa2f91 2202 r = bus_call_method(u->manager->system_bus,
2203 bus_systemd_mgr,
2204 "AttachProcessesToUnit",
2205 &error, NULL,
2206 "ssau",
2207 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
7b3fd631 2208 if (r < 0)
6592b975
LP
2209 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
2210
2211 return 0;
2212}
2213
2214int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
8e7e4a73 2215 _cleanup_free_ char *joined = NULL;
6592b975 2216 const char *p;
c9eff0bc 2217 int ret = 0, r;
6592b975
LP
2218
2219 assert(u);
2220
2221 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2222 return -EINVAL;
2223
2224 if (set_isempty(pids))
2225 return 0;
7b3fd631 2226
fab34748
KL
2227 /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
2228 * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
2229 r = bpf_firewall_load_custom(u);
2230 if (r < 0)
2231 return r;
2232
6592b975 2233 r = unit_realize_cgroup(u);
7b3fd631
LP
2234 if (r < 0)
2235 return r;
2236
9cc54544
LP
2237 CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
2238
6592b975 2239 if (isempty(suffix_path))
9cc54544 2240 p = crt->cgroup_path;
8e7e4a73 2241 else {
9cc54544 2242 joined = path_join(crt->cgroup_path, suffix_path);
8e7e4a73
LP
2243 if (!joined)
2244 return -ENOMEM;
2245
2246 p = joined;
2247 }
6592b975 2248
c9eff0bc 2249 PidRef *pid;
495e75ed
LP
2250 SET_FOREACH(pid, pids) {
2251
2252 /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead,
2253 * which of course is racy. Let's shorten the race a bit though, and re-validate the PID
2254 * before we use it */
2255 r = pidref_verify(pid);
2256 if (r < 0) {
2257 log_unit_info_errno(u, r, "PID " PID_FMT " vanished before we could move it to target cgroup '%s', skipping: %m", pid->pid, empty_to_root(p));
2258 continue;
2259 }
6592b975 2260
188286ee 2261 r = cg_attach(p, pid->pid);
db4229d1 2262 if (r < 0) {
c9eff0bc 2263 bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_NEG_PRIVILEGE(r);
6592b975 2264
db4229d1 2265 log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO, r,
7a2ba407 2266 "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
495e75ed 2267 pid->pid, again ? " directly" : "", empty_to_root(p));
7a2ba407
ZJS
2268
2269 if (again) {
6592b975
LP
2270 int z;
2271
7a2ba407
ZJS
2272 /* If we are in a user instance, and we can't move the process ourselves due
2273 * to permission problems, let's ask the system instance about it instead.
2274 * Since it's more privileged it might be able to move the process across the
2275 * leaves of a subtree whose top node is not owned by us. */
6592b975 2276
495e75ed 2277 z = unit_attach_pid_to_cgroup_via_bus(u, pid->pid, suffix_path);
c9eff0bc
MY
2278 if (z >= 0)
2279 goto success;
6592b975 2280
c9eff0bc
MY
2281 log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid->pid, empty_to_root(p));
2282 }
6592b975 2283
c9eff0bc 2284 RET_GATHER(ret, r);
6592b975 2285 continue;
bb160976 2286 }
6592b975 2287
c9eff0bc
MY
2288 success:
2289 /* the cgroup is definitely not empty now. in case the unit was in the cgroup empty queue,
2290 * drop it from there */
2291 unit_remove_from_cgroup_empty_queue(u);
6592b975 2292
c9eff0bc
MY
2293 if (ret >= 0)
2294 ret++; /* Count successful additions */
6592b975
LP
2295 }
2296
db4229d1 2297 return ret;
7b3fd631
LP
2298}
2299
94634b4b
LP
2300int unit_remove_subcgroup(Unit *u, const char *suffix_path) {
2301 int r;
2302
2303 assert(u);
2304
2305 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2306 return -EINVAL;
2307
2308 if (!unit_cgroup_delegate(u))
2309 return -ENOMEDIUM;
2310
94634b4b
LP
2311 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2312 if (!crt || !crt->cgroup_path)
2313 return -EOWNERDEAD;
2314
2315 _cleanup_free_ char *j = NULL;
2316 bool delete_root;
2317 const char *d;
2318 if (empty_or_root(suffix_path)) {
2319 d = empty_to_root(crt->cgroup_path);
2320 delete_root = false; /* Don't attempt to delete the main cgroup of this unit */
2321 } else {
2322 j = path_join(crt->cgroup_path, suffix_path);
2323 if (!j)
2324 return -ENOMEM;
2325
2326 d = j;
2327 delete_root = true;
2328 }
2329
2330 log_unit_debug(u, "Removing subcgroup '%s'...", d);
2331
188286ee 2332 r = cg_trim(d, delete_root);
94634b4b
LP
2333 if (r < 0)
2334 return log_unit_debug_errno(u, r, "Failed to fully %s cgroup '%s': %m", delete_root ? "remove" : "trim", d);
2335
2336 return 0;
2337}
2338
906c06f6
DM
2339static bool unit_has_mask_realized(
2340 Unit *u,
2341 CGroupMask target_mask,
17f14955 2342 CGroupMask enable_mask) {
906c06f6 2343
bc432dc7
LP
2344 assert(u);
2345
9cc54544
LP
2346 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2347 if (!crt)
2348 return false;
2349
d5095dcd
LP
2350 /* Returns true if this unit is fully realized. We check four things:
2351 *
2352 * 1. Whether the cgroup was created at all
4e1dfa45
CD
2353 * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
2354 * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
d5095dcd
LP
2355 * 4. Whether the invalidation mask is currently zero
2356 *
2357 * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
4e1dfa45
CD
2358 * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
2359 * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
2360 * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
d5095dcd
LP
2361 * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
2362 * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
2363 * simply don't matter. */
2364
23ac0811 2365 return crt->cgroup_path &&
9cc54544
LP
2366 ((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
2367 ((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
2368 crt->cgroup_invalidated_mask == 0;
6414b7c9
DS
2369}
2370
4f6f62e4
CD
2371static bool unit_has_mask_disables_realized(
2372 Unit *u,
2373 CGroupMask target_mask,
2374 CGroupMask enable_mask) {
2375
2376 assert(u);
2377
9cc54544
LP
2378 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2379 if (!crt)
2380 return true;
2381
4f6f62e4
CD
2382 /* Returns true if all controllers which should be disabled are indeed disabled.
2383 *
2384 * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
2385 * already removed. */
2386
23ac0811 2387 return !crt->cgroup_path ||
9cc54544
LP
2388 (FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
2389 FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
4f6f62e4
CD
2390}
2391
a57669d2
CD
2392static bool unit_has_mask_enables_realized(
2393 Unit *u,
2394 CGroupMask target_mask,
2395 CGroupMask enable_mask) {
2396
2397 assert(u);
2398
9cc54544
LP
2399 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2400 if (!crt)
2401 return false;
2402
a57669d2
CD
2403 /* Returns true if all controllers which should be enabled are indeed enabled.
2404 *
2405 * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
2406 * we want to add is already added. */
2407
23ac0811 2408 return crt->cgroup_path &&
9cc54544
LP
2409 ((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) &&
2410 ((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2);
a57669d2
CD
2411}
2412
020b2e41 2413void unit_add_to_cgroup_realize_queue(Unit *u) {
2aa57a65
LP
2414 assert(u);
2415
2416 if (u->in_cgroup_realize_queue)
2417 return;
2418
a479c21e 2419 LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2aa57a65
LP
2420 u->in_cgroup_realize_queue = true;
2421}
2422
2423static void unit_remove_from_cgroup_realize_queue(Unit *u) {
2424 assert(u);
2425
2426 if (!u->in_cgroup_realize_queue)
2427 return;
2428
2429 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2430 u->in_cgroup_realize_queue = false;
2431}
2432
a57669d2
CD
2433/* Controllers can only be enabled breadth-first, from the root of the
2434 * hierarchy downwards to the unit in question. */
2435static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
2436 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
12f64221 2437 Unit *slice;
a57669d2
CD
2438 int r;
2439
2440 assert(u);
2441
2442 /* First go deal with this unit's parent, or we won't be able to enable
2443 * any new controllers at this layer. */
12f64221
LP
2444 slice = UNIT_GET_SLICE(u);
2445 if (slice) {
2446 r = unit_realize_cgroup_now_enable(slice, state);
a57669d2
CD
2447 if (r < 0)
2448 return r;
2449 }
2450
2451 target_mask = unit_get_target_mask(u);
2452 enable_mask = unit_get_enable_mask(u);
2453
2454 /* We can only enable in this direction, don't try to disable anything.
2455 */
2456 if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
2457 return 0;
2458
9cc54544
LP
2459 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2460
2461 new_target_mask = (crt ? crt->cgroup_realized_mask : 0) | target_mask;
2462 new_enable_mask = (crt ? crt->cgroup_enabled_mask : 0) | enable_mask;
a57669d2 2463
7b639614 2464 return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
a57669d2
CD
2465}
2466
4f6f62e4
CD
2467/* Controllers can only be disabled depth-first, from the leaves of the
2468 * hierarchy upwards to the unit in question. */
2469static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
4f6f62e4 2470 Unit *m;
4f6f62e4
CD
2471
2472 assert(u);
2473
2474 if (u->type != UNIT_SLICE)
2475 return 0;
2476
d219a2b0 2477 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
4f6f62e4
CD
2478 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2479 int r;
2480
9cc54544
LP
2481 CGroupRuntime *rt = unit_get_cgroup_runtime(m);
2482 if (!rt)
2483 continue;
2484
defe63b0
LP
2485 /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
2486 * holding any controllers open anyway. */
23ac0811 2487 if (!rt->cgroup_path)
4f6f62e4
CD
2488 continue;
2489
defe63b0 2490 /* We must disable those below us first in order to release the controller. */
4f6f62e4
CD
2491 if (m->type == UNIT_SLICE)
2492 (void) unit_realize_cgroup_now_disable(m, state);
2493
2494 target_mask = unit_get_target_mask(m);
2495 enable_mask = unit_get_enable_mask(m);
2496
defe63b0 2497 /* We can only disable in this direction, don't try to enable anything. */
4f6f62e4
CD
2498 if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
2499 continue;
2500
9cc54544
LP
2501 new_target_mask = rt->cgroup_realized_mask & target_mask;
2502 new_enable_mask = rt->cgroup_enabled_mask & enable_mask;
4f6f62e4 2503
7b639614 2504 r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
4f6f62e4
CD
2505 if (r < 0)
2506 return r;
2507 }
2508
2509 return 0;
2510}
a57669d2 2511
6414b7c9
DS
2512/* Check if necessary controllers and attributes for a unit are in place.
2513 *
a57669d2
CD
2514 * - If so, do nothing.
2515 * - If not, create paths, move processes over, and set attributes.
2516 *
2517 * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
2518 * a depth-first way. As such the process looks like this:
2519 *
2520 * Suppose we have a cgroup hierarchy which looks like this:
2521 *
2522 * root
2523 * / \
2524 * / \
2525 * / \
2526 * a b
2527 * / \ / \
2528 * / \ / \
2529 * c d e f
2530 * / \ / \ / \ / \
2531 * h i j k l m n o
2532 *
2533 * 1. We want to realise cgroup "d" now.
c72703e2 2534 * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
a57669d2
CD
2535 * 3. cgroup "k" just started requesting the memory controller.
2536 *
2537 * To make this work we must do the following in order:
2538 *
2539 * 1. Disable CPU controller in k, j
2540 * 2. Disable CPU controller in d
2541 * 3. Enable memory controller in root
2542 * 4. Enable memory controller in a
2543 * 5. Enable memory controller in d
2544 * 6. Enable memory controller in k
2545 *
2546 * Notice that we need to touch j in one direction, but not the other. We also
2547 * don't go beyond d when disabling -- it's up to "a" to get realized if it
2548 * wants to disable further. The basic rules are therefore:
2549 *
2550 * - If you're disabling something, you need to realise all of the cgroups from
2551 * your recursive descendants to the root. This starts from the leaves.
2552 * - If you're enabling something, you need to realise from the root cgroup
2553 * downwards, but you don't need to iterate your recursive descendants.
6414b7c9
DS
2554 *
2555 * Returns 0 on success and < 0 on failure. */
db785129 2556static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 2557 CGroupMask target_mask, enable_mask;
12f64221 2558 Unit *slice;
6414b7c9 2559 int r;
64747e2d 2560
4ad49000 2561 assert(u);
64747e2d 2562
2aa57a65 2563 unit_remove_from_cgroup_realize_queue(u);
64747e2d 2564
efdb0237 2565 target_mask = unit_get_target_mask(u);
ccf78df1
TH
2566 enable_mask = unit_get_enable_mask(u);
2567
17f14955 2568 if (unit_has_mask_realized(u, target_mask, enable_mask))
0a1eb06d 2569 return 0;
64747e2d 2570
4f6f62e4
CD
2571 /* Disable controllers below us, if there are any */
2572 r = unit_realize_cgroup_now_disable(u, state);
2573 if (r < 0)
2574 return r;
2575
2576 /* Enable controllers above us, if there are any */
12f64221
LP
2577 slice = UNIT_GET_SLICE(u);
2578 if (slice) {
2579 r = unit_realize_cgroup_now_enable(slice, state);
6414b7c9
DS
2580 if (r < 0)
2581 return r;
2582 }
4ad49000 2583
0d2d6fbf 2584 /* Now actually deal with the cgroup we were trying to realise and set attributes */
7b639614 2585 r = unit_update_cgroup(u, target_mask, enable_mask, state);
6414b7c9
DS
2586 if (r < 0)
2587 return r;
2588
9cc54544
LP
2589 CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
2590
c2baf11c 2591 /* Now, reset the invalidation mask */
9cc54544 2592 crt->cgroup_invalidated_mask = 0;
6414b7c9 2593 return 0;
64747e2d
LP
2594}
2595
91a6073e 2596unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 2597 ManagerState state;
4ad49000 2598 unsigned n = 0;
db785129 2599 Unit *i;
6414b7c9 2600 int r;
ecedd90f 2601
91a6073e
LP
2602 assert(m);
2603
db785129
LP
2604 state = manager_state(m);
2605
91a6073e
LP
2606 while ((i = m->cgroup_realize_queue)) {
2607 assert(i->in_cgroup_realize_queue);
ecedd90f 2608
2aa57a65
LP
2609 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
2610 /* Maybe things changed, and the unit is not actually active anymore? */
2611 unit_remove_from_cgroup_realize_queue(i);
2612 continue;
2613 }
2614
db785129 2615 r = unit_realize_cgroup_now(i, state);
6414b7c9 2616 if (r < 0)
efdb0237 2617 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 2618
4ad49000
LP
2619 n++;
2620 }
ecedd90f 2621
4ad49000 2622 return n;
8e274523
LP
2623}
2624
4c591f39
MK
2625void unit_add_family_to_cgroup_realize_queue(Unit *u) {
2626 assert(u);
2627 assert(u->type == UNIT_SLICE);
ca949c9d 2628
4c591f39
MK
2629 /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
2630 * its ancestors.
2631 *
2632 * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
2633 * very weird if two units that own processes reside in the same slice, but one is realized in the
2634 * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
2635 * not), because that means individual processes need to be scheduled against whole cgroups. Let's
2636 * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
2637 * controller hierarchies too (if unit requires the controller to be realized).
e1e98911 2638 *
4c591f39
MK
2639 * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
2640 * masks. */
2641
2642 do {
9cc54544 2643 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
8f53a7b8 2644
4c591f39 2645 /* Children of u likely changed when we're called */
9cc54544
LP
2646 if (crt)
2647 crt->cgroup_members_mask_valid = false;
f23ba94d 2648
9cc54544 2649 Unit *m;
d219a2b0 2650 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
8e274523 2651
65f6b6bd 2652 /* No point in doing cgroup application for units without active processes. */
6414b7c9
DS
2653 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
2654 continue;
2655
e1e98911
LP
2656 /* We only enqueue siblings if they were realized once at least, in the main
2657 * hierarchy. */
9cc54544 2658 crt = unit_get_cgroup_runtime(m);
23ac0811 2659 if (!crt || !crt->cgroup_path)
e1e98911
LP
2660 continue;
2661
defe63b0
LP
2662 /* If the unit doesn't need any new controllers and has current ones
2663 * realized, it doesn't need any changes. */
906c06f6
DM
2664 if (unit_has_mask_realized(m,
2665 unit_get_target_mask(m),
17f14955 2666 unit_get_enable_mask(m)))
6414b7c9
DS
2667 continue;
2668
91a6073e 2669 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
2670 }
2671
4c591f39
MK
2672 /* Parent comes after children */
2673 unit_add_to_cgroup_realize_queue(u);
12f64221
LP
2674
2675 u = UNIT_GET_SLICE(u);
2676 } while (u);
4ad49000
LP
2677}
2678
0a1eb06d 2679int unit_realize_cgroup(Unit *u) {
12f64221
LP
2680 Unit *slice;
2681
4ad49000
LP
2682 assert(u);
2683
35b7ff80 2684 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 2685 return 0;
8e274523 2686
4c591f39
MK
2687 /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
2688 * parents, but there's more actually: for the weight-based controllers we also need to make sure
2689 * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the
2690 * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
2691 * and ancestors and they should be (de)realized too.
2692 *
2693 * This call will defer work on the siblings and derealized ancestors to the next event loop
2694 * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
ca949c9d 2695
12f64221
LP
2696 slice = UNIT_GET_SLICE(u);
2697 if (slice)
2698 unit_add_family_to_cgroup_realize_queue(slice);
4ad49000 2699
6414b7c9 2700 /* And realize this one now (and apply the values) */
db785129 2701 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
2702}
2703
4918f14a 2704void unit_release_cgroup(Unit *u, bool drop_cgroup_runtime) {
efdb0237
LP
2705 assert(u);
2706
8a0d5388
LP
2707 /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
2708 * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
efdb0237 2709
9cc54544
LP
2710 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2711 if (!crt)
2712 return;
2713
2714 if (crt->cgroup_path) {
2715 (void) hashmap_remove(u->manager->cgroup_unit, crt->cgroup_path);
2716 crt->cgroup_path = mfree(crt->cgroup_path);
efdb0237
LP
2717 }
2718
9cc54544
LP
2719 if (crt->cgroup_control_inotify_wd >= 0) {
2720 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_control_inotify_wd) < 0)
2721 log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", crt->cgroup_control_inotify_wd, u->id);
efdb0237 2722
9cc54544
LP
2723 (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd));
2724 crt->cgroup_control_inotify_wd = -1;
efdb0237 2725 }
afcfaa69 2726
9cc54544
LP
2727 if (crt->cgroup_memory_inotify_wd >= 0) {
2728 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_memory_inotify_wd) < 0)
2729 log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", crt->cgroup_memory_inotify_wd, u->id);
afcfaa69 2730
9cc54544
LP
2731 (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd));
2732 crt->cgroup_memory_inotify_wd = -1;
afcfaa69 2733 }
9cc54544 2734
4918f14a
MY
2735 if (drop_cgroup_runtime)
2736 *(CGroupRuntime**) ((uint8_t*) u + UNIT_VTABLE(u)->cgroup_runtime_offset) = cgroup_runtime_free(crt);
9cc54544
LP
2737}
2738
2739int unit_cgroup_is_empty(Unit *u) {
2740 int r;
2741
2742 assert(u);
2743
2744 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2745 if (!crt)
2746 return -ENXIO;
2747 if (!crt->cgroup_path)
2748 return -EOWNERDEAD;
2749
c3f90077 2750 r = cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path);
9cc54544 2751 if (r < 0)
4c1fc52d 2752 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(crt->cgroup_path));
9cc54544 2753 return r;
efdb0237
LP
2754}
2755
4918f14a 2756static bool unit_maybe_release_cgroup(Unit *u) {
e08dabfe
AZ
2757 int r;
2758
4918f14a
MY
2759 /* Releases the cgroup only if it is recursively empty.
2760 * Returns true if the cgroup was released, false otherwise. */
e08dabfe 2761
4918f14a 2762 assert(u);
e08dabfe 2763
9cc54544
LP
2764 /* Don't release the cgroup if there are still processes under it. If we get notified later when all
2765 * the processes exit (e.g. the processes were in D-state and exited after the unit was marked as
2766 * failed) we need the cgroup paths to continue to be tracked by the manager so they can be looked up
2767 * and cleaned up later. */
2768 r = unit_cgroup_is_empty(u);
8153be97 2769 if (r > 0) {
4918f14a
MY
2770 /* Do not free CGroupRuntime when called from unit_prune_cgroup. Various accounting data
2771 * we should keep, especially CPU usage and *_peak ones which would be shown even after
2772 * the unit stops. */
2773 unit_release_cgroup(u, /* drop_cgroup_runtime = */ false);
e08dabfe
AZ
2774 return true;
2775 }
2776
2777 return false;
2778}
2779
51a70c88
LP
2780static int unit_prune_cgroup_via_bus(Unit *u) {
2781 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2782 int r;
2783
2784 assert(u);
2785 assert(u->manager);
2786
2787 if (MANAGER_IS_SYSTEM(u->manager))
2788 return -EINVAL;
2789
2790 if (!u->manager->system_bus)
2791 return -EIO;
2792
2793 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2794 if (!crt || !crt->cgroup_path)
2795 return -EOWNERDEAD;
2796
2797 /* Determine this unit's cgroup path relative to our cgroup root */
48210772
LP
2798 const char *pp = path_startswith_full(
2799 crt->cgroup_path,
2800 u->manager->cgroup_root,
ceed11e4 2801 PATH_STARTSWITH_RETURN_LEADING_SLASH|PATH_STARTSWITH_REFUSE_DOT_DOT);
51a70c88
LP
2802 if (!pp)
2803 return -EINVAL;
2804
51a70c88
LP
2805 r = bus_call_method(u->manager->system_bus,
2806 bus_systemd_mgr,
2807 "RemoveSubgroupFromUnit",
2808 &error, NULL,
2809 "sst",
2810 NULL /* empty unit name means client's unit, i.e. us */,
2811 pp,
2812 (uint64_t) 0);
2813 if (r < 0)
2814 return log_unit_debug_errno(u, r, "Failed to trim cgroup via the bus: %s", bus_error_message(&error, r));
2815
2816 return 0;
2817}
2818
efdb0237 2819void unit_prune_cgroup(Unit *u) {
efdb0237 2820 bool is_root_slice;
4918f14a 2821 int r;
8e274523 2822
4ad49000 2823 assert(u);
8e274523 2824
efdb0237 2825 /* Removes the cgroup, if empty and possible, and stops watching it. */
9cc54544
LP
2826 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2827 if (!crt || !crt->cgroup_path)
4ad49000 2828 return;
8e274523 2829
17bbdefd 2830 /* Cache the last resource usage values before we destroy the cgroup */
ad009380
MY
2831 (void) unit_get_cpu_usage(u, /* ret = */ NULL);
2832
2833 for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++)
2834 (void) unit_get_memory_accounting(u, metric, /* ret = */ NULL);
fe700f46 2835
17bbdefd
IS
2836 /* All IO metrics are read at once from the underlying cgroup, so issue just a single call */
2837 (void) unit_get_io_accounting(u, _CGROUP_IO_ACCOUNTING_METRIC_INVALID, /* ret = */ NULL);
2838
2839 /* We do not cache IP metrics here because the firewall objects are not freed with cgroups */
2840
b1994387 2841#if BPF_FRAMEWORK
352ec23c 2842 (void) bpf_restrict_fs_cleanup(u); /* Remove cgroup from the global LSM BPF map */
b1994387
ILG
2843#endif
2844
49b6babb 2845 unit_modify_nft_set(u, /* add = */ false);
dc7d69b3 2846
efdb0237
LP
2847 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2848
188286ee 2849 r = cg_trim(crt->cgroup_path, !is_root_slice);
51a70c88
LP
2850 if (r < 0) {
2851 int k = unit_prune_cgroup_via_bus(u);
2852
2853 if (k >= 0)
2854 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s on our own (%m), but worked when talking to PID 1.", empty_to_root(crt->cgroup_path));
2855 else {
2856 /* One reason we could have failed here is, that the cgroup still contains a process.
2857 * However, if the cgroup becomes removable at a later time, it might be removed when
2858 * the containing slice is stopped. So even if we failed now, this unit shouldn't
2859 * assume that the cgroup is still realized the next time it is started. Do not
2860 * return early on error, continue cleanup. */
2861 log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r,
2862 "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2863 }
2864 }
8e274523 2865
efdb0237
LP
2866 if (is_root_slice)
2867 return;
2868
e08dabfe
AZ
2869 if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
2870 return;
0a1eb06d 2871
4918f14a
MY
2872 assert(crt == unit_get_cgroup_runtime(u));
2873 assert(!crt->cgroup_path);
9cc54544 2874
9cc54544
LP
2875 crt->cgroup_realized_mask = 0;
2876 crt->cgroup_enabled_mask = 0;
084c7007 2877
9cc54544 2878 crt->bpf_device_control_installed = bpf_program_free(crt->bpf_device_control_installed);
8e274523
LP
2879}
2880
495e75ed
LP
2881int unit_search_main_pid(Unit *u, PidRef *ret) {
2882 _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
4ad49000 2883 _cleanup_fclose_ FILE *f = NULL;
efdb0237 2884 int r;
4ad49000
LP
2885
2886 assert(u);
efdb0237 2887 assert(ret);
4ad49000 2888
9cc54544
LP
2889 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2890 if (!crt || !crt->cgroup_path)
efdb0237 2891 return -ENXIO;
4ad49000 2892
9cc54544 2893 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, &f);
efdb0237
LP
2894 if (r < 0)
2895 return r;
4ad49000 2896
495e75ed
LP
2897 for (;;) {
2898 _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL;
4ad49000 2899
00f17143
TR
2900 /* cg_read_pidref() will return an error on unmapped PIDs.
2901 * We can't reasonably deal with units that contain those. */
2902 r = cg_read_pidref(f, &npidref, CGROUP_DONT_SKIP_UNMAPPED);
495e75ed
LP
2903 if (r < 0)
2904 return r;
2905 if (r == 0)
2906 break;
8e274523 2907
495e75ed 2908 if (pidref_equal(&pidref, &npidref)) /* seen already, cgroupfs reports duplicates! */
4ad49000 2909 continue;
8e274523 2910
6774be42 2911 if (pidref_is_my_child(&npidref) <= 0) /* ignore processes further down the tree */
495e75ed 2912 continue;
efdb0237 2913
495e75ed
LP
2914 if (pidref_is_set(&pidref) != 0)
2915 /* Dang, there's more than one daemonized PID in this group, so we don't know what
2916 * process is the main process. */
efdb0237 2917 return -ENODATA;
8e274523 2918
495e75ed 2919 pidref = TAKE_PIDREF(npidref);
8e274523
LP
2920 }
2921
495e75ed
LP
2922 if (!pidref_is_set(&pidref))
2923 return -ENODATA;
2924
2925 *ret = TAKE_PIDREF(pidref);
efdb0237
LP
2926 return 0;
2927}
2928
09e24654 2929static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
99534007 2930 Manager *m = ASSERT_PTR(userdata);
09e24654 2931 Unit *u;
efdb0237
LP
2932 int r;
2933
09e24654 2934 assert(s);
efdb0237 2935
09e24654
LP
2936 u = m->cgroup_empty_queue;
2937 if (!u)
efdb0237
LP
2938 return 0;
2939
09e24654
LP
2940 assert(u->in_cgroup_empty_queue);
2941 u->in_cgroup_empty_queue = false;
2942 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2943
2944 if (m->cgroup_empty_queue) {
2945 /* More stuff queued, let's make sure we remain enabled */
2946 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2947 if (r < 0)
19a691a9 2948 log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
09e24654 2949 }
efdb0237 2950
f7829525
NK
2951 /* Update state based on OOM kills before we notify about cgroup empty event */
2952 (void) unit_check_oom(u);
2953 (void) unit_check_oomd_kill(u);
2954
efdb0237
LP
2955 unit_add_to_gc_queue(u);
2956
067c91fc 2957 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
380dd177
RP
2958 unit_prune_cgroup(u);
2959 else if (UNIT_VTABLE(u)->notify_cgroup_empty)
efdb0237
LP
2960 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2961
2962 return 0;
2963}
2964
55e4df21 2965static void unit_add_to_cgroup_empty_queue(Unit *u) {
09e24654
LP
2966 int r;
2967
2968 assert(u);
2969
55e4df21
MY
2970 /* Note that cgroup empty events are dispatched in a separate queue with a lower priority than
2971 * the SIGCHLD handler, so that we always use SIGCHLD if we can get it first, and only use
2972 * the cgroup empty notifications if there's no SIGCHLD pending (which might happen if the cgroup
2973 * doesn't contain processes that are our own child, which is typically the case for scope units). */
09e24654
LP
2974
2975 if (u->in_cgroup_empty_queue)
2976 return;
2977
09e24654
LP
2978 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2979 u->in_cgroup_empty_queue = true;
2980
2981 /* Trigger the defer event */
2982 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2983 if (r < 0)
2984 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2985}
2986
d9e45bc3
MS
2987static void unit_remove_from_cgroup_empty_queue(Unit *u) {
2988 assert(u);
2989
2990 if (!u->in_cgroup_empty_queue)
2991 return;
2992
2993 LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2994 u->in_cgroup_empty_queue = false;
2995}
2996
fe8d22fb
AZ
2997int unit_check_oomd_kill(Unit *u) {
2998 _cleanup_free_ char *value = NULL;
2999 bool increased;
3000 uint64_t n = 0;
3001 int r;
3002
9cc54544
LP
3003 assert(u);
3004
3005 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3006 if (!crt || !crt->cgroup_path)
fe8d22fb
AZ
3007 return 0;
3008
7b4022bd 3009 r = cg_get_xattr(crt->cgroup_path, "user.oomd_ooms", &value, /* ret_size= */ NULL);
00675c36 3010 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
fe8d22fb
AZ
3011 return r;
3012
3013 if (!isempty(value)) {
3014 r = safe_atou64(value, &n);
3015 if (r < 0)
3016 return r;
3017 }
3018
9cc54544
LP
3019 increased = n > crt->managed_oom_kill_last;
3020 crt->managed_oom_kill_last = n;
fe8d22fb
AZ
3021
3022 if (!increased)
3023 return 0;
3024
38c41427
NK
3025 n = 0;
3026 value = mfree(value);
7b4022bd 3027 r = cg_get_xattr(crt->cgroup_path, "user.oomd_kill", &value, /* ret_size= */ NULL);
38c41427
NK
3028 if (r >= 0 && !isempty(value))
3029 (void) safe_atou64(value, &n);
3030
fe8d22fb 3031 if (n > 0)
c2503e35 3032 log_unit_struct(u, LOG_NOTICE,
3cf6a3a3 3033 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OOMD_KILL_STR),
c2503e35 3034 LOG_UNIT_INVOCATION_ID(u),
38c41427 3035 LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n),
3cf6a3a3 3036 LOG_ITEM("N_PROCESSES=%" PRIu64, n));
38c41427
NK
3037 else
3038 log_unit_struct(u, LOG_NOTICE,
3cf6a3a3 3039 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OOMD_KILL_STR),
38c41427
NK
3040 LOG_UNIT_INVOCATION_ID(u),
3041 LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit."));
3042
396b3a1e 3043 unit_notify_cgroup_oom(u, /* managed_oom= */ true);
fe8d22fb
AZ
3044
3045 return 1;
3046}
3047
2ba6ae6b 3048int unit_check_oom(Unit *u) {
afcfaa69
LP
3049 _cleanup_free_ char *oom_kill = NULL;
3050 bool increased;
3051 uint64_t c;
3052 int r;
3053
9cc54544
LP
3054 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3055 if (!crt || !crt->cgroup_path)
afcfaa69
LP
3056 return 0;
3057
9cc54544
LP
3058 r = cg_get_keyed_attribute(
3059 "memory",
3060 crt->cgroup_path,
3061 "memory.events",
3062 STRV_MAKE("oom_kill"),
3063 &oom_kill);
fc594dee
LP
3064 if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
3065 c = 0;
3066 else if (r < 0)
afcfaa69 3067 return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
fc594dee
LP
3068 else {
3069 r = safe_atou64(oom_kill, &c);
3070 if (r < 0)
3071 return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
3072 }
afcfaa69 3073
9cc54544
LP
3074 increased = c > crt->oom_kill_last;
3075 crt->oom_kill_last = c;
afcfaa69
LP
3076
3077 if (!increased)
3078 return 0;
3079
c2503e35 3080 log_unit_struct(u, LOG_NOTICE,
3cf6a3a3 3081 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR),
c2503e35
RH
3082 LOG_UNIT_INVOCATION_ID(u),
3083 LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
afcfaa69 3084
396b3a1e 3085 unit_notify_cgroup_oom(u, /* managed_oom= */ false);
afcfaa69
LP
3086
3087 return 1;
3088}
3089
3090static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
99534007 3091 Manager *m = ASSERT_PTR(userdata);
afcfaa69
LP
3092 Unit *u;
3093 int r;
3094
3095 assert(s);
afcfaa69
LP
3096
3097 u = m->cgroup_oom_queue;
3098 if (!u)
3099 return 0;
3100
3101 assert(u->in_cgroup_oom_queue);
3102 u->in_cgroup_oom_queue = false;
3103 LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
3104
3105 if (m->cgroup_oom_queue) {
3106 /* More stuff queued, let's make sure we remain enabled */
3107 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
3108 if (r < 0)
3109 log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
3110 }
3111
3112 (void) unit_check_oom(u);
935f8042
LP
3113 unit_add_to_gc_queue(u);
3114
afcfaa69
LP
3115 return 0;
3116}
3117
3118static void unit_add_to_cgroup_oom_queue(Unit *u) {
3119 int r;
3120
3121 assert(u);
3122
3123 if (u->in_cgroup_oom_queue)
3124 return;
9cc54544
LP
3125
3126 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3127 if (!crt || !crt->cgroup_path)
afcfaa69
LP
3128 return;
3129
3130 LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
3131 u->in_cgroup_oom_queue = true;
3132
3133 /* Trigger the defer event */
3134 if (!u->manager->cgroup_oom_event_source) {
3135 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
3136
3137 r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
3138 if (r < 0) {
3139 log_error_errno(r, "Failed to create cgroup oom event source: %m");
3140 return;
3141 }
3142
d42b61d2 3143 r = sd_event_source_set_priority(s, EVENT_PRIORITY_CGROUP_OOM);
afcfaa69
LP
3144 if (r < 0) {
3145 log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
3146 return;
3147 }
3148
3149 (void) sd_event_source_set_description(s, "cgroup-oom");
3150 u->manager->cgroup_oom_event_source = TAKE_PTR(s);
3151 }
3152
3153 r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
3154 if (r < 0)
3155 log_error_errno(r, "Failed to enable cgroup oom event source: %m");
3156}
3157
d9e45bc3
MS
3158static int unit_check_cgroup_events(Unit *u) {
3159 char *values[2] = {};
3160 int r;
3161
3162 assert(u);
3163
9cc54544
LP
3164 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3165 if (!crt || !crt->cgroup_path)
869f52f2
DS
3166 return 0;
3167
88f4bf8a 3168 r = cg_get_keyed_attribute(
9cc54544
LP
3169 SYSTEMD_CGROUP_CONTROLLER,
3170 crt->cgroup_path,
3171 "cgroup.events",
3172 STRV_MAKE("populated", "frozen"),
3173 values);
d9e45bc3
MS
3174 if (r < 0)
3175 return r;
3176
3177 /* The cgroup.events notifications can be merged together so act as we saw the given state for the
3178 * first time. The functions we call to handle given state are idempotent, which makes them
3179 * effectively remember the previous state. */
88f4bf8a
MY
3180 if (streq(values[0], "1"))
3181 unit_remove_from_cgroup_empty_queue(u);
3182 else
3183 unit_add_to_cgroup_empty_queue(u);
d9e45bc3 3184
16b6af6a
AV
3185 /* Disregard freezer state changes due to operations not initiated by us.
3186 * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and
3187 * https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */
88f4bf8a 3188 if (IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING))
e1ac5259 3189 unit_freezer_complete(u, streq(values[1], "0") ? FREEZER_RUNNING : FREEZER_FROZEN);
d9e45bc3 3190
88f4bf8a 3191 free_many_charp(values, ELEMENTSOF(values));
d9e45bc3
MS
3192 return 0;
3193}
3194
efdb0237 3195static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
99534007 3196 Manager *m = ASSERT_PTR(userdata);
efdb0237
LP
3197
3198 assert(s);
3199 assert(fd >= 0);
efdb0237
LP
3200
3201 for (;;) {
3202 union inotify_event_buffer buffer;
efdb0237
LP
3203 ssize_t l;
3204
3205 l = read(fd, &buffer, sizeof(buffer));
3206 if (l < 0) {
8add30a0 3207 if (ERRNO_IS_TRANSIENT(errno))
efdb0237
LP
3208 return 0;
3209
3210 return log_error_errno(errno, "Failed to read control group inotify events: %m");
3211 }
3212
00adc340 3213 FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) {
efdb0237
LP
3214 Unit *u;
3215
3216 if (e->wd < 0)
3217 /* Queue overflow has no watch descriptor */
3218 continue;
3219
3220 if (e->mask & IN_IGNORED)
3221 /* The watch was just removed */
3222 continue;
3223
afcfaa69
LP
3224 /* Note that inotify might deliver events for a watch even after it was removed,
3225 * because it was queued before the removal. Let's ignore this here safely. */
3226
0bb814c2 3227 u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
afcfaa69 3228 if (u)
d9e45bc3 3229 unit_check_cgroup_events(u);
efdb0237 3230
afcfaa69
LP
3231 u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
3232 if (u)
3233 unit_add_to_cgroup_oom_queue(u);
efdb0237
LP
3234 }
3235 }
8e274523
LP
3236}
3237
17f14955
RG
3238static int cg_bpf_mask_supported(CGroupMask *ret) {
3239 CGroupMask mask = 0;
3240 int r;
3241
ad446c8c 3242 /* BPF-based firewall, device access control, and pinned foreign prog */
ec3c5cfa
YW
3243 if (bpf_program_supported() > 0)
3244 mask |= CGROUP_MASK_BPF_FIREWALL |
ad446c8c 3245 CGROUP_MASK_BPF_DEVICES |
ec3c5cfa 3246 CGROUP_MASK_BPF_FOREIGN;
17f14955 3247
a8e5eb17 3248 /* BPF-based bind{4|6} hooks */
cd09a5f3 3249 r = bpf_socket_bind_supported();
ad13559e
YW
3250 if (r < 0)
3251 return r;
a8e5eb17
JK
3252 if (r > 0)
3253 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
3254
6f50d4f7 3255 /* BPF-based cgroup_skb/{egress|ingress} hooks */
62e22490 3256 r = bpf_restrict_ifaces_supported();
ad13559e
YW
3257 if (r < 0)
3258 return r;
6f50d4f7
MV
3259 if (r > 0)
3260 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
3261
17f14955
RG
3262 *ret = mask;
3263 return 0;
3264}
3265
8e274523 3266int manager_setup_cgroup(Manager *m) {
77a0d2e3 3267 int r;
8e274523
LP
3268
3269 assert(m);
3270
35d2e7ec 3271 /* 1. Determine hierarchy */
efdb0237 3272 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 3273 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
3274 if (r < 0)
3275 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 3276
efdb0237 3277 /* Chop off the init scope, if we are already located in it */
77a0d2e3 3278 char *e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
efdb0237
LP
3279 if (e)
3280 *e = 0;
7ccfb64a 3281
7546145e
LP
3282 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
3283 * easily prepend it everywhere. */
3284 delete_trailing_chars(m->cgroup_root, "/");
8e274523 3285
77a0d2e3
MY
3286 /* 2. Pin the cgroupfs mount, so that it cannot be unmounted */
3287 safe_close(m->pin_cgroupfs_fd);
3288 m->pin_cgroupfs_fd = open("/sys/fs/cgroup", O_PATH|O_CLOEXEC|O_DIRECTORY);
3289 if (m->pin_cgroupfs_fd < 0)
3290 return log_error_errno(errno, "Failed to pin cgroup hierarchy: %m");
efdb0237 3291
09e24654 3292 /* 3. Allocate cgroup empty defer event source */
5dcadb4c 3293 m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
09e24654
LP
3294 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
3295 if (r < 0)
3296 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
3297
cbe83389
LP
3298 /* Schedule cgroup empty checks early, but after having processed service notification messages or
3299 * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
3300 * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
d42b61d2 3301 r = sd_event_source_set_priority(m->cgroup_empty_event_source, EVENT_PRIORITY_CGROUP_EMPTY);
09e24654
LP
3302 if (r < 0)
3303 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
3304
3305 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
3306 if (r < 0)
3307 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
3308
3309 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
3310
be1d96db
MY
3311 /* 4. Install cgroup empty event notifier inotify object */
3312 m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
3313 safe_close(m->cgroup_inotify_fd);
efdb0237 3314
be1d96db
MY
3315 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
3316 if (m->cgroup_inotify_fd < 0)
3317 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 3318
be1d96db
MY
3319 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
3320 if (r < 0)
3321 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 3322
be1d96db
MY
3323 /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
3324 * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
3325 * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
3326 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, EVENT_PRIORITY_CGROUP_INOTIFY);
3327 if (r < 0)
3328 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
8e274523 3329
be1d96db 3330 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 3331
09e24654 3332 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
77a0d2e3 3333 const char *scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
188286ee 3334 r = cg_create_and_attach(scope_path, /* pid = */ 0);
aa77e234
MS
3335 if (r >= 0) {
3336 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
188286ee 3337 r = cg_migrate(m->cgroup_root, scope_path, 0);
aa77e234
MS
3338 if (r < 0)
3339 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 3340
638cece4 3341 } else if (!MANAGER_IS_TEST_RUN(m))
aa77e234 3342 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
10bd3e2e 3343
77a0d2e3 3344 /* 6. Figure out which controllers are supported */
0fa7b500 3345 r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
efdb0237
LP
3346 if (r < 0)
3347 return log_error_errno(r, "Failed to determine supported controllers: %m");
17f14955 3348
77a0d2e3
MY
3349 /* 7. Figure out which bpf-based pseudo-controllers are supported */
3350 CGroupMask mask;
17f14955
RG
3351 r = cg_bpf_mask_supported(&mask);
3352 if (r < 0)
3353 return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
3354 m->cgroup_supported |= mask;
3355
77a0d2e3 3356 /* 8. Log which controllers are supported */
e8616626
ZJS
3357 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
3358 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
3359 yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 3360
a32360f1 3361 return 0;
8e274523
LP
3362}
3363
c6c18be3 3364void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
3365 assert(m);
3366
9444b1f2
LP
3367 /* We can't really delete the group, since we are in it. But
3368 * let's trim it. */
5dd2f5ff 3369 if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
188286ee 3370 (void) cg_trim(m->cgroup_root, false);
efdb0237 3371
5dcadb4c 3372 m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
09e24654 3373
0bb814c2 3374 m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
afcfaa69 3375 m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
efdb0237 3376
5dcadb4c 3377 m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
efdb0237 3378 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 3379
03e334a1 3380 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 3381
efdb0237 3382 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
3383}
3384
4ad49000 3385Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 3386 char *p;
4ad49000 3387 Unit *u;
acb14d31
LP
3388
3389 assert(m);
3390 assert(cgroup);
acb14d31 3391
4ad49000
LP
3392 u = hashmap_get(m->cgroup_unit, cgroup);
3393 if (u)
3394 return u;
acb14d31 3395
2f82562b 3396 p = strdupa_safe(cgroup);
acb14d31
LP
3397 for (;;) {
3398 char *e;
3399
3400 e = strrchr(p, '/');
efdb0237 3401 if (!e || e == p)
50f2ee45 3402 return NULL; /* reached cgroup root? return NULL and possibly fall back to manager_get_unit_by_pidref_watching() */
acb14d31
LP
3403
3404 *e = 0;
3405
4ad49000
LP
3406 u = hashmap_get(m->cgroup_unit, p);
3407 if (u)
3408 return u;
acb14d31
LP
3409 }
3410}
3411
50f2ee45 3412Unit* manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid) {
4ad49000 3413 _cleanup_free_ char *cgroup = NULL;
8e274523 3414
8c47c732
LP
3415 assert(m);
3416
a9062242 3417 if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
b3ac818b
LP
3418 return NULL;
3419
3420 return manager_get_unit_by_cgroup(m, cgroup);
3421}
3422
eb355466 3423Unit* manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid) {
62a76913 3424 Unit *u, **array;
b3ac818b
LP
3425
3426 assert(m);
3427
495e75ed
LP
3428 if (!pidref_is_set(pid))
3429 return NULL;
62a76913 3430
495e75ed
LP
3431 u = hashmap_get(m->watch_pids, pid);
3432 if (u)
3433 return u;
3434
3435 array = hashmap_get(m->watch_pids_more, pid);
3436 if (array)
3437 return array[0];
3438
3439 return NULL;
3440}
3441
4ace93da 3442Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid) {
495e75ed
LP
3443 Unit *u;
3444
3445 assert(m);
3446
3447 /* Note that a process might be owned by multiple units, we return only one here, which is good
3448 * enough for most cases, though not strictly correct. We prefer the one reported by cgroup
3449 * membership, as that's the most relevant one as children of the process will be assigned to that
3450 * one, too, before all else. */
3451
3452 if (!pidref_is_set(pid))
8c47c732
LP
3453 return NULL;
3454
a7a87769 3455 if (pidref_is_self(pid))
efdb0237 3456 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
495e75ed
LP
3457 if (pid->pid == 1)
3458 return NULL;
efdb0237 3459
495e75ed 3460 u = manager_get_unit_by_pidref_cgroup(m, pid);
5fe8876b
LP
3461 if (u)
3462 return u;
3463
495e75ed 3464 u = manager_get_unit_by_pidref_watching(m, pid);
5fe8876b
LP
3465 if (u)
3466 return u;
3467
62a76913 3468 return NULL;
6dde1f33 3469}
4fbf50b3 3470
93ff34e4 3471int unit_get_memory_available(Unit *u, uint64_t *ret) {
8db929a1 3472 uint64_t available = UINT64_MAX, current = 0;
93ff34e4
LB
3473
3474 assert(u);
3475 assert(ret);
3476
3477 /* If data from cgroups can be accessed, try to find out how much more memory a unit can
3478 * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
3479 * and MemoryMax, and also any slice the unit might be nested below. */
3480
727cea76 3481 do {
8db929a1 3482 uint64_t unit_available, unit_limit = UINT64_MAX;
727cea76 3483 CGroupContext *unit_context;
93ff34e4
LB
3484
3485 /* No point in continuing if we can't go any lower */
3486 if (available == 0)
3487 break;
3488
727cea76
MK
3489 unit_context = unit_get_cgroup_context(u);
3490 if (!unit_context)
3491 return -ENODATA;
93ff34e4 3492
702aa339 3493 (void) unit_get_memory_accounting(u, CGROUP_MEMORY_CURRENT, &current);
8db929a1
MK
3494 /* in case of error, previous current propagates as lower bound */
3495
727cea76
MK
3496 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
3497 unit_limit = physical_memory();
3498 else if (unit_context->memory_max == UINT64_MAX && unit_context->memory_high == UINT64_MAX)
93ff34e4 3499 continue;
727cea76 3500 unit_limit = MIN3(unit_limit, unit_context->memory_max, unit_context->memory_high);
93ff34e4 3501
8db929a1 3502 unit_available = LESS_BY(unit_limit, current);
727cea76
MK
3503 available = MIN(unit_available, available);
3504 } while ((u = UNIT_GET_SLICE(u)));
93ff34e4
LB
3505
3506 *ret = available;
3507
3508 return 0;
3509}
3510
9824ab1f
MY
3511int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) {
3512
3513 static const char* const attributes_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
702aa339 3514 [CGROUP_MEMORY_CURRENT] = "memory.current",
9824ab1f
MY
3515 [CGROUP_MEMORY_PEAK] = "memory.peak",
3516 [CGROUP_MEMORY_SWAP_CURRENT] = "memory.swap.current",
3517 [CGROUP_MEMORY_SWAP_PEAK] = "memory.swap.peak",
3518 [CGROUP_MEMORY_ZSWAP_CURRENT] = "memory.zswap.current",
3519 };
3520
3521 uint64_t bytes;
f17b07f4 3522 bool updated = false;
6c71db76
FS
3523 int r;
3524
3525 assert(u);
9824ab1f
MY
3526 assert(metric >= 0);
3527 assert(metric < _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX);
6c71db76 3528
37533c94
FS
3529 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
3530 return -ENODATA;
3531
d3d03539 3532 /* The root cgroup doesn't expose this information. */
702aa339
MY
3533 if (unit_has_host_root_cgroup(u)) {
3534 /* System-wide memory usage can be acquired from /proc/ */
3535 if (metric == CGROUP_MEMORY_CURRENT)
3536 return procfs_memory_get_used(ret);
3537
d3d03539 3538 return -ENODATA;
702aa339 3539 }
d3d03539 3540
9cc54544
LP
3541 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3542 if (!crt)
3543 return -ENODATA;
3544 if (!crt->cgroup_path)
f17b07f4 3545 /* If the cgroup is already gone, we try to find the last cached value. */
a8aed6a9 3546 goto finish;
6c71db76 3547
9cc54544 3548 if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_MEMORY))
6c71db76
FS
3549 return -ENODATA;
3550
9cc54544 3551 r = cg_get_attribute_as_uint64("memory", crt->cgroup_path, attributes_table[metric], &bytes);
f17b07f4 3552 if (r < 0 && r != -ENODATA)
9824ab1f 3553 return r;
f17b07f4 3554 updated = r >= 0;
6c71db76 3555
a8aed6a9
MY
3556finish:
3557 if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) {
9cc54544 3558 uint64_t *last = &crt->memory_accounting_last[metric];
6c71db76 3559
a8aed6a9
MY
3560 if (updated)
3561 *last = bytes;
3562 else if (*last != UINT64_MAX)
3563 bytes = *last;
3564 else
3565 return -ENODATA;
f17b07f4 3566
a8aed6a9 3567 } else if (!updated)
f17b07f4 3568 return -ENODATA;
6c71db76 3569
6c71db76
FS
3570 if (ret)
3571 *ret = bytes;
3572
3573 return 0;
3574}
3575
03a7b521 3576int unit_get_tasks_current(Unit *u, uint64_t *ret) {
03a7b521
LP
3577 assert(u);
3578 assert(ret);
3579
2e4025c0 3580 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
3581 return -ENODATA;
3582
9cc54544
LP
3583 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3584 if (!crt || !crt->cgroup_path)
03a7b521
LP
3585 return -ENODATA;
3586
c36a69f4 3587 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
611c4f8a 3588 if (unit_has_host_root_cgroup(u))
c36a69f4
LP
3589 return procfs_tasks_get_current(ret);
3590
9cc54544 3591 if ((crt->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1f73aa00
LP
3592 return -ENODATA;
3593
9cc54544 3594 return cg_get_attribute_as_uint64("pids", crt->cgroup_path, "pids.current", ret);
03a7b521
LP
3595}
3596
bc347edf 3597static int unit_get_cpu_usage_raw(const Unit *u, const CGroupRuntime *crt, nsec_t *ret) {
5ad096b3
LP
3598 int r;
3599
3600 assert(u);
bc347edf 3601 assert(crt);
5ad096b3
LP
3602 assert(ret);
3603
bc347edf 3604 if (!crt->cgroup_path)
5ad096b3
LP
3605 return -ENODATA;
3606
1f73aa00 3607 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
611c4f8a 3608 if (unit_has_host_root_cgroup(u))
1f73aa00
LP
3609 return procfs_cpu_get_usage(ret);
3610
bc347edf
MY
3611 _cleanup_free_ char *val = NULL;
3612 uint64_t us;
66ebf6c0 3613
bc347edf 3614 r = cg_get_keyed_attribute("cpu", crt->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
bc347edf
MY
3615 if (r < 0)
3616 return r;
66ebf6c0 3617
bc347edf
MY
3618 r = safe_atou64(val, &us);
3619 if (r < 0)
3620 return r;
3621
3622 *ret = us * NSEC_PER_USEC;
5ad096b3 3623
5ad096b3
LP
3624 return 0;
3625}
3626
3627int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
3628 nsec_t ns;
3629 int r;
3630
fe700f46
LP
3631 assert(u);
3632
3633 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
3634 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
3635 * call this function with a NULL return value. */
3636
3849d1f5
MY
3637 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3638 if (!crt)
cf3b4be1
LP
3639 return -ENODATA;
3640
bc347edf 3641 r = unit_get_cpu_usage_raw(u, crt, &ns);
9cc54544 3642 if (r == -ENODATA && crt->cpu_usage_last != NSEC_INFINITY) {
fe700f46
LP
3643 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
3644 * cached value. */
3645
3646 if (ret)
9cc54544 3647 *ret = crt->cpu_usage_last;
fe700f46
LP
3648 return 0;
3649 }
5ad096b3
LP
3650 if (r < 0)
3651 return r;
3652
9cc54544
LP
3653 if (ns > crt->cpu_usage_base)
3654 ns -= crt->cpu_usage_base;
5ad096b3
LP
3655 else
3656 ns = 0;
3657
9cc54544 3658 crt->cpu_usage_last = ns;
fe700f46
LP
3659 if (ret)
3660 *ret = ns;
3661
5ad096b3
LP
3662 return 0;
3663}
3664
906c06f6
DM
3665int unit_get_ip_accounting(
3666 Unit *u,
3667 CGroupIPAccountingMetric metric,
3668 uint64_t *ret) {
3669
6b659ed8 3670 uint64_t value;
906c06f6
DM
3671 int fd, r;
3672
3673 assert(u);
3674 assert(metric >= 0);
3675 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
3676 assert(ret);
3677
2e4025c0 3678 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
3679 return -ENODATA;
3680
9cc54544 3681 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3849d1f5 3682 if (!crt)
9cc54544
LP
3683 return -ENODATA;
3684
906c06f6 3685 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
9cc54544
LP
3686 crt->ip_accounting_ingress_map_fd :
3687 crt->ip_accounting_egress_map_fd;
906c06f6
DM
3688 if (fd < 0)
3689 return -ENODATA;
3690
3691 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 3692 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 3693 else
6b659ed8
LP
3694 r = bpf_firewall_read_accounting(fd, NULL, &value);
3695 if (r < 0)
3696 return r;
3697
3698 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
3699 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
3700 * ip_accounting_extra[] field, and add them in here transparently. */
3701
9cc54544 3702 *ret = value + crt->ip_accounting_extra[metric];
906c06f6
DM
3703
3704 return r;
3705}
3706
4fb0d2dc
MK
3707static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) {
3708 CGroupContext *cc;
3709
3710 assert(u);
3711 assert(UNIT_HAS_CGROUP_CONTEXT(u));
3712
93f8e88d
MK
3713 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
3714 switch (type) {
3715 case CGROUP_LIMIT_MEMORY_MAX:
3716 case CGROUP_LIMIT_MEMORY_HIGH:
3717 return physical_memory();
3718 case CGROUP_LIMIT_TASKS_MAX:
3719 return system_tasks_max();
3720 default:
3721 assert_not_reached();
3722 }
3723
c658ad79 3724 cc = ASSERT_PTR(unit_get_cgroup_context(u));
4fb0d2dc 3725 switch (type) {
4fb0d2dc
MK
3726 case CGROUP_LIMIT_MEMORY_MAX:
3727 return cc->memory_max;
3728 case CGROUP_LIMIT_MEMORY_HIGH:
3729 return cc->memory_high;
3730 case CGROUP_LIMIT_TASKS_MAX:
3731 return cgroup_tasks_max_resolve(&cc->tasks_max);
3732 default:
3733 assert_not_reached();
3734 }
3735}
3736
3737int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) {
3738 uint64_t infimum;
3739
3740 assert(u);
3741 assert(ret);
3742 assert(type >= 0);
3743 assert(type < _CGROUP_LIMIT_TYPE_MAX);
3744
3745 if (!UNIT_HAS_CGROUP_CONTEXT(u))
3746 return -EINVAL;
3747
3748 infimum = unit_get_effective_limit_one(u, type);
3749 for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice))
3750 infimum = MIN(infimum, unit_get_effective_limit_one(slice, type));
3751
3752 *ret = infimum;
3753 return 0;
3754}
3755
bc347edf
MY
3756static int unit_get_io_accounting_raw(
3757 const Unit *u,
3758 const CGroupRuntime *crt,
3759 uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
3760
3761 static const char* const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
fbe14fc9
LP
3762 [CGROUP_IO_READ_BYTES] = "rbytes=",
3763 [CGROUP_IO_WRITE_BYTES] = "wbytes=",
3764 [CGROUP_IO_READ_OPERATIONS] = "rios=",
3765 [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
3766 };
bc347edf 3767
fbe14fc9
LP
3768 uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
3769 _cleanup_free_ char *path = NULL;
3770 _cleanup_fclose_ FILE *f = NULL;
3771 int r;
3772
3773 assert(u);
bc347edf 3774 assert(crt);
fbe14fc9 3775
bc347edf 3776 if (!crt->cgroup_path)
fbe14fc9
LP
3777 return -ENODATA;
3778
3779 if (unit_has_host_root_cgroup(u))
3780 return -ENODATA; /* TODO: return useful data for the top-level cgroup */
3781
9cc54544 3782 if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_IO))
fbe14fc9
LP
3783 return -ENODATA;
3784
9cc54544 3785 r = cg_get_path("io", crt->cgroup_path, "io.stat", &path);
fbe14fc9
LP
3786 if (r < 0)
3787 return r;
3788
3789 f = fopen(path, "re");
3790 if (!f)
3791 return -errno;
3792
3793 for (;;) {
3794 _cleanup_free_ char *line = NULL;
3795 const char *p;
3796
3797 r = read_line(f, LONG_LINE_MAX, &line);
3798 if (r < 0)
3799 return r;
3800 if (r == 0)
3801 break;
3802
3803 p = line;
3804 p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
3805 p += strspn(p, WHITESPACE); /* Skip over following whitespace */
3806
3807 for (;;) {
3808 _cleanup_free_ char *word = NULL;
3809
3810 r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
3811 if (r < 0)
3812 return r;
3813 if (r == 0)
3814 break;
3815
3816 for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3817 const char *x;
3818
3819 x = startswith(word, field_names[i]);
3820 if (x) {
3821 uint64_t w;
3822
3823 r = safe_atou64(x, &w);
3824 if (r < 0)
3825 return r;
3826
3827 /* Sum up the stats of all devices */
3828 acc[i] += w;
3829 break;
3830 }
3831 }
3832 }
3833 }
3834
3835 memcpy(ret, acc, sizeof(acc));
3836 return 0;
3837}
3838
3839int unit_get_io_accounting(
3840 Unit *u,
3841 CGroupIOAccountingMetric metric,
fbe14fc9
LP
3842 uint64_t *ret) {
3843
3844 uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
3845 int r;
3846
17bbdefd
IS
3847 /*
3848 * Retrieve an IO counter, subtracting the value of the counter value at the time the unit was started.
3849 * If ret == NULL and metric == _<...>_INVALID, no return value is expected (refresh the caches only).
3850 */
3851
3852 assert(u);
3853 assert(metric >= 0 || (!ret && metric == _CGROUP_IO_ACCOUNTING_METRIC_INVALID));
3854 assert(metric < _CGROUP_IO_ACCOUNTING_METRIC_MAX);
fbe14fc9
LP
3855
3856 if (!UNIT_CGROUP_BOOL(u, io_accounting))
3857 return -ENODATA;
3858
9cc54544 3859 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3849d1f5 3860 if (!crt)
9cc54544
LP
3861 return -ENODATA;
3862
bc347edf 3863 r = unit_get_io_accounting_raw(u, crt, raw);
17bbdefd 3864 if (r == -ENODATA && metric >= 0 && crt->io_accounting_last[metric] != UINT64_MAX)
fbe14fc9
LP
3865 goto done;
3866 if (r < 0)
3867 return r;
3868
3869 for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3870 /* Saturated subtraction */
9cc54544
LP
3871 if (raw[i] > crt->io_accounting_base[i])
3872 crt->io_accounting_last[i] = raw[i] - crt->io_accounting_base[i];
fbe14fc9 3873 else
9cc54544 3874 crt->io_accounting_last[i] = 0;
fbe14fc9
LP
3875 }
3876
3877done:
3878 if (ret)
9cc54544 3879 *ret = crt->io_accounting_last[metric];
fbe14fc9
LP
3880
3881 return 0;
3882}
3883
bc347edf 3884static int unit_reset_cpu_accounting(Unit *unit, CGroupRuntime *crt) {
5ad096b3
LP
3885 int r;
3886
bc347edf 3887 assert(crt);
9cc54544 3888
bc347edf 3889 crt->cpu_usage_base = 0;
9cc54544 3890 crt->cpu_usage_last = NSEC_INFINITY;
fe700f46 3891
bc347edf
MY
3892 if (unit) {
3893 r = unit_get_cpu_usage_raw(unit, crt, &crt->cpu_usage_base);
3894 if (r < 0 && r != -ENODATA)
3895 return r;
b56c28c3 3896 }
2633eb83 3897
4ad49000 3898 return 0;
4fbf50b3
LP
3899}
3900
bc347edf
MY
3901static int unit_reset_io_accounting(Unit *unit, CGroupRuntime *crt) {
3902 int r;
d4bdc202 3903
bc347edf
MY
3904 assert(crt);
3905
3906 zero(crt->io_accounting_base);
3907 FOREACH_ELEMENT(i, crt->io_accounting_last)
3908 *i = UINT64_MAX;
3909
3910 if (unit) {
3911 r = unit_get_io_accounting_raw(unit, crt, crt->io_accounting_base);
3912 if (r < 0 && r != -ENODATA)
3913 return r;
3914 }
3915
3916 return 0;
3917}
3918
3919static void cgroup_runtime_reset_memory_accounting_last(CGroupRuntime *crt) {
3920 assert(crt);
9cc54544 3921
85471164 3922 FOREACH_ELEMENT(i, crt->memory_accounting_last)
d4bdc202
MY
3923 *i = UINT64_MAX;
3924}
3925
bc347edf 3926static int cgroup_runtime_reset_ip_accounting(CGroupRuntime *crt) {
cbd2abbb 3927 int r = 0;
906c06f6 3928
bc347edf 3929 assert(crt);
9cc54544
LP
3930
3931 if (crt->ip_accounting_ingress_map_fd >= 0)
3932 RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_ingress_map_fd));
906c06f6 3933
9cc54544
LP
3934 if (crt->ip_accounting_egress_map_fd >= 0)
3935 RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_egress_map_fd));
906c06f6 3936
9cc54544 3937 zero(crt->ip_accounting_extra);
6b659ed8 3938
cbd2abbb 3939 return r;
906c06f6
DM
3940}
3941
bc347edf
MY
3942int unit_reset_accounting(Unit *u) {
3943 int r = 0;
fbe14fc9
LP
3944
3945 assert(u);
3946
9cc54544 3947 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
bc347edf 3948 if (!crt)
9cc54544
LP
3949 return 0;
3950
bc347edf
MY
3951 cgroup_runtime_reset_memory_accounting_last(crt);
3952 RET_GATHER(r, unit_reset_cpu_accounting(u, crt));
3953 RET_GATHER(r, unit_reset_io_accounting(u, crt));
3954 RET_GATHER(r, cgroup_runtime_reset_ip_accounting(crt));
9b2559a1 3955
cbd2abbb 3956 return r;
9b2559a1
LP
3957}
3958
e7ab4d1a
LP
3959void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
3960 assert(u);
3961
3962 if (!UNIT_HAS_CGROUP_CONTEXT(u))
3963 return;
3964
9cc54544
LP
3965 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3966 if (!crt)
3967 return;
3968
9cc54544 3969 if (FLAGS_SET(crt->cgroup_invalidated_mask, m)) /* NOP? */
e7ab4d1a
LP
3970 return;
3971
9cc54544 3972 crt->cgroup_invalidated_mask |= m;
91a6073e 3973 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
3974}
3975
906c06f6
DM
3976void unit_invalidate_cgroup_bpf(Unit *u) {
3977 assert(u);
3978
3979 if (!UNIT_HAS_CGROUP_CONTEXT(u))
3980 return;
3981
9cc54544
LP
3982 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3983 if (!crt)
906c06f6
DM
3984 return;
3985
9cc54544
LP
3986 if (crt->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
3987 return;
3988
3989 crt->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
91a6073e 3990 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
3991
3992 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
3993 * list of our children includes our own. */
3994 if (u->type == UNIT_SLICE) {
3995 Unit *member;
906c06f6 3996
d219a2b0 3997 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
15ed3c3a 3998 unit_invalidate_cgroup_bpf(member);
906c06f6
DM
3999 }
4000}
4001
869f52f2
DS
4002void unit_cgroup_catchup(Unit *u) {
4003 assert(u);
4004
4005 if (!UNIT_HAS_CGROUP_CONTEXT(u))
4006 return;
4007
4008 /* We dropped the inotify watch during reexec/reload, so we need to
4009 * check these as they may have changed.
4010 * Note that (currently) the kernel doesn't actually update cgroup
4011 * file modification times, so we can't just serialize and then check
4012 * the mtime for file(s) we are interested in. */
4013 (void) unit_check_cgroup_events(u);
4014 unit_add_to_cgroup_oom_queue(u);
4015}
4016
1d9cc876
LP
4017bool unit_cgroup_delegate(Unit *u) {
4018 CGroupContext *c;
4019
4020 assert(u);
4021
4022 if (!UNIT_VTABLE(u)->can_delegate)
4023 return false;
4024
4025 c = unit_get_cgroup_context(u);
4026 if (!c)
4027 return false;
4028
4029 return c->delegate;
4030}
4031
e7ab4d1a 4032void manager_invalidate_startup_units(Manager *m) {
e7ab4d1a
LP
4033 Unit *u;
4034
4035 assert(m);
4036
90e74a66 4037 SET_FOREACH(u, m->startup_units)
9dfb6a3a 4038 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
e7ab4d1a
LP
4039}
4040
16b6af6a
AV
4041static int unit_cgroup_freezer_kernel_state(Unit *u, FreezerState *ret) {
4042 _cleanup_free_ char *val = NULL;
4043 FreezerState s;
4044 int r;
d9e45bc3
MS
4045
4046 assert(u);
16b6af6a 4047 assert(ret);
d9e45bc3 4048
9cc54544
LP
4049 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4050 if (!crt || !crt->cgroup_path)
4051 return -EOWNERDEAD;
4052
4053 r = cg_get_keyed_attribute(
4054 SYSTEMD_CGROUP_CONTROLLER,
4055 crt->cgroup_path,
4056 "cgroup.events",
4057 STRV_MAKE("frozen"),
4058 &val);
16b6af6a
AV
4059 if (r < 0)
4060 return r;
9a1e90ae 4061
16b6af6a
AV
4062 if (streq(val, "0"))
4063 s = FREEZER_RUNNING;
4064 else if (streq(val, "1"))
4065 s = FREEZER_FROZEN;
4066 else {
4e494e6a 4067 log_unit_debug(u, "Unexpected cgroup frozen state: %s", val);
16b6af6a
AV
4068 s = _FREEZER_STATE_INVALID;
4069 }
a14137d9 4070
16b6af6a
AV
4071 *ret = s;
4072 return 0;
4073}
d9e45bc3 4074
16b6af6a
AV
4075int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
4076 _cleanup_free_ char *path = NULL;
f27f461b 4077 FreezerState current, next, objective;
85d00912 4078 bool action_in_progress = false;
16b6af6a 4079 int r;
a14137d9 4080
16b6af6a 4081 assert(u);
21fed6ea
MY
4082 assert(action >= 0);
4083 assert(action < _FREEZER_ACTION_MAX);
16b6af6a 4084
f27f461b 4085 unit_next_freezer_state(u, action, &next, &objective);
d9e45bc3 4086
7923e949 4087 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
a9dc1961 4088 if (!crt || !crt->cgroup_path)
7923e949 4089 /* No realized cgroup = nothing to freeze */
85d00912 4090 goto finish;
7923e949 4091
16b6af6a 4092 r = unit_cgroup_freezer_kernel_state(u, &current);
d9e45bc3 4093 if (r < 0)
16b6af6a 4094 return r;
d9e45bc3 4095
85d00912 4096 if (current == objective) {
4097 if (objective == FREEZER_FROZEN)
4098 goto finish;
4099
4100 /* Skip thaw only if no freeze operation was in flight */
4101 if (IN_SET(u->freezer_state, FREEZER_RUNNING, FREEZER_THAWING))
4102 goto finish;
4103 } else
4104 action_in_progress = true;
a9dc1961
MY
4105
4106 if (next == freezer_state_finish(next)) {
27344f9a
MY
4107 /* We're directly transitioning into a finished state, which in theory means that
4108 * the cgroup's current state already matches the objective and thus we'd return 0.
4109 * But, reality shows otherwise (such case would have been handled by current == objective
4110 * branch above). This indicates that our freezer_state tracking has diverged
16b6af6a
AV
4111 * from the real state of the cgroup, which can happen if someone meddles with the
4112 * cgroup from underneath us. This really shouldn't happen during normal operation,
4113 * though. So, let's warn about it and fix up the state to be valid */
4114
4115 log_unit_warning(u, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.",
4116 freezer_state_to_string(next), freezer_state_to_string(current) ?: "(invalid)");
4117
4118 if (next == FREEZER_FROZEN)
4119 next = FREEZER_FREEZING;
4120 else if (next == FREEZER_FROZEN_BY_PARENT)
4121 next = FREEZER_FREEZING_BY_PARENT;
4122 else if (next == FREEZER_RUNNING)
4123 next = FREEZER_THAWING;
a9dc1961
MY
4124 else
4125 assert_not_reached();
16b6af6a 4126 }
d9e45bc3 4127
9cc54544 4128 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.freeze", &path);
d9e45bc3
MS
4129 if (r < 0)
4130 return r;
4131
f27f461b 4132 r = write_string_file(path, one_zero(objective == FREEZER_FROZEN), WRITE_STRING_FILE_DISABLE_BUFFER);
d9e45bc3
MS
4133 if (r < 0)
4134 return r;
4135
85d00912 4136finish:
4137 if (action_in_progress)
4138 unit_set_freezer_state(u, next);
4139 else
4140 unit_set_freezer_state(u, freezer_state_finish(next));
a9dc1961 4141
85d00912 4142 return action_in_progress;
d9e45bc3
MS
4143}
4144
047f5d63
PH
4145int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
4146 _cleanup_free_ char *v = NULL;
4147 int r;
4148
4149 assert(u);
4150 assert(cpus);
4151
9cc54544
LP
4152 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4153 if (!crt || !crt->cgroup_path)
047f5d63
PH
4154 return -ENODATA;
4155
9cc54544 4156 if ((crt->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
047f5d63
PH
4157 return -ENODATA;
4158
9cc54544 4159 r = cg_get_attribute("cpuset", crt->cgroup_path, name, &v);
047f5d63
PH
4160 if (r == -ENOENT)
4161 return -ENODATA;
4162 if (r < 0)
4163 return r;
4164
168351ee 4165 return parse_cpu_set(v, cpus);
047f5d63
PH
4166}
4167
bc347edf 4168CGroupRuntime* cgroup_runtime_new(void) {
9cc54544
LP
4169 _cleanup_(cgroup_runtime_freep) CGroupRuntime *crt = NULL;
4170
4171 crt = new(CGroupRuntime, 1);
4172 if (!crt)
4173 return NULL;
4174
4175 *crt = (CGroupRuntime) {
9cc54544
LP
4176 .cgroup_control_inotify_wd = -1,
4177 .cgroup_memory_inotify_wd = -1,
4178
4179 .ip_accounting_ingress_map_fd = -EBADF,
4180 .ip_accounting_egress_map_fd = -EBADF,
4181
4182 .ipv4_allow_map_fd = -EBADF,
4183 .ipv6_allow_map_fd = -EBADF,
4184 .ipv4_deny_map_fd = -EBADF,
4185 .ipv6_deny_map_fd = -EBADF,
4186
4187 .cgroup_invalidated_mask = _CGROUP_MASK_ALL,
23ac0811
MY
4188
4189 .deserialized_cgroup_realized = -1,
9cc54544
LP
4190 };
4191
bc347edf
MY
4192 unit_reset_cpu_accounting(/* unit = */ NULL, crt);
4193 unit_reset_io_accounting(/* unit = */ NULL, crt);
4194 cgroup_runtime_reset_memory_accounting_last(crt);
4195 assert_se(cgroup_runtime_reset_ip_accounting(crt) >= 0);
9cc54544
LP
4196
4197 return TAKE_PTR(crt);
4198}
4199
bc347edf 4200CGroupRuntime* cgroup_runtime_free(CGroupRuntime *crt) {
9cc54544
LP
4201 if (!crt)
4202 return NULL;
4203
4204 fdset_free(crt->initial_socket_bind_link_fds);
4205#if BPF_FRAMEWORK
4206 bpf_link_free(crt->ipv4_socket_bind_link);
4207 bpf_link_free(crt->ipv6_socket_bind_link);
4208#endif
4209 hashmap_free(crt->bpf_foreign_by_key);
4210
4211 bpf_program_free(crt->bpf_device_control_installed);
4212
4213#if BPF_FRAMEWORK
4214 bpf_link_free(crt->restrict_ifaces_ingress_bpf_link);
4215 bpf_link_free(crt->restrict_ifaces_egress_bpf_link);
4216#endif
33b93371 4217 fdset_free(crt->initial_restrict_ifaces_link_fds);
9cc54544 4218
5a8c2c95 4219 bpf_firewall_close(crt);
9cc54544 4220
9cc54544
LP
4221 free(crt->cgroup_path);
4222
4223 return mfree(crt);
4224}
4225
4226static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
4227 [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes",
4228 [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
4229 [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes",
4230 [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets",
4231};
4232
4233DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric);
4234
4235static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4236 [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base",
4237 [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base",
4238 [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base",
4239 [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
4240};
4241
4242DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric);
4243
4244static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4245 [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last",
4246 [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last",
4247 [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last",
4248 [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
4249};
4250
4251DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric);
4252
4253static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
4254 [CGROUP_MEMORY_PEAK] = "memory-accounting-peak",
4255 [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak",
4256};
4257
4258DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric);
4259
4260static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
4261 _cleanup_free_ char *s = NULL;
4262 int r;
4263
4264 assert(f);
4265 assert(key);
4266
4267 if (mask == 0)
4268 return 0;
4269
4270 r = cg_mask_to_string(mask, &s);
4271 if (r < 0)
4272 return log_error_errno(r, "Failed to format cgroup mask: %m");
4273
4274 return serialize_item(f, key, s);
4275}
4276
4277int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) {
4278 int r;
4279
4280 assert(u);
4281 assert(f);
4282 assert(fds);
4283
4284 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4285 if (!crt)
4286 return 0;
4287
4288 (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, crt->cpu_usage_base);
4289 if (crt->cpu_usage_last != NSEC_INFINITY)
4290 (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, crt->cpu_usage_last);
4291
4292 if (crt->managed_oom_kill_last > 0)
4293 (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, crt->managed_oom_kill_last);
4294
4295 if (crt->oom_kill_last > 0)
4296 (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, crt->oom_kill_last);
4297
4298 for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
4299 uint64_t v;
4300
4301 r = unit_get_memory_accounting(u, metric, &v);
4302 if (r >= 0)
4303 (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v);
4304 }
4305
4306 for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
4307 uint64_t v;
4308
4309 r = unit_get_ip_accounting(u, m, &v);
4310 if (r >= 0)
4311 (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v);
4312 }
4313
4314 for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
4315 (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, crt->io_accounting_base[im]);
4316
4317 if (crt->io_accounting_last[im] != UINT64_MAX)
4318 (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, crt->io_accounting_last[im]);
4319 }
4320
4321 if (crt->cgroup_path)
4322 (void) serialize_item(f, "cgroup", crt->cgroup_path);
4323 if (crt->cgroup_id != 0)
4324 (void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id);
4325
9cc54544
LP
4326 (void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask);
4327 (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask);
4328 (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask);
4329
4330 (void) bpf_socket_bind_serialize(u, f, fds);
4331
4332 (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", crt->ip_bpf_ingress_installed);
4333 (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", crt->ip_bpf_egress_installed);
4334 (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", crt->bpf_device_control_installed);
4335 (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", crt->ip_bpf_custom_ingress_installed);
4336 (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", crt->ip_bpf_custom_egress_installed);
4337
4338 (void) bpf_restrict_ifaces_serialize(u, f, fds);
4339
4340 return 0;
4341}
4342
4343#define MATCH_DESERIALIZE(u, key, l, v, parse_func, target) \
4344 ({ \
4345 bool _deserialize_matched = streq(l, key); \
4346 if (_deserialize_matched) { \
4347 CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4348 if (!crt) \
4349 log_oom_debug(); \
4350 else { \
4351 int _deserialize_r = parse_func(v); \
4352 if (_deserialize_r < 0) \
4353 log_unit_debug_errno(u, _deserialize_r, \
4354 "Failed to parse \"%s=%s\", ignoring.", l, v); \
4355 else \
4356 crt->target = _deserialize_r; \
4357 } \
4358 } \
4359 _deserialize_matched; \
4360 })
4361
4362#define MATCH_DESERIALIZE_IMMEDIATE(u, key, l, v, parse_func, target) \
4363 ({ \
4364 bool _deserialize_matched = streq(l, key); \
4365 if (_deserialize_matched) { \
4366 CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4367 if (!crt) \
4368 log_oom_debug(); \
4369 else { \
4370 int _deserialize_r = parse_func(v, &crt->target); \
4371 if (_deserialize_r < 0) \
4372 log_unit_debug_errno(u, _deserialize_r, \
4373 "Failed to parse \"%s=%s\", ignoring", l, v); \
4374 } \
4375 } \
4376 _deserialize_matched; \
4377 })
4378
4379#define MATCH_DESERIALIZE_METRIC(u, key, l, v, parse_func, target) \
4380 ({ \
4381 bool _deserialize_matched = streq(l, key); \
4382 if (_deserialize_matched) { \
4383 CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4384 if (!crt) \
4385 log_oom_debug(); \
4386 else { \
4387 int _deserialize_r = parse_func(v); \
4388 if (_deserialize_r < 0) \
4389 log_unit_debug_errno(u, _deserialize_r, \
4390 "Failed to parse \"%s=%s\", ignoring.", l, v); \
4391 else \
4392 crt->target = _deserialize_r; \
4393 } \
4394 } \
4395 _deserialize_matched; \
4396 })
4397
4398int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds) {
4399 int r;
4400
4401 assert(u);
4402 assert(value);
4403
4404 if (!UNIT_HAS_CGROUP_CONTEXT(u))
4405 return 0;
4406
4407 if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-base", key, value, safe_atou64, cpu_usage_base) ||
4408 MATCH_DESERIALIZE_IMMEDIATE(u, "cpuacct-usage-base", key, value, safe_atou64, cpu_usage_base))
4409 return 1;
4410
4411 if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-last", key, value, safe_atou64, cpu_usage_last))
4412 return 1;
4413
4414 if (MATCH_DESERIALIZE_IMMEDIATE(u, "managed-oom-kill-last", key, value, safe_atou64, managed_oom_kill_last))
4415 return 1;
4416
4417 if (MATCH_DESERIALIZE_IMMEDIATE(u, "oom-kill-last", key, value, safe_atou64, oom_kill_last))
4418 return 1;
4419
4420 if (streq(key, "cgroup")) {
4421 r = unit_set_cgroup_path(u, value);
4422 if (r < 0)
4423 log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value);
4424
9cc54544
LP
4425 return 1;
4426 }
4427
4428 if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id))
4429 return 1;
4430
23ac0811 4431 if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized", key, value, parse_tristate, deserialized_cgroup_realized))
9cc54544
LP
4432 return 1;
4433
4434 if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask))
4435 return 1;
4436
4437 if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-enabled-mask", key, value, cg_mask_from_string, cgroup_enabled_mask))
4438 return 1;
4439
4440 if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-invalidated-mask", key, value, cg_mask_from_string, cgroup_invalidated_mask))
4441 return 1;
4442
4443 if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
4444 int fd;
4445
4446 fd = deserialize_fd(fds, value);
4447 if (fd >= 0)
4448 (void) bpf_socket_bind_add_initial_link_fd(u, fd);
4449
4450 return 1;
4451 }
4452
4453 if (STR_IN_SET(key,
4454 "ip-bpf-ingress-installed", "ip-bpf-egress-installed",
4455 "bpf-device-control-installed",
4456 "ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) {
4457
4458 CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4459 if (!crt)
4460 log_oom_debug();
4461 else {
4462 if (streq(key, "ip-bpf-ingress-installed"))
4463 (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_ingress_installed);
4464
4465 if (streq(key, "ip-bpf-egress-installed"))
4466 (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_egress_installed);
4467
4468 if (streq(key, "bpf-device-control-installed"))
4469 (void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed);
4470
4471 if (streq(key, "ip-bpf-custom-ingress-installed"))
4472 (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_ingress_installed);
4473
4474 if (streq(key, "ip-bpf-custom-egress-installed"))
4475 (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_egress_installed);
4476 }
4477
4478 return 1;
4479 }
4480
4481 if (streq(key, "restrict-ifaces-bpf-fd")) {
4482 int fd;
4483
4484 fd = deserialize_fd(fds, value);
4485 if (fd >= 0)
4486 (void) bpf_restrict_ifaces_add_initial_link_fd(u, fd);
4487 return 1;
4488 }
4489
4490 CGroupMemoryAccountingMetric mm = memory_accounting_metric_field_last_from_string(key);
4491 if (mm >= 0) {
4492 uint64_t c;
4493
4494 r = safe_atou64(value, &c);
4495 if (r < 0)
4496 log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", value);
4497 else {
4498 CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4499 if (!crt)
4500 log_oom_debug();
4501 else
4502 crt->memory_accounting_last[mm] = c;
4503 }
4504
4505 return 1;
4506 }
4507
4508 CGroupIPAccountingMetric ipm = ip_accounting_metric_field_from_string(key);
4509 if (ipm >= 0) {
4510 uint64_t c;
4511
4512 r = safe_atou64(value, &c);
4513 if (r < 0)
4514 log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", value);
4515 else {
4516 CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4517 if (!crt)
4518 log_oom_debug();
4519 else
4520 crt->ip_accounting_extra[ipm] = c;
4521 }
4522
4523 return 1;
4524 }
4525
4526 CGroupIOAccountingMetric iom = io_accounting_metric_field_base_from_string(key);
4527 if (iom >= 0) {
4528 uint64_t c;
4529
4530 r = safe_atou64(value, &c);
4531 if (r < 0)
4532 log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", value);
4533 else {
4534 CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4535 if (!crt)
4536 log_oom_debug();
4537 else
4538 crt->io_accounting_base[iom] = c;
4539 }
4540
4541 return 1;
4542 }
4543
4544 iom = io_accounting_metric_field_last_from_string(key);
4545 if (iom >= 0) {
4546 uint64_t c;
4547
4548 r = safe_atou64(value, &c);
4549 if (r < 0)
4550 log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", value);
4551 else {
4552 CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4553 if (!crt)
4554 log_oom_debug();
4555 else
4556 crt->io_accounting_last[iom] = c;
4557 }
4558 return 1;
4559 }
4560
4561 return 0;
4562}
4563
4e806bfa
AZ
4564static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
4565 [CGROUP_DEVICE_POLICY_AUTO] = "auto",
4566 [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
4567 [CGROUP_DEVICE_POLICY_STRICT] = "strict",
4568};
4569
4ad49000 4570DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
d9e45bc3 4571
6bb00842 4572static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
7354936e
YW
4573 [CGROUP_PRESSURE_WATCH_NO] = "no",
4574 [CGROUP_PRESSURE_WATCH_YES] = "yes",
6bb00842 4575 [CGROUP_PRESSURE_WATCH_AUTO] = "auto",
6bb00842
LP
4576 [CGROUP_PRESSURE_WATCH_SKIP] = "skip",
4577};
4578
7354936e 4579DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_YES);
435996e6
DDM
4580
4581static const char* const cgroup_ip_accounting_metric_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
4582 [CGROUP_IP_INGRESS_BYTES] = "IPIngressBytes",
4583 [CGROUP_IP_EGRESS_BYTES] = "IPEgressBytes",
4584 [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets",
4585 [CGROUP_IP_EGRESS_PACKETS] = "IPEgressPackets",
4586};
4587
4588DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric);
4589
4590static const char* const cgroup_io_accounting_metric_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4591 [CGROUP_IO_READ_BYTES] = "IOReadBytes",
4592 [CGROUP_IO_WRITE_BYTES] = "IOWriteBytes",
4593 [CGROUP_IO_READ_OPERATIONS] = "IOReadOperations",
4594 [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations",
4595};
4596
4597DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric, CGroupIOAccountingMetric);
9824ab1f
MY
4598
4599static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
702aa339 4600 [CGROUP_MEMORY_CURRENT] = "MemoryCurrent",
9824ab1f
MY
4601 [CGROUP_MEMORY_PEAK] = "MemoryPeak",
4602 [CGROUP_MEMORY_SWAP_CURRENT] = "MemorySwapCurrent",
4603 [CGROUP_MEMORY_SWAP_PEAK] = "MemorySwapPeak",
4604 [CGROUP_MEMORY_ZSWAP_CURRENT] = "MemoryZSwapCurrent",
4605};
4606
4607DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
4fb0d2dc 4608
8ad61489 4609static const char *const cgroup_effective_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = {
4fb0d2dc
MK
4610 [CGROUP_LIMIT_MEMORY_MAX] = "EffectiveMemoryMax",
4611 [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh",
4612 [CGROUP_LIMIT_TASKS_MAX] = "EffectiveTasksMax",
4613};
4614
8ad61489 4615DEFINE_STRING_TABLE_LOOKUP(cgroup_effective_limit_type, CGroupLimitType);