]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
cgroup: Add EffectiveMemoryMax=, EffectiveMemoryHigh= and EffectiveTasksMax= properties
[thirdparty/systemd.git] / src / core / cgroup.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <fcntl.h>
4
5 #include "sd-messages.h"
6
7 #include "af-list.h"
8 #include "alloc-util.h"
9 #include "blockdev-util.h"
10 #include "bpf-devices.h"
11 #include "bpf-firewall.h"
12 #include "bpf-foreign.h"
13 #include "bpf-socket-bind.h"
14 #include "btrfs-util.h"
15 #include "bus-error.h"
16 #include "bus-locator.h"
17 #include "cgroup-setup.h"
18 #include "cgroup-util.h"
19 #include "cgroup.h"
20 #include "devnum-util.h"
21 #include "fd-util.h"
22 #include "fileio.h"
23 #include "firewall-util.h"
24 #include "in-addr-prefix-util.h"
25 #include "inotify-util.h"
26 #include "io-util.h"
27 #include "ip-protocol-list.h"
28 #include "limits-util.h"
29 #include "nulstr-util.h"
30 #include "parse-util.h"
31 #include "path-util.h"
32 #include "percent-util.h"
33 #include "process-util.h"
34 #include "procfs-util.h"
35 #include "restrict-ifaces.h"
36 #include "special.h"
37 #include "stdio-util.h"
38 #include "string-table.h"
39 #include "string-util.h"
40 #include "virt.h"
41
42 #if BPF_FRAMEWORK
43 #include "bpf-dlopen.h"
44 #include "bpf-link.h"
45 #include "bpf/restrict_fs/restrict-fs-skel.h"
46 #endif
47
48 #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
49
50 /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
51 * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
52 * out specific attributes from us. */
53 #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
54
55 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max) {
56 if (tasks_max->scale == 0)
57 return tasks_max->value;
58
59 return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
60 }
61
62 bool manager_owns_host_root_cgroup(Manager *m) {
63 assert(m);
64
65 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
66 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
67 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
68 * we run in any kind of container virtualization. */
69
70 if (MANAGER_IS_USER(m))
71 return false;
72
73 if (detect_container() > 0)
74 return false;
75
76 return empty_or_root(m->cgroup_root);
77 }
78
79 bool unit_has_startup_cgroup_constraints(Unit *u) {
80 assert(u);
81
82 /* Returns true if this unit has any directives which apply during
83 * startup/shutdown phases. */
84
85 CGroupContext *c;
86
87 c = unit_get_cgroup_context(u);
88 if (!c)
89 return false;
90
91 return c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
92 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
93 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
94 c->startup_cpuset_cpus.set ||
95 c->startup_cpuset_mems.set ||
96 c->startup_memory_high_set ||
97 c->startup_memory_max_set ||
98 c->startup_memory_swap_max_set||
99 c->startup_memory_zswap_max_set ||
100 c->startup_memory_low_set;
101 }
102
103 bool unit_has_host_root_cgroup(Unit *u) {
104 assert(u);
105
106 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
107 * the manager manages the root cgroup. */
108
109 if (!manager_owns_host_root_cgroup(u->manager))
110 return false;
111
112 return unit_has_name(u, SPECIAL_ROOT_SLICE);
113 }
114
115 static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
116 int r;
117
118 r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
119 if (r < 0)
120 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
121 strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value);
122
123 return r;
124 }
125
126 static void cgroup_compat_warn(void) {
127 static bool cgroup_compat_warned = false;
128
129 if (cgroup_compat_warned)
130 return;
131
132 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
133 "See cgroup-compat debug messages for details.");
134
135 cgroup_compat_warned = true;
136 }
137
138 #define log_cgroup_compat(unit, fmt, ...) do { \
139 cgroup_compat_warn(); \
140 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
141 } while (false)
142
143 void cgroup_context_init(CGroupContext *c) {
144 assert(c);
145
146 /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make
147 * sure to serialize in execute-serialize.c using serialize_bool() instead of
148 * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide()
149 * skips serialization if the value is 'false' (as that's the common default), so if the value at
150 * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other
151 * values, update/add a conditional serialization check. This is to minimize the amount of
152 * serialized data that is sent to the sd-executor, so that there is less work to do on the default
153 * cases. */
154
155 *c = (CGroupContext) {
156 .cpu_weight = CGROUP_WEIGHT_INVALID,
157 .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
158 .cpu_quota_per_sec_usec = USEC_INFINITY,
159 .cpu_quota_period_usec = USEC_INFINITY,
160
161 .cpu_shares = CGROUP_CPU_SHARES_INVALID,
162 .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
163
164 .memory_high = CGROUP_LIMIT_MAX,
165 .startup_memory_high = CGROUP_LIMIT_MAX,
166 .memory_max = CGROUP_LIMIT_MAX,
167 .startup_memory_max = CGROUP_LIMIT_MAX,
168 .memory_swap_max = CGROUP_LIMIT_MAX,
169 .startup_memory_swap_max = CGROUP_LIMIT_MAX,
170 .memory_zswap_max = CGROUP_LIMIT_MAX,
171 .startup_memory_zswap_max = CGROUP_LIMIT_MAX,
172
173 .memory_limit = CGROUP_LIMIT_MAX,
174
175 .io_weight = CGROUP_WEIGHT_INVALID,
176 .startup_io_weight = CGROUP_WEIGHT_INVALID,
177
178 .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
179 .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
180
181 .tasks_max = CGROUP_TASKS_MAX_UNSET,
182
183 .moom_swap = MANAGED_OOM_AUTO,
184 .moom_mem_pressure = MANAGED_OOM_AUTO,
185 .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
186
187 .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
188 .memory_pressure_threshold_usec = USEC_INFINITY,
189 };
190 }
191
192 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
193 assert(c);
194 assert(a);
195
196 LIST_REMOVE(device_allow, c->device_allow, a);
197 free(a->path);
198 free(a);
199 }
200
201 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
202 assert(c);
203 assert(w);
204
205 LIST_REMOVE(device_weights, c->io_device_weights, w);
206 free(w->path);
207 free(w);
208 }
209
210 void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
211 assert(c);
212 assert(l);
213
214 LIST_REMOVE(device_latencies, c->io_device_latencies, l);
215 free(l->path);
216 free(l);
217 }
218
219 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
220 assert(c);
221 assert(l);
222
223 LIST_REMOVE(device_limits, c->io_device_limits, l);
224 free(l->path);
225 free(l);
226 }
227
228 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
229 assert(c);
230 assert(w);
231
232 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
233 free(w->path);
234 free(w);
235 }
236
237 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
238 assert(c);
239 assert(b);
240
241 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
242 free(b->path);
243 free(b);
244 }
245
246 void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
247 assert(c);
248 assert(p);
249
250 LIST_REMOVE(programs, c->bpf_foreign_programs, p);
251 free(p->bpffs_path);
252 free(p);
253 }
254
255 void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
256 assert(head);
257
258 LIST_CLEAR(socket_bind_items, *head, free);
259 }
260
261 void cgroup_context_done(CGroupContext *c) {
262 assert(c);
263
264 while (c->io_device_weights)
265 cgroup_context_free_io_device_weight(c, c->io_device_weights);
266
267 while (c->io_device_latencies)
268 cgroup_context_free_io_device_latency(c, c->io_device_latencies);
269
270 while (c->io_device_limits)
271 cgroup_context_free_io_device_limit(c, c->io_device_limits);
272
273 while (c->blockio_device_weights)
274 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
275
276 while (c->blockio_device_bandwidths)
277 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
278
279 while (c->device_allow)
280 cgroup_context_free_device_allow(c, c->device_allow);
281
282 cgroup_context_remove_socket_bind(&c->socket_bind_allow);
283 cgroup_context_remove_socket_bind(&c->socket_bind_deny);
284
285 c->ip_address_allow = set_free(c->ip_address_allow);
286 c->ip_address_deny = set_free(c->ip_address_deny);
287
288 c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
289 c->ip_filters_egress = strv_free(c->ip_filters_egress);
290
291 while (c->bpf_foreign_programs)
292 cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
293
294 c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
295
296 cpu_set_reset(&c->cpuset_cpus);
297 cpu_set_reset(&c->startup_cpuset_cpus);
298 cpu_set_reset(&c->cpuset_mems);
299 cpu_set_reset(&c->startup_cpuset_mems);
300
301 c->delegate_subgroup = mfree(c->delegate_subgroup);
302
303 nft_set_context_clear(&c->nft_set_context);
304 }
305
306 static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
307 assert(u);
308
309 if (!u->cgroup_realized)
310 return -EOWNERDEAD;
311
312 return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret);
313 }
314
315 static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
316 CGroupContext *c;
317 CGroupMask m;
318 const char *file;
319 uint64_t unit_value;
320 int r;
321
322 /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
323 * return -ENODATA) on cgroup v1.
324 *
325 * Returns:
326 *
327 * <0: On error.
328 * 0: If the kernel memory setting doesn't match our configuration.
329 * >0: If the kernel memory setting matches our configuration.
330 *
331 * The following values are only guaranteed to be populated on return >=0:
332 *
333 * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
334 * - ret_kernel_value will contain the actual value presented by the kernel. */
335
336 assert(u);
337
338 r = cg_all_unified();
339 if (r < 0)
340 return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
341
342 /* Unsupported on v1.
343 *
344 * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
345 * silently masked the controller. */
346 if (r == 0)
347 return -ENODATA;
348
349 /* The root slice doesn't have any controller files, so we can't compare anything. */
350 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
351 return -ENODATA;
352
353 /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
354 * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
355 * avoid specious errors in these scenarios, check that we even expect the memory controller to be
356 * enabled at all. */
357 m = unit_get_target_mask(u);
358 if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
359 return -ENODATA;
360
361 assert_se(c = unit_get_cgroup_context(u));
362
363 bool startup = u->manager && IN_SET(manager_state(u->manager), MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
364
365 if (streq(property_name, "MemoryLow")) {
366 unit_value = unit_get_ancestor_memory_low(u);
367 file = "memory.low";
368 } else if (startup && streq(property_name, "StartupMemoryLow")) {
369 unit_value = unit_get_ancestor_startup_memory_low(u);
370 file = "memory.low";
371 } else if (streq(property_name, "MemoryMin")) {
372 unit_value = unit_get_ancestor_memory_min(u);
373 file = "memory.min";
374 } else if (streq(property_name, "MemoryHigh")) {
375 unit_value = c->memory_high;
376 file = "memory.high";
377 } else if (startup && streq(property_name, "StartupMemoryHigh")) {
378 unit_value = c->startup_memory_high;
379 file = "memory.high";
380 } else if (streq(property_name, "MemoryMax")) {
381 unit_value = c->memory_max;
382 file = "memory.max";
383 } else if (startup && streq(property_name, "StartupMemoryMax")) {
384 unit_value = c->startup_memory_max;
385 file = "memory.max";
386 } else if (streq(property_name, "MemorySwapMax")) {
387 unit_value = c->memory_swap_max;
388 file = "memory.swap.max";
389 } else if (startup && streq(property_name, "StartupMemorySwapMax")) {
390 unit_value = c->startup_memory_swap_max;
391 file = "memory.swap.max";
392 } else if (streq(property_name, "MemoryZSwapMax")) {
393 unit_value = c->memory_zswap_max;
394 file = "memory.zswap.max";
395 } else if (startup && streq(property_name, "StartupMemoryZSwapMax")) {
396 unit_value = c->startup_memory_zswap_max;
397 file = "memory.zswap.max";
398 } else
399 return -EINVAL;
400
401 r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
402 if (r < 0)
403 return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
404
405 /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
406 * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
407 * our internal page-counting. To support those future kernels, just check the value itself first
408 * without any page-alignment. */
409 if (*ret_kernel_value == unit_value) {
410 *ret_unit_value = unit_value;
411 return 1;
412 }
413
414 /* The current kernel behaviour, by comparison, is that even if you write a particular number of
415 * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
416 * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
417 * cricket. */
418 if (unit_value != CGROUP_LIMIT_MAX)
419 unit_value = PAGE_ALIGN_DOWN(unit_value);
420
421 *ret_unit_value = unit_value;
422
423 return *ret_kernel_value == *ret_unit_value;
424 }
425
426 #define FORMAT_CGROUP_DIFF_MAX 128
427
428 static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) {
429 uint64_t kval, sval;
430 int r;
431
432 assert(u);
433 assert(buf);
434 assert(l > 0);
435
436 r = unit_compare_memory_limit(u, property_name, &sval, &kval);
437
438 /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
439 * In the absence of reliably being able to detect whether memcg swap support is available or not,
440 * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying
441 * on CONFIG_ZSWAP. */
442 if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
443 (r == -ENOENT && STR_IN_SET(property_name,
444 "MemorySwapMax",
445 "StartupMemorySwapMax",
446 "MemoryZSwapMax",
447 "StartupMemoryZSwapMax")))
448 buf[0] = 0;
449 else if (r < 0) {
450 errno = -r;
451 (void) snprintf(buf, l, " (error getting kernel value: %m)");
452 } else
453 (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
454
455 return buf;
456 }
457
458 const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) {
459 static const char *table[_CGROUP_DEVICE_PERMISSIONS_MAX] = {
460 /* Lets simply define a table with every possible combination. As long as those are just 8 we
461 * can get away with it. If this ever grows to more we need to revisit this logic though. */
462 [0] = "",
463 [CGROUP_DEVICE_READ] = "r",
464 [CGROUP_DEVICE_WRITE] = "w",
465 [CGROUP_DEVICE_MKNOD] = "m",
466 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE] = "rw",
467 [CGROUP_DEVICE_READ|CGROUP_DEVICE_MKNOD] = "rm",
468 [CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "wm",
469 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "rwm",
470 };
471
472 if (p < 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
473 return NULL;
474
475 return table[p];
476 }
477
478 CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) {
479 CGroupDevicePermissions p = 0;
480
481 if (!s)
482 return _CGROUP_DEVICE_PERMISSIONS_INVALID;
483
484 for (const char *c = s; *c; c++) {
485 if (*c == 'r')
486 p |= CGROUP_DEVICE_READ;
487 else if (*c == 'w')
488 p |= CGROUP_DEVICE_WRITE;
489 else if (*c == 'm')
490 p |= CGROUP_DEVICE_MKNOD;
491 else
492 return _CGROUP_DEVICE_PERMISSIONS_INVALID;
493 }
494
495 return p;
496 }
497
498 void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
499 _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
500 CGroupContext *c;
501 struct in_addr_prefix *iaai;
502
503 char cda[FORMAT_CGROUP_DIFF_MAX];
504 char cdb[FORMAT_CGROUP_DIFF_MAX];
505 char cdc[FORMAT_CGROUP_DIFF_MAX];
506 char cdd[FORMAT_CGROUP_DIFF_MAX];
507 char cde[FORMAT_CGROUP_DIFF_MAX];
508 char cdf[FORMAT_CGROUP_DIFF_MAX];
509 char cdg[FORMAT_CGROUP_DIFF_MAX];
510 char cdh[FORMAT_CGROUP_DIFF_MAX];
511 char cdi[FORMAT_CGROUP_DIFF_MAX];
512 char cdj[FORMAT_CGROUP_DIFF_MAX];
513 char cdk[FORMAT_CGROUP_DIFF_MAX];
514
515 assert(u);
516 assert(f);
517
518 assert_se(c = unit_get_cgroup_context(u));
519
520 prefix = strempty(prefix);
521
522 (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
523 (void) cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
524
525 /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */
526 const char *delegate_str = delegate_controllers_str ?: c->delegate ? "(none)" : "no";
527
528 cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
529 startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
530 cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
531 startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
532
533 fprintf(f,
534 "%sCPUAccounting: %s\n"
535 "%sIOAccounting: %s\n"
536 "%sBlockIOAccounting: %s\n"
537 "%sMemoryAccounting: %s\n"
538 "%sTasksAccounting: %s\n"
539 "%sIPAccounting: %s\n"
540 "%sCPUWeight: %" PRIu64 "\n"
541 "%sStartupCPUWeight: %" PRIu64 "\n"
542 "%sCPUShares: %" PRIu64 "\n"
543 "%sStartupCPUShares: %" PRIu64 "\n"
544 "%sCPUQuotaPerSecSec: %s\n"
545 "%sCPUQuotaPeriodSec: %s\n"
546 "%sAllowedCPUs: %s\n"
547 "%sStartupAllowedCPUs: %s\n"
548 "%sAllowedMemoryNodes: %s\n"
549 "%sStartupAllowedMemoryNodes: %s\n"
550 "%sIOWeight: %" PRIu64 "\n"
551 "%sStartupIOWeight: %" PRIu64 "\n"
552 "%sBlockIOWeight: %" PRIu64 "\n"
553 "%sStartupBlockIOWeight: %" PRIu64 "\n"
554 "%sDefaultMemoryMin: %" PRIu64 "\n"
555 "%sDefaultMemoryLow: %" PRIu64 "\n"
556 "%sMemoryMin: %" PRIu64 "%s\n"
557 "%sMemoryLow: %" PRIu64 "%s\n"
558 "%sStartupMemoryLow: %" PRIu64 "%s\n"
559 "%sMemoryHigh: %" PRIu64 "%s\n"
560 "%sStartupMemoryHigh: %" PRIu64 "%s\n"
561 "%sMemoryMax: %" PRIu64 "%s\n"
562 "%sStartupMemoryMax: %" PRIu64 "%s\n"
563 "%sMemorySwapMax: %" PRIu64 "%s\n"
564 "%sStartupMemorySwapMax: %" PRIu64 "%s\n"
565 "%sMemoryZSwapMax: %" PRIu64 "%s\n"
566 "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n"
567 "%sMemoryLimit: %" PRIu64 "\n"
568 "%sTasksMax: %" PRIu64 "\n"
569 "%sDevicePolicy: %s\n"
570 "%sDisableControllers: %s\n"
571 "%sDelegate: %s\n"
572 "%sManagedOOMSwap: %s\n"
573 "%sManagedOOMMemoryPressure: %s\n"
574 "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
575 "%sManagedOOMPreference: %s\n"
576 "%sMemoryPressureWatch: %s\n"
577 "%sCoredumpReceive: %s\n",
578 prefix, yes_no(c->cpu_accounting),
579 prefix, yes_no(c->io_accounting),
580 prefix, yes_no(c->blockio_accounting),
581 prefix, yes_no(c->memory_accounting),
582 prefix, yes_no(c->tasks_accounting),
583 prefix, yes_no(c->ip_accounting),
584 prefix, c->cpu_weight,
585 prefix, c->startup_cpu_weight,
586 prefix, c->cpu_shares,
587 prefix, c->startup_cpu_shares,
588 prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
589 prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
590 prefix, strempty(cpuset_cpus),
591 prefix, strempty(startup_cpuset_cpus),
592 prefix, strempty(cpuset_mems),
593 prefix, strempty(startup_cpuset_mems),
594 prefix, c->io_weight,
595 prefix, c->startup_io_weight,
596 prefix, c->blockio_weight,
597 prefix, c->startup_blockio_weight,
598 prefix, c->default_memory_min,
599 prefix, c->default_memory_low,
600 prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"),
601 prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"),
602 prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "StartupMemoryLow"),
603 prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryHigh"),
604 prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "StartupMemoryHigh"),
605 prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdf, sizeof(cdf), u, "MemoryMax"),
606 prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(cdg, sizeof(cdg), u, "StartupMemoryMax"),
607 prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cdh, sizeof(cdh), u, "MemorySwapMax"),
608 prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(cdi, sizeof(cdi), u, "StartupMemorySwapMax"),
609 prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(cdj, sizeof(cdj), u, "MemoryZSwapMax"),
610 prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(cdk, sizeof(cdk), u, "StartupMemoryZSwapMax"),
611 prefix, c->memory_limit,
612 prefix, cgroup_tasks_max_resolve(&c->tasks_max),
613 prefix, cgroup_device_policy_to_string(c->device_policy),
614 prefix, strempty(disable_controllers_str),
615 prefix, delegate_str,
616 prefix, managed_oom_mode_to_string(c->moom_swap),
617 prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
618 prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
619 prefix, managed_oom_preference_to_string(c->moom_preference),
620 prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch),
621 prefix, yes_no(c->coredump_receive));
622
623 if (c->delegate_subgroup)
624 fprintf(f, "%sDelegateSubgroup: %s\n",
625 prefix, c->delegate_subgroup);
626
627 if (c->memory_pressure_threshold_usec != USEC_INFINITY)
628 fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
629 prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
630
631 LIST_FOREACH(device_allow, a, c->device_allow)
632 /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
633 fprintf(f,
634 "%sDeviceAllow: %s %s\n",
635 prefix,
636 a->path,
637 strna(cgroup_device_permissions_to_string(a->permissions)));
638
639 LIST_FOREACH(device_weights, iw, c->io_device_weights)
640 fprintf(f,
641 "%sIODeviceWeight: %s %" PRIu64 "\n",
642 prefix,
643 iw->path,
644 iw->weight);
645
646 LIST_FOREACH(device_latencies, l, c->io_device_latencies)
647 fprintf(f,
648 "%sIODeviceLatencyTargetSec: %s %s\n",
649 prefix,
650 l->path,
651 FORMAT_TIMESPAN(l->target_usec, 1));
652
653 LIST_FOREACH(device_limits, il, c->io_device_limits)
654 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
655 if (il->limits[type] != cgroup_io_limit_defaults[type])
656 fprintf(f,
657 "%s%s: %s %s\n",
658 prefix,
659 cgroup_io_limit_type_to_string(type),
660 il->path,
661 FORMAT_BYTES(il->limits[type]));
662
663 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
664 fprintf(f,
665 "%sBlockIODeviceWeight: %s %" PRIu64,
666 prefix,
667 w->path,
668 w->weight);
669
670 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
671 if (b->rbps != CGROUP_LIMIT_MAX)
672 fprintf(f,
673 "%sBlockIOReadBandwidth: %s %s\n",
674 prefix,
675 b->path,
676 FORMAT_BYTES(b->rbps));
677 if (b->wbps != CGROUP_LIMIT_MAX)
678 fprintf(f,
679 "%sBlockIOWriteBandwidth: %s %s\n",
680 prefix,
681 b->path,
682 FORMAT_BYTES(b->wbps));
683 }
684
685 SET_FOREACH(iaai, c->ip_address_allow)
686 fprintf(f, "%sIPAddressAllow: %s\n", prefix,
687 IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
688 SET_FOREACH(iaai, c->ip_address_deny)
689 fprintf(f, "%sIPAddressDeny: %s\n", prefix,
690 IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
691
692 STRV_FOREACH(path, c->ip_filters_ingress)
693 fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
694 STRV_FOREACH(path, c->ip_filters_egress)
695 fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
696
697 LIST_FOREACH(programs, p, c->bpf_foreign_programs)
698 fprintf(f, "%sBPFProgram: %s:%s",
699 prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
700
701 if (c->socket_bind_allow) {
702 fprintf(f, "%sSocketBindAllow: ", prefix);
703 cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f);
704 fputc('\n', f);
705 }
706
707 if (c->socket_bind_deny) {
708 fprintf(f, "%sSocketBindDeny: ", prefix);
709 cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f);
710 fputc('\n', f);
711 }
712
713 if (c->restrict_network_interfaces) {
714 char *iface;
715 SET_FOREACH(iface, c->restrict_network_interfaces)
716 fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
717 }
718
719 FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
720 fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source),
721 nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set);
722 }
723
724 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
725 const char *family, *colon1, *protocol = "", *colon2 = "";
726
727 family = strempty(af_to_ipv4_ipv6(item->address_family));
728 colon1 = isempty(family) ? "" : ":";
729
730 if (item->ip_protocol != 0) {
731 protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
732 colon2 = ":";
733 }
734
735 if (item->nr_ports == 0)
736 fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2);
737 else if (item->nr_ports == 1)
738 fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
739 else {
740 uint16_t port_max = item->port_min + item->nr_ports - 1;
741 fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
742 item->port_min, port_max);
743 }
744 }
745
746 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) {
747 bool first = true;
748
749 LIST_FOREACH(socket_bind_items, bi, items) {
750 if (first)
751 first = false;
752 else
753 fputc(' ', f);
754
755 cgroup_context_dump_socket_bind_item(bi, f);
756 }
757 }
758
759 int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
760 _cleanup_free_ CGroupDeviceAllow *a = NULL;
761 _cleanup_free_ char *d = NULL;
762
763 assert(c);
764 assert(dev);
765 assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
766
767 if (p == 0)
768 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
769
770 a = new(CGroupDeviceAllow, 1);
771 if (!a)
772 return -ENOMEM;
773
774 d = strdup(dev);
775 if (!d)
776 return -ENOMEM;
777
778 *a = (CGroupDeviceAllow) {
779 .path = TAKE_PTR(d),
780 .permissions = p,
781 };
782
783 LIST_PREPEND(device_allow, c->device_allow, a);
784 TAKE_PTR(a);
785
786 return 0;
787 }
788
789 int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
790 assert(c);
791 assert(dev);
792 assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
793
794 if (p == 0)
795 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
796
797 LIST_FOREACH(device_allow, b, c->device_allow)
798 if (path_equal(b->path, dev)) {
799 b->permissions = p;
800 return 0;
801 }
802
803 return cgroup_context_add_device_allow(c, dev, p);
804 }
805
806 int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
807 CGroupBPFForeignProgram *p;
808 _cleanup_free_ char *d = NULL;
809
810 assert(c);
811 assert(bpffs_path);
812
813 if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
814 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m");
815
816 d = strdup(bpffs_path);
817 if (!d)
818 return log_oom();
819
820 p = new(CGroupBPFForeignProgram, 1);
821 if (!p)
822 return log_oom();
823
824 *p = (CGroupBPFForeignProgram) {
825 .attach_type = attach_type,
826 .bpffs_path = TAKE_PTR(d),
827 };
828
829 LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
830
831 return 0;
832 }
833
834 #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
835 uint64_t unit_get_ancestor_##entry(Unit *u) { \
836 CGroupContext *c; \
837 \
838 /* 1. Is entry set in this unit? If so, use that. \
839 * 2. Is the default for this entry set in any \
840 * ancestor? If so, use that. \
841 * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
842 \
843 assert(u); \
844 \
845 c = unit_get_cgroup_context(u); \
846 if (c && c->entry##_set) \
847 return c->entry; \
848 \
849 while ((u = UNIT_GET_SLICE(u))) { \
850 c = unit_get_cgroup_context(u); \
851 if (c && c->default_##entry##_set) \
852 return c->default_##entry; \
853 } \
854 \
855 /* We've reached the root, but nobody had default for \
856 * this entry set, so set it to the kernel default. */ \
857 return CGROUP_LIMIT_MIN; \
858 }
859
860 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
861 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low);
862 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
863
864 static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, size_t size) {
865 int r;
866
867 assert(u);
868 assert(name);
869
870 if (!u->cgroup_path)
871 return;
872
873 r = cg_set_xattr(u->cgroup_path, name, data, size, 0);
874 if (r < 0)
875 log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
876 }
877
878 static void unit_remove_xattr_graceful(Unit *u, const char *name) {
879 int r;
880
881 assert(u);
882 assert(name);
883
884 if (!u->cgroup_path)
885 return;
886
887 r = cg_remove_xattr(u->cgroup_path, name);
888 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
889 log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
890 }
891
892 static void cgroup_oomd_xattr_apply(Unit *u) {
893 CGroupContext *c;
894
895 assert(u);
896
897 c = unit_get_cgroup_context(u);
898 if (!c)
899 return;
900
901 if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT)
902 unit_set_xattr_graceful(u, "user.oomd_omit", "1", 1);
903
904 if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID)
905 unit_set_xattr_graceful(u, "user.oomd_avoid", "1", 1);
906
907 if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID)
908 unit_remove_xattr_graceful(u, "user.oomd_avoid");
909
910 if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT)
911 unit_remove_xattr_graceful(u, "user.oomd_omit");
912 }
913
914 static int cgroup_log_xattr_apply(Unit *u) {
915 ExecContext *c;
916 size_t len, allowed_patterns_len, denied_patterns_len;
917 _cleanup_free_ char *patterns = NULL, *allowed_patterns = NULL, *denied_patterns = NULL;
918 char *last;
919 int r;
920
921 assert(u);
922
923 c = unit_get_exec_context(u);
924 if (!c)
925 /* Some unit types have a cgroup context but no exec context, so we do not log
926 * any error here to avoid confusion. */
927 return 0;
928
929 if (set_isempty(c->log_filter_allowed_patterns) && set_isempty(c->log_filter_denied_patterns)) {
930 unit_remove_xattr_graceful(u, "user.journald_log_filter_patterns");
931 return 0;
932 }
933
934 r = set_make_nulstr(c->log_filter_allowed_patterns, &allowed_patterns, &allowed_patterns_len);
935 if (r < 0)
936 return log_debug_errno(r, "Failed to make nulstr from set: %m");
937
938 r = set_make_nulstr(c->log_filter_denied_patterns, &denied_patterns, &denied_patterns_len);
939 if (r < 0)
940 return log_debug_errno(r, "Failed to make nulstr from set: %m");
941
942 /* Use nul character separated strings without trailing nul */
943 allowed_patterns_len = LESS_BY(allowed_patterns_len, 1u);
944 denied_patterns_len = LESS_BY(denied_patterns_len, 1u);
945
946 len = allowed_patterns_len + 1 + denied_patterns_len;
947 patterns = new(char, len);
948 if (!patterns)
949 return log_oom_debug();
950
951 last = mempcpy_safe(patterns, allowed_patterns, allowed_patterns_len);
952 *(last++) = '\xff';
953 memcpy_safe(last, denied_patterns, denied_patterns_len);
954
955 unit_set_xattr_graceful(u, "user.journald_log_filter_patterns", patterns, len);
956
957 return 0;
958 }
959
960 static void cgroup_invocation_id_xattr_apply(Unit *u) {
961 bool b;
962
963 assert(u);
964
965 b = !sd_id128_is_null(u->invocation_id);
966 FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") {
967 if (b)
968 unit_set_xattr_graceful(u, xn, SD_ID128_TO_STRING(u->invocation_id), 32);
969 else
970 unit_remove_xattr_graceful(u, xn);
971 }
972 }
973
974 static void cgroup_coredump_xattr_apply(Unit *u) {
975 CGroupContext *c;
976
977 assert(u);
978
979 c = unit_get_cgroup_context(u);
980 if (!c)
981 return;
982
983 if (unit_cgroup_delegate(u) && c->coredump_receive)
984 unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1);
985 else
986 unit_remove_xattr_graceful(u, "user.coredump_receive");
987 }
988
989 static void cgroup_delegate_xattr_apply(Unit *u) {
990 bool b;
991
992 assert(u);
993
994 /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
995 * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
996 * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
997 * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
998 * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
999 * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
1000 * be a big problem given this communicates delegation state to clients, but the manager never reads
1001 * it. */
1002 b = unit_cgroup_delegate(u);
1003 FOREACH_STRING(xn, "trusted.delegate", "user.delegate") {
1004 if (b)
1005 unit_set_xattr_graceful(u, xn, "1", 1);
1006 else
1007 unit_remove_xattr_graceful(u, xn);
1008 }
1009 }
1010
1011 static void cgroup_survive_xattr_apply(Unit *u) {
1012 int r;
1013
1014 assert(u);
1015
1016 if (u->survive_final_kill_signal) {
1017 r = cg_set_xattr(
1018 u->cgroup_path,
1019 "user.survive_final_kill_signal",
1020 "1",
1021 1,
1022 /* flags= */ 0);
1023 /* user xattr support was added in kernel v5.7 */
1024 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
1025 r = cg_set_xattr(
1026 u->cgroup_path,
1027 "trusted.survive_final_kill_signal",
1028 "1",
1029 1,
1030 /* flags= */ 0);
1031 if (r < 0)
1032 log_unit_debug_errno(u,
1033 r,
1034 "Failed to set 'survive_final_kill_signal' xattr on control "
1035 "group %s, ignoring: %m",
1036 empty_to_root(u->cgroup_path));
1037 } else {
1038 unit_remove_xattr_graceful(u, "user.survive_final_kill_signal");
1039 unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal");
1040 }
1041 }
1042
1043 static void cgroup_xattr_apply(Unit *u) {
1044 assert(u);
1045
1046 /* The 'user.*' xattrs can be set from a user manager. */
1047 cgroup_oomd_xattr_apply(u);
1048 cgroup_log_xattr_apply(u);
1049 cgroup_coredump_xattr_apply(u);
1050
1051 if (!MANAGER_IS_SYSTEM(u->manager))
1052 return;
1053
1054 cgroup_invocation_id_xattr_apply(u);
1055 cgroup_delegate_xattr_apply(u);
1056 cgroup_survive_xattr_apply(u);
1057 }
1058
1059 static int lookup_block_device(const char *p, dev_t *ret) {
1060 dev_t rdev, dev = 0;
1061 mode_t mode;
1062 int r;
1063
1064 assert(p);
1065 assert(ret);
1066
1067 r = device_path_parse_major_minor(p, &mode, &rdev);
1068 if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
1069 struct stat st;
1070
1071 if (stat(p, &st) < 0)
1072 return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
1073
1074 mode = st.st_mode;
1075 rdev = st.st_rdev;
1076 dev = st.st_dev;
1077 } else if (r < 0)
1078 return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
1079
1080 if (S_ISCHR(mode))
1081 return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
1082 "Device node '%s' is a character device, but block device needed.", p);
1083 if (S_ISBLK(mode))
1084 *ret = rdev;
1085 else if (major(dev) != 0)
1086 *ret = dev; /* If this is not a device node then use the block device this file is stored on */
1087 else {
1088 /* If this is btrfs, getting the backing block device is a bit harder */
1089 r = btrfs_get_block_device(p, ret);
1090 if (r == -ENOTTY)
1091 return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
1092 "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
1093 if (r < 0)
1094 return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
1095 }
1096
1097 /* If this is a LUKS/DM device, recursively try to get the originating block device */
1098 while (block_get_originating(*ret, ret) > 0);
1099
1100 /* If this is a partition, try to get the originating block device */
1101 (void) block_get_whole_disk(*ret, ret);
1102 return 0;
1103 }
1104
1105 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
1106 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
1107 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
1108 }
1109
1110 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
1111 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
1112 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
1113 }
1114
1115 static bool cgroup_context_has_allowed_cpus(CGroupContext *c) {
1116 return c->cpuset_cpus.set || c->startup_cpuset_cpus.set;
1117 }
1118
1119 static bool cgroup_context_has_allowed_mems(CGroupContext *c) {
1120 return c->cpuset_mems.set || c->startup_cpuset_mems.set;
1121 }
1122
1123 uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
1124 assert(c);
1125
1126 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1127 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
1128 return c->startup_cpu_weight;
1129 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
1130 return c->cpu_weight;
1131 else
1132 return CGROUP_WEIGHT_DEFAULT;
1133 }
1134
1135 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
1136 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1137 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
1138 return c->startup_cpu_shares;
1139 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
1140 return c->cpu_shares;
1141 else
1142 return CGROUP_CPU_SHARES_DEFAULT;
1143 }
1144
1145 static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) {
1146 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1147 c->startup_cpuset_cpus.set)
1148 return &c->startup_cpuset_cpus;
1149 else
1150 return &c->cpuset_cpus;
1151 }
1152
1153 static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) {
1154 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1155 c->startup_cpuset_mems.set)
1156 return &c->startup_cpuset_mems;
1157 else
1158 return &c->cpuset_mems;
1159 }
1160
1161 usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
1162 /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
1163 * need to be higher than that boundary. quota is specified in USecPerSec.
1164 * Additionally, period must be at most max_period. */
1165 assert(quota > 0);
1166
1167 return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
1168 }
1169
1170 static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
1171 usec_t new_period;
1172
1173 if (quota == USEC_INFINITY)
1174 /* Always use default period for infinity quota. */
1175 return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1176
1177 if (period == USEC_INFINITY)
1178 /* Default period was requested. */
1179 period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1180
1181 /* Clamp to interval [1ms, 1s] */
1182 new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
1183
1184 if (new_period != period) {
1185 log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
1186 "Clamping CPU interval for cpu.max: period is now %s",
1187 FORMAT_TIMESPAN(new_period, 1));
1188 u->warned_clamping_cpu_quota_period = true;
1189 }
1190
1191 return new_period;
1192 }
1193
1194 static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
1195 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1196
1197 if (weight == CGROUP_WEIGHT_IDLE)
1198 return;
1199 xsprintf(buf, "%" PRIu64 "\n", weight);
1200 (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
1201 }
1202
1203 static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) {
1204 int r;
1205 bool is_idle;
1206 const char *idle_val;
1207
1208 is_idle = weight == CGROUP_WEIGHT_IDLE;
1209 idle_val = one_zero(is_idle);
1210 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.idle", idle_val);
1211 if (r < 0 && (r != -ENOENT || is_idle))
1212 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
1213 "cpu.idle", empty_to_root(u->cgroup_path), idle_val);
1214 }
1215
1216 static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
1217 char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
1218
1219 period = cgroup_cpu_adjust_period_and_log(u, period, quota);
1220 if (quota != USEC_INFINITY)
1221 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
1222 MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
1223 else
1224 xsprintf(buf, "max " USEC_FMT "\n", period);
1225 (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
1226 }
1227
1228 static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
1229 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1230
1231 xsprintf(buf, "%" PRIu64 "\n", shares);
1232 (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
1233 }
1234
1235 static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
1236 char buf[DECIMAL_STR_MAX(usec_t) + 2];
1237
1238 period = cgroup_cpu_adjust_period_and_log(u, period, quota);
1239
1240 xsprintf(buf, USEC_FMT "\n", period);
1241 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
1242
1243 if (quota != USEC_INFINITY) {
1244 xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
1245 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
1246 } else
1247 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
1248 }
1249
1250 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
1251 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
1252 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
1253 }
1254
1255 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
1256 /* we don't support idle in cgroupv1 */
1257 if (weight == CGROUP_WEIGHT_IDLE)
1258 return CGROUP_CPU_SHARES_MIN;
1259
1260 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
1261 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
1262 }
1263
1264 static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
1265 _cleanup_free_ char *buf = NULL;
1266
1267 buf = cpu_set_to_range_string(cpus);
1268 if (!buf) {
1269 log_oom();
1270 return;
1271 }
1272
1273 (void) set_attribute_and_warn(u, "cpuset", name, buf);
1274 }
1275
1276 static bool cgroup_context_has_io_config(CGroupContext *c) {
1277 return c->io_accounting ||
1278 c->io_weight != CGROUP_WEIGHT_INVALID ||
1279 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
1280 c->io_device_weights ||
1281 c->io_device_latencies ||
1282 c->io_device_limits;
1283 }
1284
1285 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
1286 return c->blockio_accounting ||
1287 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
1288 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
1289 c->blockio_device_weights ||
1290 c->blockio_device_bandwidths;
1291 }
1292
1293 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
1294 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1295 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
1296 return c->startup_io_weight;
1297 if (c->io_weight != CGROUP_WEIGHT_INVALID)
1298 return c->io_weight;
1299 return CGROUP_WEIGHT_DEFAULT;
1300 }
1301
1302 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
1303 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1304 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
1305 return c->startup_blockio_weight;
1306 if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
1307 return c->blockio_weight;
1308 return CGROUP_BLKIO_WEIGHT_DEFAULT;
1309 }
1310
1311 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
1312 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
1313 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
1314 }
1315
1316 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
1317 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
1318 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
1319 }
1320
1321 static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) {
1322 static const char * const prop_names[] = {
1323 "IOWeight",
1324 "BlockIOWeight",
1325 "IODeviceWeight",
1326 "BlockIODeviceWeight",
1327 };
1328 static bool warned = false;
1329 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
1330 const char *p;
1331 uint64_t bfq_weight;
1332 int r;
1333
1334 /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
1335 * See also: https://github.com/systemd/systemd/pull/13335 and
1336 * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
1337 p = strjoina(controller, ".bfq.weight");
1338 /* Adjust to kernel range is 1..1000, the default is 100. */
1339 bfq_weight = BFQ_WEIGHT(io_weight);
1340
1341 if (major(dev) > 0)
1342 xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight);
1343 else
1344 xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
1345
1346 r = cg_set_attribute(controller, u->cgroup_path, p, buf);
1347
1348 /* FIXME: drop this when kernels prior
1349 * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
1350 * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
1351 * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
1352 if (r == -EINVAL && major(dev) > 0) {
1353 if (!warned) {
1354 log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p);
1355 warned = true;
1356 }
1357 r = -EOPNOTSUPP; /* mask as unconfigured device */
1358 } else if (r >= 0 && io_weight != bfq_weight)
1359 log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64,
1360 prop_names[2*(major(dev) > 0) + streq(controller, "blkio")],
1361 io_weight, p, bfq_weight);
1362 return r;
1363 }
1364
1365 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
1366 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1367 dev_t dev;
1368 int r, r1, r2;
1369
1370 if (lookup_block_device(dev_path, &dev) < 0)
1371 return;
1372
1373 r1 = set_bfq_weight(u, "io", dev, io_weight);
1374
1375 xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
1376 r2 = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
1377
1378 /* Look at the configured device, when both fail, prefer io.weight errno. */
1379 r = r2 == -EOPNOTSUPP ? r1 : r2;
1380
1381 if (r < 0)
1382 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
1383 r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
1384 empty_to_root(u->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
1385 }
1386
1387 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
1388 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1389 dev_t dev;
1390 int r;
1391
1392 r = lookup_block_device(dev_path, &dev);
1393 if (r < 0)
1394 return;
1395
1396 xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), blkio_weight);
1397 (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
1398 }
1399
1400 static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
1401 char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
1402 dev_t dev;
1403 int r;
1404
1405 r = lookup_block_device(dev_path, &dev);
1406 if (r < 0)
1407 return;
1408
1409 if (target != USEC_INFINITY)
1410 xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target);
1411 else
1412 xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev));
1413
1414 (void) set_attribute_and_warn(u, "io", "io.latency", buf);
1415 }
1416
1417 static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
1418 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
1419 buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
1420 dev_t dev;
1421
1422 if (lookup_block_device(dev_path, &dev) < 0)
1423 return;
1424
1425 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
1426 if (limits[type] != cgroup_io_limit_defaults[type])
1427 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
1428 else
1429 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
1430
1431 xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev),
1432 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
1433 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
1434 (void) set_attribute_and_warn(u, "io", "io.max", buf);
1435 }
1436
1437 static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
1438 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1439 dev_t dev;
1440
1441 if (lookup_block_device(dev_path, &dev) < 0)
1442 return;
1443
1444 sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), rbps);
1445 (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
1446
1447 sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), wbps);
1448 (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
1449 }
1450
1451 static bool unit_has_unified_memory_config(Unit *u) {
1452 CGroupContext *c;
1453
1454 assert(u);
1455
1456 assert_se(c = unit_get_cgroup_context(u));
1457
1458 return unit_get_ancestor_memory_min(u) > 0 ||
1459 unit_get_ancestor_memory_low(u) > 0 || unit_get_ancestor_startup_memory_low(u) > 0 ||
1460 c->memory_high != CGROUP_LIMIT_MAX || c->startup_memory_high_set ||
1461 c->memory_max != CGROUP_LIMIT_MAX || c->startup_memory_max_set ||
1462 c->memory_swap_max != CGROUP_LIMIT_MAX || c->startup_memory_swap_max_set ||
1463 c->memory_zswap_max != CGROUP_LIMIT_MAX || c->startup_memory_zswap_max_set;
1464 }
1465
1466 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
1467 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
1468
1469 if (v != CGROUP_LIMIT_MAX)
1470 xsprintf(buf, "%" PRIu64 "\n", v);
1471
1472 (void) set_attribute_and_warn(u, "memory", file, buf);
1473 }
1474
1475 static void cgroup_apply_firewall(Unit *u) {
1476 assert(u);
1477
1478 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
1479
1480 if (bpf_firewall_compile(u) < 0)
1481 return;
1482
1483 (void) bpf_firewall_load_custom(u);
1484 (void) bpf_firewall_install(u);
1485 }
1486
1487 void unit_modify_nft_set(Unit *u, bool add) {
1488 int r;
1489
1490 assert(u);
1491
1492 if (!MANAGER_IS_SYSTEM(u->manager))
1493 return;
1494
1495 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1496 return;
1497
1498 if (cg_all_unified() <= 0)
1499 return;
1500
1501 if (u->cgroup_id == 0)
1502 return;
1503
1504 if (!u->manager->fw_ctx) {
1505 r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
1506 if (r < 0)
1507 return;
1508
1509 assert(u->manager->fw_ctx);
1510 }
1511
1512 CGroupContext *c = ASSERT_PTR(unit_get_cgroup_context(u));
1513
1514 FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
1515 if (nft_set->source != NFT_SET_SOURCE_CGROUP)
1516 continue;
1517
1518 uint64_t element = u->cgroup_id;
1519
1520 r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
1521 if (r < 0)
1522 log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m",
1523 add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
1524 else
1525 log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64,
1526 add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
1527 }
1528 }
1529
1530 static void cgroup_apply_socket_bind(Unit *u) {
1531 assert(u);
1532
1533 (void) bpf_socket_bind_install(u);
1534 }
1535
1536 static void cgroup_apply_restrict_network_interfaces(Unit *u) {
1537 assert(u);
1538
1539 (void) restrict_network_interfaces_install(u);
1540 }
1541
1542 static int cgroup_apply_devices(Unit *u) {
1543 _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
1544 const char *path;
1545 CGroupContext *c;
1546 CGroupDevicePolicy policy;
1547 int r;
1548
1549 assert_se(c = unit_get_cgroup_context(u));
1550 assert_se(path = u->cgroup_path);
1551
1552 policy = c->device_policy;
1553
1554 if (cg_all_unified() > 0) {
1555 r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
1556 if (r < 0)
1557 return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
1558
1559 } else {
1560 /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
1561 * EINVAL here. */
1562
1563 if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
1564 r = cg_set_attribute("devices", path, "devices.deny", "a");
1565 else
1566 r = cg_set_attribute("devices", path, "devices.allow", "a");
1567 if (r < 0)
1568 log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1569 "Failed to reset devices.allow/devices.deny: %m");
1570 }
1571
1572 bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
1573 (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
1574 if (allow_list_static)
1575 (void) bpf_devices_allow_list_static(prog, path);
1576
1577 bool any = allow_list_static;
1578 LIST_FOREACH(device_allow, a, c->device_allow) {
1579 const char *val;
1580
1581 if (a->permissions == 0)
1582 continue;
1583
1584 if (path_startswith(a->path, "/dev/"))
1585 r = bpf_devices_allow_list_device(prog, path, a->path, a->permissions);
1586 else if ((val = startswith(a->path, "block-")))
1587 r = bpf_devices_allow_list_major(prog, path, val, 'b', a->permissions);
1588 else if ((val = startswith(a->path, "char-")))
1589 r = bpf_devices_allow_list_major(prog, path, val, 'c', a->permissions);
1590 else {
1591 log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
1592 continue;
1593 }
1594
1595 if (r >= 0)
1596 any = true;
1597 }
1598
1599 if (prog && !any) {
1600 log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
1601
1602 /* The kernel verifier would reject a program we would build with the normal intro and outro
1603 but no allow-listing rules (outro would contain an unreachable instruction for successful
1604 return). */
1605 policy = CGROUP_DEVICE_POLICY_STRICT;
1606 }
1607
1608 r = bpf_devices_apply_policy(&prog, policy, any, path, &u->bpf_device_control_installed);
1609 if (r < 0) {
1610 static bool warned = false;
1611
1612 log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
1613 "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1614 "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1615 "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
1616
1617 warned = true;
1618 }
1619 return r;
1620 }
1621
1622 static void set_io_weight(Unit *u, uint64_t weight) {
1623 char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
1624
1625 assert(u);
1626
1627 (void) set_bfq_weight(u, "io", makedev(0, 0), weight);
1628
1629 xsprintf(buf, "default %" PRIu64 "\n", weight);
1630 (void) set_attribute_and_warn(u, "io", "io.weight", buf);
1631 }
1632
1633 static void set_blkio_weight(Unit *u, uint64_t weight) {
1634 char buf[STRLEN("\n")+DECIMAL_STR_MAX(uint64_t)];
1635
1636 assert(u);
1637
1638 (void) set_bfq_weight(u, "blkio", makedev(0, 0), weight);
1639
1640 xsprintf(buf, "%" PRIu64 "\n", weight);
1641 (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
1642 }
1643
1644 static void cgroup_apply_bpf_foreign_program(Unit *u) {
1645 assert(u);
1646
1647 (void) bpf_foreign_install(u);
1648 }
1649
1650 static void cgroup_context_apply(
1651 Unit *u,
1652 CGroupMask apply_mask,
1653 ManagerState state) {
1654
1655 const char *path;
1656 CGroupContext *c;
1657 bool is_host_root, is_local_root;
1658 int r;
1659
1660 assert(u);
1661
1662 /* Nothing to do? Exit early! */
1663 if (apply_mask == 0)
1664 return;
1665
1666 /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
1667 * attributes should only be managed for cgroups further down the tree. */
1668 is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
1669 is_host_root = unit_has_host_root_cgroup(u);
1670
1671 assert_se(c = unit_get_cgroup_context(u));
1672 assert_se(path = u->cgroup_path);
1673
1674 if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
1675 path = "/";
1676
1677 /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
1678 * then), and missing cgroups, i.e. EROFS and ENOENT. */
1679
1680 /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
1681 * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
1682 * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
1683 * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
1684 * we couldn't even write to them if we wanted to). */
1685 if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
1686
1687 if (cg_all_unified() > 0) {
1688 uint64_t weight;
1689
1690 if (cgroup_context_has_cpu_weight(c))
1691 weight = cgroup_context_cpu_weight(c, state);
1692 else if (cgroup_context_has_cpu_shares(c)) {
1693 uint64_t shares;
1694
1695 shares = cgroup_context_cpu_shares(c, state);
1696 weight = cgroup_cpu_shares_to_weight(shares);
1697
1698 log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
1699 shares, weight, path);
1700 } else
1701 weight = CGROUP_WEIGHT_DEFAULT;
1702
1703 cgroup_apply_unified_cpu_idle(u, weight);
1704 cgroup_apply_unified_cpu_weight(u, weight);
1705 cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
1706
1707 } else {
1708 uint64_t shares;
1709
1710 if (cgroup_context_has_cpu_weight(c)) {
1711 uint64_t weight;
1712
1713 weight = cgroup_context_cpu_weight(c, state);
1714 shares = cgroup_cpu_weight_to_shares(weight);
1715
1716 log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
1717 weight, shares, path);
1718 } else if (cgroup_context_has_cpu_shares(c))
1719 shares = cgroup_context_cpu_shares(c, state);
1720 else
1721 shares = CGROUP_CPU_SHARES_DEFAULT;
1722
1723 cgroup_apply_legacy_cpu_shares(u, shares);
1724 cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
1725 }
1726 }
1727
1728 if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
1729 cgroup_apply_unified_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus");
1730 cgroup_apply_unified_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems");
1731 }
1732
1733 /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
1734 * controller), and in case of containers we want to leave control of these attributes to the container manager
1735 * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
1736 if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
1737 bool has_io, has_blockio;
1738 uint64_t weight;
1739
1740 has_io = cgroup_context_has_io_config(c);
1741 has_blockio = cgroup_context_has_blockio_config(c);
1742
1743 if (has_io)
1744 weight = cgroup_context_io_weight(c, state);
1745 else if (has_blockio) {
1746 uint64_t blkio_weight;
1747
1748 blkio_weight = cgroup_context_blkio_weight(c, state);
1749 weight = cgroup_weight_blkio_to_io(blkio_weight);
1750
1751 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
1752 blkio_weight, weight);
1753 } else
1754 weight = CGROUP_WEIGHT_DEFAULT;
1755
1756 set_io_weight(u, weight);
1757
1758 if (has_io) {
1759 LIST_FOREACH(device_weights, w, c->io_device_weights)
1760 cgroup_apply_io_device_weight(u, w->path, w->weight);
1761
1762 LIST_FOREACH(device_limits, limit, c->io_device_limits)
1763 cgroup_apply_io_device_limit(u, limit->path, limit->limits);
1764
1765 LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
1766 cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
1767
1768 } else if (has_blockio) {
1769 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
1770 weight = cgroup_weight_blkio_to_io(w->weight);
1771
1772 log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
1773 w->weight, weight, w->path);
1774
1775 cgroup_apply_io_device_weight(u, w->path, weight);
1776 }
1777
1778 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
1779 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
1780
1781 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
1782 limits[type] = cgroup_io_limit_defaults[type];
1783
1784 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
1785 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
1786
1787 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
1788 b->rbps, b->wbps, b->path);
1789
1790 cgroup_apply_io_device_limit(u, b->path, limits);
1791 }
1792 }
1793 }
1794
1795 if (apply_mask & CGROUP_MASK_BLKIO) {
1796 bool has_io, has_blockio;
1797
1798 has_io = cgroup_context_has_io_config(c);
1799 has_blockio = cgroup_context_has_blockio_config(c);
1800
1801 /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
1802 * left to our container manager, too. */
1803 if (!is_local_root) {
1804 uint64_t weight;
1805
1806 if (has_io) {
1807 uint64_t io_weight;
1808
1809 io_weight = cgroup_context_io_weight(c, state);
1810 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
1811
1812 log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
1813 io_weight, weight);
1814 } else if (has_blockio)
1815 weight = cgroup_context_blkio_weight(c, state);
1816 else
1817 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
1818
1819 set_blkio_weight(u, weight);
1820
1821 if (has_io)
1822 LIST_FOREACH(device_weights, w, c->io_device_weights) {
1823 weight = cgroup_weight_io_to_blkio(w->weight);
1824
1825 log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
1826 w->weight, weight, w->path);
1827
1828 cgroup_apply_blkio_device_weight(u, w->path, weight);
1829 }
1830 else if (has_blockio)
1831 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
1832 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
1833 }
1834
1835 /* The bandwidth limits are something that make sense to be applied to the host's root but not container
1836 * roots, as there we want the container manager to handle it */
1837 if (is_host_root || !is_local_root) {
1838 if (has_io)
1839 LIST_FOREACH(device_limits, l, c->io_device_limits) {
1840 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
1841 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
1842
1843 cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
1844 }
1845 else if (has_blockio)
1846 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
1847 cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
1848 }
1849 }
1850
1851 /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
1852 * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
1853 * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
1854 * write to this if we wanted to.) */
1855 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
1856
1857 if (cg_all_unified() > 0) {
1858 uint64_t max, swap_max = CGROUP_LIMIT_MAX, zswap_max = CGROUP_LIMIT_MAX, high = CGROUP_LIMIT_MAX;
1859
1860 if (unit_has_unified_memory_config(u)) {
1861 bool startup = IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
1862
1863 high = startup && c->startup_memory_high_set ? c->startup_memory_high : c->memory_high;
1864 max = startup && c->startup_memory_max_set ? c->startup_memory_max : c->memory_max;
1865 swap_max = startup && c->startup_memory_swap_max_set ? c->startup_memory_swap_max : c->memory_swap_max;
1866 zswap_max = startup && c->startup_memory_zswap_max_set ? c->startup_memory_zswap_max : c->memory_zswap_max;
1867 } else {
1868 max = c->memory_limit;
1869
1870 if (max != CGROUP_LIMIT_MAX)
1871 log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
1872 }
1873
1874 cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
1875 cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
1876 cgroup_apply_unified_memory_limit(u, "memory.high", high);
1877 cgroup_apply_unified_memory_limit(u, "memory.max", max);
1878 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
1879 cgroup_apply_unified_memory_limit(u, "memory.zswap.max", zswap_max);
1880
1881 (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
1882
1883 } else {
1884 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
1885 uint64_t val;
1886
1887 if (unit_has_unified_memory_config(u)) {
1888 val = c->memory_max;
1889 if (val != CGROUP_LIMIT_MAX)
1890 log_cgroup_compat(u, "Applying MemoryMax=%" PRIu64 " as MemoryLimit=", val);
1891 } else
1892 val = c->memory_limit;
1893
1894 if (val == CGROUP_LIMIT_MAX)
1895 strncpy(buf, "-1\n", sizeof(buf));
1896 else
1897 xsprintf(buf, "%" PRIu64 "\n", val);
1898
1899 (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
1900 }
1901 }
1902
1903 /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
1904 * containers, where we leave this to the manager */
1905 if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
1906 (is_host_root || cg_all_unified() > 0 || !is_local_root))
1907 (void) cgroup_apply_devices(u);
1908
1909 if (apply_mask & CGROUP_MASK_PIDS) {
1910
1911 if (is_host_root) {
1912 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1913 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1914 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1915 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1916 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1917 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1918 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1919 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1920 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1921 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1922 * which is desirable so that there's an official way to release control of the sysctl from
1923 * systemd: set the limit to unbounded and reload. */
1924
1925 if (cgroup_tasks_max_isset(&c->tasks_max)) {
1926 u->manager->sysctl_pid_max_changed = true;
1927 r = procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c->tasks_max));
1928 } else if (u->manager->sysctl_pid_max_changed)
1929 r = procfs_tasks_set_limit(TASKS_MAX);
1930 else
1931 r = 0;
1932 if (r < 0)
1933 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
1934 "Failed to write to tasks limit sysctls: %m");
1935 }
1936
1937 /* The attribute itself is not available on the host root cgroup, and in the container case we want to
1938 * leave it for the container manager. */
1939 if (!is_local_root) {
1940 if (cgroup_tasks_max_isset(&c->tasks_max)) {
1941 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
1942
1943 xsprintf(buf, "%" PRIu64 "\n", cgroup_tasks_max_resolve(&c->tasks_max));
1944 (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
1945 } else
1946 (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
1947 }
1948 }
1949
1950 if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
1951 cgroup_apply_firewall(u);
1952
1953 if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
1954 cgroup_apply_bpf_foreign_program(u);
1955
1956 if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
1957 cgroup_apply_socket_bind(u);
1958
1959 if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
1960 cgroup_apply_restrict_network_interfaces(u);
1961
1962 unit_modify_nft_set(u, /* add = */ true);
1963 }
1964
1965 static bool unit_get_needs_bpf_firewall(Unit *u) {
1966 CGroupContext *c;
1967 assert(u);
1968
1969 c = unit_get_cgroup_context(u);
1970 if (!c)
1971 return false;
1972
1973 if (c->ip_accounting ||
1974 !set_isempty(c->ip_address_allow) ||
1975 !set_isempty(c->ip_address_deny) ||
1976 c->ip_filters_ingress ||
1977 c->ip_filters_egress)
1978 return true;
1979
1980 /* If any parent slice has an IP access list defined, it applies too */
1981 for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
1982 c = unit_get_cgroup_context(p);
1983 if (!c)
1984 return false;
1985
1986 if (!set_isempty(c->ip_address_allow) ||
1987 !set_isempty(c->ip_address_deny))
1988 return true;
1989 }
1990
1991 return false;
1992 }
1993
1994 static bool unit_get_needs_bpf_foreign_program(Unit *u) {
1995 CGroupContext *c;
1996 assert(u);
1997
1998 c = unit_get_cgroup_context(u);
1999 if (!c)
2000 return false;
2001
2002 return !!c->bpf_foreign_programs;
2003 }
2004
2005 static bool unit_get_needs_socket_bind(Unit *u) {
2006 CGroupContext *c;
2007 assert(u);
2008
2009 c = unit_get_cgroup_context(u);
2010 if (!c)
2011 return false;
2012
2013 return c->socket_bind_allow || c->socket_bind_deny;
2014 }
2015
2016 static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
2017 CGroupContext *c;
2018 assert(u);
2019
2020 c = unit_get_cgroup_context(u);
2021 if (!c)
2022 return false;
2023
2024 return !set_isempty(c->restrict_network_interfaces);
2025 }
2026
2027 static CGroupMask unit_get_cgroup_mask(Unit *u) {
2028 CGroupMask mask = 0;
2029 CGroupContext *c;
2030
2031 assert(u);
2032
2033 assert_se(c = unit_get_cgroup_context(u));
2034
2035 /* Figure out which controllers we need, based on the cgroup context object */
2036
2037 if (c->cpu_accounting)
2038 mask |= get_cpu_accounting_mask();
2039
2040 if (cgroup_context_has_cpu_weight(c) ||
2041 cgroup_context_has_cpu_shares(c) ||
2042 c->cpu_quota_per_sec_usec != USEC_INFINITY)
2043 mask |= CGROUP_MASK_CPU;
2044
2045 if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c))
2046 mask |= CGROUP_MASK_CPUSET;
2047
2048 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
2049 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2050
2051 if (c->memory_accounting ||
2052 c->memory_limit != CGROUP_LIMIT_MAX ||
2053 unit_has_unified_memory_config(u))
2054 mask |= CGROUP_MASK_MEMORY;
2055
2056 if (c->device_allow ||
2057 c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
2058 mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
2059
2060 if (c->tasks_accounting ||
2061 cgroup_tasks_max_isset(&c->tasks_max))
2062 mask |= CGROUP_MASK_PIDS;
2063
2064 return CGROUP_MASK_EXTEND_JOINED(mask);
2065 }
2066
2067 static CGroupMask unit_get_bpf_mask(Unit *u) {
2068 CGroupMask mask = 0;
2069
2070 /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
2071 * too. */
2072
2073 if (unit_get_needs_bpf_firewall(u))
2074 mask |= CGROUP_MASK_BPF_FIREWALL;
2075
2076 if (unit_get_needs_bpf_foreign_program(u))
2077 mask |= CGROUP_MASK_BPF_FOREIGN;
2078
2079 if (unit_get_needs_socket_bind(u))
2080 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
2081
2082 if (unit_get_needs_restrict_network_interfaces(u))
2083 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
2084
2085 return mask;
2086 }
2087
2088 CGroupMask unit_get_own_mask(Unit *u) {
2089 CGroupContext *c;
2090
2091 /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
2092 * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
2093
2094 if (u->load_state != UNIT_LOADED)
2095 return 0;
2096
2097 c = unit_get_cgroup_context(u);
2098 if (!c)
2099 return 0;
2100
2101 return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
2102 }
2103
2104 CGroupMask unit_get_delegate_mask(Unit *u) {
2105 CGroupContext *c;
2106
2107 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
2108 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
2109 *
2110 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
2111
2112 if (!unit_cgroup_delegate(u))
2113 return 0;
2114
2115 if (cg_all_unified() <= 0) {
2116 ExecContext *e;
2117
2118 e = unit_get_exec_context(u);
2119 if (e && !exec_context_maintains_privileges(e))
2120 return 0;
2121 }
2122
2123 assert_se(c = unit_get_cgroup_context(u));
2124 return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
2125 }
2126
2127 static CGroupMask unit_get_subtree_mask(Unit *u) {
2128
2129 /* Returns the mask of this subtree, meaning of the group
2130 * itself and its children. */
2131
2132 return unit_get_own_mask(u) | unit_get_members_mask(u);
2133 }
2134
2135 CGroupMask unit_get_members_mask(Unit *u) {
2136 assert(u);
2137
2138 /* Returns the mask of controllers all of the unit's children require, merged */
2139
2140 if (u->cgroup_members_mask_valid)
2141 return u->cgroup_members_mask; /* Use cached value if possible */
2142
2143 u->cgroup_members_mask = 0;
2144
2145 if (u->type == UNIT_SLICE) {
2146 Unit *member;
2147
2148 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
2149 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
2150 }
2151
2152 u->cgroup_members_mask_valid = true;
2153 return u->cgroup_members_mask;
2154 }
2155
2156 CGroupMask unit_get_siblings_mask(Unit *u) {
2157 Unit *slice;
2158 assert(u);
2159
2160 /* Returns the mask of controllers all of the unit's siblings
2161 * require, i.e. the members mask of the unit's parent slice
2162 * if there is one. */
2163
2164 slice = UNIT_GET_SLICE(u);
2165 if (slice)
2166 return unit_get_members_mask(slice);
2167
2168 return unit_get_subtree_mask(u); /* we are the top-level slice */
2169 }
2170
2171 static CGroupMask unit_get_disable_mask(Unit *u) {
2172 CGroupContext *c;
2173
2174 c = unit_get_cgroup_context(u);
2175 if (!c)
2176 return 0;
2177
2178 return c->disable_controllers;
2179 }
2180
2181 CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
2182 CGroupMask mask;
2183 Unit *slice;
2184
2185 assert(u);
2186 mask = unit_get_disable_mask(u);
2187
2188 /* Returns the mask of controllers which are marked as forcibly
2189 * disabled in any ancestor unit or the unit in question. */
2190
2191 slice = UNIT_GET_SLICE(u);
2192 if (slice)
2193 mask |= unit_get_ancestor_disable_mask(slice);
2194
2195 return mask;
2196 }
2197
2198 CGroupMask unit_get_target_mask(Unit *u) {
2199 CGroupMask own_mask, mask;
2200
2201 /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
2202 * it needs itself, plus all that its children need, plus all that its siblings need. This is
2203 * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
2204 * hierarchy that shall be enabled for it. */
2205
2206 own_mask = unit_get_own_mask(u);
2207
2208 if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
2209 emit_bpf_firewall_warning(u);
2210
2211 mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
2212
2213 mask &= u->manager->cgroup_supported;
2214 mask &= ~unit_get_ancestor_disable_mask(u);
2215
2216 return mask;
2217 }
2218
2219 CGroupMask unit_get_enable_mask(Unit *u) {
2220 CGroupMask mask;
2221
2222 /* This returns the cgroup mask of all controllers to enable
2223 * for the children of a specific cgroup. This is primarily
2224 * useful for the unified cgroup hierarchy, where each cgroup
2225 * controls which controllers are enabled for its children. */
2226
2227 mask = unit_get_members_mask(u);
2228 mask &= u->manager->cgroup_supported;
2229 mask &= ~unit_get_ancestor_disable_mask(u);
2230
2231 return mask;
2232 }
2233
2234 void unit_invalidate_cgroup_members_masks(Unit *u) {
2235 Unit *slice;
2236
2237 assert(u);
2238
2239 /* Recurse invalidate the member masks cache all the way up the tree */
2240 u->cgroup_members_mask_valid = false;
2241
2242 slice = UNIT_GET_SLICE(u);
2243 if (slice)
2244 unit_invalidate_cgroup_members_masks(slice);
2245 }
2246
2247 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
2248
2249 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
2250
2251 while (u) {
2252
2253 if (u->cgroup_path &&
2254 u->cgroup_realized &&
2255 FLAGS_SET(u->cgroup_realized_mask, mask))
2256 return u->cgroup_path;
2257
2258 u = UNIT_GET_SLICE(u);
2259 }
2260
2261 return NULL;
2262 }
2263
2264 static const char *migrate_callback(CGroupMask mask, void *userdata) {
2265 /* If not realized at all, migrate to root ("").
2266 * It may happen if we're upgrading from older version that didn't clean up.
2267 */
2268 return strempty(unit_get_realized_cgroup_path(userdata, mask));
2269 }
2270
2271 int unit_default_cgroup_path(const Unit *u, char **ret) {
2272 _cleanup_free_ char *p = NULL;
2273 int r;
2274
2275 assert(u);
2276 assert(ret);
2277
2278 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
2279 p = strdup(u->manager->cgroup_root);
2280 else {
2281 _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
2282 Unit *slice;
2283
2284 slice = UNIT_GET_SLICE(u);
2285 if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
2286 r = cg_slice_to_path(slice->id, &slice_path);
2287 if (r < 0)
2288 return r;
2289 }
2290
2291 r = cg_escape(u->id, &escaped);
2292 if (r < 0)
2293 return r;
2294
2295 p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
2296 }
2297 if (!p)
2298 return -ENOMEM;
2299
2300 *ret = TAKE_PTR(p);
2301 return 0;
2302 }
2303
2304 int unit_set_cgroup_path(Unit *u, const char *path) {
2305 _cleanup_free_ char *p = NULL;
2306 int r;
2307
2308 assert(u);
2309
2310 if (streq_ptr(u->cgroup_path, path))
2311 return 0;
2312
2313 if (path) {
2314 p = strdup(path);
2315 if (!p)
2316 return -ENOMEM;
2317 }
2318
2319 if (p) {
2320 r = hashmap_put(u->manager->cgroup_unit, p, u);
2321 if (r < 0)
2322 return r;
2323 }
2324
2325 unit_release_cgroup(u);
2326 u->cgroup_path = TAKE_PTR(p);
2327
2328 return 1;
2329 }
2330
2331 int unit_watch_cgroup(Unit *u) {
2332 _cleanup_free_ char *events = NULL;
2333 int r;
2334
2335 assert(u);
2336
2337 /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
2338 * cgroupv2 is available. */
2339
2340 if (!u->cgroup_path)
2341 return 0;
2342
2343 if (u->cgroup_control_inotify_wd >= 0)
2344 return 0;
2345
2346 /* Only applies to the unified hierarchy */
2347 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2348 if (r < 0)
2349 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
2350 if (r == 0)
2351 return 0;
2352
2353 /* No point in watch the top-level slice, it's never going to run empty. */
2354 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
2355 return 0;
2356
2357 r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
2358 if (r < 0)
2359 return log_oom();
2360
2361 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
2362 if (r < 0)
2363 return log_oom();
2364
2365 u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2366 if (u->cgroup_control_inotify_wd < 0) {
2367
2368 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2369 * is not an error */
2370 return 0;
2371
2372 return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
2373 }
2374
2375 r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
2376 if (r < 0)
2377 return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
2378
2379 return 0;
2380 }
2381
2382 int unit_watch_cgroup_memory(Unit *u) {
2383 _cleanup_free_ char *events = NULL;
2384 CGroupContext *c;
2385 int r;
2386
2387 assert(u);
2388
2389 /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
2390 * cgroupv2 is available. */
2391
2392 if (!u->cgroup_path)
2393 return 0;
2394
2395 c = unit_get_cgroup_context(u);
2396 if (!c)
2397 return 0;
2398
2399 /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
2400 * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
2401 * all. */
2402 if (!c->memory_accounting)
2403 return 0;
2404
2405 /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
2406 * we also don't want to generate a log message for each parent cgroup of a process. */
2407 if (u->type == UNIT_SLICE)
2408 return 0;
2409
2410 if (u->cgroup_memory_inotify_wd >= 0)
2411 return 0;
2412
2413 /* Only applies to the unified hierarchy */
2414 r = cg_all_unified();
2415 if (r < 0)
2416 return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
2417 if (r == 0)
2418 return 0;
2419
2420 r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
2421 if (r < 0)
2422 return log_oom();
2423
2424 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
2425 if (r < 0)
2426 return log_oom();
2427
2428 u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2429 if (u->cgroup_memory_inotify_wd < 0) {
2430
2431 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2432 * is not an error */
2433 return 0;
2434
2435 return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
2436 }
2437
2438 r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
2439 if (r < 0)
2440 return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
2441
2442 return 0;
2443 }
2444
2445 int unit_pick_cgroup_path(Unit *u) {
2446 _cleanup_free_ char *path = NULL;
2447 int r;
2448
2449 assert(u);
2450
2451 if (u->cgroup_path)
2452 return 0;
2453
2454 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2455 return -EINVAL;
2456
2457 r = unit_default_cgroup_path(u, &path);
2458 if (r < 0)
2459 return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
2460
2461 r = unit_set_cgroup_path(u, path);
2462 if (r == -EEXIST)
2463 return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
2464 if (r < 0)
2465 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
2466
2467 return 0;
2468 }
2469
2470 static int unit_update_cgroup(
2471 Unit *u,
2472 CGroupMask target_mask,
2473 CGroupMask enable_mask,
2474 ManagerState state) {
2475
2476 bool created, is_root_slice;
2477 CGroupMask migrate_mask = 0;
2478 _cleanup_free_ char *cgroup_full_path = NULL;
2479 int r;
2480
2481 assert(u);
2482
2483 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2484 return 0;
2485
2486 /* Figure out our cgroup path */
2487 r = unit_pick_cgroup_path(u);
2488 if (r < 0)
2489 return r;
2490
2491 /* First, create our own group */
2492 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
2493 if (r < 0)
2494 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path));
2495 created = r;
2496
2497 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2498 uint64_t cgroup_id = 0;
2499
2500 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path);
2501 if (r == 0) {
2502 r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
2503 if (r < 0)
2504 log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
2505 "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
2506 } else
2507 log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
2508
2509 u->cgroup_id = cgroup_id;
2510 }
2511
2512 /* Start watching it */
2513 (void) unit_watch_cgroup(u);
2514 (void) unit_watch_cgroup_memory(u);
2515
2516 /* For v2 we preserve enabled controllers in delegated units, adjust others,
2517 * for v1 we figure out which controller hierarchies need migration. */
2518 if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
2519 CGroupMask result_mask = 0;
2520
2521 /* Enable all controllers we need */
2522 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
2523 if (r < 0)
2524 log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
2525
2526 /* Remember what's actually enabled now */
2527 u->cgroup_enabled_mask = result_mask;
2528
2529 migrate_mask = u->cgroup_realized_mask ^ target_mask;
2530 }
2531
2532 /* Keep track that this is now realized */
2533 u->cgroup_realized = true;
2534 u->cgroup_realized_mask = target_mask;
2535
2536 /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
2537 *
2538 * Unnecessary controller cgroups are trimmed (after emptied by upward migration).
2539 * We perform migration also with whole slices for cases when users don't care about leave
2540 * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing
2541 * delegated units.
2542 */
2543 if (cg_all_unified() == 0) {
2544 r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u);
2545 if (r < 0)
2546 log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path));
2547
2548 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2549 r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice);
2550 if (r < 0)
2551 log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path));
2552 }
2553
2554 /* Set attributes */
2555 cgroup_context_apply(u, target_mask, state);
2556 cgroup_xattr_apply(u);
2557
2558 /* For most units we expect that memory monitoring is set up before the unit is started and we won't
2559 * touch it after. For PID 1 this is different though, because we couldn't possibly do that given
2560 * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's
2561 * try to open the memory pressure interface anew. */
2562 if (unit_has_name(u, SPECIAL_INIT_SCOPE))
2563 (void) manager_setup_memory_pressure_event_source(u->manager);
2564
2565 return 0;
2566 }
2567
2568 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
2569 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2570 char *pp;
2571 int r;
2572
2573 assert(u);
2574
2575 if (MANAGER_IS_SYSTEM(u->manager))
2576 return -EINVAL;
2577
2578 if (!u->manager->system_bus)
2579 return -EIO;
2580
2581 if (!u->cgroup_path)
2582 return -EINVAL;
2583
2584 /* Determine this unit's cgroup path relative to our cgroup root */
2585 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
2586 if (!pp)
2587 return -EINVAL;
2588
2589 pp = strjoina("/", pp, suffix_path);
2590 path_simplify(pp);
2591
2592 r = bus_call_method(u->manager->system_bus,
2593 bus_systemd_mgr,
2594 "AttachProcessesToUnit",
2595 &error, NULL,
2596 "ssau",
2597 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
2598 if (r < 0)
2599 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
2600
2601 return 0;
2602 }
2603
2604 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
2605 _cleanup_free_ char *joined = NULL;
2606 CGroupMask delegated_mask;
2607 const char *p;
2608 PidRef *pid;
2609 int ret, r;
2610
2611 assert(u);
2612
2613 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2614 return -EINVAL;
2615
2616 if (set_isempty(pids))
2617 return 0;
2618
2619 /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
2620 * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
2621 r = bpf_firewall_load_custom(u);
2622 if (r < 0)
2623 return r;
2624
2625 r = unit_realize_cgroup(u);
2626 if (r < 0)
2627 return r;
2628
2629 if (isempty(suffix_path))
2630 p = u->cgroup_path;
2631 else {
2632 joined = path_join(u->cgroup_path, suffix_path);
2633 if (!joined)
2634 return -ENOMEM;
2635
2636 p = joined;
2637 }
2638
2639 delegated_mask = unit_get_delegate_mask(u);
2640
2641 ret = 0;
2642 SET_FOREACH(pid, pids) {
2643
2644 /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead,
2645 * which of course is racy. Let's shorten the race a bit though, and re-validate the PID
2646 * before we use it */
2647 r = pidref_verify(pid);
2648 if (r < 0) {
2649 log_unit_info_errno(u, r, "PID " PID_FMT " vanished before we could move it to target cgroup '%s', skipping: %m", pid->pid, empty_to_root(p));
2650 continue;
2651 }
2652
2653 /* First, attach the PID to the main cgroup hierarchy */
2654 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid->pid);
2655 if (r < 0) {
2656 bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(r);
2657
2658 log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO, r,
2659 "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
2660 pid->pid, again ? " directly" : "", empty_to_root(p));
2661
2662 if (again) {
2663 int z;
2664
2665 /* If we are in a user instance, and we can't move the process ourselves due
2666 * to permission problems, let's ask the system instance about it instead.
2667 * Since it's more privileged it might be able to move the process across the
2668 * leaves of a subtree whose top node is not owned by us. */
2669
2670 z = unit_attach_pid_to_cgroup_via_bus(u, pid->pid, suffix_path);
2671 if (z < 0)
2672 log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid->pid, empty_to_root(p));
2673 else {
2674 if (ret >= 0)
2675 ret++; /* Count successful additions */
2676 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
2677 }
2678 }
2679
2680 if (ret >= 0)
2681 ret = r; /* Remember first error */
2682
2683 continue;
2684 } else if (ret >= 0)
2685 ret++; /* Count successful additions */
2686
2687 r = cg_all_unified();
2688 if (r < 0)
2689 return r;
2690 if (r > 0)
2691 continue;
2692
2693 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
2694 * innermost realized one */
2695
2696 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2697 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2698 const char *realized;
2699
2700 if (!(u->manager->cgroup_supported & bit))
2701 continue;
2702
2703 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
2704 if (delegated_mask & u->cgroup_realized_mask & bit) {
2705 r = cg_attach(cgroup_controller_to_string(c), p, pid->pid);
2706 if (r >= 0)
2707 continue; /* Success! */
2708
2709 log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
2710 pid->pid, empty_to_root(p), cgroup_controller_to_string(c));
2711 }
2712
2713 /* So this controller is either not delegate or realized, or something else weird happened. In
2714 * that case let's attach the PID at least to the closest cgroup up the tree that is
2715 * realized. */
2716 realized = unit_get_realized_cgroup_path(u, bit);
2717 if (!realized)
2718 continue; /* Not even realized in the root slice? Then let's not bother */
2719
2720 r = cg_attach(cgroup_controller_to_string(c), realized, pid->pid);
2721 if (r < 0)
2722 log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
2723 pid->pid, realized, cgroup_controller_to_string(c));
2724 }
2725 }
2726
2727 return ret;
2728 }
2729
2730 static bool unit_has_mask_realized(
2731 Unit *u,
2732 CGroupMask target_mask,
2733 CGroupMask enable_mask) {
2734
2735 assert(u);
2736
2737 /* Returns true if this unit is fully realized. We check four things:
2738 *
2739 * 1. Whether the cgroup was created at all
2740 * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
2741 * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
2742 * 4. Whether the invalidation mask is currently zero
2743 *
2744 * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
2745 * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
2746 * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
2747 * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
2748 * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
2749 * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
2750 * simply don't matter. */
2751
2752 return u->cgroup_realized &&
2753 ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
2754 ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
2755 u->cgroup_invalidated_mask == 0;
2756 }
2757
2758 static bool unit_has_mask_disables_realized(
2759 Unit *u,
2760 CGroupMask target_mask,
2761 CGroupMask enable_mask) {
2762
2763 assert(u);
2764
2765 /* Returns true if all controllers which should be disabled are indeed disabled.
2766 *
2767 * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
2768 * already removed. */
2769
2770 return !u->cgroup_realized ||
2771 (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
2772 FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
2773 }
2774
2775 static bool unit_has_mask_enables_realized(
2776 Unit *u,
2777 CGroupMask target_mask,
2778 CGroupMask enable_mask) {
2779
2780 assert(u);
2781
2782 /* Returns true if all controllers which should be enabled are indeed enabled.
2783 *
2784 * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
2785 * we want to add is already added. */
2786
2787 return u->cgroup_realized &&
2788 ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
2789 ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
2790 }
2791
2792 void unit_add_to_cgroup_realize_queue(Unit *u) {
2793 assert(u);
2794
2795 if (u->in_cgroup_realize_queue)
2796 return;
2797
2798 LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2799 u->in_cgroup_realize_queue = true;
2800 }
2801
2802 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
2803 assert(u);
2804
2805 if (!u->in_cgroup_realize_queue)
2806 return;
2807
2808 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2809 u->in_cgroup_realize_queue = false;
2810 }
2811
2812 /* Controllers can only be enabled breadth-first, from the root of the
2813 * hierarchy downwards to the unit in question. */
2814 static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
2815 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2816 Unit *slice;
2817 int r;
2818
2819 assert(u);
2820
2821 /* First go deal with this unit's parent, or we won't be able to enable
2822 * any new controllers at this layer. */
2823 slice = UNIT_GET_SLICE(u);
2824 if (slice) {
2825 r = unit_realize_cgroup_now_enable(slice, state);
2826 if (r < 0)
2827 return r;
2828 }
2829
2830 target_mask = unit_get_target_mask(u);
2831 enable_mask = unit_get_enable_mask(u);
2832
2833 /* We can only enable in this direction, don't try to disable anything.
2834 */
2835 if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
2836 return 0;
2837
2838 new_target_mask = u->cgroup_realized_mask | target_mask;
2839 new_enable_mask = u->cgroup_enabled_mask | enable_mask;
2840
2841 return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
2842 }
2843
2844 /* Controllers can only be disabled depth-first, from the leaves of the
2845 * hierarchy upwards to the unit in question. */
2846 static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
2847 Unit *m;
2848
2849 assert(u);
2850
2851 if (u->type != UNIT_SLICE)
2852 return 0;
2853
2854 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
2855 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2856 int r;
2857
2858 /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
2859 * holding any controllers open anyway. */
2860 if (!m->cgroup_realized)
2861 continue;
2862
2863 /* We must disable those below us first in order to release the controller. */
2864 if (m->type == UNIT_SLICE)
2865 (void) unit_realize_cgroup_now_disable(m, state);
2866
2867 target_mask = unit_get_target_mask(m);
2868 enable_mask = unit_get_enable_mask(m);
2869
2870 /* We can only disable in this direction, don't try to enable anything. */
2871 if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
2872 continue;
2873
2874 new_target_mask = m->cgroup_realized_mask & target_mask;
2875 new_enable_mask = m->cgroup_enabled_mask & enable_mask;
2876
2877 r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
2878 if (r < 0)
2879 return r;
2880 }
2881
2882 return 0;
2883 }
2884
2885 /* Check if necessary controllers and attributes for a unit are in place.
2886 *
2887 * - If so, do nothing.
2888 * - If not, create paths, move processes over, and set attributes.
2889 *
2890 * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
2891 * a depth-first way. As such the process looks like this:
2892 *
2893 * Suppose we have a cgroup hierarchy which looks like this:
2894 *
2895 * root
2896 * / \
2897 * / \
2898 * / \
2899 * a b
2900 * / \ / \
2901 * / \ / \
2902 * c d e f
2903 * / \ / \ / \ / \
2904 * h i j k l m n o
2905 *
2906 * 1. We want to realise cgroup "d" now.
2907 * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
2908 * 3. cgroup "k" just started requesting the memory controller.
2909 *
2910 * To make this work we must do the following in order:
2911 *
2912 * 1. Disable CPU controller in k, j
2913 * 2. Disable CPU controller in d
2914 * 3. Enable memory controller in root
2915 * 4. Enable memory controller in a
2916 * 5. Enable memory controller in d
2917 * 6. Enable memory controller in k
2918 *
2919 * Notice that we need to touch j in one direction, but not the other. We also
2920 * don't go beyond d when disabling -- it's up to "a" to get realized if it
2921 * wants to disable further. The basic rules are therefore:
2922 *
2923 * - If you're disabling something, you need to realise all of the cgroups from
2924 * your recursive descendants to the root. This starts from the leaves.
2925 * - If you're enabling something, you need to realise from the root cgroup
2926 * downwards, but you don't need to iterate your recursive descendants.
2927 *
2928 * Returns 0 on success and < 0 on failure. */
2929 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
2930 CGroupMask target_mask, enable_mask;
2931 Unit *slice;
2932 int r;
2933
2934 assert(u);
2935
2936 unit_remove_from_cgroup_realize_queue(u);
2937
2938 target_mask = unit_get_target_mask(u);
2939 enable_mask = unit_get_enable_mask(u);
2940
2941 if (unit_has_mask_realized(u, target_mask, enable_mask))
2942 return 0;
2943
2944 /* Disable controllers below us, if there are any */
2945 r = unit_realize_cgroup_now_disable(u, state);
2946 if (r < 0)
2947 return r;
2948
2949 /* Enable controllers above us, if there are any */
2950 slice = UNIT_GET_SLICE(u);
2951 if (slice) {
2952 r = unit_realize_cgroup_now_enable(slice, state);
2953 if (r < 0)
2954 return r;
2955 }
2956
2957 /* Now actually deal with the cgroup we were trying to realise and set attributes */
2958 r = unit_update_cgroup(u, target_mask, enable_mask, state);
2959 if (r < 0)
2960 return r;
2961
2962 /* Now, reset the invalidation mask */
2963 u->cgroup_invalidated_mask = 0;
2964 return 0;
2965 }
2966
2967 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
2968 ManagerState state;
2969 unsigned n = 0;
2970 Unit *i;
2971 int r;
2972
2973 assert(m);
2974
2975 state = manager_state(m);
2976
2977 while ((i = m->cgroup_realize_queue)) {
2978 assert(i->in_cgroup_realize_queue);
2979
2980 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
2981 /* Maybe things changed, and the unit is not actually active anymore? */
2982 unit_remove_from_cgroup_realize_queue(i);
2983 continue;
2984 }
2985
2986 r = unit_realize_cgroup_now(i, state);
2987 if (r < 0)
2988 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
2989
2990 n++;
2991 }
2992
2993 return n;
2994 }
2995
2996 void unit_add_family_to_cgroup_realize_queue(Unit *u) {
2997 assert(u);
2998 assert(u->type == UNIT_SLICE);
2999
3000 /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
3001 * its ancestors.
3002 *
3003 * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
3004 * very weird if two units that own processes reside in the same slice, but one is realized in the
3005 * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
3006 * not), because that means individual processes need to be scheduled against whole cgroups. Let's
3007 * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
3008 * controller hierarchies too (if unit requires the controller to be realized).
3009 *
3010 * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
3011 * masks. */
3012
3013 do {
3014 Unit *m;
3015
3016 /* Children of u likely changed when we're called */
3017 u->cgroup_members_mask_valid = false;
3018
3019 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
3020
3021 /* No point in doing cgroup application for units without active processes. */
3022 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
3023 continue;
3024
3025 /* We only enqueue siblings if they were realized once at least, in the main
3026 * hierarchy. */
3027 if (!m->cgroup_realized)
3028 continue;
3029
3030 /* If the unit doesn't need any new controllers and has current ones
3031 * realized, it doesn't need any changes. */
3032 if (unit_has_mask_realized(m,
3033 unit_get_target_mask(m),
3034 unit_get_enable_mask(m)))
3035 continue;
3036
3037 unit_add_to_cgroup_realize_queue(m);
3038 }
3039
3040 /* Parent comes after children */
3041 unit_add_to_cgroup_realize_queue(u);
3042
3043 u = UNIT_GET_SLICE(u);
3044 } while (u);
3045 }
3046
3047 int unit_realize_cgroup(Unit *u) {
3048 Unit *slice;
3049
3050 assert(u);
3051
3052 if (!UNIT_HAS_CGROUP_CONTEXT(u))
3053 return 0;
3054
3055 /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
3056 * parents, but there's more actually: for the weight-based controllers we also need to make sure
3057 * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the
3058 * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
3059 * and ancestors and they should be (de)realized too.
3060 *
3061 * This call will defer work on the siblings and derealized ancestors to the next event loop
3062 * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
3063
3064 slice = UNIT_GET_SLICE(u);
3065 if (slice)
3066 unit_add_family_to_cgroup_realize_queue(slice);
3067
3068 /* And realize this one now (and apply the values) */
3069 return unit_realize_cgroup_now(u, manager_state(u->manager));
3070 }
3071
3072 void unit_release_cgroup(Unit *u) {
3073 assert(u);
3074
3075 /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
3076 * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
3077
3078 if (u->cgroup_path) {
3079 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
3080 u->cgroup_path = mfree(u->cgroup_path);
3081 }
3082
3083 if (u->cgroup_control_inotify_wd >= 0) {
3084 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
3085 log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
3086
3087 (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
3088 u->cgroup_control_inotify_wd = -1;
3089 }
3090
3091 if (u->cgroup_memory_inotify_wd >= 0) {
3092 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
3093 log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
3094
3095 (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
3096 u->cgroup_memory_inotify_wd = -1;
3097 }
3098 }
3099
3100 bool unit_maybe_release_cgroup(Unit *u) {
3101 int r;
3102
3103 assert(u);
3104
3105 if (!u->cgroup_path)
3106 return true;
3107
3108 /* Don't release the cgroup if there are still processes under it. If we get notified later when all the
3109 * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
3110 * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
3111 * up later. */
3112 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
3113 if (r < 0)
3114 log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
3115 else if (r == 1) {
3116 unit_release_cgroup(u);
3117 return true;
3118 }
3119
3120 return false;
3121 }
3122
3123 void unit_prune_cgroup(Unit *u) {
3124 int r;
3125 bool is_root_slice;
3126
3127 assert(u);
3128
3129 /* Removes the cgroup, if empty and possible, and stops watching it. */
3130
3131 if (!u->cgroup_path)
3132 return;
3133
3134 /* Cache the last CPU and memory usage values before we destroy the cgroup */
3135 (void) unit_get_cpu_usage(u, /* ret = */ NULL);
3136
3137 for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++)
3138 (void) unit_get_memory_accounting(u, metric, /* ret = */ NULL);
3139
3140 #if BPF_FRAMEWORK
3141 (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */
3142 #endif
3143
3144 unit_modify_nft_set(u, /* add = */ false);
3145
3146 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
3147
3148 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
3149 if (r < 0)
3150 /* One reason we could have failed here is, that the cgroup still contains a process.
3151 * However, if the cgroup becomes removable at a later time, it might be removed when
3152 * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
3153 * that the cgroup is still realized the next time it is started. Do not return early
3154 * on error, continue cleanup. */
3155 log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
3156
3157 if (is_root_slice)
3158 return;
3159
3160 if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
3161 return;
3162
3163 u->cgroup_realized = false;
3164 u->cgroup_realized_mask = 0;
3165 u->cgroup_enabled_mask = 0;
3166
3167 u->bpf_device_control_installed = bpf_program_free(u->bpf_device_control_installed);
3168 }
3169
3170 int unit_search_main_pid(Unit *u, PidRef *ret) {
3171 _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
3172 _cleanup_fclose_ FILE *f = NULL;
3173 int r;
3174
3175 assert(u);
3176 assert(ret);
3177
3178 if (!u->cgroup_path)
3179 return -ENXIO;
3180
3181 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
3182 if (r < 0)
3183 return r;
3184
3185 for (;;) {
3186 _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL;
3187
3188 r = cg_read_pidref(f, &npidref);
3189 if (r < 0)
3190 return r;
3191 if (r == 0)
3192 break;
3193
3194 if (pidref_equal(&pidref, &npidref)) /* seen already, cgroupfs reports duplicates! */
3195 continue;
3196
3197 if (pidref_is_my_child(&npidref) <= 0) /* ignore processes further down the tree */
3198 continue;
3199
3200 if (pidref_is_set(&pidref) != 0)
3201 /* Dang, there's more than one daemonized PID in this group, so we don't know what
3202 * process is the main process. */
3203 return -ENODATA;
3204
3205 pidref = TAKE_PIDREF(npidref);
3206 }
3207
3208 if (!pidref_is_set(&pidref))
3209 return -ENODATA;
3210
3211 *ret = TAKE_PIDREF(pidref);
3212 return 0;
3213 }
3214
3215 static int unit_watch_pids_in_path(Unit *u, const char *path) {
3216 _cleanup_closedir_ DIR *d = NULL;
3217 _cleanup_fclose_ FILE *f = NULL;
3218 int ret = 0, r;
3219
3220 assert(u);
3221 assert(path);
3222
3223 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
3224 if (r < 0)
3225 RET_GATHER(ret, r);
3226 else {
3227 for (;;) {
3228 _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
3229
3230 r = cg_read_pidref(f, &pid);
3231 if (r == 0)
3232 break;
3233 if (r < 0) {
3234 RET_GATHER(ret, r);
3235 break;
3236 }
3237
3238 RET_GATHER(ret, unit_watch_pidref(u, &pid, /* exclusive= */ false));
3239 }
3240 }
3241
3242 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
3243 if (r < 0)
3244 RET_GATHER(ret, r);
3245 else {
3246 for (;;) {
3247 _cleanup_free_ char *fn = NULL, *p = NULL;
3248
3249 r = cg_read_subgroup(d, &fn);
3250 if (r == 0)
3251 break;
3252 if (r < 0) {
3253 RET_GATHER(ret, r);
3254 break;
3255 }
3256
3257 p = path_join(empty_to_root(path), fn);
3258 if (!p)
3259 return -ENOMEM;
3260
3261 RET_GATHER(ret, unit_watch_pids_in_path(u, p));
3262 }
3263 }
3264
3265 return ret;
3266 }
3267
3268 int unit_synthesize_cgroup_empty_event(Unit *u) {
3269 int r;
3270
3271 assert(u);
3272
3273 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
3274 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
3275 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
3276
3277 if (!u->cgroup_path)
3278 return -ENOENT;
3279
3280 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
3281 if (r < 0)
3282 return r;
3283 if (r > 0) /* On unified we have reliable notifications, and don't need this */
3284 return 0;
3285
3286 if (!set_isempty(u->pids))
3287 return 0;
3288
3289 unit_add_to_cgroup_empty_queue(u);
3290 return 0;
3291 }
3292
3293 int unit_watch_all_pids(Unit *u) {
3294 int r;
3295
3296 assert(u);
3297
3298 /* Adds all PIDs from our cgroup to the set of PIDs we
3299 * watch. This is a fallback logic for cases where we do not
3300 * get reliable cgroup empty notifications: we try to use
3301 * SIGCHLD as replacement. */
3302
3303 if (!u->cgroup_path)
3304 return -ENOENT;
3305
3306 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
3307 if (r < 0)
3308 return r;
3309 if (r > 0) /* On unified we can use proper notifications */
3310 return 0;
3311
3312 return unit_watch_pids_in_path(u, u->cgroup_path);
3313 }
3314
3315 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
3316 Manager *m = ASSERT_PTR(userdata);
3317 Unit *u;
3318 int r;
3319
3320 assert(s);
3321
3322 u = m->cgroup_empty_queue;
3323 if (!u)
3324 return 0;
3325
3326 assert(u->in_cgroup_empty_queue);
3327 u->in_cgroup_empty_queue = false;
3328 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
3329
3330 if (m->cgroup_empty_queue) {
3331 /* More stuff queued, let's make sure we remain enabled */
3332 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
3333 if (r < 0)
3334 log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
3335 }
3336
3337 /* Update state based on OOM kills before we notify about cgroup empty event */
3338 (void) unit_check_oom(u);
3339 (void) unit_check_oomd_kill(u);
3340
3341 unit_add_to_gc_queue(u);
3342
3343 if (IN_SET(unit_active_state(u), UNIT_INACTIVE, UNIT_FAILED))
3344 unit_prune_cgroup(u);
3345 else if (UNIT_VTABLE(u)->notify_cgroup_empty)
3346 UNIT_VTABLE(u)->notify_cgroup_empty(u);
3347
3348 return 0;
3349 }
3350
3351 void unit_add_to_cgroup_empty_queue(Unit *u) {
3352 int r;
3353
3354 assert(u);
3355
3356 /* Note that there are four different ways how cgroup empty events reach us:
3357 *
3358 * 1. On the unified hierarchy we get an inotify event on the cgroup
3359 *
3360 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
3361 *
3362 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
3363 *
3364 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
3365 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
3366 *
3367 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
3368 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
3369 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
3370 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
3371 * case for scope units). */
3372
3373 if (u->in_cgroup_empty_queue)
3374 return;
3375
3376 /* Let's verify that the cgroup is really empty */
3377 if (!u->cgroup_path)
3378 return;
3379
3380 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
3381 if (r < 0) {
3382 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
3383 return;
3384 }
3385 if (r == 0)
3386 return;
3387
3388 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
3389 u->in_cgroup_empty_queue = true;
3390
3391 /* Trigger the defer event */
3392 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
3393 if (r < 0)
3394 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
3395 }
3396
3397 static void unit_remove_from_cgroup_empty_queue(Unit *u) {
3398 assert(u);
3399
3400 if (!u->in_cgroup_empty_queue)
3401 return;
3402
3403 LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
3404 u->in_cgroup_empty_queue = false;
3405 }
3406
3407 int unit_check_oomd_kill(Unit *u) {
3408 _cleanup_free_ char *value = NULL;
3409 bool increased;
3410 uint64_t n = 0;
3411 int r;
3412
3413 if (!u->cgroup_path)
3414 return 0;
3415
3416 r = cg_all_unified();
3417 if (r < 0)
3418 return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
3419 else if (r == 0)
3420 return 0;
3421
3422 r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_ooms", &value);
3423 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3424 return r;
3425
3426 if (!isempty(value)) {
3427 r = safe_atou64(value, &n);
3428 if (r < 0)
3429 return r;
3430 }
3431
3432 increased = n > u->managed_oom_kill_last;
3433 u->managed_oom_kill_last = n;
3434
3435 if (!increased)
3436 return 0;
3437
3438 n = 0;
3439 value = mfree(value);
3440 r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_kill", &value);
3441 if (r >= 0 && !isempty(value))
3442 (void) safe_atou64(value, &n);
3443
3444 if (n > 0)
3445 log_unit_struct(u, LOG_NOTICE,
3446 "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
3447 LOG_UNIT_INVOCATION_ID(u),
3448 LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n),
3449 "N_PROCESSES=%" PRIu64, n);
3450 else
3451 log_unit_struct(u, LOG_NOTICE,
3452 "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
3453 LOG_UNIT_INVOCATION_ID(u),
3454 LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit."));
3455
3456 unit_notify_cgroup_oom(u, /* ManagedOOM= */ true);
3457
3458 return 1;
3459 }
3460
3461 int unit_check_oom(Unit *u) {
3462 _cleanup_free_ char *oom_kill = NULL;
3463 bool increased;
3464 uint64_t c;
3465 int r;
3466
3467 if (!u->cgroup_path)
3468 return 0;
3469
3470 r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
3471 if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
3472 c = 0;
3473 else if (r < 0)
3474 return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
3475 else {
3476 r = safe_atou64(oom_kill, &c);
3477 if (r < 0)
3478 return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
3479 }
3480
3481 increased = c > u->oom_kill_last;
3482 u->oom_kill_last = c;
3483
3484 if (!increased)
3485 return 0;
3486
3487 log_unit_struct(u, LOG_NOTICE,
3488 "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
3489 LOG_UNIT_INVOCATION_ID(u),
3490 LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
3491
3492 unit_notify_cgroup_oom(u, /* ManagedOOM= */ false);
3493
3494 return 1;
3495 }
3496
3497 static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
3498 Manager *m = ASSERT_PTR(userdata);
3499 Unit *u;
3500 int r;
3501
3502 assert(s);
3503
3504 u = m->cgroup_oom_queue;
3505 if (!u)
3506 return 0;
3507
3508 assert(u->in_cgroup_oom_queue);
3509 u->in_cgroup_oom_queue = false;
3510 LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
3511
3512 if (m->cgroup_oom_queue) {
3513 /* More stuff queued, let's make sure we remain enabled */
3514 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
3515 if (r < 0)
3516 log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
3517 }
3518
3519 (void) unit_check_oom(u);
3520 unit_add_to_gc_queue(u);
3521
3522 return 0;
3523 }
3524
3525 static void unit_add_to_cgroup_oom_queue(Unit *u) {
3526 int r;
3527
3528 assert(u);
3529
3530 if (u->in_cgroup_oom_queue)
3531 return;
3532 if (!u->cgroup_path)
3533 return;
3534
3535 LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
3536 u->in_cgroup_oom_queue = true;
3537
3538 /* Trigger the defer event */
3539 if (!u->manager->cgroup_oom_event_source) {
3540 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
3541
3542 r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
3543 if (r < 0) {
3544 log_error_errno(r, "Failed to create cgroup oom event source: %m");
3545 return;
3546 }
3547
3548 r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
3549 if (r < 0) {
3550 log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
3551 return;
3552 }
3553
3554 (void) sd_event_source_set_description(s, "cgroup-oom");
3555 u->manager->cgroup_oom_event_source = TAKE_PTR(s);
3556 }
3557
3558 r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
3559 if (r < 0)
3560 log_error_errno(r, "Failed to enable cgroup oom event source: %m");
3561 }
3562
3563 static int unit_check_cgroup_events(Unit *u) {
3564 char *values[2] = {};
3565 int r;
3566
3567 assert(u);
3568
3569 if (!u->cgroup_path)
3570 return 0;
3571
3572 r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
3573 STRV_MAKE("populated", "frozen"), values);
3574 if (r < 0)
3575 return r;
3576
3577 /* The cgroup.events notifications can be merged together so act as we saw the given state for the
3578 * first time. The functions we call to handle given state are idempotent, which makes them
3579 * effectively remember the previous state. */
3580 if (values[0]) {
3581 if (streq(values[0], "1"))
3582 unit_remove_from_cgroup_empty_queue(u);
3583 else
3584 unit_add_to_cgroup_empty_queue(u);
3585 }
3586
3587 /* Disregard freezer state changes due to operations not initiated by us */
3588 if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
3589 if (streq(values[1], "0"))
3590 unit_thawed(u);
3591 else
3592 unit_frozen(u);
3593 }
3594
3595 free(values[0]);
3596 free(values[1]);
3597
3598 return 0;
3599 }
3600
3601 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
3602 Manager *m = ASSERT_PTR(userdata);
3603
3604 assert(s);
3605 assert(fd >= 0);
3606
3607 for (;;) {
3608 union inotify_event_buffer buffer;
3609 ssize_t l;
3610
3611 l = read(fd, &buffer, sizeof(buffer));
3612 if (l < 0) {
3613 if (ERRNO_IS_TRANSIENT(errno))
3614 return 0;
3615
3616 return log_error_errno(errno, "Failed to read control group inotify events: %m");
3617 }
3618
3619 FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) {
3620 Unit *u;
3621
3622 if (e->wd < 0)
3623 /* Queue overflow has no watch descriptor */
3624 continue;
3625
3626 if (e->mask & IN_IGNORED)
3627 /* The watch was just removed */
3628 continue;
3629
3630 /* Note that inotify might deliver events for a watch even after it was removed,
3631 * because it was queued before the removal. Let's ignore this here safely. */
3632
3633 u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
3634 if (u)
3635 unit_check_cgroup_events(u);
3636
3637 u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
3638 if (u)
3639 unit_add_to_cgroup_oom_queue(u);
3640 }
3641 }
3642 }
3643
3644 static int cg_bpf_mask_supported(CGroupMask *ret) {
3645 CGroupMask mask = 0;
3646 int r;
3647
3648 /* BPF-based firewall */
3649 r = bpf_firewall_supported();
3650 if (r < 0)
3651 return r;
3652 if (r > 0)
3653 mask |= CGROUP_MASK_BPF_FIREWALL;
3654
3655 /* BPF-based device access control */
3656 r = bpf_devices_supported();
3657 if (r < 0)
3658 return r;
3659 if (r > 0)
3660 mask |= CGROUP_MASK_BPF_DEVICES;
3661
3662 /* BPF pinned prog */
3663 r = bpf_foreign_supported();
3664 if (r < 0)
3665 return r;
3666 if (r > 0)
3667 mask |= CGROUP_MASK_BPF_FOREIGN;
3668
3669 /* BPF-based bind{4|6} hooks */
3670 r = bpf_socket_bind_supported();
3671 if (r < 0)
3672 return r;
3673 if (r > 0)
3674 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
3675
3676 /* BPF-based cgroup_skb/{egress|ingress} hooks */
3677 r = restrict_network_interfaces_supported();
3678 if (r < 0)
3679 return r;
3680 if (r > 0)
3681 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
3682
3683 *ret = mask;
3684 return 0;
3685 }
3686
3687 int manager_setup_cgroup(Manager *m) {
3688 _cleanup_free_ char *path = NULL;
3689 const char *scope_path;
3690 int r, all_unified;
3691 CGroupMask mask;
3692 char *e;
3693
3694 assert(m);
3695
3696 /* 1. Determine hierarchy */
3697 m->cgroup_root = mfree(m->cgroup_root);
3698 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
3699 if (r < 0)
3700 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
3701
3702 /* Chop off the init scope, if we are already located in it */
3703 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
3704
3705 /* LEGACY: Also chop off the system slice if we are in
3706 * it. This is to support live upgrades from older systemd
3707 * versions where PID 1 was moved there. Also see
3708 * cg_get_root_path(). */
3709 if (!e && MANAGER_IS_SYSTEM(m)) {
3710 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
3711 if (!e)
3712 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
3713 }
3714 if (e)
3715 *e = 0;
3716
3717 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
3718 * easily prepend it everywhere. */
3719 delete_trailing_chars(m->cgroup_root, "/");
3720
3721 /* 2. Show data */
3722 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
3723 if (r < 0)
3724 return log_error_errno(r, "Cannot find cgroup mount point: %m");
3725
3726 r = cg_unified();
3727 if (r < 0)
3728 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
3729
3730 all_unified = cg_all_unified();
3731 if (all_unified < 0)
3732 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
3733 if (all_unified > 0)
3734 log_debug("Unified cgroup hierarchy is located at %s.", path);
3735 else {
3736 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
3737 if (r < 0)
3738 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
3739 if (r > 0)
3740 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
3741 else
3742 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
3743 }
3744
3745 /* 3. Allocate cgroup empty defer event source */
3746 m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
3747 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
3748 if (r < 0)
3749 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
3750
3751 /* Schedule cgroup empty checks early, but after having processed service notification messages or
3752 * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
3753 * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
3754 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
3755 if (r < 0)
3756 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
3757
3758 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
3759 if (r < 0)
3760 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
3761
3762 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
3763
3764 /* 4. Install notifier inotify object, or agent */
3765 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
3766
3767 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
3768
3769 m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
3770 safe_close(m->cgroup_inotify_fd);
3771
3772 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
3773 if (m->cgroup_inotify_fd < 0)
3774 return log_error_errno(errno, "Failed to create control group inotify object: %m");
3775
3776 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
3777 if (r < 0)
3778 return log_error_errno(r, "Failed to watch control group inotify object: %m");
3779
3780 /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
3781 * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
3782 * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
3783 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
3784 if (r < 0)
3785 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
3786
3787 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
3788
3789 } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
3790
3791 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
3792 * since it does not generate events when control groups with children run empty. */
3793
3794 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH);
3795 if (r < 0)
3796 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
3797 else if (r > 0)
3798 log_debug("Installed release agent.");
3799 else if (r == 0)
3800 log_debug("Release agent already installed.");
3801 }
3802
3803 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
3804 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
3805 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
3806 if (r >= 0) {
3807 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
3808 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
3809 if (r < 0)
3810 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
3811
3812 /* 6. And pin it, so that it cannot be unmounted */
3813 safe_close(m->pin_cgroupfs_fd);
3814 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
3815 if (m->pin_cgroupfs_fd < 0)
3816 return log_error_errno(errno, "Failed to open pin file: %m");
3817
3818 } else if (!MANAGER_IS_TEST_RUN(m))
3819 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
3820
3821 /* 7. Always enable hierarchical support if it exists... */
3822 if (!all_unified && !MANAGER_IS_TEST_RUN(m))
3823 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
3824
3825 /* 8. Figure out which controllers are supported */
3826 r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
3827 if (r < 0)
3828 return log_error_errno(r, "Failed to determine supported controllers: %m");
3829
3830 /* 9. Figure out which bpf-based pseudo-controllers are supported */
3831 r = cg_bpf_mask_supported(&mask);
3832 if (r < 0)
3833 return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
3834 m->cgroup_supported |= mask;
3835
3836 /* 10. Log which controllers are supported */
3837 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
3838 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
3839 yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
3840
3841 return 0;
3842 }
3843
3844 void manager_shutdown_cgroup(Manager *m, bool delete) {
3845 assert(m);
3846
3847 /* We can't really delete the group, since we are in it. But
3848 * let's trim it. */
3849 if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
3850 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
3851
3852 m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
3853
3854 m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
3855 m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
3856
3857 m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
3858 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
3859
3860 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
3861
3862 m->cgroup_root = mfree(m->cgroup_root);
3863 }
3864
3865 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
3866 char *p;
3867 Unit *u;
3868
3869 assert(m);
3870 assert(cgroup);
3871
3872 u = hashmap_get(m->cgroup_unit, cgroup);
3873 if (u)
3874 return u;
3875
3876 p = strdupa_safe(cgroup);
3877 for (;;) {
3878 char *e;
3879
3880 e = strrchr(p, '/');
3881 if (!e || e == p)
3882 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
3883
3884 *e = 0;
3885
3886 u = hashmap_get(m->cgroup_unit, p);
3887 if (u)
3888 return u;
3889 }
3890 }
3891
3892 Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) {
3893 _cleanup_free_ char *cgroup = NULL;
3894
3895 assert(m);
3896
3897 if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
3898 return NULL;
3899
3900 return manager_get_unit_by_cgroup(m, cgroup);
3901 }
3902
3903 Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) {
3904 Unit *u, **array;
3905
3906 assert(m);
3907
3908 if (!pidref_is_set(pid))
3909 return NULL;
3910
3911 u = hashmap_get(m->watch_pids, pid);
3912 if (u)
3913 return u;
3914
3915 array = hashmap_get(m->watch_pids_more, pid);
3916 if (array)
3917 return array[0];
3918
3919 return NULL;
3920 }
3921
3922 Unit *manager_get_unit_by_pidref(Manager *m, PidRef *pid) {
3923 Unit *u;
3924
3925 assert(m);
3926
3927 /* Note that a process might be owned by multiple units, we return only one here, which is good
3928 * enough for most cases, though not strictly correct. We prefer the one reported by cgroup
3929 * membership, as that's the most relevant one as children of the process will be assigned to that
3930 * one, too, before all else. */
3931
3932 if (!pidref_is_set(pid))
3933 return NULL;
3934
3935 if (pidref_is_self(pid))
3936 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
3937 if (pid->pid == 1)
3938 return NULL;
3939
3940 u = manager_get_unit_by_pidref_cgroup(m, pid);
3941 if (u)
3942 return u;
3943
3944 u = manager_get_unit_by_pidref_watching(m, pid);
3945 if (u)
3946 return u;
3947
3948 return NULL;
3949 }
3950
3951 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
3952 assert(m);
3953
3954 if (!pid_is_valid(pid))
3955 return NULL;
3956
3957 return manager_get_unit_by_pidref(m, &PIDREF_MAKE_FROM_PID(pid));
3958 }
3959
3960 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
3961 Unit *u;
3962
3963 assert(m);
3964 assert(cgroup);
3965
3966 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
3967 * or from the --system instance */
3968
3969 log_debug("Got cgroup empty notification for: %s", cgroup);
3970
3971 u = manager_get_unit_by_cgroup(m, cgroup);
3972 if (!u)
3973 return 0;
3974
3975 unit_add_to_cgroup_empty_queue(u);
3976 return 1;
3977 }
3978
3979 int unit_get_memory_available(Unit *u, uint64_t *ret) {
3980 uint64_t available = UINT64_MAX, current = 0;
3981
3982 assert(u);
3983 assert(ret);
3984
3985 /* If data from cgroups can be accessed, try to find out how much more memory a unit can
3986 * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
3987 * and MemoryMax, and also any slice the unit might be nested below. */
3988
3989 do {
3990 uint64_t unit_available, unit_limit = UINT64_MAX;
3991 CGroupContext *unit_context;
3992
3993 /* No point in continuing if we can't go any lower */
3994 if (available == 0)
3995 break;
3996
3997 unit_context = unit_get_cgroup_context(u);
3998 if (!unit_context)
3999 return -ENODATA;
4000
4001 if (!u->cgroup_path)
4002 continue;
4003
4004 (void) unit_get_memory_current(u, &current);
4005 /* in case of error, previous current propagates as lower bound */
4006
4007 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
4008 unit_limit = physical_memory();
4009 else if (unit_context->memory_max == UINT64_MAX && unit_context->memory_high == UINT64_MAX)
4010 continue;
4011 unit_limit = MIN3(unit_limit, unit_context->memory_max, unit_context->memory_high);
4012
4013 unit_available = LESS_BY(unit_limit, current);
4014 available = MIN(unit_available, available);
4015 } while ((u = UNIT_GET_SLICE(u)));
4016
4017 *ret = available;
4018
4019 return 0;
4020 }
4021
4022 int unit_get_memory_current(Unit *u, uint64_t *ret) {
4023 int r;
4024
4025 // FIXME: Merge this into unit_get_memory_accounting after support for cgroup v1 is dropped
4026
4027 assert(u);
4028 assert(ret);
4029
4030 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
4031 return -ENODATA;
4032
4033 if (!u->cgroup_path)
4034 return -ENODATA;
4035
4036 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
4037 if (unit_has_host_root_cgroup(u))
4038 return procfs_memory_get_used(ret);
4039
4040 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
4041 return -ENODATA;
4042
4043 r = cg_all_unified();
4044 if (r < 0)
4045 return r;
4046
4047 return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
4048 }
4049
4050 int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) {
4051
4052 static const char* const attributes_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
4053 [CGROUP_MEMORY_PEAK] = "memory.peak",
4054 [CGROUP_MEMORY_SWAP_CURRENT] = "memory.swap.current",
4055 [CGROUP_MEMORY_SWAP_PEAK] = "memory.swap.peak",
4056 [CGROUP_MEMORY_ZSWAP_CURRENT] = "memory.zswap.current",
4057 };
4058
4059 uint64_t bytes;
4060 bool updated = false;
4061 int r;
4062
4063 assert(u);
4064 assert(metric >= 0);
4065 assert(metric < _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX);
4066
4067 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
4068 return -ENODATA;
4069
4070 if (!u->cgroup_path)
4071 /* If the cgroup is already gone, we try to find the last cached value. */
4072 goto finish;
4073
4074 /* The root cgroup doesn't expose this information. */
4075 if (unit_has_host_root_cgroup(u))
4076 return -ENODATA;
4077
4078 if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_MEMORY))
4079 return -ENODATA;
4080
4081 r = cg_all_unified();
4082 if (r < 0)
4083 return r;
4084 if (r == 0)
4085 return -ENODATA;
4086
4087 r = cg_get_attribute_as_uint64("memory", u->cgroup_path, attributes_table[metric], &bytes);
4088 if (r < 0 && r != -ENODATA)
4089 return r;
4090 updated = r >= 0;
4091
4092 finish:
4093 if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) {
4094 uint64_t *last = &u->memory_accounting_last[metric];
4095
4096 if (updated)
4097 *last = bytes;
4098 else if (*last != UINT64_MAX)
4099 bytes = *last;
4100 else
4101 return -ENODATA;
4102
4103 } else if (!updated)
4104 return -ENODATA;
4105
4106 if (ret)
4107 *ret = bytes;
4108
4109 return 0;
4110 }
4111
4112 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
4113 assert(u);
4114 assert(ret);
4115
4116 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
4117 return -ENODATA;
4118
4119 if (!u->cgroup_path)
4120 return -ENODATA;
4121
4122 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
4123 if (unit_has_host_root_cgroup(u))
4124 return procfs_tasks_get_current(ret);
4125
4126 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
4127 return -ENODATA;
4128
4129 return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret);
4130 }
4131
4132 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
4133 uint64_t ns;
4134 int r;
4135
4136 assert(u);
4137 assert(ret);
4138
4139 if (!u->cgroup_path)
4140 return -ENODATA;
4141
4142 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
4143 if (unit_has_host_root_cgroup(u))
4144 return procfs_cpu_get_usage(ret);
4145
4146 /* Requisite controllers for CPU accounting are not enabled */
4147 if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
4148 return -ENODATA;
4149
4150 r = cg_all_unified();
4151 if (r < 0)
4152 return r;
4153 if (r > 0) {
4154 _cleanup_free_ char *val = NULL;
4155 uint64_t us;
4156
4157 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
4158 if (IN_SET(r, -ENOENT, -ENXIO))
4159 return -ENODATA;
4160 if (r < 0)
4161 return r;
4162
4163 r = safe_atou64(val, &us);
4164 if (r < 0)
4165 return r;
4166
4167 ns = us * NSEC_PER_USEC;
4168 } else
4169 return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret);
4170
4171 *ret = ns;
4172 return 0;
4173 }
4174
4175 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
4176 nsec_t ns;
4177 int r;
4178
4179 assert(u);
4180
4181 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
4182 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
4183 * call this function with a NULL return value. */
4184
4185 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
4186 return -ENODATA;
4187
4188 r = unit_get_cpu_usage_raw(u, &ns);
4189 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
4190 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
4191 * cached value. */
4192
4193 if (ret)
4194 *ret = u->cpu_usage_last;
4195 return 0;
4196 }
4197 if (r < 0)
4198 return r;
4199
4200 if (ns > u->cpu_usage_base)
4201 ns -= u->cpu_usage_base;
4202 else
4203 ns = 0;
4204
4205 u->cpu_usage_last = ns;
4206 if (ret)
4207 *ret = ns;
4208
4209 return 0;
4210 }
4211
4212 int unit_get_ip_accounting(
4213 Unit *u,
4214 CGroupIPAccountingMetric metric,
4215 uint64_t *ret) {
4216
4217 uint64_t value;
4218 int fd, r;
4219
4220 assert(u);
4221 assert(metric >= 0);
4222 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
4223 assert(ret);
4224
4225 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
4226 return -ENODATA;
4227
4228 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
4229 u->ip_accounting_ingress_map_fd :
4230 u->ip_accounting_egress_map_fd;
4231 if (fd < 0)
4232 return -ENODATA;
4233
4234 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
4235 r = bpf_firewall_read_accounting(fd, &value, NULL);
4236 else
4237 r = bpf_firewall_read_accounting(fd, NULL, &value);
4238 if (r < 0)
4239 return r;
4240
4241 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
4242 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
4243 * ip_accounting_extra[] field, and add them in here transparently. */
4244
4245 *ret = value + u->ip_accounting_extra[metric];
4246
4247 return r;
4248 }
4249
4250 static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) {
4251 CGroupContext *cc;
4252
4253 assert(u);
4254 assert(UNIT_HAS_CGROUP_CONTEXT(u));
4255
4256 cc = unit_get_cgroup_context(u);
4257 switch (type) {
4258 /* Note: on legacy/hybrid hierarchies memory_max stays CGROUP_LIMIT_MAX unless configured
4259 * explicitly. Effective value of MemoryLimit= (cgroup v1) is not implemented. */
4260 case CGROUP_LIMIT_MEMORY_MAX:
4261 return cc->memory_max;
4262 case CGROUP_LIMIT_MEMORY_HIGH:
4263 return cc->memory_high;
4264 case CGROUP_LIMIT_TASKS_MAX:
4265 return cgroup_tasks_max_resolve(&cc->tasks_max);
4266 default:
4267 assert_not_reached();
4268 }
4269 }
4270
4271 int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) {
4272 uint64_t infimum;
4273
4274 assert(u);
4275 assert(ret);
4276 assert(type >= 0);
4277 assert(type < _CGROUP_LIMIT_TYPE_MAX);
4278
4279 if (!UNIT_HAS_CGROUP_CONTEXT(u))
4280 return -EINVAL;
4281
4282 infimum = unit_get_effective_limit_one(u, type);
4283 for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice))
4284 infimum = MIN(infimum, unit_get_effective_limit_one(slice, type));
4285
4286 *ret = infimum;
4287 return 0;
4288 }
4289
4290 static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
4291 static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4292 [CGROUP_IO_READ_BYTES] = "rbytes=",
4293 [CGROUP_IO_WRITE_BYTES] = "wbytes=",
4294 [CGROUP_IO_READ_OPERATIONS] = "rios=",
4295 [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
4296 };
4297 uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
4298 _cleanup_free_ char *path = NULL;
4299 _cleanup_fclose_ FILE *f = NULL;
4300 int r;
4301
4302 assert(u);
4303
4304 if (!u->cgroup_path)
4305 return -ENODATA;
4306
4307 if (unit_has_host_root_cgroup(u))
4308 return -ENODATA; /* TODO: return useful data for the top-level cgroup */
4309
4310 r = cg_all_unified();
4311 if (r < 0)
4312 return r;
4313 if (r == 0) /* TODO: support cgroupv1 */
4314 return -ENODATA;
4315
4316 if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
4317 return -ENODATA;
4318
4319 r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
4320 if (r < 0)
4321 return r;
4322
4323 f = fopen(path, "re");
4324 if (!f)
4325 return -errno;
4326
4327 for (;;) {
4328 _cleanup_free_ char *line = NULL;
4329 const char *p;
4330
4331 r = read_line(f, LONG_LINE_MAX, &line);
4332 if (r < 0)
4333 return r;
4334 if (r == 0)
4335 break;
4336
4337 p = line;
4338 p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
4339 p += strspn(p, WHITESPACE); /* Skip over following whitespace */
4340
4341 for (;;) {
4342 _cleanup_free_ char *word = NULL;
4343
4344 r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
4345 if (r < 0)
4346 return r;
4347 if (r == 0)
4348 break;
4349
4350 for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
4351 const char *x;
4352
4353 x = startswith(word, field_names[i]);
4354 if (x) {
4355 uint64_t w;
4356
4357 r = safe_atou64(x, &w);
4358 if (r < 0)
4359 return r;
4360
4361 /* Sum up the stats of all devices */
4362 acc[i] += w;
4363 break;
4364 }
4365 }
4366 }
4367 }
4368
4369 memcpy(ret, acc, sizeof(acc));
4370 return 0;
4371 }
4372
4373 int unit_get_io_accounting(
4374 Unit *u,
4375 CGroupIOAccountingMetric metric,
4376 bool allow_cache,
4377 uint64_t *ret) {
4378
4379 uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
4380 int r;
4381
4382 /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
4383
4384 if (!UNIT_CGROUP_BOOL(u, io_accounting))
4385 return -ENODATA;
4386
4387 if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
4388 goto done;
4389
4390 r = unit_get_io_accounting_raw(u, raw);
4391 if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
4392 goto done;
4393 if (r < 0)
4394 return r;
4395
4396 for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
4397 /* Saturated subtraction */
4398 if (raw[i] > u->io_accounting_base[i])
4399 u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
4400 else
4401 u->io_accounting_last[i] = 0;
4402 }
4403
4404 done:
4405 if (ret)
4406 *ret = u->io_accounting_last[metric];
4407
4408 return 0;
4409 }
4410
4411 int unit_reset_cpu_accounting(Unit *u) {
4412 int r;
4413
4414 assert(u);
4415
4416 u->cpu_usage_last = NSEC_INFINITY;
4417
4418 r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
4419 if (r < 0) {
4420 u->cpu_usage_base = 0;
4421 return r;
4422 }
4423
4424 return 0;
4425 }
4426
4427 void unit_reset_memory_accounting_last(Unit *u) {
4428 assert(u);
4429
4430 FOREACH_ARRAY(i, u->memory_accounting_last, ELEMENTSOF(u->memory_accounting_last))
4431 *i = UINT64_MAX;
4432 }
4433
4434 int unit_reset_ip_accounting(Unit *u) {
4435 int r = 0;
4436
4437 assert(u);
4438
4439 if (u->ip_accounting_ingress_map_fd >= 0)
4440 RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd));
4441
4442 if (u->ip_accounting_egress_map_fd >= 0)
4443 RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd));
4444
4445 zero(u->ip_accounting_extra);
4446
4447 return r;
4448 }
4449
4450 void unit_reset_io_accounting_last(Unit *u) {
4451 assert(u);
4452
4453 FOREACH_ARRAY(i, u->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX)
4454 *i = UINT64_MAX;
4455 }
4456
4457 int unit_reset_io_accounting(Unit *u) {
4458 int r;
4459
4460 assert(u);
4461
4462 unit_reset_io_accounting_last(u);
4463
4464 r = unit_get_io_accounting_raw(u, u->io_accounting_base);
4465 if (r < 0) {
4466 zero(u->io_accounting_base);
4467 return r;
4468 }
4469
4470 return 0;
4471 }
4472
4473 int unit_reset_accounting(Unit *u) {
4474 int r = 0;
4475
4476 assert(u);
4477
4478 RET_GATHER(r, unit_reset_cpu_accounting(u));
4479 RET_GATHER(r, unit_reset_io_accounting(u));
4480 RET_GATHER(r, unit_reset_ip_accounting(u));
4481 unit_reset_memory_accounting_last(u);
4482
4483 return r;
4484 }
4485
4486 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
4487 assert(u);
4488
4489 if (!UNIT_HAS_CGROUP_CONTEXT(u))
4490 return;
4491
4492 if (m == 0)
4493 return;
4494
4495 /* always invalidate compat pairs together */
4496 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
4497 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
4498
4499 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
4500 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
4501
4502 if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
4503 return;
4504
4505 u->cgroup_invalidated_mask |= m;
4506 unit_add_to_cgroup_realize_queue(u);
4507 }
4508
4509 void unit_invalidate_cgroup_bpf(Unit *u) {
4510 assert(u);
4511
4512 if (!UNIT_HAS_CGROUP_CONTEXT(u))
4513 return;
4514
4515 if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
4516 return;
4517
4518 u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
4519 unit_add_to_cgroup_realize_queue(u);
4520
4521 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
4522 * list of our children includes our own. */
4523 if (u->type == UNIT_SLICE) {
4524 Unit *member;
4525
4526 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
4527 unit_invalidate_cgroup_bpf(member);
4528 }
4529 }
4530
4531 void unit_cgroup_catchup(Unit *u) {
4532 assert(u);
4533
4534 if (!UNIT_HAS_CGROUP_CONTEXT(u))
4535 return;
4536
4537 /* We dropped the inotify watch during reexec/reload, so we need to
4538 * check these as they may have changed.
4539 * Note that (currently) the kernel doesn't actually update cgroup
4540 * file modification times, so we can't just serialize and then check
4541 * the mtime for file(s) we are interested in. */
4542 (void) unit_check_cgroup_events(u);
4543 unit_add_to_cgroup_oom_queue(u);
4544 }
4545
4546 bool unit_cgroup_delegate(Unit *u) {
4547 CGroupContext *c;
4548
4549 assert(u);
4550
4551 if (!UNIT_VTABLE(u)->can_delegate)
4552 return false;
4553
4554 c = unit_get_cgroup_context(u);
4555 if (!c)
4556 return false;
4557
4558 return c->delegate;
4559 }
4560
4561 void manager_invalidate_startup_units(Manager *m) {
4562 Unit *u;
4563
4564 assert(m);
4565
4566 SET_FOREACH(u, m->startup_units)
4567 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
4568 }
4569
4570 int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
4571 _cleanup_free_ char *path = NULL;
4572 FreezerState target, kernel = _FREEZER_STATE_INVALID;
4573 int r, ret;
4574
4575 assert(u);
4576 assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
4577
4578 if (!cg_freezer_supported())
4579 return 0;
4580
4581 /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */
4582 if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
4583 return action == FREEZER_FREEZE ? -EPERM : 0;
4584
4585 if (!u->cgroup_realized)
4586 return -EBUSY;
4587
4588 if (action == FREEZER_THAW) {
4589 Unit *slice = UNIT_GET_SLICE(u);
4590
4591 if (slice) {
4592 r = unit_cgroup_freezer_action(slice, FREEZER_THAW);
4593 if (r < 0)
4594 return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id);
4595 }
4596 }
4597
4598 target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
4599
4600 r = unit_freezer_state_kernel(u, &kernel);
4601 if (r < 0)
4602 log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
4603
4604 if (target == kernel) {
4605 u->freezer_state = target;
4606 if (action == FREEZER_FREEZE)
4607 return 0;
4608 ret = 0;
4609 } else
4610 ret = 1;
4611
4612 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
4613 if (r < 0)
4614 return r;
4615
4616 log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
4617
4618 if (target != kernel) {
4619 if (action == FREEZER_FREEZE)
4620 u->freezer_state = FREEZER_FREEZING;
4621 else
4622 u->freezer_state = FREEZER_THAWING;
4623 }
4624
4625 r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
4626 if (r < 0)
4627 return r;
4628
4629 return ret;
4630 }
4631
4632 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
4633 _cleanup_free_ char *v = NULL;
4634 int r;
4635
4636 assert(u);
4637 assert(cpus);
4638
4639 if (!u->cgroup_path)
4640 return -ENODATA;
4641
4642 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
4643 return -ENODATA;
4644
4645 r = cg_all_unified();
4646 if (r < 0)
4647 return r;
4648 if (r == 0)
4649 return -ENODATA;
4650
4651 r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
4652 if (r == -ENOENT)
4653 return -ENODATA;
4654 if (r < 0)
4655 return r;
4656
4657 return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
4658 }
4659
4660 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
4661 [CGROUP_DEVICE_POLICY_AUTO] = "auto",
4662 [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
4663 [CGROUP_DEVICE_POLICY_STRICT] = "strict",
4664 };
4665
4666 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
4667
4668 static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
4669 [FREEZER_FREEZE] = "freeze",
4670 [FREEZER_THAW] = "thaw",
4671 };
4672
4673 DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
4674
4675 static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
4676 [CGROUP_PRESSURE_WATCH_OFF] = "off",
4677 [CGROUP_PRESSURE_WATCH_AUTO] = "auto",
4678 [CGROUP_PRESSURE_WATCH_ON] = "on",
4679 [CGROUP_PRESSURE_WATCH_SKIP] = "skip",
4680 };
4681
4682 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_ON);
4683
4684 static const char* const cgroup_ip_accounting_metric_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
4685 [CGROUP_IP_INGRESS_BYTES] = "IPIngressBytes",
4686 [CGROUP_IP_EGRESS_BYTES] = "IPEgressBytes",
4687 [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets",
4688 [CGROUP_IP_EGRESS_PACKETS] = "IPEgressPackets",
4689 };
4690
4691 DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric);
4692
4693 static const char* const cgroup_io_accounting_metric_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4694 [CGROUP_IO_READ_BYTES] = "IOReadBytes",
4695 [CGROUP_IO_WRITE_BYTES] = "IOWriteBytes",
4696 [CGROUP_IO_READ_OPERATIONS] = "IOReadOperations",
4697 [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations",
4698 };
4699
4700 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric, CGroupIOAccountingMetric);
4701
4702 static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
4703 [CGROUP_MEMORY_PEAK] = "MemoryPeak",
4704 [CGROUP_MEMORY_SWAP_CURRENT] = "MemorySwapCurrent",
4705 [CGROUP_MEMORY_SWAP_PEAK] = "MemorySwapPeak",
4706 [CGROUP_MEMORY_ZSWAP_CURRENT] = "MemoryZSwapCurrent",
4707 };
4708
4709 DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
4710
4711 static const char *const cgroup_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = {
4712 [CGROUP_LIMIT_MEMORY_MAX] = "EffectiveMemoryMax",
4713 [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh",
4714 [CGROUP_LIMIT_TASKS_MAX] = "EffectiveTasksMax",
4715 };
4716
4717 DEFINE_STRING_TABLE_LOOKUP(cgroup_limit_type, CGroupLimitType);