]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
Merge pull request #20524 from weblate/weblate-systemd-master
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
8e274523 2
c6c18be3 3#include <fcntl.h>
8c6db833 4
afcfaa69
LP
5#include "sd-messages.h"
6
a4817536 7#include "af-list.h"
b5efdb8a 8#include "alloc-util.h"
18c528e9 9#include "blockdev-util.h"
d8b4d14d 10#include "bpf-devices.h"
906c06f6 11#include "bpf-firewall.h"
506ea51b 12#include "bpf-foreign.h"
cd09a5f3 13#include "bpf-socket-bind.h"
45c2e068 14#include "btrfs-util.h"
6592b975 15#include "bus-error.h"
fdb3deca 16#include "cgroup-setup.h"
03a7b521 17#include "cgroup-util.h"
3ffd4af2
LP
18#include "cgroup.h"
19#include "fd-util.h"
0d39fa9c 20#include "fileio.h"
77601719 21#include "fs-util.h"
d9e45bc3 22#include "io-util.h"
5587ce7f 23#include "ip-protocol-list.h"
3a0f06c4 24#include "limits-util.h"
d9e45bc3 25#include "nulstr-util.h"
6bedfcbb 26#include "parse-util.h"
9eb977db 27#include "path-util.h"
1ead0b2a 28#include "percent-util.h"
03a7b521 29#include "process-util.h"
c36a69f4 30#include "procfs-util.h"
6f50d4f7 31#include "restrict-ifaces.h"
9444b1f2 32#include "special.h"
74c48bf5 33#include "stat-util.h"
906c06f6 34#include "stdio-util.h"
8b43440b 35#include "string-table.h"
07630cea 36#include "string-util.h"
cc6271f1 37#include "virt.h"
8e274523 38
10f28641 39#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
9a054909 40
39b9fefb
LP
41/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
42 * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
43 * out specific attributes from us. */
44#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
45
3a0f06c4
ZJS
46uint64_t tasks_max_resolve(const TasksMax *tasks_max) {
47 if (tasks_max->scale == 0)
48 return tasks_max->value;
49
50 return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
51}
52
611c4f8a 53bool manager_owns_host_root_cgroup(Manager *m) {
cc6271f1
LP
54 assert(m);
55
56 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
57 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
58 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
59 * we run in any kind of container virtualization. */
60
28cfdc5a
LP
61 if (MANAGER_IS_USER(m))
62 return false;
63
cc6271f1
LP
64 if (detect_container() > 0)
65 return false;
66
57ea45e1 67 return empty_or_root(m->cgroup_root);
cc6271f1
LP
68}
69
611c4f8a 70bool unit_has_host_root_cgroup(Unit *u) {
f3725e64
LP
71 assert(u);
72
cc6271f1
LP
73 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
74 * the manager manages the root cgroup. */
f3725e64 75
611c4f8a 76 if (!manager_owns_host_root_cgroup(u->manager))
f3725e64
LP
77 return false;
78
cc6271f1 79 return unit_has_name(u, SPECIAL_ROOT_SLICE);
f3725e64
LP
80}
81
293d32df
LP
82static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
83 int r;
84
85 r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
86 if (r < 0)
8ed6f81b 87 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
6178e2f8 88 strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value);
293d32df
LP
89
90 return r;
91}
92
2b40998d 93static void cgroup_compat_warn(void) {
128fadc9
TH
94 static bool cgroup_compat_warned = false;
95
96 if (cgroup_compat_warned)
97 return;
98
cc6271f1
LP
99 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
100 "See cgroup-compat debug messages for details.");
101
128fadc9
TH
102 cgroup_compat_warned = true;
103}
104
105#define log_cgroup_compat(unit, fmt, ...) do { \
106 cgroup_compat_warn(); \
107 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 108 } while (false)
128fadc9 109
4ad49000
LP
110void cgroup_context_init(CGroupContext *c) {
111 assert(c);
112
de8a711a 113 /* Initialize everything to the kernel defaults. */
4ad49000 114
de8a711a
LP
115 *c = (CGroupContext) {
116 .cpu_weight = CGROUP_WEIGHT_INVALID,
117 .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
118 .cpu_quota_per_sec_usec = USEC_INFINITY,
10f28641 119 .cpu_quota_period_usec = USEC_INFINITY,
66ebf6c0 120
de8a711a
LP
121 .cpu_shares = CGROUP_CPU_SHARES_INVALID,
122 .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
d53d9474 123
de8a711a
LP
124 .memory_high = CGROUP_LIMIT_MAX,
125 .memory_max = CGROUP_LIMIT_MAX,
126 .memory_swap_max = CGROUP_LIMIT_MAX,
da4d897e 127
de8a711a 128 .memory_limit = CGROUP_LIMIT_MAX,
b2f8b02e 129
de8a711a
LP
130 .io_weight = CGROUP_WEIGHT_INVALID,
131 .startup_io_weight = CGROUP_WEIGHT_INVALID,
13c31542 132
de8a711a
LP
133 .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
134 .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
d53d9474 135
3a0f06c4 136 .tasks_max = TASKS_MAX_UNSET,
4d824a4e
AZ
137
138 .moom_swap = MANAGED_OOM_AUTO,
139 .moom_mem_pressure = MANAGED_OOM_AUTO,
4e806bfa 140 .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
de8a711a 141 };
4ad49000 142}
8e274523 143
4ad49000
LP
144void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
145 assert(c);
146 assert(a);
147
71fda00f 148 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
149 free(a->path);
150 free(a);
151}
152
13c31542
TH
153void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
154 assert(c);
155 assert(w);
156
157 LIST_REMOVE(device_weights, c->io_device_weights, w);
158 free(w->path);
159 free(w);
160}
161
6ae4283c
TH
162void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
163 assert(c);
164 assert(l);
165
166 LIST_REMOVE(device_latencies, c->io_device_latencies, l);
167 free(l->path);
168 free(l);
169}
170
13c31542
TH
171void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
172 assert(c);
173 assert(l);
174
175 LIST_REMOVE(device_limits, c->io_device_limits, l);
176 free(l->path);
177 free(l);
178}
179
4ad49000
LP
180void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
181 assert(c);
182 assert(w);
183
71fda00f 184 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
185 free(w->path);
186 free(w);
187}
188
189void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
190 assert(c);
8e274523 191 assert(b);
8e274523 192
71fda00f 193 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
194 free(b->path);
195 free(b);
196}
197
b894ef1b
JK
198void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
199 assert(c);
200 assert(p);
201
202 LIST_REMOVE(programs, c->bpf_foreign_programs, p);
203 free(p->bpffs_path);
204 free(p);
205}
206
b18e9fc1 207void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
b18e9fc1
JK
208 assert(head);
209
210 while (*head) {
a67abc49 211 CGroupSocketBindItem *h = *head;
b18e9fc1
JK
212 LIST_REMOVE(socket_bind_items, *head, h);
213 free(h);
214 }
215}
216
4ad49000
LP
217void cgroup_context_done(CGroupContext *c) {
218 assert(c);
219
13c31542
TH
220 while (c->io_device_weights)
221 cgroup_context_free_io_device_weight(c, c->io_device_weights);
222
6ae4283c
TH
223 while (c->io_device_latencies)
224 cgroup_context_free_io_device_latency(c, c->io_device_latencies);
225
13c31542
TH
226 while (c->io_device_limits)
227 cgroup_context_free_io_device_limit(c, c->io_device_limits);
228
4ad49000
LP
229 while (c->blockio_device_weights)
230 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
231
232 while (c->blockio_device_bandwidths)
233 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
234
235 while (c->device_allow)
236 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f 237
b18e9fc1
JK
238 cgroup_context_remove_socket_bind(&c->socket_bind_allow);
239 cgroup_context_remove_socket_bind(&c->socket_bind_deny);
240
6a48d82f
DM
241 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
242 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
fab34748
KL
243
244 c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
245 c->ip_filters_egress = strv_free(c->ip_filters_egress);
047f5d63 246
b894ef1b
JK
247 while (c->bpf_foreign_programs)
248 cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
249
6f50d4f7
MV
250 c->restrict_network_interfaces = set_free(c->restrict_network_interfaces);
251
047f5d63
PH
252 cpu_set_reset(&c->cpuset_cpus);
253 cpu_set_reset(&c->cpuset_mems);
4ad49000
LP
254}
255
74b5fb27 256static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
74b5fb27
CD
257 assert(u);
258
259 if (!u->cgroup_realized)
260 return -EOWNERDEAD;
261
613328c3 262 return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret);
74b5fb27
CD
263}
264
265static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
266 CGroupContext *c;
267 CGroupMask m;
268 const char *file;
269 uint64_t unit_value;
270 int r;
271
272 /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
273 * return -ENODATA) on cgroup v1.
274 *
275 * Returns:
276 *
277 * <0: On error.
278 * 0: If the kernel memory setting doesn't match our configuration.
279 * >0: If the kernel memory setting matches our configuration.
280 *
281 * The following values are only guaranteed to be populated on return >=0:
282 *
283 * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
284 * - ret_kernel_value will contain the actual value presented by the kernel. */
285
286 assert(u);
287
288 r = cg_all_unified();
289 if (r < 0)
290 return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
291
292 /* Unsupported on v1.
293 *
294 * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
295 * silently masked the controller. */
296 if (r == 0)
297 return -ENODATA;
298
299 /* The root slice doesn't have any controller files, so we can't compare anything. */
300 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
301 return -ENODATA;
302
303 /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
304 * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
305 * avoid specious errors in these scenarios, check that we even expect the memory controller to be
306 * enabled at all. */
307 m = unit_get_target_mask(u);
308 if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
309 return -ENODATA;
310
806a9362 311 assert_se(c = unit_get_cgroup_context(u));
74b5fb27
CD
312
313 if (streq(property_name, "MemoryLow")) {
314 unit_value = unit_get_ancestor_memory_low(u);
315 file = "memory.low";
316 } else if (streq(property_name, "MemoryMin")) {
317 unit_value = unit_get_ancestor_memory_min(u);
318 file = "memory.min";
319 } else if (streq(property_name, "MemoryHigh")) {
320 unit_value = c->memory_high;
321 file = "memory.high";
322 } else if (streq(property_name, "MemoryMax")) {
323 unit_value = c->memory_max;
324 file = "memory.max";
325 } else if (streq(property_name, "MemorySwapMax")) {
326 unit_value = c->memory_swap_max;
327 file = "memory.swap.max";
328 } else
329 return -EINVAL;
330
331 r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
332 if (r < 0)
333 return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
334
335 /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
336 * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
337 * our internal page-counting. To support those future kernels, just check the value itself first
338 * without any page-alignment. */
339 if (*ret_kernel_value == unit_value) {
340 *ret_unit_value = unit_value;
341 return 1;
342 }
343
344 /* The current kernel behaviour, by comparison, is that even if you write a particular number of
345 * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
346 * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
347 * cricket. */
348 if (unit_value != CGROUP_LIMIT_MAX)
349 unit_value = PAGE_ALIGN_DOWN(unit_value);
350
351 *ret_unit_value = unit_value;
352
353 return *ret_kernel_value == *ret_unit_value;
354}
355
bc0623df
CD
356#define FORMAT_CGROUP_DIFF_MAX 128
357
358static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) {
359 uint64_t kval, sval;
360 int r;
361
362 assert(u);
363 assert(buf);
364 assert(l > 0);
365
366 r = unit_compare_memory_limit(u, property_name, &sval, &kval);
367
368 /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
369 * In the absence of reliably being able to detect whether memcg swap support is available or not,
370 * only complain if the error is not ENOENT. */
371 if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
372 (r == -ENOENT && streq(property_name, "MemorySwapMax"))) {
373 buf[0] = 0;
374 return buf;
375 }
376
377 if (r < 0) {
121ed16c 378 (void) snprintf(buf, l, " (error getting kernel value: %s)", strerror_safe(r));
bc0623df
CD
379 return buf;
380 }
381
121ed16c 382 (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
bc0623df
CD
383
384 return buf;
385}
386
387void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
85c3b278 388 _cleanup_free_ char *disable_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL;
13c31542
TH
389 CGroupIODeviceLimit *il;
390 CGroupIODeviceWeight *iw;
6ae4283c 391 CGroupIODeviceLatency *l;
4ad49000
LP
392 CGroupBlockIODeviceBandwidth *b;
393 CGroupBlockIODeviceWeight *w;
b894ef1b 394 CGroupBPFForeignProgram *p;
4ad49000 395 CGroupDeviceAllow *a;
bc0623df 396 CGroupContext *c;
b18e9fc1 397 CGroupSocketBindItem *bi;
c21c9906 398 IPAddressAccessItem *iaai;
fab34748 399 char **path;
4ad49000 400
bc0623df
CD
401 char cda[FORMAT_CGROUP_DIFF_MAX];
402 char cdb[FORMAT_CGROUP_DIFF_MAX];
403 char cdc[FORMAT_CGROUP_DIFF_MAX];
404 char cdd[FORMAT_CGROUP_DIFF_MAX];
405 char cde[FORMAT_CGROUP_DIFF_MAX];
406
407 assert(u);
4ad49000
LP
408 assert(f);
409
806a9362 410 assert_se(c = unit_get_cgroup_context(u));
bc0623df 411
4ad49000
LP
412 prefix = strempty(prefix);
413
25cc30c4
AZ
414 (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
415
047f5d63
PH
416 cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
417 cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
418
4ad49000 419 fprintf(f,
6dfb9282
CD
420 "%sCPUAccounting: %s\n"
421 "%sIOAccounting: %s\n"
422 "%sBlockIOAccounting: %s\n"
423 "%sMemoryAccounting: %s\n"
424 "%sTasksAccounting: %s\n"
425 "%sIPAccounting: %s\n"
426 "%sCPUWeight: %" PRIu64 "\n"
427 "%sStartupCPUWeight: %" PRIu64 "\n"
428 "%sCPUShares: %" PRIu64 "\n"
429 "%sStartupCPUShares: %" PRIu64 "\n"
430 "%sCPUQuotaPerSecSec: %s\n"
431 "%sCPUQuotaPeriodSec: %s\n"
432 "%sAllowedCPUs: %s\n"
433 "%sAllowedMemoryNodes: %s\n"
434 "%sIOWeight: %" PRIu64 "\n"
435 "%sStartupIOWeight: %" PRIu64 "\n"
436 "%sBlockIOWeight: %" PRIu64 "\n"
437 "%sStartupBlockIOWeight: %" PRIu64 "\n"
438 "%sDefaultMemoryMin: %" PRIu64 "\n"
439 "%sDefaultMemoryLow: %" PRIu64 "\n"
bc0623df
CD
440 "%sMemoryMin: %" PRIu64 "%s\n"
441 "%sMemoryLow: %" PRIu64 "%s\n"
442 "%sMemoryHigh: %" PRIu64 "%s\n"
443 "%sMemoryMax: %" PRIu64 "%s\n"
444 "%sMemorySwapMax: %" PRIu64 "%s\n"
6dfb9282
CD
445 "%sMemoryLimit: %" PRIu64 "\n"
446 "%sTasksMax: %" PRIu64 "\n"
447 "%sDevicePolicy: %s\n"
448 "%sDisableControllers: %s\n"
4d824a4e
AZ
449 "%sDelegate: %s\n"
450 "%sManagedOOMSwap: %s\n"
451 "%sManagedOOMMemoryPressure: %s\n"
d9d3f05d 452 "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
0f6bb1ed 453 "%sManagedOOMPreference: %s\n",
4ad49000 454 prefix, yes_no(c->cpu_accounting),
13c31542 455 prefix, yes_no(c->io_accounting),
4ad49000
LP
456 prefix, yes_no(c->blockio_accounting),
457 prefix, yes_no(c->memory_accounting),
d53d9474 458 prefix, yes_no(c->tasks_accounting),
c21c9906 459 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
460 prefix, c->cpu_weight,
461 prefix, c->startup_cpu_weight,
4ad49000 462 prefix, c->cpu_shares,
95ae05c0 463 prefix, c->startup_cpu_shares,
5291f26d
ZJS
464 prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
465 prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
85c3b278
LP
466 prefix, strempty(cpuset_cpus),
467 prefix, strempty(cpuset_mems),
13c31542
TH
468 prefix, c->io_weight,
469 prefix, c->startup_io_weight,
4ad49000 470 prefix, c->blockio_weight,
95ae05c0 471 prefix, c->startup_blockio_weight,
7ad5439e 472 prefix, c->default_memory_min,
c52db42b 473 prefix, c->default_memory_low,
bc0623df
CD
474 prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"),
475 prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"),
476 prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "MemoryHigh"),
477 prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryMax"),
478 prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "MemorySwapMax"),
4ad49000 479 prefix, c->memory_limit,
3a0f06c4 480 prefix, tasks_max_resolve(&c->tasks_max),
a931ad47 481 prefix, cgroup_device_policy_to_string(c->device_policy),
f4c43a81 482 prefix, strempty(disable_controllers_str),
4d824a4e
AZ
483 prefix, yes_no(c->delegate),
484 prefix, managed_oom_mode_to_string(c->moom_swap),
485 prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
d9d3f05d 486 prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
4e806bfa 487 prefix, managed_oom_preference_to_string(c->moom_preference));
4ad49000 488
02638280
LP
489 if (c->delegate) {
490 _cleanup_free_ char *t = NULL;
491
492 (void) cg_mask_to_string(c->delegate_controllers, &t);
493
6dfb9282 494 fprintf(f, "%sDelegateControllers: %s\n",
02638280
LP
495 prefix,
496 strempty(t));
497 }
498
4ad49000
LP
499 LIST_FOREACH(device_allow, a, c->device_allow)
500 fprintf(f,
6dfb9282 501 "%sDeviceAllow: %s %s%s%s\n",
4ad49000
LP
502 prefix,
503 a->path,
504 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
505
13c31542
TH
506 LIST_FOREACH(device_weights, iw, c->io_device_weights)
507 fprintf(f,
6dfb9282 508 "%sIODeviceWeight: %s %" PRIu64 "\n",
13c31542
TH
509 prefix,
510 iw->path,
511 iw->weight);
512
6ae4283c
TH
513 LIST_FOREACH(device_latencies, l, c->io_device_latencies)
514 fprintf(f,
6dfb9282 515 "%sIODeviceLatencyTargetSec: %s %s\n",
6ae4283c
TH
516 prefix,
517 l->path,
5291f26d 518 FORMAT_TIMESPAN(l->target_usec, 1));
6ae4283c 519
2b59bf51 520 LIST_FOREACH(device_limits, il, c->io_device_limits)
e8616626 521 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
9be57249
TH
522 if (il->limits[type] != cgroup_io_limit_defaults[type])
523 fprintf(f,
6dfb9282 524 "%s%s: %s %s\n",
9be57249
TH
525 prefix,
526 cgroup_io_limit_type_to_string(type),
527 il->path,
2b59bf51 528 FORMAT_BYTES(il->limits[type]));
13c31542 529
4ad49000
LP
530 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
531 fprintf(f,
6dfb9282 532 "%sBlockIODeviceWeight: %s %" PRIu64,
4ad49000
LP
533 prefix,
534 w->path,
535 w->weight);
536
537 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
979d0311
TH
538 if (b->rbps != CGROUP_LIMIT_MAX)
539 fprintf(f,
6dfb9282 540 "%sBlockIOReadBandwidth: %s %s\n",
979d0311
TH
541 prefix,
542 b->path,
2b59bf51 543 FORMAT_BYTES(b->rbps));
979d0311
TH
544 if (b->wbps != CGROUP_LIMIT_MAX)
545 fprintf(f,
6dfb9282 546 "%sBlockIOWriteBandwidth: %s %s\n",
979d0311
TH
547 prefix,
548 b->path,
2b59bf51 549 FORMAT_BYTES(b->wbps));
4ad49000 550 }
c21c9906
LP
551
552 LIST_FOREACH(items, iaai, c->ip_address_allow) {
553 _cleanup_free_ char *k = NULL;
554
555 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
6dfb9282 556 fprintf(f, "%sIPAddressAllow: %s/%u\n", prefix, strnull(k), iaai->prefixlen);
c21c9906
LP
557 }
558
559 LIST_FOREACH(items, iaai, c->ip_address_deny) {
560 _cleanup_free_ char *k = NULL;
561
562 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
6dfb9282 563 fprintf(f, "%sIPAddressDeny: %s/%u\n", prefix, strnull(k), iaai->prefixlen);
c21c9906 564 }
fab34748
KL
565
566 STRV_FOREACH(path, c->ip_filters_ingress)
6dfb9282 567 fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
fab34748
KL
568
569 STRV_FOREACH(path, c->ip_filters_egress)
6dfb9282 570 fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
b894ef1b
JK
571
572 LIST_FOREACH(programs, p, c->bpf_foreign_programs)
573 fprintf(f, "%sBPFProgram: %s:%s",
574 prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
b18e9fc1
JK
575
576 if (c->socket_bind_allow) {
577 fprintf(f, "%sSocketBindAllow:", prefix);
578 LIST_FOREACH(socket_bind_items, bi, c->socket_bind_allow)
579 cgroup_context_dump_socket_bind_item(bi, f);
580 fputc('\n', f);
581 }
582
583 if (c->socket_bind_deny) {
584 fprintf(f, "%sSocketBindDeny:", prefix);
585 LIST_FOREACH(socket_bind_items, bi, c->socket_bind_deny)
586 cgroup_context_dump_socket_bind_item(bi, f);
587 fputc('\n', f);
588 }
6f50d4f7
MV
589
590 if (c->restrict_network_interfaces) {
591 char *iface;
592 SET_FOREACH(iface, c->restrict_network_interfaces)
593 fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
594 }
b18e9fc1
JK
595}
596
597void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
5587ce7f 598 const char *family, *colon1, *protocol = "", *colon2 = "";
a4817536
LP
599
600 family = strempty(af_to_ipv4_ipv6(item->address_family));
5587ce7f
JK
601 colon1 = isempty(family) ? "" : ":";
602
603 if (item->ip_protocol != 0) {
604 protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
605 colon2 = ":";
606 }
b18e9fc1
JK
607
608 if (item->nr_ports == 0)
5587ce7f 609 fprintf(f, " %s%s%s%sany", family, colon1, protocol, colon2);
b18e9fc1 610 else if (item->nr_ports == 1)
5587ce7f 611 fprintf(f, " %s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
b18e9fc1
JK
612 else {
613 uint16_t port_max = item->port_min + item->nr_ports - 1;
5587ce7f
JK
614 fprintf(f, " %s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
615 item->port_min, port_max);
b18e9fc1 616 }
4ad49000
LP
617}
618
fd870bac
YW
619int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) {
620 _cleanup_free_ CGroupDeviceAllow *a = NULL;
621 _cleanup_free_ char *d = NULL;
622
623 assert(c);
624 assert(dev);
625 assert(isempty(mode) || in_charset(mode, "rwm"));
626
627 a = new(CGroupDeviceAllow, 1);
628 if (!a)
629 return -ENOMEM;
630
631 d = strdup(dev);
632 if (!d)
633 return -ENOMEM;
634
635 *a = (CGroupDeviceAllow) {
636 .path = TAKE_PTR(d),
490c5a37
LP
637 .r = isempty(mode) || strchr(mode, 'r'),
638 .w = isempty(mode) || strchr(mode, 'w'),
639 .m = isempty(mode) || strchr(mode, 'm'),
fd870bac
YW
640 };
641
642 LIST_PREPEND(device_allow, c->device_allow, a);
643 TAKE_PTR(a);
644
645 return 0;
646}
647
b894ef1b
JK
648int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
649 CGroupBPFForeignProgram *p;
650 _cleanup_free_ char *d = NULL;
651
652 assert(c);
653 assert(bpffs_path);
654
655 if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
656 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m");
657
658 d = strdup(bpffs_path);
659 if (!d)
660 return log_oom();
661
662 p = new(CGroupBPFForeignProgram, 1);
663 if (!p)
664 return log_oom();
665
666 *p = (CGroupBPFForeignProgram) {
667 .attach_type = attach_type,
668 .bpffs_path = TAKE_PTR(d),
669 };
670
671 LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
672
673 return 0;
674}
675
6264b85e
CD
676#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
677 uint64_t unit_get_ancestor_##entry(Unit *u) { \
678 CGroupContext *c; \
679 \
680 /* 1. Is entry set in this unit? If so, use that. \
681 * 2. Is the default for this entry set in any \
682 * ancestor? If so, use that. \
683 * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
684 \
685 assert(u); \
686 \
687 c = unit_get_cgroup_context(u); \
c5322608 688 if (c && c->entry##_set) \
6264b85e
CD
689 return c->entry; \
690 \
12f64221 691 while ((u = UNIT_GET_SLICE(u))) { \
6264b85e 692 c = unit_get_cgroup_context(u); \
c5322608 693 if (c && c->default_##entry##_set) \
6264b85e
CD
694 return c->default_##entry; \
695 } \
696 \
697 /* We've reached the root, but nobody had default for \
698 * this entry set, so set it to the kernel default. */ \
699 return CGROUP_LIMIT_MIN; \
c52db42b
CD
700}
701
6264b85e 702UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
7ad5439e 703UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
6264b85e 704
4e806bfa
AZ
705void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path) {
706 CGroupContext *c;
707 int r;
708
709 assert(u);
710
711 c = unit_get_cgroup_context(u);
712 if (!c)
713 return;
714
715 if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT) {
716 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit", "1", 1, 0);
717 if (r < 0)
6178e2f8 718 log_unit_debug_errno(u, r, "Failed to set oomd_omit flag on control group %s, ignoring: %m", empty_to_root(cgroup_path));
4e806bfa
AZ
719 }
720
721 if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID) {
722 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid", "1", 1, 0);
723 if (r < 0)
6178e2f8 724 log_unit_debug_errno(u, r, "Failed to set oomd_avoid flag on control group %s, ignoring: %m", empty_to_root(cgroup_path));
4e806bfa
AZ
725 }
726
727 if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID) {
728 r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid");
0cddb53c 729 if (r < 0 && r != -ENODATA)
6178e2f8 730 log_unit_debug_errno(u, r, "Failed to remove oomd_avoid flag on control group %s, ignoring: %m", empty_to_root(cgroup_path));
4e806bfa
AZ
731 }
732
733 if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT) {
734 r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit");
0cddb53c 735 if (r < 0 && r != -ENODATA)
6178e2f8 736 log_unit_debug_errno(u, r, "Failed to remove oomd_omit flag on control group %s, ignoring: %m", empty_to_root(cgroup_path));
4e806bfa
AZ
737 }
738}
739
0d2d6fbf 740static void cgroup_xattr_apply(Unit *u) {
0d2d6fbf
CD
741 int r;
742
743 assert(u);
744
745 if (!MANAGER_IS_SYSTEM(u->manager))
746 return;
747
3288ea8f
LP
748 if (!sd_id128_is_null(u->invocation_id)) {
749 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
750 "trusted.invocation_id",
85b55869 751 SD_ID128_TO_STRING(u->invocation_id), 32,
3288ea8f
LP
752 0);
753 if (r < 0)
6178e2f8 754 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", empty_to_root(u->cgroup_path));
3288ea8f 755 }
0d2d6fbf 756
3288ea8f
LP
757 if (unit_cgroup_delegate(u)) {
758 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
759 "trusted.delegate",
760 "1", 1,
761 0);
762 if (r < 0)
6178e2f8 763 log_unit_debug_errno(u, r, "Failed to set delegate flag on control group %s, ignoring: %m", empty_to_root(u->cgroup_path));
3288ea8f
LP
764 } else {
765 r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "trusted.delegate");
0cddb53c 766 if (r < 0 && r != -ENODATA)
6178e2f8 767 log_unit_debug_errno(u, r, "Failed to remove delegate flag on control group %s, ignoring: %m", empty_to_root(u->cgroup_path));
3288ea8f 768 }
4e806bfa
AZ
769
770 cgroup_oomd_xattr_apply(u, u->cgroup_path);
0d2d6fbf
CD
771}
772
45c2e068 773static int lookup_block_device(const char *p, dev_t *ret) {
f5855697
YS
774 dev_t rdev, dev = 0;
775 mode_t mode;
45c2e068 776 int r;
4ad49000
LP
777
778 assert(p);
45c2e068 779 assert(ret);
4ad49000 780
f5855697 781 r = device_path_parse_major_minor(p, &mode, &rdev);
d5aecba6 782 if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
f5855697 783 struct stat st;
57f1030b 784
d5aecba6
LP
785 if (stat(p, &st) < 0)
786 return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
57f1030b 787
f5855697 788 mode = st.st_mode;
a0d6590c
LP
789 rdev = st.st_rdev;
790 dev = st.st_dev;
d5aecba6
LP
791 } else if (r < 0)
792 return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
793
57f1030b
LP
794 if (S_ISCHR(mode))
795 return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
796 "Device node '%s' is a character device, but block device needed.", p);
797 if (S_ISBLK(mode))
f5855697
YS
798 *ret = rdev;
799 else if (major(dev) != 0)
800 *ret = dev; /* If this is not a device node then use the block device this file is stored on */
45c2e068
LP
801 else {
802 /* If this is btrfs, getting the backing block device is a bit harder */
803 r = btrfs_get_block_device(p, ret);
57f1030b
LP
804 if (r == -ENOTTY)
805 return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
806 "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
807 if (r < 0)
45c2e068 808 return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
4ad49000 809 }
8e274523 810
b7cf4b4e
BB
811 /* If this is a LUKS/DM device, recursively try to get the originating block device */
812 while (block_get_originating(*ret, ret) > 0);
45c2e068
LP
813
814 /* If this is a partition, try to get the originating block device */
815 (void) block_get_whole_disk(*ret, ret);
8e274523 816 return 0;
8e274523
LP
817}
818
66ebf6c0
TH
819static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
820 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
821 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
822}
823
824static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
825 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
826 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
827}
828
829static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
830 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
831 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
832 return c->startup_cpu_weight;
833 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
834 return c->cpu_weight;
835 else
836 return CGROUP_WEIGHT_DEFAULT;
837}
838
839static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
840 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
841 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
842 return c->startup_cpu_shares;
843 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
844 return c->cpu_shares;
845 else
846 return CGROUP_CPU_SHARES_DEFAULT;
847}
848
10f28641
FB
849usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
850 /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
851 * need to be higher than that boundary. quota is specified in USecPerSec.
852 * Additionally, period must be at most max_period. */
853 assert(quota > 0);
854
855 return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
856}
857
858static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
859 usec_t new_period;
860
861 if (quota == USEC_INFINITY)
862 /* Always use default period for infinity quota. */
863 return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
864
865 if (period == USEC_INFINITY)
866 /* Default period was requested. */
867 period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
868
869 /* Clamp to interval [1ms, 1s] */
870 new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
871
872 if (new_period != period) {
8ed6f81b 873 log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
10f28641 874 "Clamping CPU interval for cpu.max: period is now %s",
5291f26d 875 FORMAT_TIMESPAN(new_period, 1));
527ede0c 876 u->warned_clamping_cpu_quota_period = true;
10f28641
FB
877 }
878
879 return new_period;
880}
881
52fecf20
LP
882static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
883 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
66ebf6c0
TH
884
885 xsprintf(buf, "%" PRIu64 "\n", weight);
293d32df 886 (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
52fecf20
LP
887}
888
10f28641 889static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
52fecf20 890 char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
66ebf6c0 891
10f28641 892 period = cgroup_cpu_adjust_period_and_log(u, period, quota);
66ebf6c0
TH
893 if (quota != USEC_INFINITY)
894 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
10f28641 895 MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
66ebf6c0 896 else
10f28641 897 xsprintf(buf, "max " USEC_FMT "\n", period);
293d32df 898 (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
66ebf6c0
TH
899}
900
52fecf20
LP
901static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
902 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
66ebf6c0
TH
903
904 xsprintf(buf, "%" PRIu64 "\n", shares);
293d32df 905 (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
52fecf20
LP
906}
907
10f28641 908static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
52fecf20 909 char buf[DECIMAL_STR_MAX(usec_t) + 2];
66ebf6c0 910
10f28641
FB
911 period = cgroup_cpu_adjust_period_and_log(u, period, quota);
912
913 xsprintf(buf, USEC_FMT "\n", period);
293d32df 914 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
66ebf6c0
TH
915
916 if (quota != USEC_INFINITY) {
10f28641 917 xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
293d32df 918 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
66ebf6c0 919 } else
589a5f7a 920 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
66ebf6c0
TH
921}
922
923static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
924 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
925 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
926}
927
928static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
929 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
930 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
931}
932
2cea199e 933static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
047f5d63
PH
934 _cleanup_free_ char *buf = NULL;
935
2cea199e 936 buf = cpu_set_to_range_string(cpus);
c259ac9a
LP
937 if (!buf) {
938 log_oom();
939 return;
940 }
047f5d63
PH
941
942 (void) set_attribute_and_warn(u, "cpuset", name, buf);
943}
944
508c45da 945static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
946 return c->io_accounting ||
947 c->io_weight != CGROUP_WEIGHT_INVALID ||
948 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
949 c->io_device_weights ||
6ae4283c 950 c->io_device_latencies ||
538b4852
TH
951 c->io_device_limits;
952}
953
508c45da 954static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
955 return c->blockio_accounting ||
956 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
957 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
958 c->blockio_device_weights ||
959 c->blockio_device_bandwidths;
960}
961
508c45da 962static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
963 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
964 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
965 return c->startup_io_weight;
966 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
967 return c->io_weight;
968 else
969 return CGROUP_WEIGHT_DEFAULT;
970}
971
508c45da 972static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
973 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
974 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
975 return c->startup_blockio_weight;
976 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
977 return c->blockio_weight;
978 else
979 return CGROUP_BLKIO_WEIGHT_DEFAULT;
980}
981
508c45da 982static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
983 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
984 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
985}
986
508c45da 987static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
988 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
989 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
990}
991
f29ff115 992static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
993 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
994 dev_t dev;
995 int r;
996
997 r = lookup_block_device(dev_path, &dev);
998 if (r < 0)
999 return;
1000
1001 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
293d32df 1002 (void) set_attribute_and_warn(u, "io", "io.weight", buf);
64faf04c
TH
1003}
1004
f29ff115 1005static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
1006 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1007 dev_t dev;
1008 int r;
1009
1010 r = lookup_block_device(dev_path, &dev);
1011 if (r < 0)
1012 return;
1013
1014 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
293d32df 1015 (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
64faf04c
TH
1016}
1017
6ae4283c
TH
1018static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
1019 char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
1020 dev_t dev;
1021 int r;
1022
1023 r = lookup_block_device(dev_path, &dev);
1024 if (r < 0)
1025 return;
1026
1027 if (target != USEC_INFINITY)
1028 xsprintf(buf, "%u:%u target=%" PRIu64 "\n", major(dev), minor(dev), target);
1029 else
1030 xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev));
1031
293d32df 1032 (void) set_attribute_and_warn(u, "io", "io.latency", buf);
6ae4283c
TH
1033}
1034
17ae2780 1035static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
4c1f9343
ZJS
1036 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
1037 buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
64faf04c 1038 dev_t dev;
64faf04c 1039
4c1f9343 1040 if (lookup_block_device(dev_path, &dev) < 0)
17ae2780 1041 return;
64faf04c 1042
4c1f9343 1043 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
17ae2780 1044 if (limits[type] != cgroup_io_limit_defaults[type])
64faf04c 1045 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
17ae2780 1046 else
64faf04c 1047 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
64faf04c
TH
1048
1049 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
1050 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
1051 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
293d32df 1052 (void) set_attribute_and_warn(u, "io", "io.max", buf);
64faf04c
TH
1053}
1054
17ae2780 1055static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
1056 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1057 dev_t dev;
64faf04c 1058
4c1f9343 1059 if (lookup_block_device(dev_path, &dev) < 0)
17ae2780 1060 return;
64faf04c 1061
64faf04c 1062 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
293d32df 1063 (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
64faf04c 1064
64faf04c 1065 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
293d32df 1066 (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
64faf04c
TH
1067}
1068
c52db42b
CD
1069static bool unit_has_unified_memory_config(Unit *u) {
1070 CGroupContext *c;
1071
1072 assert(u);
1073
806a9362 1074 assert_se(c = unit_get_cgroup_context(u));
c52db42b 1075
7c9d2b79 1076 return unit_get_ancestor_memory_min(u) > 0 || unit_get_ancestor_memory_low(u) > 0 ||
c52db42b
CD
1077 c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX ||
1078 c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
1079}
1080
f29ff115 1081static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
589a5f7a 1082 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
da4d897e
TH
1083
1084 if (v != CGROUP_LIMIT_MAX)
1085 xsprintf(buf, "%" PRIu64 "\n", v);
1086
293d32df 1087 (void) set_attribute_and_warn(u, "memory", file, buf);
da4d897e
TH
1088}
1089
0f2d84d2 1090static void cgroup_apply_firewall(Unit *u) {
0f2d84d2
LP
1091 assert(u);
1092
acf7f253 1093 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
906c06f6 1094
acf7f253 1095 if (bpf_firewall_compile(u) < 0)
906c06f6
DM
1096 return;
1097
fab34748 1098 (void) bpf_firewall_load_custom(u);
906c06f6 1099 (void) bpf_firewall_install(u);
906c06f6
DM
1100}
1101
a8e5eb17
JK
1102static void cgroup_apply_socket_bind(Unit *u) {
1103 assert(u);
1104
cd09a5f3 1105 (void) bpf_socket_bind_install(u);
a8e5eb17
JK
1106}
1107
6f50d4f7
MV
1108static void cgroup_apply_restrict_network_interfaces(Unit *u) {
1109 assert(u);
1110
1111 (void) restrict_network_interfaces_install(u);
1112}
1113
8b139557
ZJS
1114static int cgroup_apply_devices(Unit *u) {
1115 _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
1116 const char *path;
1117 CGroupContext *c;
1118 CGroupDeviceAllow *a;
45669ae2 1119 CGroupDevicePolicy policy;
8b139557
ZJS
1120 int r;
1121
1122 assert_se(c = unit_get_cgroup_context(u));
1123 assert_se(path = u->cgroup_path);
1124
45669ae2
ZJS
1125 policy = c->device_policy;
1126
8b139557 1127 if (cg_all_unified() > 0) {
45669ae2 1128 r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
8b139557
ZJS
1129 if (r < 0)
1130 return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
1131
1132 } else {
1133 /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
1134 * EINVAL here. */
1135
45669ae2 1136 if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
8b139557
ZJS
1137 r = cg_set_attribute("devices", path, "devices.deny", "a");
1138 else
1139 r = cg_set_attribute("devices", path, "devices.allow", "a");
1140 if (r < 0)
8ed6f81b
YW
1141 log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1142 "Failed to reset devices.allow/devices.deny: %m");
8b139557
ZJS
1143 }
1144
6b000af4 1145 bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
45669ae2 1146 (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
6b000af4
LP
1147 if (allow_list_static)
1148 (void) bpf_devices_allow_list_static(prog, path);
8b139557 1149
6b000af4 1150 bool any = allow_list_static;
8b139557
ZJS
1151 LIST_FOREACH(device_allow, a, c->device_allow) {
1152 char acc[4], *val;
1153 unsigned k = 0;
1154
1155 if (a->r)
1156 acc[k++] = 'r';
1157 if (a->w)
1158 acc[k++] = 'w';
1159 if (a->m)
1160 acc[k++] = 'm';
8b139557
ZJS
1161 if (k == 0)
1162 continue;
8b139557
ZJS
1163 acc[k++] = 0;
1164
1165 if (path_startswith(a->path, "/dev/"))
6b000af4 1166 r = bpf_devices_allow_list_device(prog, path, a->path, acc);
8b139557 1167 else if ((val = startswith(a->path, "block-")))
6b000af4 1168 r = bpf_devices_allow_list_major(prog, path, val, 'b', acc);
8b139557 1169 else if ((val = startswith(a->path, "char-")))
6b000af4 1170 r = bpf_devices_allow_list_major(prog, path, val, 'c', acc);
45669ae2 1171 else {
8b139557 1172 log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
45669ae2
ZJS
1173 continue;
1174 }
1175
1176 if (r >= 0)
1177 any = true;
1178 }
1179
1180 if (prog && !any) {
1181 log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
1182
1183 /* The kernel verifier would reject a program we would build with the normal intro and outro
6b000af4 1184 but no allow-listing rules (outro would contain an unreachable instruction for successful
45669ae2
ZJS
1185 return). */
1186 policy = CGROUP_DEVICE_POLICY_STRICT;
8b139557
ZJS
1187 }
1188
45669ae2 1189 r = bpf_devices_apply_policy(prog, policy, any, path, &u->bpf_device_control_installed);
8b139557
ZJS
1190 if (r < 0) {
1191 static bool warned = false;
1192
1193 log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
1194 "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1195 "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1196 "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
1197
1198 warned = true;
1199 }
1200 return r;
1201}
1202
29eb0eef
ZJS
1203static void set_io_weight(Unit *u, const char *controller, uint64_t weight) {
1204 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
1205 const char *p;
1206
1207 p = strjoina(controller, ".weight");
1208 xsprintf(buf, "default %" PRIu64 "\n", weight);
1209 (void) set_attribute_and_warn(u, controller, p, buf);
1210
1211 /* FIXME: drop this when distro kernels properly support BFQ through "io.weight"
1212 * See also: https://github.com/systemd/systemd/pull/13335 and
1213 * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9.
1214 * The range is 1..1000 apparently. */
1215 p = strjoina(controller, ".bfq.weight");
1216 xsprintf(buf, "%" PRIu64 "\n", (weight + 9) / 10);
1217 (void) set_attribute_and_warn(u, controller, p, buf);
1218}
1219
506ea51b
JK
1220static void cgroup_apply_bpf_foreign_program(Unit *u) {
1221 assert(u);
1222
1223 (void) bpf_foreign_install(u);
1224}
1225
906c06f6
DM
1226static void cgroup_context_apply(
1227 Unit *u,
1228 CGroupMask apply_mask,
906c06f6
DM
1229 ManagerState state) {
1230
f29ff115
TH
1231 const char *path;
1232 CGroupContext *c;
52fecf20 1233 bool is_host_root, is_local_root;
4ad49000
LP
1234 int r;
1235
f29ff115
TH
1236 assert(u);
1237
906c06f6 1238 /* Nothing to do? Exit early! */
17f14955 1239 if (apply_mask == 0)
4ad49000 1240 return;
8e274523 1241
52fecf20
LP
1242 /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
1243 * attributes should only be managed for cgroups further down the tree. */
1244 is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
1245 is_host_root = unit_has_host_root_cgroup(u);
f3725e64
LP
1246
1247 assert_se(c = unit_get_cgroup_context(u));
1248 assert_se(path = u->cgroup_path);
1249
52fecf20 1250 if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
6da13913 1251 path = "/";
01efdf13 1252
be2c0327
LP
1253 /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
1254 * then), and missing cgroups, i.e. EROFS and ENOENT. */
714e2e1d 1255
be2c0327
LP
1256 /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
1257 * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
1258 * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
4e1dfa45 1259 * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
be2c0327
LP
1260 * we couldn't even write to them if we wanted to). */
1261 if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
8e274523 1262
b4cccbc1 1263 if (cg_all_unified() > 0) {
be2c0327 1264 uint64_t weight;
b2f8b02e 1265
be2c0327
LP
1266 if (cgroup_context_has_cpu_weight(c))
1267 weight = cgroup_context_cpu_weight(c, state);
1268 else if (cgroup_context_has_cpu_shares(c)) {
1269 uint64_t shares;
66ebf6c0 1270
be2c0327
LP
1271 shares = cgroup_context_cpu_shares(c, state);
1272 weight = cgroup_cpu_shares_to_weight(shares);
66ebf6c0 1273
be2c0327
LP
1274 log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
1275 shares, weight, path);
1276 } else
1277 weight = CGROUP_WEIGHT_DEFAULT;
66ebf6c0 1278
be2c0327 1279 cgroup_apply_unified_cpu_weight(u, weight);
10f28641 1280 cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
66ebf6c0 1281
52fecf20 1282 } else {
be2c0327 1283 uint64_t shares;
52fecf20 1284
be2c0327
LP
1285 if (cgroup_context_has_cpu_weight(c)) {
1286 uint64_t weight;
52fecf20 1287
be2c0327
LP
1288 weight = cgroup_context_cpu_weight(c, state);
1289 shares = cgroup_cpu_weight_to_shares(weight);
52fecf20 1290
be2c0327
LP
1291 log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
1292 weight, shares, path);
1293 } else if (cgroup_context_has_cpu_shares(c))
1294 shares = cgroup_context_cpu_shares(c, state);
1295 else
1296 shares = CGROUP_CPU_SHARES_DEFAULT;
66ebf6c0 1297
be2c0327 1298 cgroup_apply_legacy_cpu_shares(u, shares);
10f28641 1299 cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
66ebf6c0 1300 }
4ad49000
LP
1301 }
1302
047f5d63 1303 if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
2cea199e
ZJS
1304 cgroup_apply_unified_cpuset(u, &c->cpuset_cpus, "cpuset.cpus");
1305 cgroup_apply_unified_cpuset(u, &c->cpuset_mems, "cpuset.mems");
047f5d63
PH
1306 }
1307
4e1dfa45 1308 /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
52fecf20
LP
1309 * controller), and in case of containers we want to leave control of these attributes to the container manager
1310 * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
1311 if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
52fecf20
LP
1312 bool has_io, has_blockio;
1313 uint64_t weight;
13c31542 1314
52fecf20
LP
1315 has_io = cgroup_context_has_io_config(c);
1316 has_blockio = cgroup_context_has_blockio_config(c);
13c31542 1317
52fecf20
LP
1318 if (has_io)
1319 weight = cgroup_context_io_weight(c, state);
1320 else if (has_blockio) {
1321 uint64_t blkio_weight;
128fadc9 1322
52fecf20
LP
1323 blkio_weight = cgroup_context_blkio_weight(c, state);
1324 weight = cgroup_weight_blkio_to_io(blkio_weight);
128fadc9 1325
67e2ea15 1326 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
52fecf20
LP
1327 blkio_weight, weight);
1328 } else
1329 weight = CGROUP_WEIGHT_DEFAULT;
13c31542 1330
29eb0eef 1331 set_io_weight(u, "io", weight);
2dbc45ae 1332
52fecf20
LP
1333 if (has_io) {
1334 CGroupIODeviceLatency *latency;
1335 CGroupIODeviceLimit *limit;
1336 CGroupIODeviceWeight *w;
128fadc9 1337
52fecf20
LP
1338 LIST_FOREACH(device_weights, w, c->io_device_weights)
1339 cgroup_apply_io_device_weight(u, w->path, w->weight);
128fadc9 1340
52fecf20
LP
1341 LIST_FOREACH(device_limits, limit, c->io_device_limits)
1342 cgroup_apply_io_device_limit(u, limit->path, limit->limits);
6ae4283c 1343
52fecf20
LP
1344 LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
1345 cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
6ae4283c 1346
52fecf20
LP
1347 } else if (has_blockio) {
1348 CGroupBlockIODeviceWeight *w;
1349 CGroupBlockIODeviceBandwidth *b;
13c31542 1350
52fecf20
LP
1351 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
1352 weight = cgroup_weight_blkio_to_io(w->weight);
17ae2780 1353
67e2ea15 1354 log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
52fecf20 1355 w->weight, weight, w->path);
538b4852 1356
52fecf20
LP
1357 cgroup_apply_io_device_weight(u, w->path, weight);
1358 }
538b4852 1359
17ae2780 1360 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
538b4852 1361 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
538b4852 1362
e8616626 1363 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
538b4852
TH
1364 limits[type] = cgroup_io_limit_defaults[type];
1365
1366 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
1367 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
1368
67e2ea15 1369 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
128fadc9
TH
1370 b->rbps, b->wbps, b->path);
1371
17ae2780 1372 cgroup_apply_io_device_limit(u, b->path, limits);
538b4852 1373 }
13c31542
TH
1374 }
1375 }
1376
906c06f6 1377 if (apply_mask & CGROUP_MASK_BLKIO) {
52fecf20 1378 bool has_io, has_blockio;
4ad49000 1379
52fecf20
LP
1380 has_io = cgroup_context_has_io_config(c);
1381 has_blockio = cgroup_context_has_blockio_config(c);
1382
1383 /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
1384 * left to our container manager, too. */
1385 if (!is_local_root) {
64faf04c 1386 uint64_t weight;
64faf04c 1387
7d862ab8 1388 if (has_io) {
52fecf20 1389 uint64_t io_weight;
128fadc9 1390
52fecf20 1391 io_weight = cgroup_context_io_weight(c, state);
538b4852 1392 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9 1393
67e2ea15 1394 log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
128fadc9 1395 io_weight, weight);
7d862ab8
TH
1396 } else if (has_blockio)
1397 weight = cgroup_context_blkio_weight(c, state);
1398 else
538b4852 1399 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c 1400
29eb0eef 1401 set_io_weight(u, "blkio", weight);
35e7a62c 1402
7d862ab8 1403 if (has_io) {
538b4852
TH
1404 CGroupIODeviceWeight *w;
1405
128fadc9
TH
1406 LIST_FOREACH(device_weights, w, c->io_device_weights) {
1407 weight = cgroup_weight_io_to_blkio(w->weight);
1408
67e2ea15 1409 log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
128fadc9
TH
1410 w->weight, weight, w->path);
1411
1412 cgroup_apply_blkio_device_weight(u, w->path, weight);
1413 }
7d862ab8
TH
1414 } else if (has_blockio) {
1415 CGroupBlockIODeviceWeight *w;
1416
7d862ab8
TH
1417 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
1418 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 1419 }
4ad49000
LP
1420 }
1421
5238e957 1422 /* The bandwidth limits are something that make sense to be applied to the host's root but not container
52fecf20
LP
1423 * roots, as there we want the container manager to handle it */
1424 if (is_host_root || !is_local_root) {
1425 if (has_io) {
1426 CGroupIODeviceLimit *l;
538b4852 1427
52fecf20 1428 LIST_FOREACH(device_limits, l, c->io_device_limits) {
67e2ea15 1429 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
52fecf20 1430 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
128fadc9 1431
52fecf20
LP
1432 cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
1433 }
1434 } else if (has_blockio) {
1435 CGroupBlockIODeviceBandwidth *b;
7d862ab8 1436
52fecf20
LP
1437 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
1438 cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
1439 }
d686d8a9 1440 }
8e274523
LP
1441 }
1442
be2c0327
LP
1443 /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
1444 * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
4e1dfa45 1445 * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
be2c0327
LP
1446 * write to this if we wanted to.) */
1447 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
efdb0237 1448
52fecf20 1449 if (cg_all_unified() > 0) {
be2c0327
LP
1450 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
1451
c52db42b 1452 if (unit_has_unified_memory_config(u)) {
be2c0327
LP
1453 max = c->memory_max;
1454 swap_max = c->memory_swap_max;
1455 } else {
1456 max = c->memory_limit;
efdb0237 1457
be2c0327
LP
1458 if (max != CGROUP_LIMIT_MAX)
1459 log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
128fadc9 1460 }
da4d897e 1461
64fe532e 1462 cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
c52db42b 1463 cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
be2c0327
LP
1464 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
1465 cgroup_apply_unified_memory_limit(u, "memory.max", max);
1466 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
128fadc9 1467
afcfaa69
LP
1468 (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
1469
be2c0327
LP
1470 } else {
1471 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
1472 uint64_t val;
52fecf20 1473
c52db42b 1474 if (unit_has_unified_memory_config(u)) {
be2c0327
LP
1475 val = c->memory_max;
1476 log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
1477 } else
1478 val = c->memory_limit;
78a4ee59 1479
be2c0327
LP
1480 if (val == CGROUP_LIMIT_MAX)
1481 strncpy(buf, "-1\n", sizeof(buf));
1482 else
1483 xsprintf(buf, "%" PRIu64 "\n", val);
1484
1485 (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
da4d897e 1486 }
4ad49000 1487 }
8e274523 1488
4e1dfa45 1489 /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
52fecf20
LP
1490 * containers, where we leave this to the manager */
1491 if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
8b139557
ZJS
1492 (is_host_root || cg_all_unified() > 0 || !is_local_root))
1493 (void) cgroup_apply_devices(u);
03a7b521 1494
00b5974f
LP
1495 if (apply_mask & CGROUP_MASK_PIDS) {
1496
52fecf20 1497 if (is_host_root) {
00b5974f
LP
1498 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1499 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1500 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1501 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1502 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1503 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1504 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1505 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1506 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1507 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
5238e957 1508 * which is desirable so that there's an official way to release control of the sysctl from
00b5974f
LP
1509 * systemd: set the limit to unbounded and reload. */
1510
3a0f06c4 1511 if (tasks_max_isset(&c->tasks_max)) {
00b5974f 1512 u->manager->sysctl_pid_max_changed = true;
3a0f06c4 1513 r = procfs_tasks_set_limit(tasks_max_resolve(&c->tasks_max));
00b5974f
LP
1514 } else if (u->manager->sysctl_pid_max_changed)
1515 r = procfs_tasks_set_limit(TASKS_MAX);
1516 else
1517 r = 0;
00b5974f 1518 if (r < 0)
8ed6f81b
YW
1519 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
1520 "Failed to write to tasks limit sysctls: %m");
52fecf20 1521 }
03a7b521 1522
52fecf20
LP
1523 /* The attribute itself is not available on the host root cgroup, and in the container case we want to
1524 * leave it for the container manager. */
1525 if (!is_local_root) {
3a0f06c4
ZJS
1526 if (tasks_max_isset(&c->tasks_max)) {
1527 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
03a7b521 1528
3a0f06c4 1529 xsprintf(buf, "%" PRIu64 "\n", tasks_max_resolve(&c->tasks_max));
293d32df 1530 (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
00b5974f 1531 } else
589a5f7a 1532 (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
00b5974f 1533 }
03a7b521 1534 }
906c06f6 1535
17f14955 1536 if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
0f2d84d2 1537 cgroup_apply_firewall(u);
506ea51b
JK
1538
1539 if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
1540 cgroup_apply_bpf_foreign_program(u);
a8e5eb17
JK
1541
1542 if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
1543 cgroup_apply_socket_bind(u);
6f50d4f7
MV
1544
1545 if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
1546 cgroup_apply_restrict_network_interfaces(u);
fb385181
LP
1547}
1548
16492445
LP
1549static bool unit_get_needs_bpf_firewall(Unit *u) {
1550 CGroupContext *c;
16492445
LP
1551 assert(u);
1552
1553 c = unit_get_cgroup_context(u);
1554 if (!c)
1555 return false;
1556
1557 if (c->ip_accounting ||
1558 c->ip_address_allow ||
fab34748
KL
1559 c->ip_address_deny ||
1560 c->ip_filters_ingress ||
1561 c->ip_filters_egress)
16492445
LP
1562 return true;
1563
1564 /* If any parent slice has an IP access list defined, it applies too */
e8616626 1565 for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
16492445
LP
1566 c = unit_get_cgroup_context(p);
1567 if (!c)
1568 return false;
1569
1570 if (c->ip_address_allow ||
1571 c->ip_address_deny)
1572 return true;
1573 }
1574
1575 return false;
1576}
1577
506ea51b
JK
1578static bool unit_get_needs_bpf_foreign_program(Unit *u) {
1579 CGroupContext *c;
1580 assert(u);
1581
1582 c = unit_get_cgroup_context(u);
1583 if (!c)
1584 return false;
1585
1586 return !LIST_IS_EMPTY(c->bpf_foreign_programs);
1587}
1588
a8e5eb17
JK
1589static bool unit_get_needs_socket_bind(Unit *u) {
1590 CGroupContext *c;
1591 assert(u);
1592
1593 c = unit_get_cgroup_context(u);
1594 if (!c)
1595 return false;
1596
11ab01e4 1597 return c->socket_bind_allow || c->socket_bind_deny;
a8e5eb17
JK
1598}
1599
6f50d4f7
MV
1600static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
1601 CGroupContext *c;
1602 assert(u);
1603
1604 c = unit_get_cgroup_context(u);
1605 if (!c)
1606 return false;
1607
1608 return !set_isempty(c->restrict_network_interfaces);
1609}
1610
c52db42b 1611static CGroupMask unit_get_cgroup_mask(Unit *u) {
efdb0237 1612 CGroupMask mask = 0;
c52db42b
CD
1613 CGroupContext *c;
1614
1615 assert(u);
1616
806a9362 1617 assert_se(c = unit_get_cgroup_context(u));
c710d3b4 1618
fae9bc29 1619 /* Figure out which controllers we need, based on the cgroup context object */
8e274523 1620
fae9bc29 1621 if (c->cpu_accounting)
f98c2585 1622 mask |= get_cpu_accounting_mask();
fae9bc29
LP
1623
1624 if (cgroup_context_has_cpu_weight(c) ||
66ebf6c0 1625 cgroup_context_has_cpu_shares(c) ||
3a43da28 1626 c->cpu_quota_per_sec_usec != USEC_INFINITY)
fae9bc29 1627 mask |= CGROUP_MASK_CPU;
ecedd90f 1628
047f5d63
PH
1629 if (c->cpuset_cpus.set || c->cpuset_mems.set)
1630 mask |= CGROUP_MASK_CPUSET;
1631
538b4852
TH
1632 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1633 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1634
4ad49000 1635 if (c->memory_accounting ||
da4d897e 1636 c->memory_limit != CGROUP_LIMIT_MAX ||
c52db42b 1637 unit_has_unified_memory_config(u))
efdb0237 1638 mask |= CGROUP_MASK_MEMORY;
8e274523 1639
a931ad47 1640 if (c->device_allow ||
084870f9 1641 c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
084c7007 1642 mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
4ad49000 1643
03a7b521 1644 if (c->tasks_accounting ||
3a0f06c4 1645 tasks_max_isset(&c->tasks_max))
03a7b521
LP
1646 mask |= CGROUP_MASK_PIDS;
1647
fae9bc29 1648 return CGROUP_MASK_EXTEND_JOINED(mask);
8e274523
LP
1649}
1650
53aea74a 1651static CGroupMask unit_get_bpf_mask(Unit *u) {
17f14955
RG
1652 CGroupMask mask = 0;
1653
fae9bc29
LP
1654 /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
1655 * too. */
1656
17f14955
RG
1657 if (unit_get_needs_bpf_firewall(u))
1658 mask |= CGROUP_MASK_BPF_FIREWALL;
1659
506ea51b
JK
1660 if (unit_get_needs_bpf_foreign_program(u))
1661 mask |= CGROUP_MASK_BPF_FOREIGN;
1662
a8e5eb17
JK
1663 if (unit_get_needs_socket_bind(u))
1664 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
1665
6f50d4f7
MV
1666 if (unit_get_needs_restrict_network_interfaces(u))
1667 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
1668
17f14955
RG
1669 return mask;
1670}
1671
efdb0237 1672CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1673 CGroupContext *c;
8e274523 1674
442ce775
LP
1675 /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
1676 * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
1677
1678 if (u->load_state != UNIT_LOADED)
1679 return 0;
efdb0237 1680
4ad49000
LP
1681 c = unit_get_cgroup_context(u);
1682 if (!c)
1683 return 0;
8e274523 1684
12b975e0 1685 return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
02638280
LP
1686}
1687
1688CGroupMask unit_get_delegate_mask(Unit *u) {
1689 CGroupContext *c;
1690
1691 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1692 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
19af675e 1693 *
02638280 1694 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1695
1d9cc876 1696 if (!unit_cgroup_delegate(u))
02638280
LP
1697 return 0;
1698
1699 if (cg_all_unified() <= 0) {
a931ad47
LP
1700 ExecContext *e;
1701
1702 e = unit_get_exec_context(u);
02638280
LP
1703 if (e && !exec_context_maintains_privileges(e))
1704 return 0;
a931ad47
LP
1705 }
1706
1d9cc876 1707 assert_se(c = unit_get_cgroup_context(u));
fae9bc29 1708 return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
8e274523
LP
1709}
1710
d9ef5944
MK
1711static CGroupMask unit_get_subtree_mask(Unit *u) {
1712
1713 /* Returns the mask of this subtree, meaning of the group
1714 * itself and its children. */
1715
1716 return unit_get_own_mask(u) | unit_get_members_mask(u);
1717}
1718
efdb0237 1719CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1720 assert(u);
bc432dc7 1721
02638280 1722 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1723
bc432dc7 1724 if (u->cgroup_members_mask_valid)
26a17ca2 1725 return u->cgroup_members_mask; /* Use cached value if possible */
bc432dc7 1726
64e844e5 1727 u->cgroup_members_mask = 0;
bc432dc7
LP
1728
1729 if (u->type == UNIT_SLICE) {
1730 Unit *member;
bc432dc7 1731
d219a2b0 1732 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
15ed3c3a 1733 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1734 }
1735
1736 u->cgroup_members_mask_valid = true;
6414b7c9 1737 return u->cgroup_members_mask;
246aa6dd
LP
1738}
1739
efdb0237 1740CGroupMask unit_get_siblings_mask(Unit *u) {
12f64221 1741 Unit *slice;
4ad49000 1742 assert(u);
246aa6dd 1743
efdb0237
LP
1744 /* Returns the mask of controllers all of the unit's siblings
1745 * require, i.e. the members mask of the unit's parent slice
1746 * if there is one. */
1747
12f64221
LP
1748 slice = UNIT_GET_SLICE(u);
1749 if (slice)
1750 return unit_get_members_mask(slice);
4ad49000 1751
64e844e5 1752 return unit_get_subtree_mask(u); /* we are the top-level slice */
246aa6dd
LP
1753}
1754
d9ef5944 1755static CGroupMask unit_get_disable_mask(Unit *u) {
4f6f62e4
CD
1756 CGroupContext *c;
1757
1758 c = unit_get_cgroup_context(u);
1759 if (!c)
1760 return 0;
1761
1762 return c->disable_controllers;
1763}
1764
1765CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
1766 CGroupMask mask;
12f64221 1767 Unit *slice;
4f6f62e4
CD
1768
1769 assert(u);
1770 mask = unit_get_disable_mask(u);
1771
1772 /* Returns the mask of controllers which are marked as forcibly
1773 * disabled in any ancestor unit or the unit in question. */
1774
12f64221
LP
1775 slice = UNIT_GET_SLICE(u);
1776 if (slice)
1777 mask |= unit_get_ancestor_disable_mask(slice);
4f6f62e4
CD
1778
1779 return mask;
1780}
1781
efdb0237 1782CGroupMask unit_get_target_mask(Unit *u) {
a437c5e4 1783 CGroupMask own_mask, mask;
efdb0237 1784
a437c5e4
LP
1785 /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
1786 * it needs itself, plus all that its children need, plus all that its siblings need. This is
1787 * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
efdb0237 1788 * hierarchy that shall be enabled for it. */
6414b7c9 1789
a437c5e4 1790 own_mask = unit_get_own_mask(u);
84d2744b 1791
a437c5e4 1792 if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
84d2744b
ZJS
1793 emit_bpf_firewall_warning(u);
1794
a437c5e4
LP
1795 mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1796
efdb0237 1797 mask &= u->manager->cgroup_supported;
c72703e2 1798 mask &= ~unit_get_ancestor_disable_mask(u);
efdb0237
LP
1799
1800 return mask;
1801}
1802
1803CGroupMask unit_get_enable_mask(Unit *u) {
1804 CGroupMask mask;
1805
1806 /* This returns the cgroup mask of all controllers to enable
1807 * for the children of a specific cgroup. This is primarily
1808 * useful for the unified cgroup hierarchy, where each cgroup
1809 * controls which controllers are enabled for its children. */
1810
1811 mask = unit_get_members_mask(u);
6414b7c9 1812 mask &= u->manager->cgroup_supported;
c72703e2 1813 mask &= ~unit_get_ancestor_disable_mask(u);
6414b7c9
DS
1814
1815 return mask;
1816}
1817
5af88058 1818void unit_invalidate_cgroup_members_masks(Unit *u) {
12f64221
LP
1819 Unit *slice;
1820
bc432dc7
LP
1821 assert(u);
1822
5af88058
LP
1823 /* Recurse invalidate the member masks cache all the way up the tree */
1824 u->cgroup_members_mask_valid = false;
bc432dc7 1825
12f64221
LP
1826 slice = UNIT_GET_SLICE(u);
1827 if (slice)
1828 unit_invalidate_cgroup_members_masks(slice);
6414b7c9
DS
1829}
1830
6592b975 1831const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
03b90d4b 1832
6592b975 1833 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
03b90d4b
LP
1834
1835 while (u) {
6592b975 1836
03b90d4b
LP
1837 if (u->cgroup_path &&
1838 u->cgroup_realized &&
d94a24ca 1839 FLAGS_SET(u->cgroup_realized_mask, mask))
03b90d4b
LP
1840 return u->cgroup_path;
1841
12f64221 1842 u = UNIT_GET_SLICE(u);
03b90d4b
LP
1843 }
1844
1845 return NULL;
1846}
1847
6592b975 1848static const char *migrate_callback(CGroupMask mask, void *userdata) {
7b639614
MK
1849 /* If not realized at all, migrate to root ("").
1850 * It may happen if we're upgrading from older version that didn't clean up.
1851 */
1852 return strempty(unit_get_realized_cgroup_path(userdata, mask));
6592b975
LP
1853}
1854
303ee601 1855char *unit_default_cgroup_path(const Unit *u) {
12f64221
LP
1856 _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
1857 Unit *slice;
efdb0237
LP
1858 int r;
1859
1860 assert(u);
1861
1862 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1863 return strdup(u->manager->cgroup_root);
1864
12f64221
LP
1865 slice = UNIT_GET_SLICE(u);
1866 if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
1867 r = cg_slice_to_path(slice->id, &slice_path);
efdb0237
LP
1868 if (r < 0)
1869 return NULL;
1870 }
1871
1872 escaped = cg_escape(u->id);
1873 if (!escaped)
1874 return NULL;
1875
12f64221 1876 return path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
efdb0237
LP
1877}
1878
1879int unit_set_cgroup_path(Unit *u, const char *path) {
1880 _cleanup_free_ char *p = NULL;
1881 int r;
1882
1883 assert(u);
1884
5210387e
LP
1885 if (streq_ptr(u->cgroup_path, path))
1886 return 0;
1887
efdb0237
LP
1888 if (path) {
1889 p = strdup(path);
1890 if (!p)
1891 return -ENOMEM;
5210387e 1892 }
efdb0237
LP
1893
1894 if (p) {
1895 r = hashmap_put(u->manager->cgroup_unit, p, u);
1896 if (r < 0)
1897 return r;
1898 }
1899
1900 unit_release_cgroup(u);
ae2a15bc 1901 u->cgroup_path = TAKE_PTR(p);
efdb0237
LP
1902
1903 return 1;
1904}
1905
1906int unit_watch_cgroup(Unit *u) {
ab2c3861 1907 _cleanup_free_ char *events = NULL;
efdb0237
LP
1908 int r;
1909
1910 assert(u);
1911
0bb814c2
LP
1912 /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
1913 * cgroupv2 is available. */
1914
efdb0237
LP
1915 if (!u->cgroup_path)
1916 return 0;
1917
0bb814c2 1918 if (u->cgroup_control_inotify_wd >= 0)
efdb0237
LP
1919 return 0;
1920
1921 /* Only applies to the unified hierarchy */
c22800e4 1922 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1923 if (r < 0)
1924 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1925 if (r == 0)
efdb0237
LP
1926 return 0;
1927
0bb814c2 1928 /* No point in watch the top-level slice, it's never going to run empty. */
efdb0237
LP
1929 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1930 return 0;
1931
0bb814c2 1932 r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
efdb0237
LP
1933 if (r < 0)
1934 return log_oom();
1935
ab2c3861 1936 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1937 if (r < 0)
1938 return log_oom();
1939
0bb814c2
LP
1940 u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1941 if (u->cgroup_control_inotify_wd < 0) {
efdb0237 1942
0bb814c2
LP
1943 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
1944 * is not an error */
efdb0237
LP
1945 return 0;
1946
6178e2f8 1947 return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
efdb0237
LP
1948 }
1949
0bb814c2 1950 r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
efdb0237 1951 if (r < 0)
6178e2f8 1952 return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
efdb0237
LP
1953
1954 return 0;
1955}
1956
afcfaa69
LP
1957int unit_watch_cgroup_memory(Unit *u) {
1958 _cleanup_free_ char *events = NULL;
1959 CGroupContext *c;
1960 int r;
1961
1962 assert(u);
1963
1964 /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
1965 * cgroupv2 is available. */
1966
1967 if (!u->cgroup_path)
1968 return 0;
1969
1970 c = unit_get_cgroup_context(u);
1971 if (!c)
1972 return 0;
1973
1974 /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
1975 * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
1976 * all. */
1977 if (!c->memory_accounting)
1978 return 0;
1979
1980 /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
1981 * we also don't want to generate a log message for each parent cgroup of a process. */
1982 if (u->type == UNIT_SLICE)
1983 return 0;
1984
1985 if (u->cgroup_memory_inotify_wd >= 0)
1986 return 0;
1987
1988 /* Only applies to the unified hierarchy */
1989 r = cg_all_unified();
1990 if (r < 0)
1991 return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
1992 if (r == 0)
1993 return 0;
1994
1995 r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
1996 if (r < 0)
1997 return log_oom();
1998
1999 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
2000 if (r < 0)
2001 return log_oom();
2002
2003 u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2004 if (u->cgroup_memory_inotify_wd < 0) {
2005
2006 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2007 * is not an error */
2008 return 0;
2009
6178e2f8 2010 return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
afcfaa69
LP
2011 }
2012
2013 r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
2014 if (r < 0)
6178e2f8 2015 return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
afcfaa69
LP
2016
2017 return 0;
2018}
2019
a4634b21
LP
2020int unit_pick_cgroup_path(Unit *u) {
2021 _cleanup_free_ char *path = NULL;
2022 int r;
2023
2024 assert(u);
2025
2026 if (u->cgroup_path)
2027 return 0;
2028
2029 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2030 return -EINVAL;
2031
2032 path = unit_default_cgroup_path(u);
2033 if (!path)
2034 return log_oom();
2035
2036 r = unit_set_cgroup_path(u, path);
2037 if (r == -EEXIST)
6178e2f8 2038 return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
a4634b21 2039 if (r < 0)
6178e2f8 2040 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
a4634b21
LP
2041
2042 return 0;
2043}
2044
7b639614 2045static int unit_update_cgroup(
efdb0237
LP
2046 Unit *u,
2047 CGroupMask target_mask,
0d2d6fbf
CD
2048 CGroupMask enable_mask,
2049 ManagerState state) {
efdb0237 2050
7b639614
MK
2051 bool created, is_root_slice;
2052 CGroupMask migrate_mask = 0;
27adcc97 2053 int r;
64747e2d 2054
4ad49000 2055 assert(u);
64747e2d 2056
27c4ed79 2057 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0cd385d3
LP
2058 return 0;
2059
a4634b21
LP
2060 /* Figure out our cgroup path */
2061 r = unit_pick_cgroup_path(u);
2062 if (r < 0)
2063 return r;
b58b8e11 2064
03b90d4b 2065 /* First, create our own group */
efdb0237 2066 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 2067 if (r < 0)
6178e2f8 2068 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path));
490c5a37 2069 created = r;
efdb0237
LP
2070
2071 /* Start watching it */
2072 (void) unit_watch_cgroup(u);
afcfaa69 2073 (void) unit_watch_cgroup_memory(u);
efdb0237 2074
7b639614
MK
2075
2076 /* For v2 we preserve enabled controllers in delegated units, adjust others,
2077 * for v1 we figure out which controller hierarchies need migration. */
1fd3a10c 2078 if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
27adcc97 2079 CGroupMask result_mask = 0;
65be7e06
ZJS
2080
2081 /* Enable all controllers we need */
27adcc97 2082 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
65be7e06 2083 if (r < 0)
6178e2f8 2084 log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
27adcc97 2085
27adcc97
LP
2086 /* Remember what's actually enabled now */
2087 u->cgroup_enabled_mask = result_mask;
7b639614
MK
2088
2089 migrate_mask = u->cgroup_realized_mask ^ target_mask;
65be7e06 2090 }
03b90d4b
LP
2091
2092 /* Keep track that this is now realized */
4ad49000 2093 u->cgroup_realized = true;
efdb0237 2094 u->cgroup_realized_mask = target_mask;
4ad49000 2095
7b639614
MK
2096 /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
2097 *
2098 * Unnecessary controller cgroups are trimmed (after emptied by upward migration).
2099 * We perform migration also with whole slices for cases when users don't care about leave
2100 * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing
2101 * delegated units.
2102 */
2103 if (cg_all_unified() == 0) {
2104 r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u);
2105 if (r < 0)
6178e2f8 2106 log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path));
0cd385d3 2107
7b639614
MK
2108 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2109 r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice);
0cd385d3 2110 if (r < 0)
6178e2f8 2111 log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path));
0cd385d3 2112 }
03b90d4b 2113
0d2d6fbf
CD
2114 /* Set attributes */
2115 cgroup_context_apply(u, target_mask, state);
2116 cgroup_xattr_apply(u);
2117
64747e2d
LP
2118 return 0;
2119}
2120
6592b975
LP
2121static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
2122 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2123 char *pp;
7b3fd631 2124 int r;
6592b975 2125
7b3fd631
LP
2126 assert(u);
2127
6592b975
LP
2128 if (MANAGER_IS_SYSTEM(u->manager))
2129 return -EINVAL;
2130
2131 if (!u->manager->system_bus)
2132 return -EIO;
2133
2134 if (!u->cgroup_path)
2135 return -EINVAL;
2136
2137 /* Determine this unit's cgroup path relative to our cgroup root */
2138 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
2139 if (!pp)
2140 return -EINVAL;
2141
2142 pp = strjoina("/", pp, suffix_path);
4ff361cc 2143 path_simplify(pp);
6592b975
LP
2144
2145 r = sd_bus_call_method(u->manager->system_bus,
2146 "org.freedesktop.systemd1",
2147 "/org/freedesktop/systemd1",
2148 "org.freedesktop.systemd1.Manager",
2149 "AttachProcessesToUnit",
2150 &error, NULL,
2151 "ssau",
2152 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
7b3fd631 2153 if (r < 0)
6592b975
LP
2154 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
2155
2156 return 0;
2157}
2158
2159int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
2160 CGroupMask delegated_mask;
2161 const char *p;
6592b975
LP
2162 void *pidp;
2163 int r, q;
2164
2165 assert(u);
2166
2167 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2168 return -EINVAL;
2169
2170 if (set_isempty(pids))
2171 return 0;
7b3fd631 2172
fab34748
KL
2173 /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
2174 * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
2175 r = bpf_firewall_load_custom(u);
2176 if (r < 0)
2177 return r;
2178
6592b975 2179 r = unit_realize_cgroup(u);
7b3fd631
LP
2180 if (r < 0)
2181 return r;
2182
6592b975
LP
2183 if (isempty(suffix_path))
2184 p = u->cgroup_path;
2185 else
270384b2 2186 p = prefix_roota(u->cgroup_path, suffix_path);
6592b975
LP
2187
2188 delegated_mask = unit_get_delegate_mask(u);
2189
2190 r = 0;
90e74a66 2191 SET_FOREACH(pidp, pids) {
6592b975 2192 pid_t pid = PTR_TO_PID(pidp);
6592b975
LP
2193
2194 /* First, attach the PID to the main cgroup hierarchy */
2195 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
2196 if (q < 0) {
7a2ba407 2197 bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(q);
6592b975 2198
7a2ba407
ZJS
2199 log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO, q,
2200 "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
6178e2f8 2201 pid, again ? " directly" : "", empty_to_root(p));
7a2ba407
ZJS
2202
2203 if (again) {
6592b975
LP
2204 int z;
2205
7a2ba407
ZJS
2206 /* If we are in a user instance, and we can't move the process ourselves due
2207 * to permission problems, let's ask the system instance about it instead.
2208 * Since it's more privileged it might be able to move the process across the
2209 * leaves of a subtree whose top node is not owned by us. */
6592b975
LP
2210
2211 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
2212 if (z < 0)
6178e2f8 2213 log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid, empty_to_root(p));
6592b975
LP
2214 else
2215 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
2216 }
2217
2218 if (r >= 0)
2219 r = q; /* Remember first error */
2220
2221 continue;
2222 }
2223
2224 q = cg_all_unified();
2225 if (q < 0)
2226 return q;
2227 if (q > 0)
2228 continue;
2229
2230 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
2231 * innermost realized one */
2232
e8616626 2233 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
6592b975
LP
2234 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2235 const char *realized;
2236
2237 if (!(u->manager->cgroup_supported & bit))
2238 continue;
2239
2240 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
2241 if (delegated_mask & u->cgroup_realized_mask & bit) {
2242 q = cg_attach(cgroup_controller_to_string(c), p, pid);
2243 if (q >= 0)
2244 continue; /* Success! */
2245
2246 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
6178e2f8 2247 pid, empty_to_root(p), cgroup_controller_to_string(c));
6592b975
LP
2248 }
2249
2250 /* So this controller is either not delegate or realized, or something else weird happened. In
2251 * that case let's attach the PID at least to the closest cgroup up the tree that is
2252 * realized. */
2253 realized = unit_get_realized_cgroup_path(u, bit);
2254 if (!realized)
2255 continue; /* Not even realized in the root slice? Then let's not bother */
2256
2257 q = cg_attach(cgroup_controller_to_string(c), realized, pid);
2258 if (q < 0)
2259 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
2260 pid, realized, cgroup_controller_to_string(c));
2261 }
2262 }
2263
2264 return r;
7b3fd631
LP
2265}
2266
906c06f6
DM
2267static bool unit_has_mask_realized(
2268 Unit *u,
2269 CGroupMask target_mask,
17f14955 2270 CGroupMask enable_mask) {
906c06f6 2271
bc432dc7
LP
2272 assert(u);
2273
d5095dcd
LP
2274 /* Returns true if this unit is fully realized. We check four things:
2275 *
2276 * 1. Whether the cgroup was created at all
4e1dfa45
CD
2277 * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
2278 * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
d5095dcd
LP
2279 * 4. Whether the invalidation mask is currently zero
2280 *
2281 * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
4e1dfa45
CD
2282 * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
2283 * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
2284 * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
d5095dcd
LP
2285 * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
2286 * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
2287 * simply don't matter. */
2288
906c06f6 2289 return u->cgroup_realized &&
d5095dcd
LP
2290 ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
2291 ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
17f14955 2292 u->cgroup_invalidated_mask == 0;
6414b7c9
DS
2293}
2294
4f6f62e4
CD
2295static bool unit_has_mask_disables_realized(
2296 Unit *u,
2297 CGroupMask target_mask,
2298 CGroupMask enable_mask) {
2299
2300 assert(u);
2301
2302 /* Returns true if all controllers which should be disabled are indeed disabled.
2303 *
2304 * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
2305 * already removed. */
2306
2307 return !u->cgroup_realized ||
2308 (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
2309 FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
2310}
2311
a57669d2
CD
2312static bool unit_has_mask_enables_realized(
2313 Unit *u,
2314 CGroupMask target_mask,
2315 CGroupMask enable_mask) {
2316
2317 assert(u);
2318
2319 /* Returns true if all controllers which should be enabled are indeed enabled.
2320 *
2321 * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
2322 * we want to add is already added. */
2323
2324 return u->cgroup_realized &&
c72703e2
CD
2325 ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
2326 ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
a57669d2
CD
2327}
2328
fb46fca7 2329static void unit_add_to_cgroup_realize_queue(Unit *u) {
2aa57a65
LP
2330 assert(u);
2331
2332 if (u->in_cgroup_realize_queue)
2333 return;
2334
a479c21e 2335 LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2aa57a65
LP
2336 u->in_cgroup_realize_queue = true;
2337}
2338
2339static void unit_remove_from_cgroup_realize_queue(Unit *u) {
2340 assert(u);
2341
2342 if (!u->in_cgroup_realize_queue)
2343 return;
2344
2345 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2346 u->in_cgroup_realize_queue = false;
2347}
2348
a57669d2
CD
2349/* Controllers can only be enabled breadth-first, from the root of the
2350 * hierarchy downwards to the unit in question. */
2351static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
2352 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
12f64221 2353 Unit *slice;
a57669d2
CD
2354 int r;
2355
2356 assert(u);
2357
2358 /* First go deal with this unit's parent, or we won't be able to enable
2359 * any new controllers at this layer. */
12f64221
LP
2360 slice = UNIT_GET_SLICE(u);
2361 if (slice) {
2362 r = unit_realize_cgroup_now_enable(slice, state);
a57669d2
CD
2363 if (r < 0)
2364 return r;
2365 }
2366
2367 target_mask = unit_get_target_mask(u);
2368 enable_mask = unit_get_enable_mask(u);
2369
2370 /* We can only enable in this direction, don't try to disable anything.
2371 */
2372 if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
2373 return 0;
2374
2375 new_target_mask = u->cgroup_realized_mask | target_mask;
2376 new_enable_mask = u->cgroup_enabled_mask | enable_mask;
2377
7b639614 2378 return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
a57669d2
CD
2379}
2380
4f6f62e4
CD
2381/* Controllers can only be disabled depth-first, from the leaves of the
2382 * hierarchy upwards to the unit in question. */
2383static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
4f6f62e4 2384 Unit *m;
4f6f62e4
CD
2385
2386 assert(u);
2387
2388 if (u->type != UNIT_SLICE)
2389 return 0;
2390
d219a2b0 2391 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
4f6f62e4
CD
2392 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2393 int r;
2394
defe63b0
LP
2395 /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
2396 * holding any controllers open anyway. */
d9ef5944 2397 if (!m->cgroup_realized)
4f6f62e4
CD
2398 continue;
2399
defe63b0 2400 /* We must disable those below us first in order to release the controller. */
4f6f62e4
CD
2401 if (m->type == UNIT_SLICE)
2402 (void) unit_realize_cgroup_now_disable(m, state);
2403
2404 target_mask = unit_get_target_mask(m);
2405 enable_mask = unit_get_enable_mask(m);
2406
defe63b0 2407 /* We can only disable in this direction, don't try to enable anything. */
4f6f62e4
CD
2408 if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
2409 continue;
2410
2411 new_target_mask = m->cgroup_realized_mask & target_mask;
2412 new_enable_mask = m->cgroup_enabled_mask & enable_mask;
2413
7b639614 2414 r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
4f6f62e4
CD
2415 if (r < 0)
2416 return r;
2417 }
2418
2419 return 0;
2420}
a57669d2 2421
6414b7c9
DS
2422/* Check if necessary controllers and attributes for a unit are in place.
2423 *
a57669d2
CD
2424 * - If so, do nothing.
2425 * - If not, create paths, move processes over, and set attributes.
2426 *
2427 * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
2428 * a depth-first way. As such the process looks like this:
2429 *
2430 * Suppose we have a cgroup hierarchy which looks like this:
2431 *
2432 * root
2433 * / \
2434 * / \
2435 * / \
2436 * a b
2437 * / \ / \
2438 * / \ / \
2439 * c d e f
2440 * / \ / \ / \ / \
2441 * h i j k l m n o
2442 *
2443 * 1. We want to realise cgroup "d" now.
c72703e2 2444 * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
a57669d2
CD
2445 * 3. cgroup "k" just started requesting the memory controller.
2446 *
2447 * To make this work we must do the following in order:
2448 *
2449 * 1. Disable CPU controller in k, j
2450 * 2. Disable CPU controller in d
2451 * 3. Enable memory controller in root
2452 * 4. Enable memory controller in a
2453 * 5. Enable memory controller in d
2454 * 6. Enable memory controller in k
2455 *
2456 * Notice that we need to touch j in one direction, but not the other. We also
2457 * don't go beyond d when disabling -- it's up to "a" to get realized if it
2458 * wants to disable further. The basic rules are therefore:
2459 *
2460 * - If you're disabling something, you need to realise all of the cgroups from
2461 * your recursive descendants to the root. This starts from the leaves.
2462 * - If you're enabling something, you need to realise from the root cgroup
2463 * downwards, but you don't need to iterate your recursive descendants.
6414b7c9
DS
2464 *
2465 * Returns 0 on success and < 0 on failure. */
db785129 2466static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 2467 CGroupMask target_mask, enable_mask;
12f64221 2468 Unit *slice;
6414b7c9 2469 int r;
64747e2d 2470
4ad49000 2471 assert(u);
64747e2d 2472
2aa57a65 2473 unit_remove_from_cgroup_realize_queue(u);
64747e2d 2474
efdb0237 2475 target_mask = unit_get_target_mask(u);
ccf78df1
TH
2476 enable_mask = unit_get_enable_mask(u);
2477
17f14955 2478 if (unit_has_mask_realized(u, target_mask, enable_mask))
0a1eb06d 2479 return 0;
64747e2d 2480
4f6f62e4
CD
2481 /* Disable controllers below us, if there are any */
2482 r = unit_realize_cgroup_now_disable(u, state);
2483 if (r < 0)
2484 return r;
2485
2486 /* Enable controllers above us, if there are any */
12f64221
LP
2487 slice = UNIT_GET_SLICE(u);
2488 if (slice) {
2489 r = unit_realize_cgroup_now_enable(slice, state);
6414b7c9
DS
2490 if (r < 0)
2491 return r;
2492 }
4ad49000 2493
0d2d6fbf 2494 /* Now actually deal with the cgroup we were trying to realise and set attributes */
7b639614 2495 r = unit_update_cgroup(u, target_mask, enable_mask, state);
6414b7c9
DS
2496 if (r < 0)
2497 return r;
2498
c2baf11c
LP
2499 /* Now, reset the invalidation mask */
2500 u->cgroup_invalidated_mask = 0;
6414b7c9 2501 return 0;
64747e2d
LP
2502}
2503
91a6073e 2504unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 2505 ManagerState state;
4ad49000 2506 unsigned n = 0;
db785129 2507 Unit *i;
6414b7c9 2508 int r;
ecedd90f 2509
91a6073e
LP
2510 assert(m);
2511
db785129
LP
2512 state = manager_state(m);
2513
91a6073e
LP
2514 while ((i = m->cgroup_realize_queue)) {
2515 assert(i->in_cgroup_realize_queue);
ecedd90f 2516
2aa57a65
LP
2517 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
2518 /* Maybe things changed, and the unit is not actually active anymore? */
2519 unit_remove_from_cgroup_realize_queue(i);
2520 continue;
2521 }
2522
db785129 2523 r = unit_realize_cgroup_now(i, state);
6414b7c9 2524 if (r < 0)
efdb0237 2525 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 2526
4ad49000
LP
2527 n++;
2528 }
ecedd90f 2529
4ad49000 2530 return n;
8e274523
LP
2531}
2532
4c591f39
MK
2533void unit_add_family_to_cgroup_realize_queue(Unit *u) {
2534 assert(u);
2535 assert(u->type == UNIT_SLICE);
ca949c9d 2536
4c591f39
MK
2537 /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
2538 * its ancestors.
2539 *
2540 * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
2541 * very weird if two units that own processes reside in the same slice, but one is realized in the
2542 * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
2543 * not), because that means individual processes need to be scheduled against whole cgroups. Let's
2544 * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
2545 * controller hierarchies too (if unit requires the controller to be realized).
e1e98911 2546 *
4c591f39
MK
2547 * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
2548 * masks. */
2549
2550 do {
4ad49000 2551 Unit *m;
8f53a7b8 2552
4c591f39
MK
2553 /* Children of u likely changed when we're called */
2554 u->cgroup_members_mask_valid = false;
f23ba94d 2555
d219a2b0 2556 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
8e274523 2557
65f6b6bd 2558 /* No point in doing cgroup application for units without active processes. */
6414b7c9
DS
2559 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
2560 continue;
2561
e1e98911
LP
2562 /* We only enqueue siblings if they were realized once at least, in the main
2563 * hierarchy. */
2564 if (!m->cgroup_realized)
2565 continue;
2566
defe63b0
LP
2567 /* If the unit doesn't need any new controllers and has current ones
2568 * realized, it doesn't need any changes. */
906c06f6
DM
2569 if (unit_has_mask_realized(m,
2570 unit_get_target_mask(m),
17f14955 2571 unit_get_enable_mask(m)))
6414b7c9
DS
2572 continue;
2573
91a6073e 2574 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
2575 }
2576
4c591f39
MK
2577 /* Parent comes after children */
2578 unit_add_to_cgroup_realize_queue(u);
12f64221
LP
2579
2580 u = UNIT_GET_SLICE(u);
2581 } while (u);
4ad49000
LP
2582}
2583
0a1eb06d 2584int unit_realize_cgroup(Unit *u) {
12f64221
LP
2585 Unit *slice;
2586
4ad49000
LP
2587 assert(u);
2588
35b7ff80 2589 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 2590 return 0;
8e274523 2591
4c591f39
MK
2592 /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
2593 * parents, but there's more actually: for the weight-based controllers we also need to make sure
2594 * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the
2595 * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
2596 * and ancestors and they should be (de)realized too.
2597 *
2598 * This call will defer work on the siblings and derealized ancestors to the next event loop
2599 * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
ca949c9d 2600
12f64221
LP
2601 slice = UNIT_GET_SLICE(u);
2602 if (slice)
2603 unit_add_family_to_cgroup_realize_queue(slice);
4ad49000 2604
6414b7c9 2605 /* And realize this one now (and apply the values) */
db785129 2606 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
2607}
2608
efdb0237
LP
2609void unit_release_cgroup(Unit *u) {
2610 assert(u);
2611
8a0d5388
LP
2612 /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
2613 * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
efdb0237
LP
2614
2615 if (u->cgroup_path) {
2616 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
2617 u->cgroup_path = mfree(u->cgroup_path);
2618 }
2619
0bb814c2
LP
2620 if (u->cgroup_control_inotify_wd >= 0) {
2621 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
2622 log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
efdb0237 2623
0bb814c2
LP
2624 (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
2625 u->cgroup_control_inotify_wd = -1;
efdb0237 2626 }
afcfaa69
LP
2627
2628 if (u->cgroup_memory_inotify_wd >= 0) {
2629 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
2630 log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
2631
2632 (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
2633 u->cgroup_memory_inotify_wd = -1;
2634 }
efdb0237
LP
2635}
2636
e08dabfe
AZ
2637bool unit_maybe_release_cgroup(Unit *u) {
2638 int r;
2639
2640 assert(u);
2641
2642 if (!u->cgroup_path)
2643 return true;
2644
2645 /* Don't release the cgroup if there are still processes under it. If we get notified later when all the
2646 * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
2647 * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
2648 * up later. */
2649 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2650 if (r < 0)
2651 log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
2652 else if (r == 1) {
2653 unit_release_cgroup(u);
2654 return true;
2655 }
2656
2657 return false;
2658}
2659
efdb0237 2660void unit_prune_cgroup(Unit *u) {
8e274523 2661 int r;
efdb0237 2662 bool is_root_slice;
8e274523 2663
4ad49000 2664 assert(u);
8e274523 2665
efdb0237
LP
2666 /* Removes the cgroup, if empty and possible, and stops watching it. */
2667
4ad49000
LP
2668 if (!u->cgroup_path)
2669 return;
8e274523 2670
fe700f46
LP
2671 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
2672
efdb0237
LP
2673 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2674
2675 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
0219b352
DB
2676 if (r < 0)
2677 /* One reason we could have failed here is, that the cgroup still contains a process.
2678 * However, if the cgroup becomes removable at a later time, it might be removed when
2679 * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
2680 * that the cgroup is still realized the next time it is started. Do not return early
2681 * on error, continue cleanup. */
6178e2f8 2682 log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
8e274523 2683
efdb0237
LP
2684 if (is_root_slice)
2685 return;
2686
e08dabfe
AZ
2687 if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
2688 return;
0a1eb06d 2689
4ad49000 2690 u->cgroup_realized = false;
bc432dc7 2691 u->cgroup_realized_mask = 0;
ccf78df1 2692 u->cgroup_enabled_mask = 0;
084c7007
RG
2693
2694 u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
8e274523
LP
2695}
2696
efdb0237 2697int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000 2698 _cleanup_fclose_ FILE *f = NULL;
4d051546 2699 pid_t pid = 0, npid;
efdb0237 2700 int r;
4ad49000
LP
2701
2702 assert(u);
efdb0237 2703 assert(ret);
4ad49000
LP
2704
2705 if (!u->cgroup_path)
efdb0237 2706 return -ENXIO;
4ad49000 2707
efdb0237
LP
2708 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
2709 if (r < 0)
2710 return r;
4ad49000 2711
4ad49000 2712 while (cg_read_pid(f, &npid) > 0) {
4ad49000
LP
2713
2714 if (npid == pid)
2715 continue;
8e274523 2716
4d051546 2717 if (pid_is_my_child(npid) == 0)
4ad49000 2718 continue;
8e274523 2719
efdb0237 2720 if (pid != 0)
4ad49000
LP
2721 /* Dang, there's more than one daemonized PID
2722 in this group, so we don't know what process
2723 is the main process. */
efdb0237
LP
2724
2725 return -ENODATA;
8e274523 2726
4ad49000 2727 pid = npid;
8e274523
LP
2728 }
2729
efdb0237
LP
2730 *ret = pid;
2731 return 0;
2732}
2733
2734static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 2735 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
2736 _cleanup_fclose_ FILE *f = NULL;
2737 int ret = 0, r;
2738
2739 assert(u);
2740 assert(path);
2741
2742 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
2743 if (r < 0)
2744 ret = r;
2745 else {
2746 pid_t pid;
2747
2748 while ((r = cg_read_pid(f, &pid)) > 0) {
f75f613d 2749 r = unit_watch_pid(u, pid, false);
efdb0237
LP
2750 if (r < 0 && ret >= 0)
2751 ret = r;
2752 }
2753
2754 if (r < 0 && ret >= 0)
2755 ret = r;
2756 }
2757
2758 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
2759 if (r < 0) {
2760 if (ret >= 0)
2761 ret = r;
2762 } else {
2763 char *fn;
2764
2765 while ((r = cg_read_subgroup(d, &fn)) > 0) {
2766 _cleanup_free_ char *p = NULL;
2767
95b21cff 2768 p = path_join(empty_to_root(path), fn);
efdb0237
LP
2769 free(fn);
2770
2771 if (!p)
2772 return -ENOMEM;
2773
2774 r = unit_watch_pids_in_path(u, p);
2775 if (r < 0 && ret >= 0)
2776 ret = r;
2777 }
2778
2779 if (r < 0 && ret >= 0)
2780 ret = r;
2781 }
2782
2783 return ret;
2784}
2785
11aef522
LP
2786int unit_synthesize_cgroup_empty_event(Unit *u) {
2787 int r;
2788
2789 assert(u);
2790
2791 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
2792 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2793 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2794
2795 if (!u->cgroup_path)
2796 return -ENOENT;
2797
2798 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2799 if (r < 0)
2800 return r;
2801 if (r > 0) /* On unified we have reliable notifications, and don't need this */
2802 return 0;
2803
2804 if (!set_isempty(u->pids))
2805 return 0;
2806
2807 unit_add_to_cgroup_empty_queue(u);
2808 return 0;
2809}
2810
efdb0237 2811int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
2812 int r;
2813
efdb0237
LP
2814 assert(u);
2815
2816 /* Adds all PIDs from our cgroup to the set of PIDs we
2817 * watch. This is a fallback logic for cases where we do not
2818 * get reliable cgroup empty notifications: we try to use
2819 * SIGCHLD as replacement. */
2820
2821 if (!u->cgroup_path)
2822 return -ENOENT;
2823
c22800e4 2824 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
2825 if (r < 0)
2826 return r;
2827 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
2828 return 0;
2829
2830 return unit_watch_pids_in_path(u, u->cgroup_path);
2831}
2832
09e24654
LP
2833static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2834 Manager *m = userdata;
2835 Unit *u;
efdb0237
LP
2836 int r;
2837
09e24654
LP
2838 assert(s);
2839 assert(m);
efdb0237 2840
09e24654
LP
2841 u = m->cgroup_empty_queue;
2842 if (!u)
efdb0237
LP
2843 return 0;
2844
09e24654
LP
2845 assert(u->in_cgroup_empty_queue);
2846 u->in_cgroup_empty_queue = false;
2847 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2848
2849 if (m->cgroup_empty_queue) {
2850 /* More stuff queued, let's make sure we remain enabled */
2851 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2852 if (r < 0)
19a691a9 2853 log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
09e24654 2854 }
efdb0237
LP
2855
2856 unit_add_to_gc_queue(u);
2857
2858 if (UNIT_VTABLE(u)->notify_cgroup_empty)
2859 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2860
2861 return 0;
2862}
2863
09e24654
LP
2864void unit_add_to_cgroup_empty_queue(Unit *u) {
2865 int r;
2866
2867 assert(u);
2868
2869 /* Note that there are four different ways how cgroup empty events reach us:
2870 *
2871 * 1. On the unified hierarchy we get an inotify event on the cgroup
2872 *
2873 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2874 *
2875 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2876 *
2877 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2878 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2879 *
2880 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2881 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2882 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2883 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2884 * case for scope units). */
2885
2886 if (u->in_cgroup_empty_queue)
2887 return;
2888
2889 /* Let's verify that the cgroup is really empty */
2890 if (!u->cgroup_path)
2891 return;
e1e98911 2892
09e24654
LP
2893 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2894 if (r < 0) {
6178e2f8 2895 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
09e24654
LP
2896 return;
2897 }
2898 if (r == 0)
2899 return;
2900
2901 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2902 u->in_cgroup_empty_queue = true;
2903
2904 /* Trigger the defer event */
2905 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2906 if (r < 0)
2907 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2908}
2909
d9e45bc3
MS
2910static void unit_remove_from_cgroup_empty_queue(Unit *u) {
2911 assert(u);
2912
2913 if (!u->in_cgroup_empty_queue)
2914 return;
2915
2916 LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2917 u->in_cgroup_empty_queue = false;
2918}
2919
fe8d22fb
AZ
2920int unit_check_oomd_kill(Unit *u) {
2921 _cleanup_free_ char *value = NULL;
2922 bool increased;
2923 uint64_t n = 0;
2924 int r;
2925
2926 if (!u->cgroup_path)
2927 return 0;
2928
2929 r = cg_all_unified();
2930 if (r < 0)
2931 return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
2932 else if (r == 0)
2933 return 0;
2934
e3038333 2935 r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value);
fe8d22fb
AZ
2936 if (r < 0 && r != -ENODATA)
2937 return r;
2938
2939 if (!isempty(value)) {
2940 r = safe_atou64(value, &n);
2941 if (r < 0)
2942 return r;
2943 }
2944
2945 increased = n > u->managed_oom_kill_last;
2946 u->managed_oom_kill_last = n;
2947
2948 if (!increased)
2949 return 0;
2950
2951 if (n > 0)
c2503e35
RH
2952 log_unit_struct(u, LOG_NOTICE,
2953 "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
2954 LOG_UNIT_INVOCATION_ID(u),
2955 LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n));
fe8d22fb
AZ
2956
2957 return 1;
2958}
2959
2ba6ae6b 2960int unit_check_oom(Unit *u) {
afcfaa69
LP
2961 _cleanup_free_ char *oom_kill = NULL;
2962 bool increased;
2963 uint64_t c;
2964 int r;
2965
2966 if (!u->cgroup_path)
2967 return 0;
2968
2969 r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
2970 if (r < 0)
2971 return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
2972
2973 r = safe_atou64(oom_kill, &c);
2974 if (r < 0)
2975 return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
2976
2977 increased = c > u->oom_kill_last;
2978 u->oom_kill_last = c;
2979
2980 if (!increased)
2981 return 0;
2982
c2503e35
RH
2983 log_unit_struct(u, LOG_NOTICE,
2984 "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
2985 LOG_UNIT_INVOCATION_ID(u),
2986 LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
afcfaa69
LP
2987
2988 if (UNIT_VTABLE(u)->notify_cgroup_oom)
2989 UNIT_VTABLE(u)->notify_cgroup_oom(u);
2990
2991 return 1;
2992}
2993
2994static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
2995 Manager *m = userdata;
2996 Unit *u;
2997 int r;
2998
2999 assert(s);
3000 assert(m);
3001
3002 u = m->cgroup_oom_queue;
3003 if (!u)
3004 return 0;
3005
3006 assert(u->in_cgroup_oom_queue);
3007 u->in_cgroup_oom_queue = false;
3008 LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
3009
3010 if (m->cgroup_oom_queue) {
3011 /* More stuff queued, let's make sure we remain enabled */
3012 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
3013 if (r < 0)
3014 log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
3015 }
3016
3017 (void) unit_check_oom(u);
3018 return 0;
3019}
3020
3021static void unit_add_to_cgroup_oom_queue(Unit *u) {
3022 int r;
3023
3024 assert(u);
3025
3026 if (u->in_cgroup_oom_queue)
3027 return;
3028 if (!u->cgroup_path)
3029 return;
3030
3031 LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
3032 u->in_cgroup_oom_queue = true;
3033
3034 /* Trigger the defer event */
3035 if (!u->manager->cgroup_oom_event_source) {
3036 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
3037
3038 r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
3039 if (r < 0) {
3040 log_error_errno(r, "Failed to create cgroup oom event source: %m");
3041 return;
3042 }
3043
3044 r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
3045 if (r < 0) {
3046 log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
3047 return;
3048 }
3049
3050 (void) sd_event_source_set_description(s, "cgroup-oom");
3051 u->manager->cgroup_oom_event_source = TAKE_PTR(s);
3052 }
3053
3054 r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
3055 if (r < 0)
3056 log_error_errno(r, "Failed to enable cgroup oom event source: %m");
3057}
3058
d9e45bc3
MS
3059static int unit_check_cgroup_events(Unit *u) {
3060 char *values[2] = {};
3061 int r;
3062
3063 assert(u);
3064
869f52f2
DS
3065 if (!u->cgroup_path)
3066 return 0;
3067
d9e45bc3
MS
3068 r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
3069 STRV_MAKE("populated", "frozen"), values);
3070 if (r < 0)
3071 return r;
3072
3073 /* The cgroup.events notifications can be merged together so act as we saw the given state for the
3074 * first time. The functions we call to handle given state are idempotent, which makes them
3075 * effectively remember the previous state. */
3076 if (values[0]) {
3077 if (streq(values[0], "1"))
3078 unit_remove_from_cgroup_empty_queue(u);
3079 else
3080 unit_add_to_cgroup_empty_queue(u);
3081 }
3082
3083 /* Disregard freezer state changes due to operations not initiated by us */
3084 if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
3085 if (streq(values[1], "0"))
3086 unit_thawed(u);
3087 else
3088 unit_frozen(u);
3089 }
3090
3091 free(values[0]);
3092 free(values[1]);
3093
3094 return 0;
3095}
3096
efdb0237
LP
3097static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
3098 Manager *m = userdata;
3099
3100 assert(s);
3101 assert(fd >= 0);
3102 assert(m);
3103
3104 for (;;) {
3105 union inotify_event_buffer buffer;
3106 struct inotify_event *e;
3107 ssize_t l;
3108
3109 l = read(fd, &buffer, sizeof(buffer));
3110 if (l < 0) {
47249640 3111 if (IN_SET(errno, EINTR, EAGAIN))
efdb0237
LP
3112 return 0;
3113
3114 return log_error_errno(errno, "Failed to read control group inotify events: %m");
3115 }
3116
3117 FOREACH_INOTIFY_EVENT(e, buffer, l) {
3118 Unit *u;
3119
3120 if (e->wd < 0)
3121 /* Queue overflow has no watch descriptor */
3122 continue;
3123
3124 if (e->mask & IN_IGNORED)
3125 /* The watch was just removed */
3126 continue;
3127
afcfaa69
LP
3128 /* Note that inotify might deliver events for a watch even after it was removed,
3129 * because it was queued before the removal. Let's ignore this here safely. */
3130
0bb814c2 3131 u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
afcfaa69 3132 if (u)
d9e45bc3 3133 unit_check_cgroup_events(u);
efdb0237 3134
afcfaa69
LP
3135 u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
3136 if (u)
3137 unit_add_to_cgroup_oom_queue(u);
efdb0237
LP
3138 }
3139 }
8e274523
LP
3140}
3141
17f14955
RG
3142static int cg_bpf_mask_supported(CGroupMask *ret) {
3143 CGroupMask mask = 0;
3144 int r;
3145
3146 /* BPF-based firewall */
3147 r = bpf_firewall_supported();
3148 if (r > 0)
3149 mask |= CGROUP_MASK_BPF_FIREWALL;
3150
084c7007
RG
3151 /* BPF-based device access control */
3152 r = bpf_devices_supported();
3153 if (r > 0)
3154 mask |= CGROUP_MASK_BPF_DEVICES;
3155
506ea51b
JK
3156 /* BPF pinned prog */
3157 r = bpf_foreign_supported();
3158 if (r > 0)
3159 mask |= CGROUP_MASK_BPF_FOREIGN;
3160
a8e5eb17 3161 /* BPF-based bind{4|6} hooks */
cd09a5f3 3162 r = bpf_socket_bind_supported();
a8e5eb17
JK
3163 if (r > 0)
3164 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
3165
6f50d4f7
MV
3166 /* BPF-based cgroup_skb/{egress|ingress} hooks */
3167 r = restrict_network_interfaces_supported();
3168 if (r > 0)
3169 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
3170
17f14955
RG
3171 *ret = mask;
3172 return 0;
3173}
3174
8e274523 3175int manager_setup_cgroup(Manager *m) {
9444b1f2 3176 _cleanup_free_ char *path = NULL;
10bd3e2e 3177 const char *scope_path;
b4cccbc1 3178 int r, all_unified;
17f14955 3179 CGroupMask mask;
efdb0237 3180 char *e;
8e274523
LP
3181
3182 assert(m);
3183
35d2e7ec 3184 /* 1. Determine hierarchy */
efdb0237 3185 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 3186 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
3187 if (r < 0)
3188 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 3189
efdb0237
LP
3190 /* Chop off the init scope, if we are already located in it */
3191 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 3192
efdb0237
LP
3193 /* LEGACY: Also chop off the system slice if we are in
3194 * it. This is to support live upgrades from older systemd
3195 * versions where PID 1 was moved there. Also see
3196 * cg_get_root_path(). */
463d0d15 3197 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 3198 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 3199 if (!e)
efdb0237 3200 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 3201 }
efdb0237
LP
3202 if (e)
3203 *e = 0;
7ccfb64a 3204
7546145e
LP
3205 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
3206 * easily prepend it everywhere. */
3207 delete_trailing_chars(m->cgroup_root, "/");
8e274523 3208
35d2e7ec 3209 /* 2. Show data */
9444b1f2 3210 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
3211 if (r < 0)
3212 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 3213
d4d99bc6 3214 r = cg_unified();
415fc41c
TH
3215 if (r < 0)
3216 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 3217
b4cccbc1 3218 all_unified = cg_all_unified();
d4c819ed
ZJS
3219 if (all_unified < 0)
3220 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
3221 if (all_unified > 0)
efdb0237 3222 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 3223 else {
c22800e4 3224 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
3225 if (r < 0)
3226 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
3227 if (r > 0)
3228 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
3229 else
3230 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
3231 }
efdb0237 3232
09e24654 3233 /* 3. Allocate cgroup empty defer event source */
5dcadb4c 3234 m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
09e24654
LP
3235 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
3236 if (r < 0)
3237 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
3238
cbe83389
LP
3239 /* Schedule cgroup empty checks early, but after having processed service notification messages or
3240 * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
3241 * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
09e24654
LP
3242 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
3243 if (r < 0)
3244 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
3245
3246 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
3247 if (r < 0)
3248 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
3249
3250 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
3251
3252 /* 4. Install notifier inotify object, or agent */
10bd3e2e 3253 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
c6c18be3 3254
09e24654 3255 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
efdb0237 3256
5dcadb4c 3257 m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
10bd3e2e 3258 safe_close(m->cgroup_inotify_fd);
efdb0237 3259
10bd3e2e
LP
3260 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
3261 if (m->cgroup_inotify_fd < 0)
3262 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 3263
10bd3e2e
LP
3264 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
3265 if (r < 0)
3266 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 3267
cbe83389
LP
3268 /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
3269 * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
3270 * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
3271 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
10bd3e2e
LP
3272 if (r < 0)
3273 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
efdb0237 3274
10bd3e2e 3275 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 3276
611c4f8a 3277 } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
efdb0237 3278
10bd3e2e
LP
3279 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
3280 * since it does not generate events when control groups with children run empty. */
8e274523 3281
ce906769 3282 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH);
23bbb0de 3283 if (r < 0)
10bd3e2e
LP
3284 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
3285 else if (r > 0)
3286 log_debug("Installed release agent.");
3287 else if (r == 0)
3288 log_debug("Release agent already installed.");
3289 }
efdb0237 3290
09e24654 3291 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
10bd3e2e
LP
3292 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
3293 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
aa77e234
MS
3294 if (r >= 0) {
3295 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
3296 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
3297 if (r < 0)
3298 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 3299
aa77e234
MS
3300 /* 6. And pin it, so that it cannot be unmounted */
3301 safe_close(m->pin_cgroupfs_fd);
3302 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
3303 if (m->pin_cgroupfs_fd < 0)
3304 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 3305
638cece4 3306 } else if (!MANAGER_IS_TEST_RUN(m))
aa77e234 3307 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
10bd3e2e 3308
09e24654 3309 /* 7. Always enable hierarchical support if it exists... */
638cece4 3310 if (!all_unified && !MANAGER_IS_TEST_RUN(m))
10bd3e2e 3311 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3 3312
17f14955 3313 /* 8. Figure out which controllers are supported */
0fa7b500 3314 r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
efdb0237
LP
3315 if (r < 0)
3316 return log_error_errno(r, "Failed to determine supported controllers: %m");
17f14955
RG
3317
3318 /* 9. Figure out which bpf-based pseudo-controllers are supported */
3319 r = cg_bpf_mask_supported(&mask);
3320 if (r < 0)
3321 return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
3322 m->cgroup_supported |= mask;
3323
3324 /* 10. Log which controllers are supported */
e8616626
ZJS
3325 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
3326 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
3327 yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 3328
a32360f1 3329 return 0;
8e274523
LP
3330}
3331
c6c18be3 3332void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
3333 assert(m);
3334
9444b1f2
LP
3335 /* We can't really delete the group, since we are in it. But
3336 * let's trim it. */
5dd2f5ff 3337 if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
efdb0237
LP
3338 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
3339
5dcadb4c 3340 m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
09e24654 3341
0bb814c2 3342 m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
afcfaa69 3343 m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
efdb0237 3344
5dcadb4c 3345 m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
efdb0237 3346 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 3347
03e334a1 3348 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 3349
efdb0237 3350 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
3351}
3352
4ad49000 3353Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 3354 char *p;
4ad49000 3355 Unit *u;
acb14d31
LP
3356
3357 assert(m);
3358 assert(cgroup);
acb14d31 3359
4ad49000
LP
3360 u = hashmap_get(m->cgroup_unit, cgroup);
3361 if (u)
3362 return u;
acb14d31 3363
8e70580b 3364 p = strdupa(cgroup);
acb14d31
LP
3365 for (;;) {
3366 char *e;
3367
3368 e = strrchr(p, '/');
efdb0237
LP
3369 if (!e || e == p)
3370 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
3371
3372 *e = 0;
3373
4ad49000
LP
3374 u = hashmap_get(m->cgroup_unit, p);
3375 if (u)
3376 return u;
acb14d31
LP
3377 }
3378}
3379
b3ac818b 3380Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 3381 _cleanup_free_ char *cgroup = NULL;
8e274523 3382
8c47c732
LP
3383 assert(m);
3384
62a76913 3385 if (!pid_is_valid(pid))
b3ac818b
LP
3386 return NULL;
3387
62a76913 3388 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
b3ac818b
LP
3389 return NULL;
3390
3391 return manager_get_unit_by_cgroup(m, cgroup);
3392}
3393
3394Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
62a76913 3395 Unit *u, **array;
b3ac818b
LP
3396
3397 assert(m);
3398
62a76913
LP
3399 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
3400 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
3401 * relevant one as children of the process will be assigned to that one, too, before all else. */
3402
3403 if (!pid_is_valid(pid))
8c47c732
LP
3404 return NULL;
3405
2ca9d979 3406 if (pid == getpid_cached())
efdb0237
LP
3407 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
3408
62a76913 3409 u = manager_get_unit_by_pid_cgroup(m, pid);
5fe8876b
LP
3410 if (u)
3411 return u;
3412
62a76913 3413 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
5fe8876b
LP
3414 if (u)
3415 return u;
3416
62a76913
LP
3417 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
3418 if (array)
3419 return array[0];
3420
3421 return NULL;
6dde1f33 3422}
4fbf50b3 3423
4ad49000
LP
3424int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
3425 Unit *u;
4fbf50b3 3426
4ad49000
LP
3427 assert(m);
3428 assert(cgroup);
4fbf50b3 3429
09e24654
LP
3430 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
3431 * or from the --system instance */
3432
d8fdc620
LP
3433 log_debug("Got cgroup empty notification for: %s", cgroup);
3434
4ad49000 3435 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
3436 if (!u)
3437 return 0;
b56c28c3 3438
09e24654
LP
3439 unit_add_to_cgroup_empty_queue(u);
3440 return 1;
5ad096b3
LP
3441}
3442
93ff34e4
LB
3443int unit_get_memory_available(Unit *u, uint64_t *ret) {
3444 uint64_t unit_current, available = UINT64_MAX;
3445 CGroupContext *unit_context;
3446 const char *memory_file;
3447 int r;
3448
3449 assert(u);
3450 assert(ret);
3451
3452 /* If data from cgroups can be accessed, try to find out how much more memory a unit can
3453 * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
3454 * and MemoryMax, and also any slice the unit might be nested below. */
3455
3456 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
3457 return -ENODATA;
3458
3459 if (!u->cgroup_path)
3460 return -ENODATA;
3461
3462 /* The root cgroup doesn't expose this information */
3463 if (unit_has_host_root_cgroup(u))
3464 return -ENODATA;
3465
3466 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
3467 return -ENODATA;
3468
3469 r = cg_all_unified();
3470 if (r < 0)
3471 return r;
3472 memory_file = r > 0 ? "memory.current" : "memory.usage_in_bytes";
3473
3474 r = cg_get_attribute_as_uint64("memory", u->cgroup_path, memory_file, &unit_current);
3475 if (r < 0)
3476 return r;
3477
3478 assert_se(unit_context = unit_get_cgroup_context(u));
3479
3480 if (unit_context->memory_max != UINT64_MAX || unit_context->memory_high != UINT64_MAX)
3481 available = LESS_BY(MIN(unit_context->memory_max, unit_context->memory_high), unit_current);
3482
3483 for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice)) {
3484 uint64_t slice_current, slice_available = UINT64_MAX;
3485 CGroupContext *slice_context;
3486
3487 /* No point in continuing if we can't go any lower */
3488 if (available == 0)
3489 break;
3490
3491 if (!slice->cgroup_path)
3492 continue;
3493
3494 slice_context = unit_get_cgroup_context(slice);
3495 if (!slice_context)
3496 continue;
3497
3498 if (slice_context->memory_max == UINT64_MAX && slice_context->memory_high == UINT64_MAX)
3499 continue;
3500
3501 r = cg_get_attribute_as_uint64("memory", slice->cgroup_path, memory_file, &slice_current);
3502 if (r < 0)
3503 continue;
3504
3505 slice_available = LESS_BY(MIN(slice_context->memory_max, slice_context->memory_high), slice_current);
3506 available = MIN(slice_available, available);
3507 }
3508
3509 *ret = available;
3510
3511 return 0;
3512}
3513
5ad096b3 3514int unit_get_memory_current(Unit *u, uint64_t *ret) {
5ad096b3
LP
3515 int r;
3516
3517 assert(u);
3518 assert(ret);
3519
2e4025c0 3520 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
cf3b4be1
LP
3521 return -ENODATA;
3522
5ad096b3
LP
3523 if (!u->cgroup_path)
3524 return -ENODATA;
3525
1f73aa00 3526 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
611c4f8a 3527 if (unit_has_host_root_cgroup(u))
c482724a 3528 return procfs_memory_get_used(ret);
1f73aa00 3529
efdb0237 3530 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
3531 return -ENODATA;
3532
b4cccbc1
LP
3533 r = cg_all_unified();
3534 if (r < 0)
3535 return r;
5ad096b3 3536
613328c3 3537 return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
5ad096b3
LP
3538}
3539
03a7b521 3540int unit_get_tasks_current(Unit *u, uint64_t *ret) {
03a7b521
LP
3541 assert(u);
3542 assert(ret);
3543
2e4025c0 3544 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
3545 return -ENODATA;
3546
03a7b521
LP
3547 if (!u->cgroup_path)
3548 return -ENODATA;
3549
c36a69f4 3550 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
611c4f8a 3551 if (unit_has_host_root_cgroup(u))
c36a69f4
LP
3552 return procfs_tasks_get_current(ret);
3553
1f73aa00
LP
3554 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
3555 return -ENODATA;
3556
613328c3 3557 return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret);
03a7b521
LP
3558}
3559
5ad096b3 3560static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
5ad096b3
LP
3561 uint64_t ns;
3562 int r;
3563
3564 assert(u);
3565 assert(ret);
3566
3567 if (!u->cgroup_path)
3568 return -ENODATA;
3569
1f73aa00 3570 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
611c4f8a 3571 if (unit_has_host_root_cgroup(u))
1f73aa00
LP
3572 return procfs_cpu_get_usage(ret);
3573
f98c2585
CD
3574 /* Requisite controllers for CPU accounting are not enabled */
3575 if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
3576 return -ENODATA;
3577
92a99304
LP
3578 r = cg_all_unified();
3579 if (r < 0)
3580 return r;
b4cccbc1 3581 if (r > 0) {
66ebf6c0
TH
3582 _cleanup_free_ char *val = NULL;
3583 uint64_t us;
5ad096b3 3584
b734a4ff 3585 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
b734a4ff
LP
3586 if (IN_SET(r, -ENOENT, -ENXIO))
3587 return -ENODATA;
d742f4b5
LP
3588 if (r < 0)
3589 return r;
66ebf6c0
TH
3590
3591 r = safe_atou64(val, &us);
3592 if (r < 0)
3593 return r;
3594
3595 ns = us * NSEC_PER_USEC;
613328c3
AZ
3596 } else
3597 return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret);
5ad096b3
LP
3598
3599 *ret = ns;
3600 return 0;
3601}
3602
3603int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
3604 nsec_t ns;
3605 int r;
3606
fe700f46
LP
3607 assert(u);
3608
3609 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
3610 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
3611 * call this function with a NULL return value. */
3612
2e4025c0 3613 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
cf3b4be1
LP
3614 return -ENODATA;
3615
5ad096b3 3616 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
3617 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
3618 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
3619 * cached value. */
3620
3621 if (ret)
3622 *ret = u->cpu_usage_last;
3623 return 0;
3624 }
5ad096b3
LP
3625 if (r < 0)
3626 return r;
3627
66ebf6c0
TH
3628 if (ns > u->cpu_usage_base)
3629 ns -= u->cpu_usage_base;
5ad096b3
LP
3630 else
3631 ns = 0;
3632
fe700f46
LP
3633 u->cpu_usage_last = ns;
3634 if (ret)
3635 *ret = ns;
3636
5ad096b3
LP
3637 return 0;
3638}
3639
906c06f6
DM
3640int unit_get_ip_accounting(
3641 Unit *u,
3642 CGroupIPAccountingMetric metric,
3643 uint64_t *ret) {
3644
6b659ed8 3645 uint64_t value;
906c06f6
DM
3646 int fd, r;
3647
3648 assert(u);
3649 assert(metric >= 0);
3650 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
3651 assert(ret);
3652
2e4025c0 3653 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
3654 return -ENODATA;
3655
906c06f6
DM
3656 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
3657 u->ip_accounting_ingress_map_fd :
3658 u->ip_accounting_egress_map_fd;
906c06f6
DM
3659 if (fd < 0)
3660 return -ENODATA;
3661
3662 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 3663 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 3664 else
6b659ed8
LP
3665 r = bpf_firewall_read_accounting(fd, NULL, &value);
3666 if (r < 0)
3667 return r;
3668
3669 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
3670 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
3671 * ip_accounting_extra[] field, and add them in here transparently. */
3672
3673 *ret = value + u->ip_accounting_extra[metric];
906c06f6
DM
3674
3675 return r;
3676}
3677
fbe14fc9
LP
3678static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
3679 static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
3680 [CGROUP_IO_READ_BYTES] = "rbytes=",
3681 [CGROUP_IO_WRITE_BYTES] = "wbytes=",
3682 [CGROUP_IO_READ_OPERATIONS] = "rios=",
3683 [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
3684 };
3685 uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
3686 _cleanup_free_ char *path = NULL;
3687 _cleanup_fclose_ FILE *f = NULL;
3688 int r;
3689
3690 assert(u);
3691
3692 if (!u->cgroup_path)
3693 return -ENODATA;
3694
3695 if (unit_has_host_root_cgroup(u))
3696 return -ENODATA; /* TODO: return useful data for the top-level cgroup */
3697
3698 r = cg_all_unified();
3699 if (r < 0)
3700 return r;
3701 if (r == 0) /* TODO: support cgroupv1 */
3702 return -ENODATA;
3703
3704 if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
3705 return -ENODATA;
3706
3707 r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
3708 if (r < 0)
3709 return r;
3710
3711 f = fopen(path, "re");
3712 if (!f)
3713 return -errno;
3714
3715 for (;;) {
3716 _cleanup_free_ char *line = NULL;
3717 const char *p;
3718
3719 r = read_line(f, LONG_LINE_MAX, &line);
3720 if (r < 0)
3721 return r;
3722 if (r == 0)
3723 break;
3724
3725 p = line;
3726 p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
3727 p += strspn(p, WHITESPACE); /* Skip over following whitespace */
3728
3729 for (;;) {
3730 _cleanup_free_ char *word = NULL;
3731
3732 r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
3733 if (r < 0)
3734 return r;
3735 if (r == 0)
3736 break;
3737
3738 for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3739 const char *x;
3740
3741 x = startswith(word, field_names[i]);
3742 if (x) {
3743 uint64_t w;
3744
3745 r = safe_atou64(x, &w);
3746 if (r < 0)
3747 return r;
3748
3749 /* Sum up the stats of all devices */
3750 acc[i] += w;
3751 break;
3752 }
3753 }
3754 }
3755 }
3756
3757 memcpy(ret, acc, sizeof(acc));
3758 return 0;
3759}
3760
3761int unit_get_io_accounting(
3762 Unit *u,
3763 CGroupIOAccountingMetric metric,
3764 bool allow_cache,
3765 uint64_t *ret) {
3766
3767 uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
3768 int r;
3769
3770 /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
3771
3772 if (!UNIT_CGROUP_BOOL(u, io_accounting))
3773 return -ENODATA;
3774
3775 if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
3776 goto done;
3777
3778 r = unit_get_io_accounting_raw(u, raw);
3779 if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
3780 goto done;
3781 if (r < 0)
3782 return r;
3783
3784 for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3785 /* Saturated subtraction */
3786 if (raw[i] > u->io_accounting_base[i])
3787 u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
3788 else
3789 u->io_accounting_last[i] = 0;
3790 }
3791
3792done:
3793 if (ret)
3794 *ret = u->io_accounting_last[metric];
3795
3796 return 0;
3797}
3798
906c06f6 3799int unit_reset_cpu_accounting(Unit *u) {
5ad096b3
LP
3800 int r;
3801
3802 assert(u);
3803
fe700f46
LP
3804 u->cpu_usage_last = NSEC_INFINITY;
3805
0bbff7d6 3806 r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
5ad096b3 3807 if (r < 0) {
66ebf6c0 3808 u->cpu_usage_base = 0;
5ad096b3 3809 return r;
b56c28c3 3810 }
2633eb83 3811
4ad49000 3812 return 0;
4fbf50b3
LP
3813}
3814
906c06f6
DM
3815int unit_reset_ip_accounting(Unit *u) {
3816 int r = 0, q = 0;
3817
3818 assert(u);
3819
3820 if (u->ip_accounting_ingress_map_fd >= 0)
3821 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
3822
3823 if (u->ip_accounting_egress_map_fd >= 0)
3824 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
3825
6b659ed8
LP
3826 zero(u->ip_accounting_extra);
3827
906c06f6
DM
3828 return r < 0 ? r : q;
3829}
3830
fbe14fc9
LP
3831int unit_reset_io_accounting(Unit *u) {
3832 int r;
3833
3834 assert(u);
3835
3836 for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
3837 u->io_accounting_last[i] = UINT64_MAX;
3838
3839 r = unit_get_io_accounting_raw(u, u->io_accounting_base);
3840 if (r < 0) {
3841 zero(u->io_accounting_base);
3842 return r;
3843 }
3844
3845 return 0;
3846}
3847
9b2559a1 3848int unit_reset_accounting(Unit *u) {
fbe14fc9 3849 int r, q, v;
9b2559a1
LP
3850
3851 assert(u);
3852
3853 r = unit_reset_cpu_accounting(u);
fbe14fc9
LP
3854 q = unit_reset_io_accounting(u);
3855 v = unit_reset_ip_accounting(u);
9b2559a1 3856
fbe14fc9 3857 return r < 0 ? r : q < 0 ? q : v;
9b2559a1
LP
3858}
3859
e7ab4d1a
LP
3860void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
3861 assert(u);
3862
3863 if (!UNIT_HAS_CGROUP_CONTEXT(u))
3864 return;
3865
3866 if (m == 0)
3867 return;
3868
538b4852
TH
3869 /* always invalidate compat pairs together */
3870 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
3871 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
3872
7cce4fb7
LP
3873 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
3874 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
3875
e00068e7 3876 if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
e7ab4d1a
LP
3877 return;
3878
e00068e7 3879 u->cgroup_invalidated_mask |= m;
91a6073e 3880 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
3881}
3882
906c06f6
DM
3883void unit_invalidate_cgroup_bpf(Unit *u) {
3884 assert(u);
3885
3886 if (!UNIT_HAS_CGROUP_CONTEXT(u))
3887 return;
3888
17f14955 3889 if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
906c06f6
DM
3890 return;
3891
17f14955 3892 u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
91a6073e 3893 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
3894
3895 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
3896 * list of our children includes our own. */
3897 if (u->type == UNIT_SLICE) {
3898 Unit *member;
906c06f6 3899
d219a2b0 3900 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
15ed3c3a 3901 unit_invalidate_cgroup_bpf(member);
906c06f6
DM
3902 }
3903}
3904
869f52f2
DS
3905void unit_cgroup_catchup(Unit *u) {
3906 assert(u);
3907
3908 if (!UNIT_HAS_CGROUP_CONTEXT(u))
3909 return;
3910
3911 /* We dropped the inotify watch during reexec/reload, so we need to
3912 * check these as they may have changed.
3913 * Note that (currently) the kernel doesn't actually update cgroup
3914 * file modification times, so we can't just serialize and then check
3915 * the mtime for file(s) we are interested in. */
3916 (void) unit_check_cgroup_events(u);
3917 unit_add_to_cgroup_oom_queue(u);
3918}
3919
1d9cc876
LP
3920bool unit_cgroup_delegate(Unit *u) {
3921 CGroupContext *c;
3922
3923 assert(u);
3924
3925 if (!UNIT_VTABLE(u)->can_delegate)
3926 return false;
3927
3928 c = unit_get_cgroup_context(u);
3929 if (!c)
3930 return false;
3931
3932 return c->delegate;
3933}
3934
e7ab4d1a 3935void manager_invalidate_startup_units(Manager *m) {
e7ab4d1a
LP
3936 Unit *u;
3937
3938 assert(m);
3939
90e74a66 3940 SET_FOREACH(u, m->startup_units)
13c31542 3941 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
3942}
3943
da8e1782
MO
3944static int unit_get_nice(Unit *u) {
3945 ExecContext *ec;
3946
3947 ec = unit_get_exec_context(u);
3948 return ec ? ec->nice : 0;
3949}
3950
3951static uint64_t unit_get_cpu_weight(Unit *u) {
3952 ManagerState state = manager_state(u->manager);
3953 CGroupContext *cc;
3954
3955 cc = unit_get_cgroup_context(u);
3956 return cc ? cgroup_context_cpu_weight(cc, state) : CGROUP_WEIGHT_DEFAULT;
3957}
3958
3959int compare_job_priority(const void *a, const void *b) {
3960 const Job *x = a, *y = b;
3961 int nice_x, nice_y;
3962 uint64_t weight_x, weight_y;
3963 int ret;
3964
217b7b33
ZJS
3965 if ((ret = CMP(x->unit->type, y->unit->type)) != 0)
3966 return -ret;
3967
da8e1782
MO
3968 weight_x = unit_get_cpu_weight(x->unit);
3969 weight_y = unit_get_cpu_weight(y->unit);
3970
217b7b33
ZJS
3971 if ((ret = CMP(weight_x, weight_y)) != 0)
3972 return -ret;
da8e1782
MO
3973
3974 nice_x = unit_get_nice(x->unit);
3975 nice_y = unit_get_nice(y->unit);
3976
3977 if ((ret = CMP(nice_x, nice_y)) != 0)
3978 return ret;
3979
da8e1782
MO
3980 return strcmp(x->unit->id, y->unit->id);
3981}
3982
d9e45bc3
MS
3983int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
3984 _cleanup_free_ char *path = NULL;
3985 FreezerState target, kernel = _FREEZER_STATE_INVALID;
3986 int r;
3987
3988 assert(u);
3989 assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
3990
9a1e90ae
MS
3991 if (!cg_freezer_supported())
3992 return 0;
3993
d9e45bc3
MS
3994 if (!u->cgroup_realized)
3995 return -EBUSY;
3996
3997 target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
3998
3999 r = unit_freezer_state_kernel(u, &kernel);
4000 if (r < 0)
4001 log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
4002
4003 if (target == kernel) {
4004 u->freezer_state = target;
4005 return 0;
4006 }
4007
4008 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
4009 if (r < 0)
4010 return r;
4011
4012 log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
4013
4014 if (action == FREEZER_FREEZE)
4015 u->freezer_state = FREEZER_FREEZING;
4016 else
4017 u->freezer_state = FREEZER_THAWING;
4018
4019 r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
4020 if (r < 0)
4021 return r;
4022
d910f4c2 4023 return 1;
d9e45bc3
MS
4024}
4025
047f5d63
PH
4026int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
4027 _cleanup_free_ char *v = NULL;
4028 int r;
4029
4030 assert(u);
4031 assert(cpus);
4032
4033 if (!u->cgroup_path)
4034 return -ENODATA;
4035
4036 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
4037 return -ENODATA;
4038
4039 r = cg_all_unified();
4040 if (r < 0)
4041 return r;
4042 if (r == 0)
4043 return -ENODATA;
48fd01e5
LP
4044
4045 r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
047f5d63
PH
4046 if (r == -ENOENT)
4047 return -ENODATA;
4048 if (r < 0)
4049 return r;
4050
4051 return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
4052}
4053
4e806bfa
AZ
4054static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
4055 [CGROUP_DEVICE_POLICY_AUTO] = "auto",
4056 [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
4057 [CGROUP_DEVICE_POLICY_STRICT] = "strict",
4058};
4059
4ad49000 4060DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
d9e45bc3
MS
4061
4062static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
4063 [FREEZER_FREEZE] = "freeze",
4064 [FREEZER_THAW] = "thaw",
4065};
4066
4067DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);