]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
macro: introduce TAKE_PTR() macro
[thirdparty/systemd.git] / src / core / cgroup.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2013 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <fcntl.h>
22 #include <fnmatch.h>
23
24 #include "alloc-util.h"
25 #include "blockdev-util.h"
26 #include "bpf-firewall.h"
27 #include "bus-error.h"
28 #include "cgroup-util.h"
29 #include "cgroup.h"
30 #include "fd-util.h"
31 #include "fileio.h"
32 #include "fs-util.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "process-util.h"
36 #include "procfs-util.h"
37 #include "special.h"
38 #include "stdio-util.h"
39 #include "string-table.h"
40 #include "string-util.h"
41 #include "virt.h"
42
43 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
44
45 bool manager_owns_root_cgroup(Manager *m) {
46 assert(m);
47
48 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
49 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
50 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
51 * we run in any kind of container virtualization. */
52
53 if (detect_container() > 0)
54 return false;
55
56 return isempty(m->cgroup_root) || path_equal(m->cgroup_root, "/");
57 }
58
59 bool unit_has_root_cgroup(Unit *u) {
60 assert(u);
61
62 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
63 * the manager manages the root cgroup. */
64
65 if (!manager_owns_root_cgroup(u->manager))
66 return false;
67
68 return unit_has_name(u, SPECIAL_ROOT_SLICE);
69 }
70
71 static void cgroup_compat_warn(void) {
72 static bool cgroup_compat_warned = false;
73
74 if (cgroup_compat_warned)
75 return;
76
77 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
78 "See cgroup-compat debug messages for details.");
79
80 cgroup_compat_warned = true;
81 }
82
83 #define log_cgroup_compat(unit, fmt, ...) do { \
84 cgroup_compat_warn(); \
85 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
86 } while (false)
87
88 void cgroup_context_init(CGroupContext *c) {
89 assert(c);
90
91 /* Initialize everything to the kernel defaults, assuming the
92 * structure is preinitialized to 0 */
93
94 c->cpu_weight = CGROUP_WEIGHT_INVALID;
95 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
96 c->cpu_quota_per_sec_usec = USEC_INFINITY;
97
98 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
99 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
100
101 c->memory_high = CGROUP_LIMIT_MAX;
102 c->memory_max = CGROUP_LIMIT_MAX;
103 c->memory_swap_max = CGROUP_LIMIT_MAX;
104
105 c->memory_limit = CGROUP_LIMIT_MAX;
106
107 c->io_weight = CGROUP_WEIGHT_INVALID;
108 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
109
110 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
111 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
112
113 c->tasks_max = (uint64_t) -1;
114 }
115
116 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
117 assert(c);
118 assert(a);
119
120 LIST_REMOVE(device_allow, c->device_allow, a);
121 free(a->path);
122 free(a);
123 }
124
125 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
126 assert(c);
127 assert(w);
128
129 LIST_REMOVE(device_weights, c->io_device_weights, w);
130 free(w->path);
131 free(w);
132 }
133
134 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
135 assert(c);
136 assert(l);
137
138 LIST_REMOVE(device_limits, c->io_device_limits, l);
139 free(l->path);
140 free(l);
141 }
142
143 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
144 assert(c);
145 assert(w);
146
147 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
148 free(w->path);
149 free(w);
150 }
151
152 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
153 assert(c);
154 assert(b);
155
156 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
157 free(b->path);
158 free(b);
159 }
160
161 void cgroup_context_done(CGroupContext *c) {
162 assert(c);
163
164 while (c->io_device_weights)
165 cgroup_context_free_io_device_weight(c, c->io_device_weights);
166
167 while (c->io_device_limits)
168 cgroup_context_free_io_device_limit(c, c->io_device_limits);
169
170 while (c->blockio_device_weights)
171 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
172
173 while (c->blockio_device_bandwidths)
174 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
175
176 while (c->device_allow)
177 cgroup_context_free_device_allow(c, c->device_allow);
178
179 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
180 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
181 }
182
183 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
184 CGroupIODeviceLimit *il;
185 CGroupIODeviceWeight *iw;
186 CGroupBlockIODeviceBandwidth *b;
187 CGroupBlockIODeviceWeight *w;
188 CGroupDeviceAllow *a;
189 IPAddressAccessItem *iaai;
190 char u[FORMAT_TIMESPAN_MAX];
191
192 assert(c);
193 assert(f);
194
195 prefix = strempty(prefix);
196
197 fprintf(f,
198 "%sCPUAccounting=%s\n"
199 "%sIOAccounting=%s\n"
200 "%sBlockIOAccounting=%s\n"
201 "%sMemoryAccounting=%s\n"
202 "%sTasksAccounting=%s\n"
203 "%sIPAccounting=%s\n"
204 "%sCPUWeight=%" PRIu64 "\n"
205 "%sStartupCPUWeight=%" PRIu64 "\n"
206 "%sCPUShares=%" PRIu64 "\n"
207 "%sStartupCPUShares=%" PRIu64 "\n"
208 "%sCPUQuotaPerSecSec=%s\n"
209 "%sIOWeight=%" PRIu64 "\n"
210 "%sStartupIOWeight=%" PRIu64 "\n"
211 "%sBlockIOWeight=%" PRIu64 "\n"
212 "%sStartupBlockIOWeight=%" PRIu64 "\n"
213 "%sMemoryLow=%" PRIu64 "\n"
214 "%sMemoryHigh=%" PRIu64 "\n"
215 "%sMemoryMax=%" PRIu64 "\n"
216 "%sMemorySwapMax=%" PRIu64 "\n"
217 "%sMemoryLimit=%" PRIu64 "\n"
218 "%sTasksMax=%" PRIu64 "\n"
219 "%sDevicePolicy=%s\n"
220 "%sDelegate=%s\n",
221 prefix, yes_no(c->cpu_accounting),
222 prefix, yes_no(c->io_accounting),
223 prefix, yes_no(c->blockio_accounting),
224 prefix, yes_no(c->memory_accounting),
225 prefix, yes_no(c->tasks_accounting),
226 prefix, yes_no(c->ip_accounting),
227 prefix, c->cpu_weight,
228 prefix, c->startup_cpu_weight,
229 prefix, c->cpu_shares,
230 prefix, c->startup_cpu_shares,
231 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
232 prefix, c->io_weight,
233 prefix, c->startup_io_weight,
234 prefix, c->blockio_weight,
235 prefix, c->startup_blockio_weight,
236 prefix, c->memory_low,
237 prefix, c->memory_high,
238 prefix, c->memory_max,
239 prefix, c->memory_swap_max,
240 prefix, c->memory_limit,
241 prefix, c->tasks_max,
242 prefix, cgroup_device_policy_to_string(c->device_policy),
243 prefix, yes_no(c->delegate));
244
245 if (c->delegate) {
246 _cleanup_free_ char *t = NULL;
247
248 (void) cg_mask_to_string(c->delegate_controllers, &t);
249
250 fprintf(f, "%sDelegateControllers=%s\n",
251 prefix,
252 strempty(t));
253 }
254
255 LIST_FOREACH(device_allow, a, c->device_allow)
256 fprintf(f,
257 "%sDeviceAllow=%s %s%s%s\n",
258 prefix,
259 a->path,
260 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
261
262 LIST_FOREACH(device_weights, iw, c->io_device_weights)
263 fprintf(f,
264 "%sIODeviceWeight=%s %" PRIu64,
265 prefix,
266 iw->path,
267 iw->weight);
268
269 LIST_FOREACH(device_limits, il, c->io_device_limits) {
270 char buf[FORMAT_BYTES_MAX];
271 CGroupIOLimitType type;
272
273 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
274 if (il->limits[type] != cgroup_io_limit_defaults[type])
275 fprintf(f,
276 "%s%s=%s %s\n",
277 prefix,
278 cgroup_io_limit_type_to_string(type),
279 il->path,
280 format_bytes(buf, sizeof(buf), il->limits[type]));
281 }
282
283 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
284 fprintf(f,
285 "%sBlockIODeviceWeight=%s %" PRIu64,
286 prefix,
287 w->path,
288 w->weight);
289
290 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
291 char buf[FORMAT_BYTES_MAX];
292
293 if (b->rbps != CGROUP_LIMIT_MAX)
294 fprintf(f,
295 "%sBlockIOReadBandwidth=%s %s\n",
296 prefix,
297 b->path,
298 format_bytes(buf, sizeof(buf), b->rbps));
299 if (b->wbps != CGROUP_LIMIT_MAX)
300 fprintf(f,
301 "%sBlockIOWriteBandwidth=%s %s\n",
302 prefix,
303 b->path,
304 format_bytes(buf, sizeof(buf), b->wbps));
305 }
306
307 LIST_FOREACH(items, iaai, c->ip_address_allow) {
308 _cleanup_free_ char *k = NULL;
309
310 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
311 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
312 }
313
314 LIST_FOREACH(items, iaai, c->ip_address_deny) {
315 _cleanup_free_ char *k = NULL;
316
317 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
318 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
319 }
320 }
321
322 static int lookup_block_device(const char *p, dev_t *dev) {
323 struct stat st;
324 int r;
325
326 assert(p);
327 assert(dev);
328
329 r = stat(p, &st);
330 if (r < 0)
331 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
332
333 if (S_ISBLK(st.st_mode))
334 *dev = st.st_rdev;
335 else if (major(st.st_dev) != 0) {
336 /* If this is not a device node then find the block
337 * device this file is stored on */
338 *dev = st.st_dev;
339
340 /* If this is a partition, try to get the originating
341 * block device */
342 (void) block_get_whole_disk(*dev, dev);
343 } else {
344 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
345 return -ENODEV;
346 }
347
348 return 0;
349 }
350
351 static int whitelist_device(const char *path, const char *node, const char *acc) {
352 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
353 struct stat st;
354 bool ignore_notfound;
355 int r;
356
357 assert(path);
358 assert(acc);
359
360 if (node[0] == '-') {
361 /* Non-existent paths starting with "-" must be silently ignored */
362 node++;
363 ignore_notfound = true;
364 } else
365 ignore_notfound = false;
366
367 if (stat(node, &st) < 0) {
368 if (errno == ENOENT && ignore_notfound)
369 return 0;
370
371 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
372 }
373
374 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
375 log_warning("%s is not a device.", node);
376 return -ENODEV;
377 }
378
379 sprintf(buf,
380 "%c %u:%u %s",
381 S_ISCHR(st.st_mode) ? 'c' : 'b',
382 major(st.st_rdev), minor(st.st_rdev),
383 acc);
384
385 r = cg_set_attribute("devices", path, "devices.allow", buf);
386 if (r < 0)
387 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
388 "Failed to set devices.allow on %s: %m", path);
389
390 return r;
391 }
392
393 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
394 _cleanup_fclose_ FILE *f = NULL;
395 char line[LINE_MAX];
396 bool good = false;
397 int r;
398
399 assert(path);
400 assert(acc);
401 assert(IN_SET(type, 'b', 'c'));
402
403 f = fopen("/proc/devices", "re");
404 if (!f)
405 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
406
407 FOREACH_LINE(line, f, goto fail) {
408 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
409 unsigned maj;
410
411 truncate_nl(line);
412
413 if (type == 'c' && streq(line, "Character devices:")) {
414 good = true;
415 continue;
416 }
417
418 if (type == 'b' && streq(line, "Block devices:")) {
419 good = true;
420 continue;
421 }
422
423 if (isempty(line)) {
424 good = false;
425 continue;
426 }
427
428 if (!good)
429 continue;
430
431 p = strstrip(line);
432
433 w = strpbrk(p, WHITESPACE);
434 if (!w)
435 continue;
436 *w = 0;
437
438 r = safe_atou(p, &maj);
439 if (r < 0)
440 continue;
441 if (maj <= 0)
442 continue;
443
444 w++;
445 w += strspn(w, WHITESPACE);
446
447 if (fnmatch(name, w, 0) != 0)
448 continue;
449
450 sprintf(buf,
451 "%c %u:* %s",
452 type,
453 maj,
454 acc);
455
456 r = cg_set_attribute("devices", path, "devices.allow", buf);
457 if (r < 0)
458 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
459 "Failed to set devices.allow on %s: %m", path);
460 }
461
462 return 0;
463
464 fail:
465 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
466 }
467
468 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
469 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
470 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
471 }
472
473 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
474 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
475 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
476 }
477
478 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
479 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
480 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
481 return c->startup_cpu_weight;
482 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
483 return c->cpu_weight;
484 else
485 return CGROUP_WEIGHT_DEFAULT;
486 }
487
488 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
489 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
490 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
491 return c->startup_cpu_shares;
492 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
493 return c->cpu_shares;
494 else
495 return CGROUP_CPU_SHARES_DEFAULT;
496 }
497
498 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
499 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
500 int r;
501
502 xsprintf(buf, "%" PRIu64 "\n", weight);
503 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
504 if (r < 0)
505 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
506 "Failed to set cpu.weight: %m");
507
508 if (quota != USEC_INFINITY)
509 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
510 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
511 else
512 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
513
514 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
515
516 if (r < 0)
517 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
518 "Failed to set cpu.max: %m");
519 }
520
521 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
522 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
523 int r;
524
525 xsprintf(buf, "%" PRIu64 "\n", shares);
526 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
527 if (r < 0)
528 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
529 "Failed to set cpu.shares: %m");
530
531 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
532 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
533 if (r < 0)
534 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
535 "Failed to set cpu.cfs_period_us: %m");
536
537 if (quota != USEC_INFINITY) {
538 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
539 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
540 } else
541 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
542 if (r < 0)
543 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
544 "Failed to set cpu.cfs_quota_us: %m");
545 }
546
547 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
548 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
549 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
550 }
551
552 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
553 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
554 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
555 }
556
557 static bool cgroup_context_has_io_config(CGroupContext *c) {
558 return c->io_accounting ||
559 c->io_weight != CGROUP_WEIGHT_INVALID ||
560 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
561 c->io_device_weights ||
562 c->io_device_limits;
563 }
564
565 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
566 return c->blockio_accounting ||
567 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
568 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
569 c->blockio_device_weights ||
570 c->blockio_device_bandwidths;
571 }
572
573 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
574 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
575 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
576 return c->startup_io_weight;
577 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
578 return c->io_weight;
579 else
580 return CGROUP_WEIGHT_DEFAULT;
581 }
582
583 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
584 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
585 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
586 return c->startup_blockio_weight;
587 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
588 return c->blockio_weight;
589 else
590 return CGROUP_BLKIO_WEIGHT_DEFAULT;
591 }
592
593 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
594 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
595 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
596 }
597
598 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
599 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
600 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
601 }
602
603 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
604 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
605 dev_t dev;
606 int r;
607
608 r = lookup_block_device(dev_path, &dev);
609 if (r < 0)
610 return;
611
612 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
613 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
614 if (r < 0)
615 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
616 "Failed to set io.weight: %m");
617 }
618
619 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
620 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
621 dev_t dev;
622 int r;
623
624 r = lookup_block_device(dev_path, &dev);
625 if (r < 0)
626 return;
627
628 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
629 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
630 if (r < 0)
631 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
632 "Failed to set blkio.weight_device: %m");
633 }
634
635 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
636 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
637 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
638 CGroupIOLimitType type;
639 dev_t dev;
640 unsigned n = 0;
641 int r;
642
643 r = lookup_block_device(dev_path, &dev);
644 if (r < 0)
645 return 0;
646
647 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
648 if (limits[type] != cgroup_io_limit_defaults[type]) {
649 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
650 n++;
651 } else {
652 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
653 }
654 }
655
656 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
657 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
658 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
659 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
660 if (r < 0)
661 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
662 "Failed to set io.max: %m");
663 return n;
664 }
665
666 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
667 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
668 dev_t dev;
669 unsigned n = 0;
670 int r;
671
672 r = lookup_block_device(dev_path, &dev);
673 if (r < 0)
674 return 0;
675
676 if (rbps != CGROUP_LIMIT_MAX)
677 n++;
678 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
679 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
680 if (r < 0)
681 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
682 "Failed to set blkio.throttle.read_bps_device: %m");
683
684 if (wbps != CGROUP_LIMIT_MAX)
685 n++;
686 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
687 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
688 if (r < 0)
689 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
690 "Failed to set blkio.throttle.write_bps_device: %m");
691
692 return n;
693 }
694
695 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
696 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
697 }
698
699 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
700 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
701 int r;
702
703 if (v != CGROUP_LIMIT_MAX)
704 xsprintf(buf, "%" PRIu64 "\n", v);
705
706 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
707 if (r < 0)
708 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
709 "Failed to set %s: %m", file);
710 }
711
712 static void cgroup_apply_firewall(Unit *u) {
713 assert(u);
714
715 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
716
717 if (bpf_firewall_compile(u) < 0)
718 return;
719
720 (void) bpf_firewall_install(u);
721 }
722
723 static void cgroup_context_apply(
724 Unit *u,
725 CGroupMask apply_mask,
726 bool apply_bpf,
727 ManagerState state) {
728
729 const char *path;
730 CGroupContext *c;
731 bool is_root;
732 int r;
733
734 assert(u);
735
736 /* Nothing to do? Exit early! */
737 if (apply_mask == 0 && !apply_bpf)
738 return;
739
740 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
741 is_root = unit_has_root_cgroup(u);
742
743 assert_se(c = unit_get_cgroup_context(u));
744 assert_se(path = u->cgroup_path);
745
746 if (is_root) /* Make sure we don't try to display messages with an empty path. */
747 path = "/";
748
749 /* We generally ignore errors caused by read-only mounted
750 * cgroup trees (assuming we are running in a container then),
751 * and missing cgroups, i.e. EROFS and ENOENT. */
752
753 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
754 bool has_weight, has_shares;
755
756 has_weight = cgroup_context_has_cpu_weight(c);
757 has_shares = cgroup_context_has_cpu_shares(c);
758
759 if (cg_all_unified() > 0) {
760 uint64_t weight;
761
762 if (has_weight)
763 weight = cgroup_context_cpu_weight(c, state);
764 else if (has_shares) {
765 uint64_t shares = cgroup_context_cpu_shares(c, state);
766
767 weight = cgroup_cpu_shares_to_weight(shares);
768
769 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
770 shares, weight, path);
771 } else
772 weight = CGROUP_WEIGHT_DEFAULT;
773
774 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
775 } else {
776 uint64_t shares;
777
778 if (has_weight) {
779 uint64_t weight = cgroup_context_cpu_weight(c, state);
780
781 shares = cgroup_cpu_weight_to_shares(weight);
782
783 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
784 weight, shares, path);
785 } else if (has_shares)
786 shares = cgroup_context_cpu_shares(c, state);
787 else
788 shares = CGROUP_CPU_SHARES_DEFAULT;
789
790 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
791 }
792 }
793
794 if (apply_mask & CGROUP_MASK_IO) {
795 bool has_io = cgroup_context_has_io_config(c);
796 bool has_blockio = cgroup_context_has_blockio_config(c);
797
798 if (!is_root) {
799 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
800 uint64_t weight;
801
802 if (has_io)
803 weight = cgroup_context_io_weight(c, state);
804 else if (has_blockio) {
805 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
806
807 weight = cgroup_weight_blkio_to_io(blkio_weight);
808
809 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
810 blkio_weight, weight);
811 } else
812 weight = CGROUP_WEIGHT_DEFAULT;
813
814 xsprintf(buf, "default %" PRIu64 "\n", weight);
815 r = cg_set_attribute("io", path, "io.weight", buf);
816 if (r < 0)
817 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
818 "Failed to set io.weight: %m");
819
820 if (has_io) {
821 CGroupIODeviceWeight *w;
822
823 /* FIXME: no way to reset this list */
824 LIST_FOREACH(device_weights, w, c->io_device_weights)
825 cgroup_apply_io_device_weight(u, w->path, w->weight);
826 } else if (has_blockio) {
827 CGroupBlockIODeviceWeight *w;
828
829 /* FIXME: no way to reset this list */
830 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
831 weight = cgroup_weight_blkio_to_io(w->weight);
832
833 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
834 w->weight, weight, w->path);
835
836 cgroup_apply_io_device_weight(u, w->path, weight);
837 }
838 }
839 }
840
841 /* Apply limits and free ones without config. */
842 if (has_io) {
843 CGroupIODeviceLimit *l, *next;
844
845 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
846 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
847 cgroup_context_free_io_device_limit(c, l);
848 }
849 } else if (has_blockio) {
850 CGroupBlockIODeviceBandwidth *b, *next;
851
852 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
853 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
854 CGroupIOLimitType type;
855
856 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
857 limits[type] = cgroup_io_limit_defaults[type];
858
859 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
860 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
861
862 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
863 b->rbps, b->wbps, b->path);
864
865 if (!cgroup_apply_io_device_limit(u, b->path, limits))
866 cgroup_context_free_blockio_device_bandwidth(c, b);
867 }
868 }
869 }
870
871 if (apply_mask & CGROUP_MASK_BLKIO) {
872 bool has_io = cgroup_context_has_io_config(c);
873 bool has_blockio = cgroup_context_has_blockio_config(c);
874
875 if (!is_root) {
876 char buf[DECIMAL_STR_MAX(uint64_t)+1];
877 uint64_t weight;
878
879 if (has_io) {
880 uint64_t io_weight = cgroup_context_io_weight(c, state);
881
882 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
883
884 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
885 io_weight, weight);
886 } else if (has_blockio)
887 weight = cgroup_context_blkio_weight(c, state);
888 else
889 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
890
891 xsprintf(buf, "%" PRIu64 "\n", weight);
892 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
893 if (r < 0)
894 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
895 "Failed to set blkio.weight: %m");
896
897 if (has_io) {
898 CGroupIODeviceWeight *w;
899
900 /* FIXME: no way to reset this list */
901 LIST_FOREACH(device_weights, w, c->io_device_weights) {
902 weight = cgroup_weight_io_to_blkio(w->weight);
903
904 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
905 w->weight, weight, w->path);
906
907 cgroup_apply_blkio_device_weight(u, w->path, weight);
908 }
909 } else if (has_blockio) {
910 CGroupBlockIODeviceWeight *w;
911
912 /* FIXME: no way to reset this list */
913 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
914 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
915 }
916 }
917
918 /* Apply limits and free ones without config. */
919 if (has_io) {
920 CGroupIODeviceLimit *l, *next;
921
922 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
923 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
924 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
925
926 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
927 cgroup_context_free_io_device_limit(c, l);
928 }
929 } else if (has_blockio) {
930 CGroupBlockIODeviceBandwidth *b, *next;
931
932 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
933 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
934 cgroup_context_free_blockio_device_bandwidth(c, b);
935 }
936 }
937
938 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
939 if (cg_all_unified() > 0) {
940 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
941
942 if (cgroup_context_has_unified_memory_config(c)) {
943 max = c->memory_max;
944 swap_max = c->memory_swap_max;
945 } else {
946 max = c->memory_limit;
947
948 if (max != CGROUP_LIMIT_MAX)
949 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
950 }
951
952 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
953 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
954 cgroup_apply_unified_memory_limit(u, "memory.max", max);
955 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
956 } else {
957 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
958 uint64_t val;
959
960 if (cgroup_context_has_unified_memory_config(c)) {
961 val = c->memory_max;
962 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
963 } else
964 val = c->memory_limit;
965
966 if (val == CGROUP_LIMIT_MAX)
967 strncpy(buf, "-1\n", sizeof(buf));
968 else
969 xsprintf(buf, "%" PRIu64 "\n", val);
970
971 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
972 if (r < 0)
973 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
974 "Failed to set memory.limit_in_bytes: %m");
975 }
976 }
977
978 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
979 CGroupDeviceAllow *a;
980
981 /* Changing the devices list of a populated cgroup
982 * might result in EINVAL, hence ignore EINVAL
983 * here. */
984
985 if (c->device_allow || c->device_policy != CGROUP_AUTO)
986 r = cg_set_attribute("devices", path, "devices.deny", "a");
987 else
988 r = cg_set_attribute("devices", path, "devices.allow", "a");
989 if (r < 0)
990 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
991 "Failed to reset devices.list: %m");
992
993 if (c->device_policy == CGROUP_CLOSED ||
994 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
995 static const char auto_devices[] =
996 "/dev/null\0" "rwm\0"
997 "/dev/zero\0" "rwm\0"
998 "/dev/full\0" "rwm\0"
999 "/dev/random\0" "rwm\0"
1000 "/dev/urandom\0" "rwm\0"
1001 "/dev/tty\0" "rwm\0"
1002 "/dev/ptmx\0" "rwm\0"
1003 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1004 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1005 "-/run/systemd/inaccessible/blk\0" "rwm\0";
1006
1007 const char *x, *y;
1008
1009 NULSTR_FOREACH_PAIR(x, y, auto_devices)
1010 whitelist_device(path, x, y);
1011
1012 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1013 whitelist_major(path, "pts", 'c', "rw");
1014 }
1015
1016 LIST_FOREACH(device_allow, a, c->device_allow) {
1017 char acc[4], *val;
1018 unsigned k = 0;
1019
1020 if (a->r)
1021 acc[k++] = 'r';
1022 if (a->w)
1023 acc[k++] = 'w';
1024 if (a->m)
1025 acc[k++] = 'm';
1026
1027 if (k == 0)
1028 continue;
1029
1030 acc[k++] = 0;
1031
1032 if (path_startswith(a->path, "/dev/"))
1033 whitelist_device(path, a->path, acc);
1034 else if ((val = startswith(a->path, "block-")))
1035 whitelist_major(path, val, 'b', acc);
1036 else if ((val = startswith(a->path, "char-")))
1037 whitelist_major(path, val, 'c', acc);
1038 else
1039 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1040 }
1041 }
1042
1043 if (apply_mask & CGROUP_MASK_PIDS) {
1044
1045 if (is_root) {
1046 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1047 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1048 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1049 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1050 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1051 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1052 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1053 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1054 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1055 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1056 * which is desirable so that there's an offical way to release control of the sysctl from
1057 * systemd: set the limit to unbounded and reload. */
1058
1059 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1060 u->manager->sysctl_pid_max_changed = true;
1061 r = procfs_tasks_set_limit(c->tasks_max);
1062 } else if (u->manager->sysctl_pid_max_changed)
1063 r = procfs_tasks_set_limit(TASKS_MAX);
1064 else
1065 r = 0;
1066
1067 if (r < 0)
1068 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1069 "Failed to write to tasks limit sysctls: %m");
1070
1071 } else {
1072 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1073 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1074
1075 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1076 r = cg_set_attribute("pids", path, "pids.max", buf);
1077 } else
1078 r = cg_set_attribute("pids", path, "pids.max", "max");
1079 if (r < 0)
1080 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1081 "Failed to set pids.max: %m");
1082 }
1083 }
1084
1085 if (apply_bpf)
1086 cgroup_apply_firewall(u);
1087 }
1088
1089 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1090 CGroupMask mask = 0;
1091
1092 /* Figure out which controllers we need */
1093
1094 if (c->cpu_accounting ||
1095 cgroup_context_has_cpu_weight(c) ||
1096 cgroup_context_has_cpu_shares(c) ||
1097 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1098 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1099
1100 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1101 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1102
1103 if (c->memory_accounting ||
1104 c->memory_limit != CGROUP_LIMIT_MAX ||
1105 cgroup_context_has_unified_memory_config(c))
1106 mask |= CGROUP_MASK_MEMORY;
1107
1108 if (c->device_allow ||
1109 c->device_policy != CGROUP_AUTO)
1110 mask |= CGROUP_MASK_DEVICES;
1111
1112 if (c->tasks_accounting ||
1113 c->tasks_max != CGROUP_LIMIT_MAX)
1114 mask |= CGROUP_MASK_PIDS;
1115
1116 return mask;
1117 }
1118
1119 CGroupMask unit_get_own_mask(Unit *u) {
1120 CGroupContext *c;
1121
1122 /* Returns the mask of controllers the unit needs for itself */
1123
1124 c = unit_get_cgroup_context(u);
1125 if (!c)
1126 return 0;
1127
1128 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1129 }
1130
1131 CGroupMask unit_get_delegate_mask(Unit *u) {
1132 CGroupContext *c;
1133
1134 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1135 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1136 *
1137 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1138
1139 if (!unit_cgroup_delegate(u))
1140 return 0;
1141
1142 if (cg_all_unified() <= 0) {
1143 ExecContext *e;
1144
1145 e = unit_get_exec_context(u);
1146 if (e && !exec_context_maintains_privileges(e))
1147 return 0;
1148 }
1149
1150 assert_se(c = unit_get_cgroup_context(u));
1151 return c->delegate_controllers;
1152 }
1153
1154 CGroupMask unit_get_members_mask(Unit *u) {
1155 assert(u);
1156
1157 /* Returns the mask of controllers all of the unit's children require, merged */
1158
1159 if (u->cgroup_members_mask_valid)
1160 return u->cgroup_members_mask;
1161
1162 u->cgroup_members_mask = 0;
1163
1164 if (u->type == UNIT_SLICE) {
1165 void *v;
1166 Unit *member;
1167 Iterator i;
1168
1169 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1170
1171 if (member == u)
1172 continue;
1173
1174 if (UNIT_DEREF(member->slice) != u)
1175 continue;
1176
1177 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1178 }
1179 }
1180
1181 u->cgroup_members_mask_valid = true;
1182 return u->cgroup_members_mask;
1183 }
1184
1185 CGroupMask unit_get_siblings_mask(Unit *u) {
1186 assert(u);
1187
1188 /* Returns the mask of controllers all of the unit's siblings
1189 * require, i.e. the members mask of the unit's parent slice
1190 * if there is one. */
1191
1192 if (UNIT_ISSET(u->slice))
1193 return unit_get_members_mask(UNIT_DEREF(u->slice));
1194
1195 return unit_get_subtree_mask(u); /* we are the top-level slice */
1196 }
1197
1198 CGroupMask unit_get_subtree_mask(Unit *u) {
1199
1200 /* Returns the mask of this subtree, meaning of the group
1201 * itself and its children. */
1202
1203 return unit_get_own_mask(u) | unit_get_members_mask(u);
1204 }
1205
1206 CGroupMask unit_get_target_mask(Unit *u) {
1207 CGroupMask mask;
1208
1209 /* This returns the cgroup mask of all controllers to enable
1210 * for a specific cgroup, i.e. everything it needs itself,
1211 * plus all that its children need, plus all that its siblings
1212 * need. This is primarily useful on the legacy cgroup
1213 * hierarchy, where we need to duplicate each cgroup in each
1214 * hierarchy that shall be enabled for it. */
1215
1216 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1217 mask &= u->manager->cgroup_supported;
1218
1219 return mask;
1220 }
1221
1222 CGroupMask unit_get_enable_mask(Unit *u) {
1223 CGroupMask mask;
1224
1225 /* This returns the cgroup mask of all controllers to enable
1226 * for the children of a specific cgroup. This is primarily
1227 * useful for the unified cgroup hierarchy, where each cgroup
1228 * controls which controllers are enabled for its children. */
1229
1230 mask = unit_get_members_mask(u);
1231 mask &= u->manager->cgroup_supported;
1232
1233 return mask;
1234 }
1235
1236 bool unit_get_needs_bpf(Unit *u) {
1237 CGroupContext *c;
1238 Unit *p;
1239 assert(u);
1240
1241 c = unit_get_cgroup_context(u);
1242 if (!c)
1243 return false;
1244
1245 if (c->ip_accounting ||
1246 c->ip_address_allow ||
1247 c->ip_address_deny)
1248 return true;
1249
1250 /* If any parent slice has an IP access list defined, it applies too */
1251 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1252 c = unit_get_cgroup_context(p);
1253 if (!c)
1254 return false;
1255
1256 if (c->ip_address_allow ||
1257 c->ip_address_deny)
1258 return true;
1259 }
1260
1261 return false;
1262 }
1263
1264 /* Recurse from a unit up through its containing slices, propagating
1265 * mask bits upward. A unit is also member of itself. */
1266 void unit_update_cgroup_members_masks(Unit *u) {
1267 CGroupMask m;
1268 bool more;
1269
1270 assert(u);
1271
1272 /* Calculate subtree mask */
1273 m = unit_get_subtree_mask(u);
1274
1275 /* See if anything changed from the previous invocation. If
1276 * not, we're done. */
1277 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1278 return;
1279
1280 more =
1281 u->cgroup_subtree_mask_valid &&
1282 ((m & ~u->cgroup_subtree_mask) != 0) &&
1283 ((~m & u->cgroup_subtree_mask) == 0);
1284
1285 u->cgroup_subtree_mask = m;
1286 u->cgroup_subtree_mask_valid = true;
1287
1288 if (UNIT_ISSET(u->slice)) {
1289 Unit *s = UNIT_DEREF(u->slice);
1290
1291 if (more)
1292 /* There's more set now than before. We
1293 * propagate the new mask to the parent's mask
1294 * (not caring if it actually was valid or
1295 * not). */
1296
1297 s->cgroup_members_mask |= m;
1298
1299 else
1300 /* There's less set now than before (or we
1301 * don't know), we need to recalculate
1302 * everything, so let's invalidate the
1303 * parent's members mask */
1304
1305 s->cgroup_members_mask_valid = false;
1306
1307 /* And now make sure that this change also hits our
1308 * grandparents */
1309 unit_update_cgroup_members_masks(s);
1310 }
1311 }
1312
1313 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1314
1315 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1316
1317 while (u) {
1318
1319 if (u->cgroup_path &&
1320 u->cgroup_realized &&
1321 (u->cgroup_realized_mask & mask) == mask)
1322 return u->cgroup_path;
1323
1324 u = UNIT_DEREF(u->slice);
1325 }
1326
1327 return NULL;
1328 }
1329
1330 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1331 return unit_get_realized_cgroup_path(userdata, mask);
1332 }
1333
1334 char *unit_default_cgroup_path(Unit *u) {
1335 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1336 int r;
1337
1338 assert(u);
1339
1340 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1341 return strdup(u->manager->cgroup_root);
1342
1343 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1344 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1345 if (r < 0)
1346 return NULL;
1347 }
1348
1349 escaped = cg_escape(u->id);
1350 if (!escaped)
1351 return NULL;
1352
1353 if (slice)
1354 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1355 escaped);
1356 else
1357 return strjoin(u->manager->cgroup_root, "/", escaped);
1358 }
1359
1360 int unit_set_cgroup_path(Unit *u, const char *path) {
1361 _cleanup_free_ char *p = NULL;
1362 int r;
1363
1364 assert(u);
1365
1366 if (path) {
1367 p = strdup(path);
1368 if (!p)
1369 return -ENOMEM;
1370 } else
1371 p = NULL;
1372
1373 if (streq_ptr(u->cgroup_path, p))
1374 return 0;
1375
1376 if (p) {
1377 r = hashmap_put(u->manager->cgroup_unit, p, u);
1378 if (r < 0)
1379 return r;
1380 }
1381
1382 unit_release_cgroup(u);
1383
1384 u->cgroup_path = TAKE_PTR(p);
1385
1386 return 1;
1387 }
1388
1389 int unit_watch_cgroup(Unit *u) {
1390 _cleanup_free_ char *events = NULL;
1391 int r;
1392
1393 assert(u);
1394
1395 if (!u->cgroup_path)
1396 return 0;
1397
1398 if (u->cgroup_inotify_wd >= 0)
1399 return 0;
1400
1401 /* Only applies to the unified hierarchy */
1402 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1403 if (r < 0)
1404 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1405 if (r == 0)
1406 return 0;
1407
1408 /* Don't watch the root slice, it's pointless. */
1409 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1410 return 0;
1411
1412 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1413 if (r < 0)
1414 return log_oom();
1415
1416 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1417 if (r < 0)
1418 return log_oom();
1419
1420 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1421 if (u->cgroup_inotify_wd < 0) {
1422
1423 if (errno == ENOENT) /* If the directory is already
1424 * gone we don't need to track
1425 * it, so this is not an error */
1426 return 0;
1427
1428 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1429 }
1430
1431 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1432 if (r < 0)
1433 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1434
1435 return 0;
1436 }
1437
1438 int unit_pick_cgroup_path(Unit *u) {
1439 _cleanup_free_ char *path = NULL;
1440 int r;
1441
1442 assert(u);
1443
1444 if (u->cgroup_path)
1445 return 0;
1446
1447 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1448 return -EINVAL;
1449
1450 path = unit_default_cgroup_path(u);
1451 if (!path)
1452 return log_oom();
1453
1454 r = unit_set_cgroup_path(u, path);
1455 if (r == -EEXIST)
1456 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1457 if (r < 0)
1458 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1459
1460 return 0;
1461 }
1462
1463 static int unit_create_cgroup(
1464 Unit *u,
1465 CGroupMask target_mask,
1466 CGroupMask enable_mask,
1467 bool needs_bpf) {
1468
1469 CGroupContext *c;
1470 int r;
1471
1472 assert(u);
1473
1474 c = unit_get_cgroup_context(u);
1475 if (!c)
1476 return 0;
1477
1478 /* Figure out our cgroup path */
1479 r = unit_pick_cgroup_path(u);
1480 if (r < 0)
1481 return r;
1482
1483 /* First, create our own group */
1484 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1485 if (r < 0)
1486 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1487
1488 /* Start watching it */
1489 (void) unit_watch_cgroup(u);
1490
1491 /* Enable all controllers we need */
1492 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1493 if (r < 0)
1494 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1495
1496 /* Keep track that this is now realized */
1497 u->cgroup_realized = true;
1498 u->cgroup_realized_mask = target_mask;
1499 u->cgroup_enabled_mask = enable_mask;
1500 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1501
1502 if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1503
1504 /* Then, possibly move things over, but not if
1505 * subgroups may contain processes, which is the case
1506 * for slice and delegation units. */
1507 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1508 if (r < 0)
1509 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1510 }
1511
1512 return 0;
1513 }
1514
1515 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1516 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1517 char *pp;
1518 int r;
1519
1520 assert(u);
1521
1522 if (MANAGER_IS_SYSTEM(u->manager))
1523 return -EINVAL;
1524
1525 if (!u->manager->system_bus)
1526 return -EIO;
1527
1528 if (!u->cgroup_path)
1529 return -EINVAL;
1530
1531 /* Determine this unit's cgroup path relative to our cgroup root */
1532 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1533 if (!pp)
1534 return -EINVAL;
1535
1536 pp = strjoina("/", pp, suffix_path);
1537 path_kill_slashes(pp);
1538
1539 r = sd_bus_call_method(u->manager->system_bus,
1540 "org.freedesktop.systemd1",
1541 "/org/freedesktop/systemd1",
1542 "org.freedesktop.systemd1.Manager",
1543 "AttachProcessesToUnit",
1544 &error, NULL,
1545 "ssau",
1546 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1547 if (r < 0)
1548 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1549
1550 return 0;
1551 }
1552
1553 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1554 CGroupMask delegated_mask;
1555 const char *p;
1556 Iterator i;
1557 void *pidp;
1558 int r, q;
1559
1560 assert(u);
1561
1562 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1563 return -EINVAL;
1564
1565 if (set_isempty(pids))
1566 return 0;
1567
1568 r = unit_realize_cgroup(u);
1569 if (r < 0)
1570 return r;
1571
1572 if (isempty(suffix_path))
1573 p = u->cgroup_path;
1574 else
1575 p = strjoina(u->cgroup_path, "/", suffix_path);
1576
1577 delegated_mask = unit_get_delegate_mask(u);
1578
1579 r = 0;
1580 SET_FOREACH(pidp, pids, i) {
1581 pid_t pid = PTR_TO_PID(pidp);
1582 CGroupController c;
1583
1584 /* First, attach the PID to the main cgroup hierarchy */
1585 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1586 if (q < 0) {
1587 log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1588
1589 if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1590 int z;
1591
1592 /* If we are in a user instance, and we can't move the process ourselves due to
1593 * permission problems, let's ask the system instance about it instead. Since it's more
1594 * privileged it might be able to move the process across the leaves of a subtree who's
1595 * top node is not owned by us. */
1596
1597 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1598 if (z < 0)
1599 log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1600 else
1601 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1602 }
1603
1604 if (r >= 0)
1605 r = q; /* Remember first error */
1606
1607 continue;
1608 }
1609
1610 q = cg_all_unified();
1611 if (q < 0)
1612 return q;
1613 if (q > 0)
1614 continue;
1615
1616 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1617 * innermost realized one */
1618
1619 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1620 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1621 const char *realized;
1622
1623 if (!(u->manager->cgroup_supported & bit))
1624 continue;
1625
1626 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1627 if (delegated_mask & u->cgroup_realized_mask & bit) {
1628 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1629 if (q >= 0)
1630 continue; /* Success! */
1631
1632 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1633 pid, p, cgroup_controller_to_string(c));
1634 }
1635
1636 /* So this controller is either not delegate or realized, or something else weird happened. In
1637 * that case let's attach the PID at least to the closest cgroup up the tree that is
1638 * realized. */
1639 realized = unit_get_realized_cgroup_path(u, bit);
1640 if (!realized)
1641 continue; /* Not even realized in the root slice? Then let's not bother */
1642
1643 q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1644 if (q < 0)
1645 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1646 pid, realized, cgroup_controller_to_string(c));
1647 }
1648 }
1649
1650 return r;
1651 }
1652
1653 static void cgroup_xattr_apply(Unit *u) {
1654 char ids[SD_ID128_STRING_MAX];
1655 int r;
1656
1657 assert(u);
1658
1659 if (!MANAGER_IS_SYSTEM(u->manager))
1660 return;
1661
1662 if (sd_id128_is_null(u->invocation_id))
1663 return;
1664
1665 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1666 "trusted.invocation_id",
1667 sd_id128_to_string(u->invocation_id, ids), 32,
1668 0);
1669 if (r < 0)
1670 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1671 }
1672
1673 static bool unit_has_mask_realized(
1674 Unit *u,
1675 CGroupMask target_mask,
1676 CGroupMask enable_mask,
1677 bool needs_bpf) {
1678
1679 assert(u);
1680
1681 return u->cgroup_realized &&
1682 u->cgroup_realized_mask == target_mask &&
1683 u->cgroup_enabled_mask == enable_mask &&
1684 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1685 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1686 }
1687
1688 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1689 assert(u);
1690
1691 if (u->in_cgroup_realize_queue)
1692 return;
1693
1694 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1695 u->in_cgroup_realize_queue = true;
1696 }
1697
1698 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1699 assert(u);
1700
1701 if (!u->in_cgroup_realize_queue)
1702 return;
1703
1704 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1705 u->in_cgroup_realize_queue = false;
1706 }
1707
1708
1709 /* Check if necessary controllers and attributes for a unit are in place.
1710 *
1711 * If so, do nothing.
1712 * If not, create paths, move processes over, and set attributes.
1713 *
1714 * Returns 0 on success and < 0 on failure. */
1715 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1716 CGroupMask target_mask, enable_mask;
1717 bool needs_bpf, apply_bpf;
1718 int r;
1719
1720 assert(u);
1721
1722 unit_remove_from_cgroup_realize_queue(u);
1723
1724 target_mask = unit_get_target_mask(u);
1725 enable_mask = unit_get_enable_mask(u);
1726 needs_bpf = unit_get_needs_bpf(u);
1727
1728 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1729 return 0;
1730
1731 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1732 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1733 * this will trickle down properly to cgroupfs. */
1734 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1735
1736 /* First, realize parents */
1737 if (UNIT_ISSET(u->slice)) {
1738 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1739 if (r < 0)
1740 return r;
1741 }
1742
1743 /* And then do the real work */
1744 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1745 if (r < 0)
1746 return r;
1747
1748 /* Finally, apply the necessary attributes. */
1749 cgroup_context_apply(u, target_mask, apply_bpf, state);
1750 cgroup_xattr_apply(u);
1751
1752 return 0;
1753 }
1754
1755 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1756 ManagerState state;
1757 unsigned n = 0;
1758 Unit *i;
1759 int r;
1760
1761 assert(m);
1762
1763 state = manager_state(m);
1764
1765 while ((i = m->cgroup_realize_queue)) {
1766 assert(i->in_cgroup_realize_queue);
1767
1768 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1769 /* Maybe things changed, and the unit is not actually active anymore? */
1770 unit_remove_from_cgroup_realize_queue(i);
1771 continue;
1772 }
1773
1774 r = unit_realize_cgroup_now(i, state);
1775 if (r < 0)
1776 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1777
1778 n++;
1779 }
1780
1781 return n;
1782 }
1783
1784 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1785 Unit *slice;
1786
1787 /* This adds the siblings of the specified unit and the
1788 * siblings of all parent units to the cgroup queue. (But
1789 * neither the specified unit itself nor the parents.) */
1790
1791 while ((slice = UNIT_DEREF(u->slice))) {
1792 Iterator i;
1793 Unit *m;
1794 void *v;
1795
1796 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1797 if (m == u)
1798 continue;
1799
1800 /* Skip units that have a dependency on the slice
1801 * but aren't actually in it. */
1802 if (UNIT_DEREF(m->slice) != slice)
1803 continue;
1804
1805 /* No point in doing cgroup application for units
1806 * without active processes. */
1807 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1808 continue;
1809
1810 /* If the unit doesn't need any new controllers
1811 * and has current ones realized, it doesn't need
1812 * any changes. */
1813 if (unit_has_mask_realized(m,
1814 unit_get_target_mask(m),
1815 unit_get_enable_mask(m),
1816 unit_get_needs_bpf(m)))
1817 continue;
1818
1819 unit_add_to_cgroup_realize_queue(m);
1820 }
1821
1822 u = slice;
1823 }
1824 }
1825
1826 int unit_realize_cgroup(Unit *u) {
1827 assert(u);
1828
1829 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1830 return 0;
1831
1832 /* So, here's the deal: when realizing the cgroups for this
1833 * unit, we need to first create all parents, but there's more
1834 * actually: for the weight-based controllers we also need to
1835 * make sure that all our siblings (i.e. units that are in the
1836 * same slice as we are) have cgroups, too. Otherwise, things
1837 * would become very uneven as each of their processes would
1838 * get as much resources as all our group together. This call
1839 * will synchronously create the parent cgroups, but will
1840 * defer work on the siblings to the next event loop
1841 * iteration. */
1842
1843 /* Add all sibling slices to the cgroup queue. */
1844 unit_add_siblings_to_cgroup_realize_queue(u);
1845
1846 /* And realize this one now (and apply the values) */
1847 return unit_realize_cgroup_now(u, manager_state(u->manager));
1848 }
1849
1850 void unit_release_cgroup(Unit *u) {
1851 assert(u);
1852
1853 /* Forgets all cgroup details for this cgroup */
1854
1855 if (u->cgroup_path) {
1856 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1857 u->cgroup_path = mfree(u->cgroup_path);
1858 }
1859
1860 if (u->cgroup_inotify_wd >= 0) {
1861 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1862 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1863
1864 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1865 u->cgroup_inotify_wd = -1;
1866 }
1867 }
1868
1869 void unit_prune_cgroup(Unit *u) {
1870 int r;
1871 bool is_root_slice;
1872
1873 assert(u);
1874
1875 /* Removes the cgroup, if empty and possible, and stops watching it. */
1876
1877 if (!u->cgroup_path)
1878 return;
1879
1880 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1881
1882 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1883
1884 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1885 if (r < 0) {
1886 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1887 return;
1888 }
1889
1890 if (is_root_slice)
1891 return;
1892
1893 unit_release_cgroup(u);
1894
1895 u->cgroup_realized = false;
1896 u->cgroup_realized_mask = 0;
1897 u->cgroup_enabled_mask = 0;
1898 }
1899
1900 int unit_search_main_pid(Unit *u, pid_t *ret) {
1901 _cleanup_fclose_ FILE *f = NULL;
1902 pid_t pid = 0, npid, mypid;
1903 int r;
1904
1905 assert(u);
1906 assert(ret);
1907
1908 if (!u->cgroup_path)
1909 return -ENXIO;
1910
1911 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1912 if (r < 0)
1913 return r;
1914
1915 mypid = getpid_cached();
1916 while (cg_read_pid(f, &npid) > 0) {
1917 pid_t ppid;
1918
1919 if (npid == pid)
1920 continue;
1921
1922 /* Ignore processes that aren't our kids */
1923 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1924 continue;
1925
1926 if (pid != 0)
1927 /* Dang, there's more than one daemonized PID
1928 in this group, so we don't know what process
1929 is the main process. */
1930
1931 return -ENODATA;
1932
1933 pid = npid;
1934 }
1935
1936 *ret = pid;
1937 return 0;
1938 }
1939
1940 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1941 _cleanup_closedir_ DIR *d = NULL;
1942 _cleanup_fclose_ FILE *f = NULL;
1943 int ret = 0, r;
1944
1945 assert(u);
1946 assert(path);
1947
1948 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1949 if (r < 0)
1950 ret = r;
1951 else {
1952 pid_t pid;
1953
1954 while ((r = cg_read_pid(f, &pid)) > 0) {
1955 r = unit_watch_pid(u, pid);
1956 if (r < 0 && ret >= 0)
1957 ret = r;
1958 }
1959
1960 if (r < 0 && ret >= 0)
1961 ret = r;
1962 }
1963
1964 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1965 if (r < 0) {
1966 if (ret >= 0)
1967 ret = r;
1968 } else {
1969 char *fn;
1970
1971 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1972 _cleanup_free_ char *p = NULL;
1973
1974 p = strjoin(path, "/", fn);
1975 free(fn);
1976
1977 if (!p)
1978 return -ENOMEM;
1979
1980 r = unit_watch_pids_in_path(u, p);
1981 if (r < 0 && ret >= 0)
1982 ret = r;
1983 }
1984
1985 if (r < 0 && ret >= 0)
1986 ret = r;
1987 }
1988
1989 return ret;
1990 }
1991
1992 int unit_synthesize_cgroup_empty_event(Unit *u) {
1993 int r;
1994
1995 assert(u);
1996
1997 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1998 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1999 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2000
2001 if (!u->cgroup_path)
2002 return -ENOENT;
2003
2004 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2005 if (r < 0)
2006 return r;
2007 if (r > 0) /* On unified we have reliable notifications, and don't need this */
2008 return 0;
2009
2010 if (!set_isempty(u->pids))
2011 return 0;
2012
2013 unit_add_to_cgroup_empty_queue(u);
2014 return 0;
2015 }
2016
2017 int unit_watch_all_pids(Unit *u) {
2018 int r;
2019
2020 assert(u);
2021
2022 /* Adds all PIDs from our cgroup to the set of PIDs we
2023 * watch. This is a fallback logic for cases where we do not
2024 * get reliable cgroup empty notifications: we try to use
2025 * SIGCHLD as replacement. */
2026
2027 if (!u->cgroup_path)
2028 return -ENOENT;
2029
2030 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2031 if (r < 0)
2032 return r;
2033 if (r > 0) /* On unified we can use proper notifications */
2034 return 0;
2035
2036 return unit_watch_pids_in_path(u, u->cgroup_path);
2037 }
2038
2039 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2040 Manager *m = userdata;
2041 Unit *u;
2042 int r;
2043
2044 assert(s);
2045 assert(m);
2046
2047 u = m->cgroup_empty_queue;
2048 if (!u)
2049 return 0;
2050
2051 assert(u->in_cgroup_empty_queue);
2052 u->in_cgroup_empty_queue = false;
2053 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2054
2055 if (m->cgroup_empty_queue) {
2056 /* More stuff queued, let's make sure we remain enabled */
2057 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2058 if (r < 0)
2059 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2060 }
2061
2062 unit_add_to_gc_queue(u);
2063
2064 if (UNIT_VTABLE(u)->notify_cgroup_empty)
2065 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2066
2067 return 0;
2068 }
2069
2070 void unit_add_to_cgroup_empty_queue(Unit *u) {
2071 int r;
2072
2073 assert(u);
2074
2075 /* Note that there are four different ways how cgroup empty events reach us:
2076 *
2077 * 1. On the unified hierarchy we get an inotify event on the cgroup
2078 *
2079 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2080 *
2081 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2082 *
2083 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2084 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2085 *
2086 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2087 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2088 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2089 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2090 * case for scope units). */
2091
2092 if (u->in_cgroup_empty_queue)
2093 return;
2094
2095 /* Let's verify that the cgroup is really empty */
2096 if (!u->cgroup_path)
2097 return;
2098 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2099 if (r < 0) {
2100 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2101 return;
2102 }
2103 if (r == 0)
2104 return;
2105
2106 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2107 u->in_cgroup_empty_queue = true;
2108
2109 /* Trigger the defer event */
2110 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2111 if (r < 0)
2112 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2113 }
2114
2115 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2116 Manager *m = userdata;
2117
2118 assert(s);
2119 assert(fd >= 0);
2120 assert(m);
2121
2122 for (;;) {
2123 union inotify_event_buffer buffer;
2124 struct inotify_event *e;
2125 ssize_t l;
2126
2127 l = read(fd, &buffer, sizeof(buffer));
2128 if (l < 0) {
2129 if (IN_SET(errno, EINTR, EAGAIN))
2130 return 0;
2131
2132 return log_error_errno(errno, "Failed to read control group inotify events: %m");
2133 }
2134
2135 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2136 Unit *u;
2137
2138 if (e->wd < 0)
2139 /* Queue overflow has no watch descriptor */
2140 continue;
2141
2142 if (e->mask & IN_IGNORED)
2143 /* The watch was just removed */
2144 continue;
2145
2146 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2147 if (!u) /* Not that inotify might deliver
2148 * events for a watch even after it
2149 * was removed, because it was queued
2150 * before the removal. Let's ignore
2151 * this here safely. */
2152 continue;
2153
2154 unit_add_to_cgroup_empty_queue(u);
2155 }
2156 }
2157 }
2158
2159 int manager_setup_cgroup(Manager *m) {
2160 _cleanup_free_ char *path = NULL;
2161 const char *scope_path;
2162 CGroupController c;
2163 int r, all_unified;
2164 char *e;
2165
2166 assert(m);
2167
2168 /* 1. Determine hierarchy */
2169 m->cgroup_root = mfree(m->cgroup_root);
2170 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2171 if (r < 0)
2172 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2173
2174 /* Chop off the init scope, if we are already located in it */
2175 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2176
2177 /* LEGACY: Also chop off the system slice if we are in
2178 * it. This is to support live upgrades from older systemd
2179 * versions where PID 1 was moved there. Also see
2180 * cg_get_root_path(). */
2181 if (!e && MANAGER_IS_SYSTEM(m)) {
2182 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2183 if (!e)
2184 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2185 }
2186 if (e)
2187 *e = 0;
2188
2189 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2190 * easily prepend it everywhere. */
2191 delete_trailing_chars(m->cgroup_root, "/");
2192
2193 /* 2. Show data */
2194 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2195 if (r < 0)
2196 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2197
2198 r = cg_unified_flush();
2199 if (r < 0)
2200 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2201
2202 all_unified = cg_all_unified();
2203 if (all_unified < 0)
2204 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2205 if (all_unified > 0)
2206 log_debug("Unified cgroup hierarchy is located at %s.", path);
2207 else {
2208 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2209 if (r < 0)
2210 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2211 if (r > 0)
2212 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2213 else
2214 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2215 }
2216
2217 /* 3. Allocate cgroup empty defer event source */
2218 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2219 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2222
2223 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2226
2227 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2230
2231 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2232
2233 /* 4. Install notifier inotify object, or agent */
2234 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2235
2236 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2237
2238 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2239 safe_close(m->cgroup_inotify_fd);
2240
2241 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2242 if (m->cgroup_inotify_fd < 0)
2243 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2244
2245 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2246 if (r < 0)
2247 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2248
2249 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2250 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2251 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2252 if (r < 0)
2253 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2254
2255 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2256
2257 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2258
2259 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2260 * since it does not generate events when control groups with children run empty. */
2261
2262 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2263 if (r < 0)
2264 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2265 else if (r > 0)
2266 log_debug("Installed release agent.");
2267 else if (r == 0)
2268 log_debug("Release agent already installed.");
2269 }
2270
2271 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2272 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2273 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2274 if (r >= 0) {
2275 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2276 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2277 if (r < 0)
2278 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2279
2280 /* 6. And pin it, so that it cannot be unmounted */
2281 safe_close(m->pin_cgroupfs_fd);
2282 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2283 if (m->pin_cgroupfs_fd < 0)
2284 return log_error_errno(errno, "Failed to open pin file: %m");
2285
2286 } else if (r < 0 && !m->test_run_flags)
2287 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2288
2289 /* 7. Always enable hierarchical support if it exists... */
2290 if (!all_unified && m->test_run_flags == 0)
2291 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2292
2293 /* 8. Figure out which controllers are supported, and log about it */
2294 r = cg_mask_supported(&m->cgroup_supported);
2295 if (r < 0)
2296 return log_error_errno(r, "Failed to determine supported controllers: %m");
2297 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2298 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2299
2300 return 0;
2301 }
2302
2303 void manager_shutdown_cgroup(Manager *m, bool delete) {
2304 assert(m);
2305
2306 /* We can't really delete the group, since we are in it. But
2307 * let's trim it. */
2308 if (delete && m->cgroup_root)
2309 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2310
2311 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2312
2313 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2314
2315 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2316 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2317
2318 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2319
2320 m->cgroup_root = mfree(m->cgroup_root);
2321 }
2322
2323 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2324 char *p;
2325 Unit *u;
2326
2327 assert(m);
2328 assert(cgroup);
2329
2330 u = hashmap_get(m->cgroup_unit, cgroup);
2331 if (u)
2332 return u;
2333
2334 p = strdupa(cgroup);
2335 for (;;) {
2336 char *e;
2337
2338 e = strrchr(p, '/');
2339 if (!e || e == p)
2340 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2341
2342 *e = 0;
2343
2344 u = hashmap_get(m->cgroup_unit, p);
2345 if (u)
2346 return u;
2347 }
2348 }
2349
2350 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2351 _cleanup_free_ char *cgroup = NULL;
2352
2353 assert(m);
2354
2355 if (!pid_is_valid(pid))
2356 return NULL;
2357
2358 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2359 return NULL;
2360
2361 return manager_get_unit_by_cgroup(m, cgroup);
2362 }
2363
2364 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2365 Unit *u, **array;
2366
2367 assert(m);
2368
2369 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2370 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2371 * relevant one as children of the process will be assigned to that one, too, before all else. */
2372
2373 if (!pid_is_valid(pid))
2374 return NULL;
2375
2376 if (pid == getpid_cached())
2377 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2378
2379 u = manager_get_unit_by_pid_cgroup(m, pid);
2380 if (u)
2381 return u;
2382
2383 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2384 if (u)
2385 return u;
2386
2387 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2388 if (array)
2389 return array[0];
2390
2391 return NULL;
2392 }
2393
2394 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2395 Unit *u;
2396
2397 assert(m);
2398 assert(cgroup);
2399
2400 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2401 * or from the --system instance */
2402
2403 log_debug("Got cgroup empty notification for: %s", cgroup);
2404
2405 u = manager_get_unit_by_cgroup(m, cgroup);
2406 if (!u)
2407 return 0;
2408
2409 unit_add_to_cgroup_empty_queue(u);
2410 return 1;
2411 }
2412
2413 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2414 _cleanup_free_ char *v = NULL;
2415 int r;
2416
2417 assert(u);
2418 assert(ret);
2419
2420 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2421 return -ENODATA;
2422
2423 if (!u->cgroup_path)
2424 return -ENODATA;
2425
2426 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2427 if (unit_has_root_cgroup(u))
2428 return procfs_memory_get_current(ret);
2429
2430 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2431 return -ENODATA;
2432
2433 r = cg_all_unified();
2434 if (r < 0)
2435 return r;
2436 if (r > 0)
2437 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2438 else
2439 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2440 if (r == -ENOENT)
2441 return -ENODATA;
2442 if (r < 0)
2443 return r;
2444
2445 return safe_atou64(v, ret);
2446 }
2447
2448 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2449 _cleanup_free_ char *v = NULL;
2450 int r;
2451
2452 assert(u);
2453 assert(ret);
2454
2455 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2456 return -ENODATA;
2457
2458 if (!u->cgroup_path)
2459 return -ENODATA;
2460
2461 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2462 if (unit_has_root_cgroup(u))
2463 return procfs_tasks_get_current(ret);
2464
2465 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2466 return -ENODATA;
2467
2468 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2469 if (r == -ENOENT)
2470 return -ENODATA;
2471 if (r < 0)
2472 return r;
2473
2474 return safe_atou64(v, ret);
2475 }
2476
2477 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2478 _cleanup_free_ char *v = NULL;
2479 uint64_t ns;
2480 int r;
2481
2482 assert(u);
2483 assert(ret);
2484
2485 if (!u->cgroup_path)
2486 return -ENODATA;
2487
2488 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2489 if (unit_has_root_cgroup(u))
2490 return procfs_cpu_get_usage(ret);
2491
2492 r = cg_all_unified();
2493 if (r < 0)
2494 return r;
2495 if (r > 0) {
2496 _cleanup_free_ char *val = NULL;
2497 uint64_t us;
2498
2499 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2500 return -ENODATA;
2501
2502 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2503 if (r < 0)
2504 return r;
2505 if (IN_SET(r, -ENOENT, -ENXIO))
2506 return -ENODATA;
2507
2508 r = safe_atou64(val, &us);
2509 if (r < 0)
2510 return r;
2511
2512 ns = us * NSEC_PER_USEC;
2513 } else {
2514 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2515 return -ENODATA;
2516
2517 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2518 if (r == -ENOENT)
2519 return -ENODATA;
2520 if (r < 0)
2521 return r;
2522
2523 r = safe_atou64(v, &ns);
2524 if (r < 0)
2525 return r;
2526 }
2527
2528 *ret = ns;
2529 return 0;
2530 }
2531
2532 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2533 nsec_t ns;
2534 int r;
2535
2536 assert(u);
2537
2538 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2539 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2540 * call this function with a NULL return value. */
2541
2542 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2543 return -ENODATA;
2544
2545 r = unit_get_cpu_usage_raw(u, &ns);
2546 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2547 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2548 * cached value. */
2549
2550 if (ret)
2551 *ret = u->cpu_usage_last;
2552 return 0;
2553 }
2554 if (r < 0)
2555 return r;
2556
2557 if (ns > u->cpu_usage_base)
2558 ns -= u->cpu_usage_base;
2559 else
2560 ns = 0;
2561
2562 u->cpu_usage_last = ns;
2563 if (ret)
2564 *ret = ns;
2565
2566 return 0;
2567 }
2568
2569 int unit_get_ip_accounting(
2570 Unit *u,
2571 CGroupIPAccountingMetric metric,
2572 uint64_t *ret) {
2573
2574 uint64_t value;
2575 int fd, r;
2576
2577 assert(u);
2578 assert(metric >= 0);
2579 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2580 assert(ret);
2581
2582 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2583 return -ENODATA;
2584
2585 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2586 u->ip_accounting_ingress_map_fd :
2587 u->ip_accounting_egress_map_fd;
2588 if (fd < 0)
2589 return -ENODATA;
2590
2591 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2592 r = bpf_firewall_read_accounting(fd, &value, NULL);
2593 else
2594 r = bpf_firewall_read_accounting(fd, NULL, &value);
2595 if (r < 0)
2596 return r;
2597
2598 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2599 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2600 * ip_accounting_extra[] field, and add them in here transparently. */
2601
2602 *ret = value + u->ip_accounting_extra[metric];
2603
2604 return r;
2605 }
2606
2607 int unit_reset_cpu_accounting(Unit *u) {
2608 nsec_t ns;
2609 int r;
2610
2611 assert(u);
2612
2613 u->cpu_usage_last = NSEC_INFINITY;
2614
2615 r = unit_get_cpu_usage_raw(u, &ns);
2616 if (r < 0) {
2617 u->cpu_usage_base = 0;
2618 return r;
2619 }
2620
2621 u->cpu_usage_base = ns;
2622 return 0;
2623 }
2624
2625 int unit_reset_ip_accounting(Unit *u) {
2626 int r = 0, q = 0;
2627
2628 assert(u);
2629
2630 if (u->ip_accounting_ingress_map_fd >= 0)
2631 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2632
2633 if (u->ip_accounting_egress_map_fd >= 0)
2634 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2635
2636 zero(u->ip_accounting_extra);
2637
2638 return r < 0 ? r : q;
2639 }
2640
2641 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2642 assert(u);
2643
2644 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2645 return;
2646
2647 if (m == 0)
2648 return;
2649
2650 /* always invalidate compat pairs together */
2651 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2652 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2653
2654 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2655 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2656
2657 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2658 return;
2659
2660 u->cgroup_realized_mask &= ~m;
2661 unit_add_to_cgroup_realize_queue(u);
2662 }
2663
2664 void unit_invalidate_cgroup_bpf(Unit *u) {
2665 assert(u);
2666
2667 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2668 return;
2669
2670 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2671 return;
2672
2673 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2674 unit_add_to_cgroup_realize_queue(u);
2675
2676 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2677 * list of our children includes our own. */
2678 if (u->type == UNIT_SLICE) {
2679 Unit *member;
2680 Iterator i;
2681 void *v;
2682
2683 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2684 if (member == u)
2685 continue;
2686
2687 if (UNIT_DEREF(member->slice) != u)
2688 continue;
2689
2690 unit_invalidate_cgroup_bpf(member);
2691 }
2692 }
2693 }
2694
2695 bool unit_cgroup_delegate(Unit *u) {
2696 CGroupContext *c;
2697
2698 assert(u);
2699
2700 if (!UNIT_VTABLE(u)->can_delegate)
2701 return false;
2702
2703 c = unit_get_cgroup_context(u);
2704 if (!c)
2705 return false;
2706
2707 return c->delegate;
2708 }
2709
2710 void manager_invalidate_startup_units(Manager *m) {
2711 Iterator i;
2712 Unit *u;
2713
2714 assert(m);
2715
2716 SET_FOREACH(u, m->startup_units, i)
2717 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2718 }
2719
2720 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2721 [CGROUP_AUTO] = "auto",
2722 [CGROUP_CLOSED] = "closed",
2723 [CGROUP_STRICT] = "strict",
2724 };
2725
2726 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);