]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
b3cd12000e0f5385b1fc28ae8fd82d41774c51a5
[thirdparty/systemd.git] / src / core / cgroup.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2013 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <fcntl.h>
22 #include <fnmatch.h>
23
24 #include "alloc-util.h"
25 #include "blockdev-util.h"
26 #include "bpf-firewall.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29 #include "fd-util.h"
30 #include "fileio.h"
31 #include "fs-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
35 #include "special.h"
36 #include "stdio-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
39
40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
41
42 static void cgroup_compat_warn(void) {
43 static bool cgroup_compat_warned = false;
44
45 if (cgroup_compat_warned)
46 return;
47
48 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
49 cgroup_compat_warned = true;
50 }
51
52 #define log_cgroup_compat(unit, fmt, ...) do { \
53 cgroup_compat_warn(); \
54 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
55 } while (false)
56
57 void cgroup_context_init(CGroupContext *c) {
58 assert(c);
59
60 /* Initialize everything to the kernel defaults, assuming the
61 * structure is preinitialized to 0 */
62
63 c->cpu_weight = CGROUP_WEIGHT_INVALID;
64 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
65 c->cpu_quota_per_sec_usec = USEC_INFINITY;
66
67 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
68 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
69
70 c->memory_high = CGROUP_LIMIT_MAX;
71 c->memory_max = CGROUP_LIMIT_MAX;
72 c->memory_swap_max = CGROUP_LIMIT_MAX;
73
74 c->memory_limit = CGROUP_LIMIT_MAX;
75
76 c->io_weight = CGROUP_WEIGHT_INVALID;
77 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
78
79 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
80 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
81
82 c->tasks_max = (uint64_t) -1;
83 }
84
85 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
86 assert(c);
87 assert(a);
88
89 LIST_REMOVE(device_allow, c->device_allow, a);
90 free(a->path);
91 free(a);
92 }
93
94 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
95 assert(c);
96 assert(w);
97
98 LIST_REMOVE(device_weights, c->io_device_weights, w);
99 free(w->path);
100 free(w);
101 }
102
103 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
104 assert(c);
105 assert(l);
106
107 LIST_REMOVE(device_limits, c->io_device_limits, l);
108 free(l->path);
109 free(l);
110 }
111
112 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
113 assert(c);
114 assert(w);
115
116 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
117 free(w->path);
118 free(w);
119 }
120
121 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
122 assert(c);
123 assert(b);
124
125 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
126 free(b->path);
127 free(b);
128 }
129
130 void cgroup_context_done(CGroupContext *c) {
131 assert(c);
132
133 while (c->io_device_weights)
134 cgroup_context_free_io_device_weight(c, c->io_device_weights);
135
136 while (c->io_device_limits)
137 cgroup_context_free_io_device_limit(c, c->io_device_limits);
138
139 while (c->blockio_device_weights)
140 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
141
142 while (c->blockio_device_bandwidths)
143 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
144
145 while (c->device_allow)
146 cgroup_context_free_device_allow(c, c->device_allow);
147
148 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
149 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
150 }
151
152 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
153 CGroupIODeviceLimit *il;
154 CGroupIODeviceWeight *iw;
155 CGroupBlockIODeviceBandwidth *b;
156 CGroupBlockIODeviceWeight *w;
157 CGroupDeviceAllow *a;
158 IPAddressAccessItem *iaai;
159 char u[FORMAT_TIMESPAN_MAX];
160
161 assert(c);
162 assert(f);
163
164 prefix = strempty(prefix);
165
166 fprintf(f,
167 "%sCPUAccounting=%s\n"
168 "%sIOAccounting=%s\n"
169 "%sBlockIOAccounting=%s\n"
170 "%sMemoryAccounting=%s\n"
171 "%sTasksAccounting=%s\n"
172 "%sIPAccounting=%s\n"
173 "%sCPUWeight=%" PRIu64 "\n"
174 "%sStartupCPUWeight=%" PRIu64 "\n"
175 "%sCPUShares=%" PRIu64 "\n"
176 "%sStartupCPUShares=%" PRIu64 "\n"
177 "%sCPUQuotaPerSecSec=%s\n"
178 "%sIOWeight=%" PRIu64 "\n"
179 "%sStartupIOWeight=%" PRIu64 "\n"
180 "%sBlockIOWeight=%" PRIu64 "\n"
181 "%sStartupBlockIOWeight=%" PRIu64 "\n"
182 "%sMemoryLow=%" PRIu64 "\n"
183 "%sMemoryHigh=%" PRIu64 "\n"
184 "%sMemoryMax=%" PRIu64 "\n"
185 "%sMemorySwapMax=%" PRIu64 "\n"
186 "%sMemoryLimit=%" PRIu64 "\n"
187 "%sTasksMax=%" PRIu64 "\n"
188 "%sDevicePolicy=%s\n"
189 "%sDelegate=%s\n",
190 prefix, yes_no(c->cpu_accounting),
191 prefix, yes_no(c->io_accounting),
192 prefix, yes_no(c->blockio_accounting),
193 prefix, yes_no(c->memory_accounting),
194 prefix, yes_no(c->tasks_accounting),
195 prefix, yes_no(c->ip_accounting),
196 prefix, c->cpu_weight,
197 prefix, c->startup_cpu_weight,
198 prefix, c->cpu_shares,
199 prefix, c->startup_cpu_shares,
200 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
201 prefix, c->io_weight,
202 prefix, c->startup_io_weight,
203 prefix, c->blockio_weight,
204 prefix, c->startup_blockio_weight,
205 prefix, c->memory_low,
206 prefix, c->memory_high,
207 prefix, c->memory_max,
208 prefix, c->memory_swap_max,
209 prefix, c->memory_limit,
210 prefix, c->tasks_max,
211 prefix, cgroup_device_policy_to_string(c->device_policy),
212 prefix, yes_no(c->delegate));
213
214 if (c->delegate) {
215 _cleanup_free_ char *t = NULL;
216
217 (void) cg_mask_to_string(c->delegate_controllers, &t);
218
219 fprintf(f, "%sDelegateControllers=%s\n",
220 prefix,
221 strempty(t));
222 }
223
224 LIST_FOREACH(device_allow, a, c->device_allow)
225 fprintf(f,
226 "%sDeviceAllow=%s %s%s%s\n",
227 prefix,
228 a->path,
229 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
230
231 LIST_FOREACH(device_weights, iw, c->io_device_weights)
232 fprintf(f,
233 "%sIODeviceWeight=%s %" PRIu64,
234 prefix,
235 iw->path,
236 iw->weight);
237
238 LIST_FOREACH(device_limits, il, c->io_device_limits) {
239 char buf[FORMAT_BYTES_MAX];
240 CGroupIOLimitType type;
241
242 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
243 if (il->limits[type] != cgroup_io_limit_defaults[type])
244 fprintf(f,
245 "%s%s=%s %s\n",
246 prefix,
247 cgroup_io_limit_type_to_string(type),
248 il->path,
249 format_bytes(buf, sizeof(buf), il->limits[type]));
250 }
251
252 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
253 fprintf(f,
254 "%sBlockIODeviceWeight=%s %" PRIu64,
255 prefix,
256 w->path,
257 w->weight);
258
259 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
260 char buf[FORMAT_BYTES_MAX];
261
262 if (b->rbps != CGROUP_LIMIT_MAX)
263 fprintf(f,
264 "%sBlockIOReadBandwidth=%s %s\n",
265 prefix,
266 b->path,
267 format_bytes(buf, sizeof(buf), b->rbps));
268 if (b->wbps != CGROUP_LIMIT_MAX)
269 fprintf(f,
270 "%sBlockIOWriteBandwidth=%s %s\n",
271 prefix,
272 b->path,
273 format_bytes(buf, sizeof(buf), b->wbps));
274 }
275
276 LIST_FOREACH(items, iaai, c->ip_address_allow) {
277 _cleanup_free_ char *k = NULL;
278
279 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
280 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
281 }
282
283 LIST_FOREACH(items, iaai, c->ip_address_deny) {
284 _cleanup_free_ char *k = NULL;
285
286 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
287 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
288 }
289 }
290
291 static int lookup_block_device(const char *p, dev_t *dev) {
292 struct stat st;
293 int r;
294
295 assert(p);
296 assert(dev);
297
298 r = stat(p, &st);
299 if (r < 0)
300 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
301
302 if (S_ISBLK(st.st_mode))
303 *dev = st.st_rdev;
304 else if (major(st.st_dev) != 0) {
305 /* If this is not a device node then find the block
306 * device this file is stored on */
307 *dev = st.st_dev;
308
309 /* If this is a partition, try to get the originating
310 * block device */
311 (void) block_get_whole_disk(*dev, dev);
312 } else {
313 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
314 return -ENODEV;
315 }
316
317 return 0;
318 }
319
320 static int whitelist_device(const char *path, const char *node, const char *acc) {
321 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
322 struct stat st;
323 bool ignore_notfound;
324 int r;
325
326 assert(path);
327 assert(acc);
328
329 if (node[0] == '-') {
330 /* Non-existent paths starting with "-" must be silently ignored */
331 node++;
332 ignore_notfound = true;
333 } else
334 ignore_notfound = false;
335
336 if (stat(node, &st) < 0) {
337 if (errno == ENOENT && ignore_notfound)
338 return 0;
339
340 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
341 }
342
343 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
344 log_warning("%s is not a device.", node);
345 return -ENODEV;
346 }
347
348 sprintf(buf,
349 "%c %u:%u %s",
350 S_ISCHR(st.st_mode) ? 'c' : 'b',
351 major(st.st_rdev), minor(st.st_rdev),
352 acc);
353
354 r = cg_set_attribute("devices", path, "devices.allow", buf);
355 if (r < 0)
356 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
357 "Failed to set devices.allow on %s: %m", path);
358
359 return r;
360 }
361
362 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
363 _cleanup_fclose_ FILE *f = NULL;
364 char line[LINE_MAX];
365 bool good = false;
366 int r;
367
368 assert(path);
369 assert(acc);
370 assert(IN_SET(type, 'b', 'c'));
371
372 f = fopen("/proc/devices", "re");
373 if (!f)
374 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
375
376 FOREACH_LINE(line, f, goto fail) {
377 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
378 unsigned maj;
379
380 truncate_nl(line);
381
382 if (type == 'c' && streq(line, "Character devices:")) {
383 good = true;
384 continue;
385 }
386
387 if (type == 'b' && streq(line, "Block devices:")) {
388 good = true;
389 continue;
390 }
391
392 if (isempty(line)) {
393 good = false;
394 continue;
395 }
396
397 if (!good)
398 continue;
399
400 p = strstrip(line);
401
402 w = strpbrk(p, WHITESPACE);
403 if (!w)
404 continue;
405 *w = 0;
406
407 r = safe_atou(p, &maj);
408 if (r < 0)
409 continue;
410 if (maj <= 0)
411 continue;
412
413 w++;
414 w += strspn(w, WHITESPACE);
415
416 if (fnmatch(name, w, 0) != 0)
417 continue;
418
419 sprintf(buf,
420 "%c %u:* %s",
421 type,
422 maj,
423 acc);
424
425 r = cg_set_attribute("devices", path, "devices.allow", buf);
426 if (r < 0)
427 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
428 "Failed to set devices.allow on %s: %m", path);
429 }
430
431 return 0;
432
433 fail:
434 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
435 }
436
437 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
438 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
439 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
440 }
441
442 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
443 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
444 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
445 }
446
447 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
448 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
449 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
450 return c->startup_cpu_weight;
451 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
452 return c->cpu_weight;
453 else
454 return CGROUP_WEIGHT_DEFAULT;
455 }
456
457 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
458 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
459 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
460 return c->startup_cpu_shares;
461 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
462 return c->cpu_shares;
463 else
464 return CGROUP_CPU_SHARES_DEFAULT;
465 }
466
467 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
468 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
469 int r;
470
471 xsprintf(buf, "%" PRIu64 "\n", weight);
472 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
473 if (r < 0)
474 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
475 "Failed to set cpu.weight: %m");
476
477 if (quota != USEC_INFINITY)
478 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
479 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
480 else
481 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
482
483 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
484
485 if (r < 0)
486 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
487 "Failed to set cpu.max: %m");
488 }
489
490 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
491 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
492 int r;
493
494 xsprintf(buf, "%" PRIu64 "\n", shares);
495 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
496 if (r < 0)
497 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
498 "Failed to set cpu.shares: %m");
499
500 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
501 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
502 if (r < 0)
503 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
504 "Failed to set cpu.cfs_period_us: %m");
505
506 if (quota != USEC_INFINITY) {
507 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
508 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
509 } else
510 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
511 if (r < 0)
512 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
513 "Failed to set cpu.cfs_quota_us: %m");
514 }
515
516 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
517 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
518 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
519 }
520
521 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
522 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
523 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
524 }
525
526 static bool cgroup_context_has_io_config(CGroupContext *c) {
527 return c->io_accounting ||
528 c->io_weight != CGROUP_WEIGHT_INVALID ||
529 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
530 c->io_device_weights ||
531 c->io_device_limits;
532 }
533
534 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
535 return c->blockio_accounting ||
536 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
537 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
538 c->blockio_device_weights ||
539 c->blockio_device_bandwidths;
540 }
541
542 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
543 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
544 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
545 return c->startup_io_weight;
546 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
547 return c->io_weight;
548 else
549 return CGROUP_WEIGHT_DEFAULT;
550 }
551
552 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
553 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
554 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
555 return c->startup_blockio_weight;
556 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
557 return c->blockio_weight;
558 else
559 return CGROUP_BLKIO_WEIGHT_DEFAULT;
560 }
561
562 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
563 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
564 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
565 }
566
567 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
568 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
569 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
570 }
571
572 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
573 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
574 dev_t dev;
575 int r;
576
577 r = lookup_block_device(dev_path, &dev);
578 if (r < 0)
579 return;
580
581 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
582 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
583 if (r < 0)
584 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
585 "Failed to set io.weight: %m");
586 }
587
588 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
589 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
590 dev_t dev;
591 int r;
592
593 r = lookup_block_device(dev_path, &dev);
594 if (r < 0)
595 return;
596
597 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
598 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
599 if (r < 0)
600 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
601 "Failed to set blkio.weight_device: %m");
602 }
603
604 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
605 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
606 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
607 CGroupIOLimitType type;
608 dev_t dev;
609 unsigned n = 0;
610 int r;
611
612 r = lookup_block_device(dev_path, &dev);
613 if (r < 0)
614 return 0;
615
616 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
617 if (limits[type] != cgroup_io_limit_defaults[type]) {
618 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
619 n++;
620 } else {
621 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
622 }
623 }
624
625 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
626 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
627 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
628 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
629 if (r < 0)
630 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
631 "Failed to set io.max: %m");
632 return n;
633 }
634
635 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
636 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
637 dev_t dev;
638 unsigned n = 0;
639 int r;
640
641 r = lookup_block_device(dev_path, &dev);
642 if (r < 0)
643 return 0;
644
645 if (rbps != CGROUP_LIMIT_MAX)
646 n++;
647 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
648 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
649 if (r < 0)
650 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
651 "Failed to set blkio.throttle.read_bps_device: %m");
652
653 if (wbps != CGROUP_LIMIT_MAX)
654 n++;
655 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
656 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
657 if (r < 0)
658 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
659 "Failed to set blkio.throttle.write_bps_device: %m");
660
661 return n;
662 }
663
664 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
665 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
666 }
667
668 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
669 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
670 int r;
671
672 if (v != CGROUP_LIMIT_MAX)
673 xsprintf(buf, "%" PRIu64 "\n", v);
674
675 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
676 if (r < 0)
677 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
678 "Failed to set %s: %m", file);
679 }
680
681 static void cgroup_apply_firewall(Unit *u) {
682 int r;
683
684 assert(u);
685
686 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
687 * not recursive we don't ever touch the bpf on them */
688 return;
689
690 r = bpf_firewall_compile(u);
691 if (r < 0)
692 return;
693
694 (void) bpf_firewall_install(u);
695 return;
696 }
697
698 static void cgroup_context_apply(
699 Unit *u,
700 CGroupMask apply_mask,
701 bool apply_bpf,
702 ManagerState state) {
703
704 const char *path;
705 CGroupContext *c;
706 bool is_root;
707 int r;
708
709 assert(u);
710
711 c = unit_get_cgroup_context(u);
712 path = u->cgroup_path;
713
714 assert(c);
715 assert(path);
716
717 /* Nothing to do? Exit early! */
718 if (apply_mask == 0 && !apply_bpf)
719 return;
720
721 /* Some cgroup attributes are not supported on the root cgroup,
722 * hence silently ignore */
723 is_root = isempty(path) || path_equal(path, "/");
724 if (is_root)
725 /* Make sure we don't try to display messages with an empty path. */
726 path = "/";
727
728 /* We generally ignore errors caused by read-only mounted
729 * cgroup trees (assuming we are running in a container then),
730 * and missing cgroups, i.e. EROFS and ENOENT. */
731
732 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
733 bool has_weight, has_shares;
734
735 has_weight = cgroup_context_has_cpu_weight(c);
736 has_shares = cgroup_context_has_cpu_shares(c);
737
738 if (cg_all_unified() > 0) {
739 uint64_t weight;
740
741 if (has_weight)
742 weight = cgroup_context_cpu_weight(c, state);
743 else if (has_shares) {
744 uint64_t shares = cgroup_context_cpu_shares(c, state);
745
746 weight = cgroup_cpu_shares_to_weight(shares);
747
748 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
749 shares, weight, path);
750 } else
751 weight = CGROUP_WEIGHT_DEFAULT;
752
753 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
754 } else {
755 uint64_t shares;
756
757 if (has_weight) {
758 uint64_t weight = cgroup_context_cpu_weight(c, state);
759
760 shares = cgroup_cpu_weight_to_shares(weight);
761
762 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
763 weight, shares, path);
764 } else if (has_shares)
765 shares = cgroup_context_cpu_shares(c, state);
766 else
767 shares = CGROUP_CPU_SHARES_DEFAULT;
768
769 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
770 }
771 }
772
773 if (apply_mask & CGROUP_MASK_IO) {
774 bool has_io = cgroup_context_has_io_config(c);
775 bool has_blockio = cgroup_context_has_blockio_config(c);
776
777 if (!is_root) {
778 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
779 uint64_t weight;
780
781 if (has_io)
782 weight = cgroup_context_io_weight(c, state);
783 else if (has_blockio) {
784 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
785
786 weight = cgroup_weight_blkio_to_io(blkio_weight);
787
788 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
789 blkio_weight, weight);
790 } else
791 weight = CGROUP_WEIGHT_DEFAULT;
792
793 xsprintf(buf, "default %" PRIu64 "\n", weight);
794 r = cg_set_attribute("io", path, "io.weight", buf);
795 if (r < 0)
796 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
797 "Failed to set io.weight: %m");
798
799 if (has_io) {
800 CGroupIODeviceWeight *w;
801
802 /* FIXME: no way to reset this list */
803 LIST_FOREACH(device_weights, w, c->io_device_weights)
804 cgroup_apply_io_device_weight(u, w->path, w->weight);
805 } else if (has_blockio) {
806 CGroupBlockIODeviceWeight *w;
807
808 /* FIXME: no way to reset this list */
809 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
810 weight = cgroup_weight_blkio_to_io(w->weight);
811
812 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
813 w->weight, weight, w->path);
814
815 cgroup_apply_io_device_weight(u, w->path, weight);
816 }
817 }
818 }
819
820 /* Apply limits and free ones without config. */
821 if (has_io) {
822 CGroupIODeviceLimit *l, *next;
823
824 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
825 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
826 cgroup_context_free_io_device_limit(c, l);
827 }
828 } else if (has_blockio) {
829 CGroupBlockIODeviceBandwidth *b, *next;
830
831 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
832 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
833 CGroupIOLimitType type;
834
835 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
836 limits[type] = cgroup_io_limit_defaults[type];
837
838 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
839 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
840
841 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
842 b->rbps, b->wbps, b->path);
843
844 if (!cgroup_apply_io_device_limit(u, b->path, limits))
845 cgroup_context_free_blockio_device_bandwidth(c, b);
846 }
847 }
848 }
849
850 if (apply_mask & CGROUP_MASK_BLKIO) {
851 bool has_io = cgroup_context_has_io_config(c);
852 bool has_blockio = cgroup_context_has_blockio_config(c);
853
854 if (!is_root) {
855 char buf[DECIMAL_STR_MAX(uint64_t)+1];
856 uint64_t weight;
857
858 if (has_io) {
859 uint64_t io_weight = cgroup_context_io_weight(c, state);
860
861 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
862
863 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
864 io_weight, weight);
865 } else if (has_blockio)
866 weight = cgroup_context_blkio_weight(c, state);
867 else
868 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
869
870 xsprintf(buf, "%" PRIu64 "\n", weight);
871 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
872 if (r < 0)
873 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
874 "Failed to set blkio.weight: %m");
875
876 if (has_io) {
877 CGroupIODeviceWeight *w;
878
879 /* FIXME: no way to reset this list */
880 LIST_FOREACH(device_weights, w, c->io_device_weights) {
881 weight = cgroup_weight_io_to_blkio(w->weight);
882
883 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
884 w->weight, weight, w->path);
885
886 cgroup_apply_blkio_device_weight(u, w->path, weight);
887 }
888 } else if (has_blockio) {
889 CGroupBlockIODeviceWeight *w;
890
891 /* FIXME: no way to reset this list */
892 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
893 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
894 }
895 }
896
897 /* Apply limits and free ones without config. */
898 if (has_io) {
899 CGroupIODeviceLimit *l, *next;
900
901 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
902 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
903 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
904
905 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
906 cgroup_context_free_io_device_limit(c, l);
907 }
908 } else if (has_blockio) {
909 CGroupBlockIODeviceBandwidth *b, *next;
910
911 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
912 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
913 cgroup_context_free_blockio_device_bandwidth(c, b);
914 }
915 }
916
917 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
918 if (cg_all_unified() > 0) {
919 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
920
921 if (cgroup_context_has_unified_memory_config(c)) {
922 max = c->memory_max;
923 swap_max = c->memory_swap_max;
924 } else {
925 max = c->memory_limit;
926
927 if (max != CGROUP_LIMIT_MAX)
928 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
929 }
930
931 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
932 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
933 cgroup_apply_unified_memory_limit(u, "memory.max", max);
934 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
935 } else {
936 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
937 uint64_t val;
938
939 if (cgroup_context_has_unified_memory_config(c)) {
940 val = c->memory_max;
941 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
942 } else
943 val = c->memory_limit;
944
945 if (val == CGROUP_LIMIT_MAX)
946 strncpy(buf, "-1\n", sizeof(buf));
947 else
948 xsprintf(buf, "%" PRIu64 "\n", val);
949
950 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
951 if (r < 0)
952 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
953 "Failed to set memory.limit_in_bytes: %m");
954 }
955 }
956
957 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
958 CGroupDeviceAllow *a;
959
960 /* Changing the devices list of a populated cgroup
961 * might result in EINVAL, hence ignore EINVAL
962 * here. */
963
964 if (c->device_allow || c->device_policy != CGROUP_AUTO)
965 r = cg_set_attribute("devices", path, "devices.deny", "a");
966 else
967 r = cg_set_attribute("devices", path, "devices.allow", "a");
968 if (r < 0)
969 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
970 "Failed to reset devices.list: %m");
971
972 if (c->device_policy == CGROUP_CLOSED ||
973 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
974 static const char auto_devices[] =
975 "/dev/null\0" "rwm\0"
976 "/dev/zero\0" "rwm\0"
977 "/dev/full\0" "rwm\0"
978 "/dev/random\0" "rwm\0"
979 "/dev/urandom\0" "rwm\0"
980 "/dev/tty\0" "rwm\0"
981 "/dev/ptmx\0" "rwm\0"
982 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
983 "-/run/systemd/inaccessible/chr\0" "rwm\0"
984 "-/run/systemd/inaccessible/blk\0" "rwm\0";
985
986 const char *x, *y;
987
988 NULSTR_FOREACH_PAIR(x, y, auto_devices)
989 whitelist_device(path, x, y);
990
991 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
992 whitelist_major(path, "pts", 'c', "rw");
993 }
994
995 LIST_FOREACH(device_allow, a, c->device_allow) {
996 char acc[4], *val;
997 unsigned k = 0;
998
999 if (a->r)
1000 acc[k++] = 'r';
1001 if (a->w)
1002 acc[k++] = 'w';
1003 if (a->m)
1004 acc[k++] = 'm';
1005
1006 if (k == 0)
1007 continue;
1008
1009 acc[k++] = 0;
1010
1011 if (path_startswith(a->path, "/dev/"))
1012 whitelist_device(path, a->path, acc);
1013 else if ((val = startswith(a->path, "block-")))
1014 whitelist_major(path, val, 'b', acc);
1015 else if ((val = startswith(a->path, "char-")))
1016 whitelist_major(path, val, 'c', acc);
1017 else
1018 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1019 }
1020 }
1021
1022 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1023
1024 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1025 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1026
1027 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1028 r = cg_set_attribute("pids", path, "pids.max", buf);
1029 } else
1030 r = cg_set_attribute("pids", path, "pids.max", "max");
1031
1032 if (r < 0)
1033 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1034 "Failed to set pids.max: %m");
1035 }
1036
1037 if (apply_bpf)
1038 cgroup_apply_firewall(u);
1039 }
1040
1041 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1042 CGroupMask mask = 0;
1043
1044 /* Figure out which controllers we need */
1045
1046 if (c->cpu_accounting ||
1047 cgroup_context_has_cpu_weight(c) ||
1048 cgroup_context_has_cpu_shares(c) ||
1049 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1050 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1051
1052 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1053 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1054
1055 if (c->memory_accounting ||
1056 c->memory_limit != CGROUP_LIMIT_MAX ||
1057 cgroup_context_has_unified_memory_config(c))
1058 mask |= CGROUP_MASK_MEMORY;
1059
1060 if (c->device_allow ||
1061 c->device_policy != CGROUP_AUTO)
1062 mask |= CGROUP_MASK_DEVICES;
1063
1064 if (c->tasks_accounting ||
1065 c->tasks_max != (uint64_t) -1)
1066 mask |= CGROUP_MASK_PIDS;
1067
1068 return mask;
1069 }
1070
1071 CGroupMask unit_get_own_mask(Unit *u) {
1072 CGroupContext *c;
1073
1074 /* Returns the mask of controllers the unit needs for itself */
1075
1076 c = unit_get_cgroup_context(u);
1077 if (!c)
1078 return 0;
1079
1080 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1081 }
1082
1083 CGroupMask unit_get_delegate_mask(Unit *u) {
1084 CGroupContext *c;
1085
1086 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1087 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1088 *
1089 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1090
1091 if (u->type == UNIT_SLICE)
1092 return 0;
1093
1094 c = unit_get_cgroup_context(u);
1095 if (!c)
1096 return 0;
1097
1098 if (!c->delegate)
1099 return 0;
1100
1101 if (cg_all_unified() <= 0) {
1102 ExecContext *e;
1103
1104 e = unit_get_exec_context(u);
1105 if (e && !exec_context_maintains_privileges(e))
1106 return 0;
1107 }
1108
1109 return c->delegate_controllers;
1110 }
1111
1112 CGroupMask unit_get_members_mask(Unit *u) {
1113 assert(u);
1114
1115 /* Returns the mask of controllers all of the unit's children require, merged */
1116
1117 if (u->cgroup_members_mask_valid)
1118 return u->cgroup_members_mask;
1119
1120 u->cgroup_members_mask = 0;
1121
1122 if (u->type == UNIT_SLICE) {
1123 void *v;
1124 Unit *member;
1125 Iterator i;
1126
1127 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1128
1129 if (member == u)
1130 continue;
1131
1132 if (UNIT_DEREF(member->slice) != u)
1133 continue;
1134
1135 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1136 }
1137 }
1138
1139 u->cgroup_members_mask_valid = true;
1140 return u->cgroup_members_mask;
1141 }
1142
1143 CGroupMask unit_get_siblings_mask(Unit *u) {
1144 assert(u);
1145
1146 /* Returns the mask of controllers all of the unit's siblings
1147 * require, i.e. the members mask of the unit's parent slice
1148 * if there is one. */
1149
1150 if (UNIT_ISSET(u->slice))
1151 return unit_get_members_mask(UNIT_DEREF(u->slice));
1152
1153 return unit_get_subtree_mask(u); /* we are the top-level slice */
1154 }
1155
1156 CGroupMask unit_get_subtree_mask(Unit *u) {
1157
1158 /* Returns the mask of this subtree, meaning of the group
1159 * itself and its children. */
1160
1161 return unit_get_own_mask(u) | unit_get_members_mask(u);
1162 }
1163
1164 CGroupMask unit_get_target_mask(Unit *u) {
1165 CGroupMask mask;
1166
1167 /* This returns the cgroup mask of all controllers to enable
1168 * for a specific cgroup, i.e. everything it needs itself,
1169 * plus all that its children need, plus all that its siblings
1170 * need. This is primarily useful on the legacy cgroup
1171 * hierarchy, where we need to duplicate each cgroup in each
1172 * hierarchy that shall be enabled for it. */
1173
1174 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1175 mask &= u->manager->cgroup_supported;
1176
1177 return mask;
1178 }
1179
1180 CGroupMask unit_get_enable_mask(Unit *u) {
1181 CGroupMask mask;
1182
1183 /* This returns the cgroup mask of all controllers to enable
1184 * for the children of a specific cgroup. This is primarily
1185 * useful for the unified cgroup hierarchy, where each cgroup
1186 * controls which controllers are enabled for its children. */
1187
1188 mask = unit_get_members_mask(u);
1189 mask &= u->manager->cgroup_supported;
1190
1191 return mask;
1192 }
1193
1194 bool unit_get_needs_bpf(Unit *u) {
1195 CGroupContext *c;
1196 Unit *p;
1197 assert(u);
1198
1199 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1200 * moment. */
1201 if (u->type == UNIT_SLICE)
1202 return false;
1203
1204 c = unit_get_cgroup_context(u);
1205 if (!c)
1206 return false;
1207
1208 if (c->ip_accounting ||
1209 c->ip_address_allow ||
1210 c->ip_address_deny)
1211 return true;
1212
1213 /* If any parent slice has an IP access list defined, it applies too */
1214 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1215 c = unit_get_cgroup_context(p);
1216 if (!c)
1217 return false;
1218
1219 if (c->ip_address_allow ||
1220 c->ip_address_deny)
1221 return true;
1222 }
1223
1224 return false;
1225 }
1226
1227 /* Recurse from a unit up through its containing slices, propagating
1228 * mask bits upward. A unit is also member of itself. */
1229 void unit_update_cgroup_members_masks(Unit *u) {
1230 CGroupMask m;
1231 bool more;
1232
1233 assert(u);
1234
1235 /* Calculate subtree mask */
1236 m = unit_get_subtree_mask(u);
1237
1238 /* See if anything changed from the previous invocation. If
1239 * not, we're done. */
1240 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1241 return;
1242
1243 more =
1244 u->cgroup_subtree_mask_valid &&
1245 ((m & ~u->cgroup_subtree_mask) != 0) &&
1246 ((~m & u->cgroup_subtree_mask) == 0);
1247
1248 u->cgroup_subtree_mask = m;
1249 u->cgroup_subtree_mask_valid = true;
1250
1251 if (UNIT_ISSET(u->slice)) {
1252 Unit *s = UNIT_DEREF(u->slice);
1253
1254 if (more)
1255 /* There's more set now than before. We
1256 * propagate the new mask to the parent's mask
1257 * (not caring if it actually was valid or
1258 * not). */
1259
1260 s->cgroup_members_mask |= m;
1261
1262 else
1263 /* There's less set now than before (or we
1264 * don't know), we need to recalculate
1265 * everything, so let's invalidate the
1266 * parent's members mask */
1267
1268 s->cgroup_members_mask_valid = false;
1269
1270 /* And now make sure that this change also hits our
1271 * grandparents */
1272 unit_update_cgroup_members_masks(s);
1273 }
1274 }
1275
1276 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1277 Unit *u = userdata;
1278
1279 assert(mask != 0);
1280 assert(u);
1281
1282 while (u) {
1283 if (u->cgroup_path &&
1284 u->cgroup_realized &&
1285 (u->cgroup_realized_mask & mask) == mask)
1286 return u->cgroup_path;
1287
1288 u = UNIT_DEREF(u->slice);
1289 }
1290
1291 return NULL;
1292 }
1293
1294 char *unit_default_cgroup_path(Unit *u) {
1295 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1296 int r;
1297
1298 assert(u);
1299
1300 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1301 return strdup(u->manager->cgroup_root);
1302
1303 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1304 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1305 if (r < 0)
1306 return NULL;
1307 }
1308
1309 escaped = cg_escape(u->id);
1310 if (!escaped)
1311 return NULL;
1312
1313 if (slice)
1314 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1315 escaped);
1316 else
1317 return strjoin(u->manager->cgroup_root, "/", escaped);
1318 }
1319
1320 int unit_set_cgroup_path(Unit *u, const char *path) {
1321 _cleanup_free_ char *p = NULL;
1322 int r;
1323
1324 assert(u);
1325
1326 if (path) {
1327 p = strdup(path);
1328 if (!p)
1329 return -ENOMEM;
1330 } else
1331 p = NULL;
1332
1333 if (streq_ptr(u->cgroup_path, p))
1334 return 0;
1335
1336 if (p) {
1337 r = hashmap_put(u->manager->cgroup_unit, p, u);
1338 if (r < 0)
1339 return r;
1340 }
1341
1342 unit_release_cgroup(u);
1343
1344 u->cgroup_path = p;
1345 p = NULL;
1346
1347 return 1;
1348 }
1349
1350 int unit_watch_cgroup(Unit *u) {
1351 _cleanup_free_ char *events = NULL;
1352 int r;
1353
1354 assert(u);
1355
1356 if (!u->cgroup_path)
1357 return 0;
1358
1359 if (u->cgroup_inotify_wd >= 0)
1360 return 0;
1361
1362 /* Only applies to the unified hierarchy */
1363 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1364 if (r < 0)
1365 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1366 if (r == 0)
1367 return 0;
1368
1369 /* Don't watch the root slice, it's pointless. */
1370 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1371 return 0;
1372
1373 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1374 if (r < 0)
1375 return log_oom();
1376
1377 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1378 if (r < 0)
1379 return log_oom();
1380
1381 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1382 if (u->cgroup_inotify_wd < 0) {
1383
1384 if (errno == ENOENT) /* If the directory is already
1385 * gone we don't need to track
1386 * it, so this is not an error */
1387 return 0;
1388
1389 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1390 }
1391
1392 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1393 if (r < 0)
1394 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1395
1396 return 0;
1397 }
1398
1399 int unit_pick_cgroup_path(Unit *u) {
1400 _cleanup_free_ char *path = NULL;
1401 int r;
1402
1403 assert(u);
1404
1405 if (u->cgroup_path)
1406 return 0;
1407
1408 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1409 return -EINVAL;
1410
1411 path = unit_default_cgroup_path(u);
1412 if (!path)
1413 return log_oom();
1414
1415 r = unit_set_cgroup_path(u, path);
1416 if (r == -EEXIST)
1417 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1418 if (r < 0)
1419 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1420
1421 return 0;
1422 }
1423
1424 static int unit_create_cgroup(
1425 Unit *u,
1426 CGroupMask target_mask,
1427 CGroupMask enable_mask,
1428 bool needs_bpf) {
1429
1430 CGroupContext *c;
1431 int r;
1432
1433 assert(u);
1434
1435 c = unit_get_cgroup_context(u);
1436 if (!c)
1437 return 0;
1438
1439 /* Figure out our cgroup path */
1440 r = unit_pick_cgroup_path(u);
1441 if (r < 0)
1442 return r;
1443
1444 /* First, create our own group */
1445 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1446 if (r < 0)
1447 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1448
1449 /* Start watching it */
1450 (void) unit_watch_cgroup(u);
1451
1452 /* Enable all controllers we need */
1453 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1454 if (r < 0)
1455 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1456
1457 /* Keep track that this is now realized */
1458 u->cgroup_realized = true;
1459 u->cgroup_realized_mask = target_mask;
1460 u->cgroup_enabled_mask = enable_mask;
1461 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1462
1463 if (u->type != UNIT_SLICE && !c->delegate) {
1464
1465 /* Then, possibly move things over, but not if
1466 * subgroups may contain processes, which is the case
1467 * for slice and delegation units. */
1468 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1469 if (r < 0)
1470 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1471 }
1472
1473 return 0;
1474 }
1475
1476 int unit_attach_pids_to_cgroup(Unit *u) {
1477 int r;
1478 assert(u);
1479
1480 r = unit_realize_cgroup(u);
1481 if (r < 0)
1482 return r;
1483
1484 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1485 if (r < 0)
1486 return r;
1487
1488 return 0;
1489 }
1490
1491 static void cgroup_xattr_apply(Unit *u) {
1492 char ids[SD_ID128_STRING_MAX];
1493 int r;
1494
1495 assert(u);
1496
1497 if (!MANAGER_IS_SYSTEM(u->manager))
1498 return;
1499
1500 if (sd_id128_is_null(u->invocation_id))
1501 return;
1502
1503 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1504 "trusted.invocation_id",
1505 sd_id128_to_string(u->invocation_id, ids), 32,
1506 0);
1507 if (r < 0)
1508 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1509 }
1510
1511 static bool unit_has_mask_realized(
1512 Unit *u,
1513 CGroupMask target_mask,
1514 CGroupMask enable_mask,
1515 bool needs_bpf) {
1516
1517 assert(u);
1518
1519 return u->cgroup_realized &&
1520 u->cgroup_realized_mask == target_mask &&
1521 u->cgroup_enabled_mask == enable_mask &&
1522 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1523 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1524 }
1525
1526 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1527 assert(u);
1528
1529 if (u->in_cgroup_realize_queue)
1530 return;
1531
1532 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1533 u->in_cgroup_realize_queue = true;
1534 }
1535
1536 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1537 assert(u);
1538
1539 if (!u->in_cgroup_realize_queue)
1540 return;
1541
1542 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1543 u->in_cgroup_realize_queue = false;
1544 }
1545
1546
1547 /* Check if necessary controllers and attributes for a unit are in place.
1548 *
1549 * If so, do nothing.
1550 * If not, create paths, move processes over, and set attributes.
1551 *
1552 * Returns 0 on success and < 0 on failure. */
1553 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1554 CGroupMask target_mask, enable_mask;
1555 bool needs_bpf, apply_bpf;
1556 int r;
1557
1558 assert(u);
1559
1560 unit_remove_from_cgroup_realize_queue(u);
1561
1562 target_mask = unit_get_target_mask(u);
1563 enable_mask = unit_get_enable_mask(u);
1564 needs_bpf = unit_get_needs_bpf(u);
1565
1566 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1567 return 0;
1568
1569 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1570 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1571 * this will trickle down properly to cgroupfs. */
1572 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1573
1574 /* First, realize parents */
1575 if (UNIT_ISSET(u->slice)) {
1576 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1577 if (r < 0)
1578 return r;
1579 }
1580
1581 /* And then do the real work */
1582 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1583 if (r < 0)
1584 return r;
1585
1586 /* Finally, apply the necessary attributes. */
1587 cgroup_context_apply(u, target_mask, apply_bpf, state);
1588 cgroup_xattr_apply(u);
1589
1590 return 0;
1591 }
1592
1593 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1594 ManagerState state;
1595 unsigned n = 0;
1596 Unit *i;
1597 int r;
1598
1599 assert(m);
1600
1601 state = manager_state(m);
1602
1603 while ((i = m->cgroup_realize_queue)) {
1604 assert(i->in_cgroup_realize_queue);
1605
1606 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1607 /* Maybe things changed, and the unit is not actually active anymore? */
1608 unit_remove_from_cgroup_realize_queue(i);
1609 continue;
1610 }
1611
1612 r = unit_realize_cgroup_now(i, state);
1613 if (r < 0)
1614 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1615
1616 n++;
1617 }
1618
1619 return n;
1620 }
1621
1622 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1623 Unit *slice;
1624
1625 /* This adds the siblings of the specified unit and the
1626 * siblings of all parent units to the cgroup queue. (But
1627 * neither the specified unit itself nor the parents.) */
1628
1629 while ((slice = UNIT_DEREF(u->slice))) {
1630 Iterator i;
1631 Unit *m;
1632 void *v;
1633
1634 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1635 if (m == u)
1636 continue;
1637
1638 /* Skip units that have a dependency on the slice
1639 * but aren't actually in it. */
1640 if (UNIT_DEREF(m->slice) != slice)
1641 continue;
1642
1643 /* No point in doing cgroup application for units
1644 * without active processes. */
1645 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1646 continue;
1647
1648 /* If the unit doesn't need any new controllers
1649 * and has current ones realized, it doesn't need
1650 * any changes. */
1651 if (unit_has_mask_realized(m,
1652 unit_get_target_mask(m),
1653 unit_get_enable_mask(m),
1654 unit_get_needs_bpf(m)))
1655 continue;
1656
1657 unit_add_to_cgroup_realize_queue(m);
1658 }
1659
1660 u = slice;
1661 }
1662 }
1663
1664 int unit_realize_cgroup(Unit *u) {
1665 assert(u);
1666
1667 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1668 return 0;
1669
1670 /* So, here's the deal: when realizing the cgroups for this
1671 * unit, we need to first create all parents, but there's more
1672 * actually: for the weight-based controllers we also need to
1673 * make sure that all our siblings (i.e. units that are in the
1674 * same slice as we are) have cgroups, too. Otherwise, things
1675 * would become very uneven as each of their processes would
1676 * get as much resources as all our group together. This call
1677 * will synchronously create the parent cgroups, but will
1678 * defer work on the siblings to the next event loop
1679 * iteration. */
1680
1681 /* Add all sibling slices to the cgroup queue. */
1682 unit_add_siblings_to_cgroup_realize_queue(u);
1683
1684 /* And realize this one now (and apply the values) */
1685 return unit_realize_cgroup_now(u, manager_state(u->manager));
1686 }
1687
1688 void unit_release_cgroup(Unit *u) {
1689 assert(u);
1690
1691 /* Forgets all cgroup details for this cgroup */
1692
1693 if (u->cgroup_path) {
1694 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1695 u->cgroup_path = mfree(u->cgroup_path);
1696 }
1697
1698 if (u->cgroup_inotify_wd >= 0) {
1699 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1700 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1701
1702 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1703 u->cgroup_inotify_wd = -1;
1704 }
1705 }
1706
1707 void unit_prune_cgroup(Unit *u) {
1708 int r;
1709 bool is_root_slice;
1710
1711 assert(u);
1712
1713 /* Removes the cgroup, if empty and possible, and stops watching it. */
1714
1715 if (!u->cgroup_path)
1716 return;
1717
1718 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1719
1720 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1721
1722 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1723 if (r < 0) {
1724 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1725 return;
1726 }
1727
1728 if (is_root_slice)
1729 return;
1730
1731 unit_release_cgroup(u);
1732
1733 u->cgroup_realized = false;
1734 u->cgroup_realized_mask = 0;
1735 u->cgroup_enabled_mask = 0;
1736 }
1737
1738 int unit_search_main_pid(Unit *u, pid_t *ret) {
1739 _cleanup_fclose_ FILE *f = NULL;
1740 pid_t pid = 0, npid, mypid;
1741 int r;
1742
1743 assert(u);
1744 assert(ret);
1745
1746 if (!u->cgroup_path)
1747 return -ENXIO;
1748
1749 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1750 if (r < 0)
1751 return r;
1752
1753 mypid = getpid_cached();
1754 while (cg_read_pid(f, &npid) > 0) {
1755 pid_t ppid;
1756
1757 if (npid == pid)
1758 continue;
1759
1760 /* Ignore processes that aren't our kids */
1761 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1762 continue;
1763
1764 if (pid != 0)
1765 /* Dang, there's more than one daemonized PID
1766 in this group, so we don't know what process
1767 is the main process. */
1768
1769 return -ENODATA;
1770
1771 pid = npid;
1772 }
1773
1774 *ret = pid;
1775 return 0;
1776 }
1777
1778 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1779 _cleanup_closedir_ DIR *d = NULL;
1780 _cleanup_fclose_ FILE *f = NULL;
1781 int ret = 0, r;
1782
1783 assert(u);
1784 assert(path);
1785
1786 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1787 if (r < 0)
1788 ret = r;
1789 else {
1790 pid_t pid;
1791
1792 while ((r = cg_read_pid(f, &pid)) > 0) {
1793 r = unit_watch_pid(u, pid);
1794 if (r < 0 && ret >= 0)
1795 ret = r;
1796 }
1797
1798 if (r < 0 && ret >= 0)
1799 ret = r;
1800 }
1801
1802 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1803 if (r < 0) {
1804 if (ret >= 0)
1805 ret = r;
1806 } else {
1807 char *fn;
1808
1809 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1810 _cleanup_free_ char *p = NULL;
1811
1812 p = strjoin(path, "/", fn);
1813 free(fn);
1814
1815 if (!p)
1816 return -ENOMEM;
1817
1818 r = unit_watch_pids_in_path(u, p);
1819 if (r < 0 && ret >= 0)
1820 ret = r;
1821 }
1822
1823 if (r < 0 && ret >= 0)
1824 ret = r;
1825 }
1826
1827 return ret;
1828 }
1829
1830 int unit_watch_all_pids(Unit *u) {
1831 int r;
1832
1833 assert(u);
1834
1835 /* Adds all PIDs from our cgroup to the set of PIDs we
1836 * watch. This is a fallback logic for cases where we do not
1837 * get reliable cgroup empty notifications: we try to use
1838 * SIGCHLD as replacement. */
1839
1840 if (!u->cgroup_path)
1841 return -ENOENT;
1842
1843 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1844 if (r < 0)
1845 return r;
1846 if (r > 0) /* On unified we can use proper notifications */
1847 return 0;
1848
1849 return unit_watch_pids_in_path(u, u->cgroup_path);
1850 }
1851
1852 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1853 Manager *m = userdata;
1854 Unit *u;
1855 int r;
1856
1857 assert(s);
1858 assert(m);
1859
1860 u = m->cgroup_empty_queue;
1861 if (!u)
1862 return 0;
1863
1864 assert(u->in_cgroup_empty_queue);
1865 u->in_cgroup_empty_queue = false;
1866 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1867
1868 if (m->cgroup_empty_queue) {
1869 /* More stuff queued, let's make sure we remain enabled */
1870 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1871 if (r < 0)
1872 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1873 }
1874
1875 unit_add_to_gc_queue(u);
1876
1877 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1878 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1879
1880 return 0;
1881 }
1882
1883 void unit_add_to_cgroup_empty_queue(Unit *u) {
1884 int r;
1885
1886 assert(u);
1887
1888 /* Note that there are four different ways how cgroup empty events reach us:
1889 *
1890 * 1. On the unified hierarchy we get an inotify event on the cgroup
1891 *
1892 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1893 *
1894 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1895 *
1896 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1897 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1898 *
1899 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1900 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1901 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1902 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1903 * case for scope units). */
1904
1905 if (u->in_cgroup_empty_queue)
1906 return;
1907
1908 /* Let's verify that the cgroup is really empty */
1909 if (!u->cgroup_path)
1910 return;
1911 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1912 if (r < 0) {
1913 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1914 return;
1915 }
1916 if (r == 0)
1917 return;
1918
1919 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1920 u->in_cgroup_empty_queue = true;
1921
1922 /* Trigger the defer event */
1923 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1924 if (r < 0)
1925 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1926 }
1927
1928 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1929 Manager *m = userdata;
1930
1931 assert(s);
1932 assert(fd >= 0);
1933 assert(m);
1934
1935 for (;;) {
1936 union inotify_event_buffer buffer;
1937 struct inotify_event *e;
1938 ssize_t l;
1939
1940 l = read(fd, &buffer, sizeof(buffer));
1941 if (l < 0) {
1942 if (IN_SET(errno, EINTR, EAGAIN))
1943 return 0;
1944
1945 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1946 }
1947
1948 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1949 Unit *u;
1950
1951 if (e->wd < 0)
1952 /* Queue overflow has no watch descriptor */
1953 continue;
1954
1955 if (e->mask & IN_IGNORED)
1956 /* The watch was just removed */
1957 continue;
1958
1959 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1960 if (!u) /* Not that inotify might deliver
1961 * events for a watch even after it
1962 * was removed, because it was queued
1963 * before the removal. Let's ignore
1964 * this here safely. */
1965 continue;
1966
1967 unit_add_to_cgroup_empty_queue(u);
1968 }
1969 }
1970 }
1971
1972 int manager_setup_cgroup(Manager *m) {
1973 _cleanup_free_ char *path = NULL;
1974 const char *scope_path;
1975 CGroupController c;
1976 int r, all_unified;
1977 char *e;
1978
1979 assert(m);
1980
1981 /* 1. Determine hierarchy */
1982 m->cgroup_root = mfree(m->cgroup_root);
1983 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1984 if (r < 0)
1985 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1986
1987 /* Chop off the init scope, if we are already located in it */
1988 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1989
1990 /* LEGACY: Also chop off the system slice if we are in
1991 * it. This is to support live upgrades from older systemd
1992 * versions where PID 1 was moved there. Also see
1993 * cg_get_root_path(). */
1994 if (!e && MANAGER_IS_SYSTEM(m)) {
1995 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1996 if (!e)
1997 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1998 }
1999 if (e)
2000 *e = 0;
2001
2002 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2003 * easily prepend it everywhere. */
2004 delete_trailing_chars(m->cgroup_root, "/");
2005
2006 /* 2. Show data */
2007 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2008 if (r < 0)
2009 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2010
2011 r = cg_unified_flush();
2012 if (r < 0)
2013 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2014
2015 all_unified = cg_all_unified();
2016 if (all_unified < 0)
2017 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2018 if (all_unified > 0)
2019 log_debug("Unified cgroup hierarchy is located at %s.", path);
2020 else {
2021 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2022 if (r < 0)
2023 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2024 if (r > 0)
2025 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2026 else
2027 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2028 }
2029
2030 /* 3. Allocate cgroup empty defer event source */
2031 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2032 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2033 if (r < 0)
2034 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2035
2036 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2037 if (r < 0)
2038 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2039
2040 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2041 if (r < 0)
2042 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2043
2044 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2045
2046 /* 4. Install notifier inotify object, or agent */
2047 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2048
2049 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2050
2051 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2052 safe_close(m->cgroup_inotify_fd);
2053
2054 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2055 if (m->cgroup_inotify_fd < 0)
2056 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2057
2058 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2059 if (r < 0)
2060 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2061
2062 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2063 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2064 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2065 if (r < 0)
2066 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2067
2068 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2069
2070 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2071
2072 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2073 * since it does not generate events when control groups with children run empty. */
2074
2075 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2076 if (r < 0)
2077 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2078 else if (r > 0)
2079 log_debug("Installed release agent.");
2080 else if (r == 0)
2081 log_debug("Release agent already installed.");
2082 }
2083
2084 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2085 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2086 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2087 if (r < 0)
2088 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2089
2090 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2091 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2092 if (r < 0)
2093 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2094
2095 /* 6. And pin it, so that it cannot be unmounted */
2096 safe_close(m->pin_cgroupfs_fd);
2097 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2098 if (m->pin_cgroupfs_fd < 0)
2099 return log_error_errno(errno, "Failed to open pin file: %m");
2100
2101 /* 7. Always enable hierarchical support if it exists... */
2102 if (!all_unified && m->test_run_flags == 0)
2103 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2104
2105 /* 8. Figure out which controllers are supported, and log about it */
2106 r = cg_mask_supported(&m->cgroup_supported);
2107 if (r < 0)
2108 return log_error_errno(r, "Failed to determine supported controllers: %m");
2109 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2110 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2111
2112 return 0;
2113 }
2114
2115 void manager_shutdown_cgroup(Manager *m, bool delete) {
2116 assert(m);
2117
2118 /* We can't really delete the group, since we are in it. But
2119 * let's trim it. */
2120 if (delete && m->cgroup_root)
2121 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2122
2123 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2124
2125 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2126
2127 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2128 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2129
2130 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2131
2132 m->cgroup_root = mfree(m->cgroup_root);
2133 }
2134
2135 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2136 char *p;
2137 Unit *u;
2138
2139 assert(m);
2140 assert(cgroup);
2141
2142 u = hashmap_get(m->cgroup_unit, cgroup);
2143 if (u)
2144 return u;
2145
2146 p = strdupa(cgroup);
2147 for (;;) {
2148 char *e;
2149
2150 e = strrchr(p, '/');
2151 if (!e || e == p)
2152 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2153
2154 *e = 0;
2155
2156 u = hashmap_get(m->cgroup_unit, p);
2157 if (u)
2158 return u;
2159 }
2160 }
2161
2162 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2163 _cleanup_free_ char *cgroup = NULL;
2164 int r;
2165
2166 assert(m);
2167
2168 if (pid <= 0)
2169 return NULL;
2170
2171 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2172 if (r < 0)
2173 return NULL;
2174
2175 return manager_get_unit_by_cgroup(m, cgroup);
2176 }
2177
2178 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2179 Unit *u;
2180
2181 assert(m);
2182
2183 if (pid <= 0)
2184 return NULL;
2185
2186 if (pid == getpid_cached())
2187 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2188
2189 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2190 if (u)
2191 return u;
2192
2193 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2194 if (u)
2195 return u;
2196
2197 return manager_get_unit_by_pid_cgroup(m, pid);
2198 }
2199
2200 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2201 Unit *u;
2202
2203 assert(m);
2204 assert(cgroup);
2205
2206 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2207 * or from the --system instance */
2208
2209 log_debug("Got cgroup empty notification for: %s", cgroup);
2210
2211 u = manager_get_unit_by_cgroup(m, cgroup);
2212 if (!u)
2213 return 0;
2214
2215 unit_add_to_cgroup_empty_queue(u);
2216 return 1;
2217 }
2218
2219 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2220 _cleanup_free_ char *v = NULL;
2221 int r;
2222
2223 assert(u);
2224 assert(ret);
2225
2226 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2227 return -ENODATA;
2228
2229 if (!u->cgroup_path)
2230 return -ENODATA;
2231
2232 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2233 return -ENODATA;
2234
2235 r = cg_all_unified();
2236 if (r < 0)
2237 return r;
2238 if (r > 0)
2239 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2240 else
2241 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2242 if (r == -ENOENT)
2243 return -ENODATA;
2244 if (r < 0)
2245 return r;
2246
2247 return safe_atou64(v, ret);
2248 }
2249
2250 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2251 _cleanup_free_ char *v = NULL;
2252 int r;
2253
2254 assert(u);
2255 assert(ret);
2256
2257 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2258 return -ENODATA;
2259
2260 if (!u->cgroup_path)
2261 return -ENODATA;
2262
2263 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2264 return -ENODATA;
2265
2266 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2267 if (r == -ENOENT)
2268 return -ENODATA;
2269 if (r < 0)
2270 return r;
2271
2272 return safe_atou64(v, ret);
2273 }
2274
2275 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2276 _cleanup_free_ char *v = NULL;
2277 uint64_t ns;
2278 int r;
2279
2280 assert(u);
2281 assert(ret);
2282
2283 if (!u->cgroup_path)
2284 return -ENODATA;
2285
2286 r = cg_all_unified();
2287 if (r < 0)
2288 return r;
2289 if (r > 0) {
2290 const char *keys[] = { "usage_usec", NULL };
2291 _cleanup_free_ char *val = NULL;
2292 uint64_t us;
2293
2294 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2295 return -ENODATA;
2296
2297 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2298 if (r < 0)
2299 return r;
2300
2301 r = safe_atou64(val, &us);
2302 if (r < 0)
2303 return r;
2304
2305 ns = us * NSEC_PER_USEC;
2306 } else {
2307 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2308 return -ENODATA;
2309
2310 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2311 if (r == -ENOENT)
2312 return -ENODATA;
2313 if (r < 0)
2314 return r;
2315
2316 r = safe_atou64(v, &ns);
2317 if (r < 0)
2318 return r;
2319 }
2320
2321 *ret = ns;
2322 return 0;
2323 }
2324
2325 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2326 nsec_t ns;
2327 int r;
2328
2329 assert(u);
2330
2331 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2332 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2333 * call this function with a NULL return value. */
2334
2335 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2336 return -ENODATA;
2337
2338 r = unit_get_cpu_usage_raw(u, &ns);
2339 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2340 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2341 * cached value. */
2342
2343 if (ret)
2344 *ret = u->cpu_usage_last;
2345 return 0;
2346 }
2347 if (r < 0)
2348 return r;
2349
2350 if (ns > u->cpu_usage_base)
2351 ns -= u->cpu_usage_base;
2352 else
2353 ns = 0;
2354
2355 u->cpu_usage_last = ns;
2356 if (ret)
2357 *ret = ns;
2358
2359 return 0;
2360 }
2361
2362 int unit_get_ip_accounting(
2363 Unit *u,
2364 CGroupIPAccountingMetric metric,
2365 uint64_t *ret) {
2366
2367 uint64_t value;
2368 int fd, r;
2369
2370 assert(u);
2371 assert(metric >= 0);
2372 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2373 assert(ret);
2374
2375 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2376 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2377 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2378 * filters. */
2379 if (u->type == UNIT_SLICE)
2380 return -ENODATA;
2381
2382 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2383 return -ENODATA;
2384
2385 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2386 u->ip_accounting_ingress_map_fd :
2387 u->ip_accounting_egress_map_fd;
2388 if (fd < 0)
2389 return -ENODATA;
2390
2391 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2392 r = bpf_firewall_read_accounting(fd, &value, NULL);
2393 else
2394 r = bpf_firewall_read_accounting(fd, NULL, &value);
2395 if (r < 0)
2396 return r;
2397
2398 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2399 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2400 * ip_accounting_extra[] field, and add them in here transparently. */
2401
2402 *ret = value + u->ip_accounting_extra[metric];
2403
2404 return r;
2405 }
2406
2407 int unit_reset_cpu_accounting(Unit *u) {
2408 nsec_t ns;
2409 int r;
2410
2411 assert(u);
2412
2413 u->cpu_usage_last = NSEC_INFINITY;
2414
2415 r = unit_get_cpu_usage_raw(u, &ns);
2416 if (r < 0) {
2417 u->cpu_usage_base = 0;
2418 return r;
2419 }
2420
2421 u->cpu_usage_base = ns;
2422 return 0;
2423 }
2424
2425 int unit_reset_ip_accounting(Unit *u) {
2426 int r = 0, q = 0;
2427
2428 assert(u);
2429
2430 if (u->ip_accounting_ingress_map_fd >= 0)
2431 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2432
2433 if (u->ip_accounting_egress_map_fd >= 0)
2434 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2435
2436 zero(u->ip_accounting_extra);
2437
2438 return r < 0 ? r : q;
2439 }
2440
2441 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2442 assert(u);
2443
2444 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2445 return;
2446
2447 if (m == 0)
2448 return;
2449
2450 /* always invalidate compat pairs together */
2451 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2452 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2453
2454 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2455 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2456
2457 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2458 return;
2459
2460 u->cgroup_realized_mask &= ~m;
2461 unit_add_to_cgroup_realize_queue(u);
2462 }
2463
2464 void unit_invalidate_cgroup_bpf(Unit *u) {
2465 assert(u);
2466
2467 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2468 return;
2469
2470 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2471 return;
2472
2473 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2474 unit_add_to_cgroup_realize_queue(u);
2475
2476 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2477 * list of our children includes our own. */
2478 if (u->type == UNIT_SLICE) {
2479 Unit *member;
2480 Iterator i;
2481 void *v;
2482
2483 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2484 if (member == u)
2485 continue;
2486
2487 if (UNIT_DEREF(member->slice) != u)
2488 continue;
2489
2490 unit_invalidate_cgroup_bpf(member);
2491 }
2492 }
2493 }
2494
2495 void manager_invalidate_startup_units(Manager *m) {
2496 Iterator i;
2497 Unit *u;
2498
2499 assert(m);
2500
2501 SET_FOREACH(u, m->startup_units, i)
2502 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2503 }
2504
2505 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2506 [CGROUP_AUTO] = "auto",
2507 [CGROUP_CLOSED] = "closed",
2508 [CGROUP_STRICT] = "strict",
2509 };
2510
2511 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);