]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
bpf: mount bpffs by default on boot
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
8e274523
LP
2/***
3 This file is part of systemd.
4
4ad49000 5 Copyright 2013 Lennart Poettering
8e274523
LP
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
8e274523 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
c6c18be3 21#include <fcntl.h>
e41969e3 22#include <fnmatch.h>
8c6db833 23
b5efdb8a 24#include "alloc-util.h"
18c528e9 25#include "blockdev-util.h"
906c06f6 26#include "bpf-firewall.h"
6592b975 27#include "bus-error.h"
03a7b521 28#include "cgroup-util.h"
3ffd4af2
LP
29#include "cgroup.h"
30#include "fd-util.h"
0d39fa9c 31#include "fileio.h"
77601719 32#include "fs-util.h"
6bedfcbb 33#include "parse-util.h"
9eb977db 34#include "path-util.h"
03a7b521 35#include "process-util.h"
c36a69f4 36#include "procfs-util.h"
9444b1f2 37#include "special.h"
906c06f6 38#include "stdio-util.h"
8b43440b 39#include "string-table.h"
07630cea 40#include "string-util.h"
8e274523 41
9a054909
LP
42#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
43
f3725e64
LP
44bool unit_has_root_cgroup(Unit *u) {
45 assert(u);
46
47 /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
48 * as inside of containers the root slice won't be identical to the root cgroup. */
49
50 if (!u->cgroup_path)
51 return false;
52
53 return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
54}
55
2b40998d 56static void cgroup_compat_warn(void) {
128fadc9
TH
57 static bool cgroup_compat_warned = false;
58
59 if (cgroup_compat_warned)
60 return;
61
62 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
63 cgroup_compat_warned = true;
64}
65
66#define log_cgroup_compat(unit, fmt, ...) do { \
67 cgroup_compat_warn(); \
68 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 69 } while (false)
128fadc9 70
4ad49000
LP
71void cgroup_context_init(CGroupContext *c) {
72 assert(c);
73
74 /* Initialize everything to the kernel defaults, assuming the
75 * structure is preinitialized to 0 */
76
66ebf6c0
TH
77 c->cpu_weight = CGROUP_WEIGHT_INVALID;
78 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
79 c->cpu_quota_per_sec_usec = USEC_INFINITY;
80
d53d9474
LP
81 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
82 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
d53d9474 83
da4d897e
TH
84 c->memory_high = CGROUP_LIMIT_MAX;
85 c->memory_max = CGROUP_LIMIT_MAX;
96e131ea 86 c->memory_swap_max = CGROUP_LIMIT_MAX;
da4d897e
TH
87
88 c->memory_limit = CGROUP_LIMIT_MAX;
b2f8b02e 89
13c31542
TH
90 c->io_weight = CGROUP_WEIGHT_INVALID;
91 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
92
d53d9474
LP
93 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
94 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
95
96 c->tasks_max = (uint64_t) -1;
4ad49000 97}
8e274523 98
4ad49000
LP
99void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
100 assert(c);
101 assert(a);
102
71fda00f 103 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
104 free(a->path);
105 free(a);
106}
107
13c31542
TH
108void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
109 assert(c);
110 assert(w);
111
112 LIST_REMOVE(device_weights, c->io_device_weights, w);
113 free(w->path);
114 free(w);
115}
116
117void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
118 assert(c);
119 assert(l);
120
121 LIST_REMOVE(device_limits, c->io_device_limits, l);
122 free(l->path);
123 free(l);
124}
125
4ad49000
LP
126void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
127 assert(c);
128 assert(w);
129
71fda00f 130 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
131 free(w->path);
132 free(w);
133}
134
135void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
136 assert(c);
8e274523 137 assert(b);
8e274523 138
71fda00f 139 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
140 free(b->path);
141 free(b);
142}
143
144void cgroup_context_done(CGroupContext *c) {
145 assert(c);
146
13c31542
TH
147 while (c->io_device_weights)
148 cgroup_context_free_io_device_weight(c, c->io_device_weights);
149
150 while (c->io_device_limits)
151 cgroup_context_free_io_device_limit(c, c->io_device_limits);
152
4ad49000
LP
153 while (c->blockio_device_weights)
154 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
155
156 while (c->blockio_device_bandwidths)
157 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
158
159 while (c->device_allow)
160 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f
DM
161
162 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
163 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
4ad49000
LP
164}
165
166void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
13c31542
TH
167 CGroupIODeviceLimit *il;
168 CGroupIODeviceWeight *iw;
4ad49000
LP
169 CGroupBlockIODeviceBandwidth *b;
170 CGroupBlockIODeviceWeight *w;
171 CGroupDeviceAllow *a;
c21c9906 172 IPAddressAccessItem *iaai;
9a054909 173 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
174
175 assert(c);
176 assert(f);
177
178 prefix = strempty(prefix);
179
180 fprintf(f,
181 "%sCPUAccounting=%s\n"
13c31542 182 "%sIOAccounting=%s\n"
4ad49000
LP
183 "%sBlockIOAccounting=%s\n"
184 "%sMemoryAccounting=%s\n"
d53d9474 185 "%sTasksAccounting=%s\n"
c21c9906 186 "%sIPAccounting=%s\n"
66ebf6c0
TH
187 "%sCPUWeight=%" PRIu64 "\n"
188 "%sStartupCPUWeight=%" PRIu64 "\n"
d53d9474
LP
189 "%sCPUShares=%" PRIu64 "\n"
190 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 191 "%sCPUQuotaPerSecSec=%s\n"
13c31542
TH
192 "%sIOWeight=%" PRIu64 "\n"
193 "%sStartupIOWeight=%" PRIu64 "\n"
d53d9474
LP
194 "%sBlockIOWeight=%" PRIu64 "\n"
195 "%sStartupBlockIOWeight=%" PRIu64 "\n"
da4d897e
TH
196 "%sMemoryLow=%" PRIu64 "\n"
197 "%sMemoryHigh=%" PRIu64 "\n"
198 "%sMemoryMax=%" PRIu64 "\n"
96e131ea 199 "%sMemorySwapMax=%" PRIu64 "\n"
4ad49000 200 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 201 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
202 "%sDevicePolicy=%s\n"
203 "%sDelegate=%s\n",
4ad49000 204 prefix, yes_no(c->cpu_accounting),
13c31542 205 prefix, yes_no(c->io_accounting),
4ad49000
LP
206 prefix, yes_no(c->blockio_accounting),
207 prefix, yes_no(c->memory_accounting),
d53d9474 208 prefix, yes_no(c->tasks_accounting),
c21c9906 209 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
210 prefix, c->cpu_weight,
211 prefix, c->startup_cpu_weight,
4ad49000 212 prefix, c->cpu_shares,
95ae05c0 213 prefix, c->startup_cpu_shares,
b1d6dcf5 214 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
13c31542
TH
215 prefix, c->io_weight,
216 prefix, c->startup_io_weight,
4ad49000 217 prefix, c->blockio_weight,
95ae05c0 218 prefix, c->startup_blockio_weight,
da4d897e
TH
219 prefix, c->memory_low,
220 prefix, c->memory_high,
221 prefix, c->memory_max,
96e131ea 222 prefix, c->memory_swap_max,
4ad49000 223 prefix, c->memory_limit,
03a7b521 224 prefix, c->tasks_max,
a931ad47
LP
225 prefix, cgroup_device_policy_to_string(c->device_policy),
226 prefix, yes_no(c->delegate));
4ad49000 227
02638280
LP
228 if (c->delegate) {
229 _cleanup_free_ char *t = NULL;
230
231 (void) cg_mask_to_string(c->delegate_controllers, &t);
232
47a78d41 233 fprintf(f, "%sDelegateControllers=%s\n",
02638280
LP
234 prefix,
235 strempty(t));
236 }
237
4ad49000
LP
238 LIST_FOREACH(device_allow, a, c->device_allow)
239 fprintf(f,
240 "%sDeviceAllow=%s %s%s%s\n",
241 prefix,
242 a->path,
243 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
244
13c31542
TH
245 LIST_FOREACH(device_weights, iw, c->io_device_weights)
246 fprintf(f,
247 "%sIODeviceWeight=%s %" PRIu64,
248 prefix,
249 iw->path,
250 iw->weight);
251
252 LIST_FOREACH(device_limits, il, c->io_device_limits) {
253 char buf[FORMAT_BYTES_MAX];
9be57249
TH
254 CGroupIOLimitType type;
255
256 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
257 if (il->limits[type] != cgroup_io_limit_defaults[type])
258 fprintf(f,
259 "%s%s=%s %s\n",
260 prefix,
261 cgroup_io_limit_type_to_string(type),
262 il->path,
263 format_bytes(buf, sizeof(buf), il->limits[type]));
13c31542
TH
264 }
265
4ad49000
LP
266 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
267 fprintf(f,
d53d9474 268 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
269 prefix,
270 w->path,
271 w->weight);
272
273 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
274 char buf[FORMAT_BYTES_MAX];
275
979d0311
TH
276 if (b->rbps != CGROUP_LIMIT_MAX)
277 fprintf(f,
278 "%sBlockIOReadBandwidth=%s %s\n",
279 prefix,
280 b->path,
281 format_bytes(buf, sizeof(buf), b->rbps));
282 if (b->wbps != CGROUP_LIMIT_MAX)
283 fprintf(f,
284 "%sBlockIOWriteBandwidth=%s %s\n",
285 prefix,
286 b->path,
287 format_bytes(buf, sizeof(buf), b->wbps));
4ad49000 288 }
c21c9906
LP
289
290 LIST_FOREACH(items, iaai, c->ip_address_allow) {
291 _cleanup_free_ char *k = NULL;
292
293 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
294 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
295 }
296
297 LIST_FOREACH(items, iaai, c->ip_address_deny) {
298 _cleanup_free_ char *k = NULL;
299
300 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
301 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
302 }
4ad49000
LP
303}
304
13c31542 305static int lookup_block_device(const char *p, dev_t *dev) {
4ad49000
LP
306 struct stat st;
307 int r;
308
309 assert(p);
310 assert(dev);
311
312 r = stat(p, &st);
4a62c710
MS
313 if (r < 0)
314 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 315
4ad49000
LP
316 if (S_ISBLK(st.st_mode))
317 *dev = st.st_rdev;
318 else if (major(st.st_dev) != 0) {
319 /* If this is not a device node then find the block
320 * device this file is stored on */
321 *dev = st.st_dev;
322
323 /* If this is a partition, try to get the originating
324 * block device */
18c528e9 325 (void) block_get_whole_disk(*dev, dev);
4ad49000
LP
326 } else {
327 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
328 return -ENODEV;
329 }
8e274523 330
8e274523 331 return 0;
8e274523
LP
332}
333
4ad49000
LP
334static int whitelist_device(const char *path, const char *node, const char *acc) {
335 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
336 struct stat st;
b200489b 337 bool ignore_notfound;
8c6db833 338 int r;
8e274523 339
4ad49000
LP
340 assert(path);
341 assert(acc);
8e274523 342
b200489b
DR
343 if (node[0] == '-') {
344 /* Non-existent paths starting with "-" must be silently ignored */
345 node++;
346 ignore_notfound = true;
347 } else
348 ignore_notfound = false;
349
4ad49000 350 if (stat(node, &st) < 0) {
b200489b 351 if (errno == ENOENT && ignore_notfound)
e7330dfe
DP
352 return 0;
353
354 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
4ad49000
LP
355 }
356
357 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
358 log_warning("%s is not a device.", node);
359 return -ENODEV;
360 }
361
362 sprintf(buf,
363 "%c %u:%u %s",
364 S_ISCHR(st.st_mode) ? 'c' : 'b',
365 major(st.st_rdev), minor(st.st_rdev),
366 acc);
367
368 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 369 if (r < 0)
077ba06e 370 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 371 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
372
373 return r;
8e274523
LP
374}
375
90060676
LP
376static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
377 _cleanup_fclose_ FILE *f = NULL;
378 char line[LINE_MAX];
379 bool good = false;
380 int r;
381
382 assert(path);
383 assert(acc);
4c701096 384 assert(IN_SET(type, 'b', 'c'));
90060676
LP
385
386 f = fopen("/proc/devices", "re");
4a62c710
MS
387 if (!f)
388 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
389
390 FOREACH_LINE(line, f, goto fail) {
391 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
392 unsigned maj;
393
394 truncate_nl(line);
395
396 if (type == 'c' && streq(line, "Character devices:")) {
397 good = true;
398 continue;
399 }
400
401 if (type == 'b' && streq(line, "Block devices:")) {
402 good = true;
403 continue;
404 }
405
406 if (isempty(line)) {
407 good = false;
408 continue;
409 }
410
411 if (!good)
412 continue;
413
414 p = strstrip(line);
415
416 w = strpbrk(p, WHITESPACE);
417 if (!w)
418 continue;
419 *w = 0;
420
421 r = safe_atou(p, &maj);
422 if (r < 0)
423 continue;
424 if (maj <= 0)
425 continue;
426
427 w++;
428 w += strspn(w, WHITESPACE);
e41969e3
LP
429
430 if (fnmatch(name, w, 0) != 0)
90060676
LP
431 continue;
432
433 sprintf(buf,
434 "%c %u:* %s",
435 type,
436 maj,
437 acc);
438
439 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 440 if (r < 0)
077ba06e 441 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 442 "Failed to set devices.allow on %s: %m", path);
90060676
LP
443 }
444
445 return 0;
446
447fail:
25f027c5 448 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
449}
450
66ebf6c0
TH
451static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
452 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
453 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
454}
455
456static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
457 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
458 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
459}
460
461static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
462 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
463 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
464 return c->startup_cpu_weight;
465 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
466 return c->cpu_weight;
467 else
468 return CGROUP_WEIGHT_DEFAULT;
469}
470
471static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
472 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
473 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
474 return c->startup_cpu_shares;
475 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
476 return c->cpu_shares;
477 else
478 return CGROUP_CPU_SHARES_DEFAULT;
479}
480
481static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
482 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
483 int r;
484
485 xsprintf(buf, "%" PRIu64 "\n", weight);
486 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
487 if (r < 0)
488 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
489 "Failed to set cpu.weight: %m");
490
491 if (quota != USEC_INFINITY)
492 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
493 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
494 else
495 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
496
497 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
498
499 if (r < 0)
500 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
501 "Failed to set cpu.max: %m");
502}
503
504static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
505 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
506 int r;
507
508 xsprintf(buf, "%" PRIu64 "\n", shares);
509 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
510 if (r < 0)
511 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
512 "Failed to set cpu.shares: %m");
513
514 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
515 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
516 if (r < 0)
517 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
518 "Failed to set cpu.cfs_period_us: %m");
519
520 if (quota != USEC_INFINITY) {
521 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
522 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
523 } else
524 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
525 if (r < 0)
526 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
527 "Failed to set cpu.cfs_quota_us: %m");
528}
529
530static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
531 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
532 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
533}
534
535static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
536 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
537 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
538}
539
508c45da 540static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
541 return c->io_accounting ||
542 c->io_weight != CGROUP_WEIGHT_INVALID ||
543 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
544 c->io_device_weights ||
545 c->io_device_limits;
546}
547
508c45da 548static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
549 return c->blockio_accounting ||
550 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
551 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
552 c->blockio_device_weights ||
553 c->blockio_device_bandwidths;
554}
555
508c45da 556static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
557 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
558 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
559 return c->startup_io_weight;
560 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
561 return c->io_weight;
562 else
563 return CGROUP_WEIGHT_DEFAULT;
564}
565
508c45da 566static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
567 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
568 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
569 return c->startup_blockio_weight;
570 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
571 return c->blockio_weight;
572 else
573 return CGROUP_BLKIO_WEIGHT_DEFAULT;
574}
575
508c45da 576static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
577 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
578 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
579}
580
508c45da 581static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
582 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
583 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
584}
585
f29ff115 586static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
587 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
588 dev_t dev;
589 int r;
590
591 r = lookup_block_device(dev_path, &dev);
592 if (r < 0)
593 return;
594
595 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
f29ff115 596 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
64faf04c 597 if (r < 0)
f29ff115
TH
598 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
599 "Failed to set io.weight: %m");
64faf04c
TH
600}
601
f29ff115 602static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
603 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
604 dev_t dev;
605 int r;
606
607 r = lookup_block_device(dev_path, &dev);
608 if (r < 0)
609 return;
610
611 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
f29ff115 612 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
64faf04c 613 if (r < 0)
f29ff115
TH
614 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
615 "Failed to set blkio.weight_device: %m");
64faf04c
TH
616}
617
f29ff115 618static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
64faf04c
TH
619 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
620 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
621 CGroupIOLimitType type;
622 dev_t dev;
623 unsigned n = 0;
624 int r;
625
626 r = lookup_block_device(dev_path, &dev);
627 if (r < 0)
628 return 0;
629
630 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
631 if (limits[type] != cgroup_io_limit_defaults[type]) {
632 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
633 n++;
634 } else {
635 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
636 }
637 }
638
639 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
640 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
641 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
f29ff115 642 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
64faf04c 643 if (r < 0)
f29ff115
TH
644 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
645 "Failed to set io.max: %m");
64faf04c
TH
646 return n;
647}
648
f29ff115 649static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
650 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
651 dev_t dev;
652 unsigned n = 0;
653 int r;
654
655 r = lookup_block_device(dev_path, &dev);
656 if (r < 0)
657 return 0;
658
659 if (rbps != CGROUP_LIMIT_MAX)
660 n++;
661 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
f29ff115 662 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
64faf04c 663 if (r < 0)
f29ff115
TH
664 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
665 "Failed to set blkio.throttle.read_bps_device: %m");
64faf04c
TH
666
667 if (wbps != CGROUP_LIMIT_MAX)
668 n++;
669 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
f29ff115 670 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
64faf04c 671 if (r < 0)
f29ff115
TH
672 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
673 "Failed to set blkio.throttle.write_bps_device: %m");
64faf04c
TH
674
675 return n;
676}
677
da4d897e 678static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
96e131ea 679 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
680}
681
f29ff115 682static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
da4d897e
TH
683 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
684 int r;
685
686 if (v != CGROUP_LIMIT_MAX)
687 xsprintf(buf, "%" PRIu64 "\n", v);
688
f29ff115 689 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
da4d897e 690 if (r < 0)
f29ff115
TH
691 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
692 "Failed to set %s: %m", file);
da4d897e
TH
693}
694
0f2d84d2 695static void cgroup_apply_firewall(Unit *u) {
906c06f6
DM
696 int r;
697
0f2d84d2
LP
698 assert(u);
699
906c06f6
DM
700 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
701 * not recursive we don't ever touch the bpf on them */
702 return;
703
704 r = bpf_firewall_compile(u);
705 if (r < 0)
706 return;
707
708 (void) bpf_firewall_install(u);
709 return;
710}
711
712static void cgroup_context_apply(
713 Unit *u,
714 CGroupMask apply_mask,
715 bool apply_bpf,
716 ManagerState state) {
717
f29ff115
TH
718 const char *path;
719 CGroupContext *c;
01efdf13 720 bool is_root;
4ad49000
LP
721 int r;
722
f29ff115
TH
723 assert(u);
724
906c06f6
DM
725 /* Nothing to do? Exit early! */
726 if (apply_mask == 0 && !apply_bpf)
4ad49000 727 return;
8e274523 728
f3725e64
LP
729 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
730 is_root = unit_has_root_cgroup(u);
731
732 assert_se(c = unit_get_cgroup_context(u));
733 assert_se(path = u->cgroup_path);
734
735 if (is_root) /* Make sure we don't try to display messages with an empty path. */
6da13913 736 path = "/";
01efdf13 737
714e2e1d
LP
738 /* We generally ignore errors caused by read-only mounted
739 * cgroup trees (assuming we are running in a container then),
740 * and missing cgroups, i.e. EROFS and ENOENT. */
741
906c06f6
DM
742 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
743 bool has_weight, has_shares;
744
745 has_weight = cgroup_context_has_cpu_weight(c);
746 has_shares = cgroup_context_has_cpu_shares(c);
8e274523 747
b4cccbc1 748 if (cg_all_unified() > 0) {
66ebf6c0 749 uint64_t weight;
b2f8b02e 750
66ebf6c0
TH
751 if (has_weight)
752 weight = cgroup_context_cpu_weight(c, state);
753 else if (has_shares) {
754 uint64_t shares = cgroup_context_cpu_shares(c, state);
b2f8b02e 755
66ebf6c0
TH
756 weight = cgroup_cpu_shares_to_weight(shares);
757
758 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
759 shares, weight, path);
760 } else
761 weight = CGROUP_WEIGHT_DEFAULT;
762
763 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
764 } else {
765 uint64_t shares;
766
7d862ab8 767 if (has_weight) {
66ebf6c0
TH
768 uint64_t weight = cgroup_context_cpu_weight(c, state);
769
770 shares = cgroup_cpu_weight_to_shares(weight);
771
772 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
773 weight, shares, path);
7d862ab8
TH
774 } else if (has_shares)
775 shares = cgroup_context_cpu_shares(c, state);
776 else
66ebf6c0
TH
777 shares = CGROUP_CPU_SHARES_DEFAULT;
778
779 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
780 }
4ad49000
LP
781 }
782
906c06f6 783 if (apply_mask & CGROUP_MASK_IO) {
538b4852
TH
784 bool has_io = cgroup_context_has_io_config(c);
785 bool has_blockio = cgroup_context_has_blockio_config(c);
13c31542
TH
786
787 if (!is_root) {
64faf04c
TH
788 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
789 uint64_t weight;
13c31542 790
538b4852
TH
791 if (has_io)
792 weight = cgroup_context_io_weight(c, state);
128fadc9
TH
793 else if (has_blockio) {
794 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
795
796 weight = cgroup_weight_blkio_to_io(blkio_weight);
797
798 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
799 blkio_weight, weight);
800 } else
538b4852 801 weight = CGROUP_WEIGHT_DEFAULT;
13c31542
TH
802
803 xsprintf(buf, "default %" PRIu64 "\n", weight);
804 r = cg_set_attribute("io", path, "io.weight", buf);
805 if (r < 0)
f29ff115
TH
806 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
807 "Failed to set io.weight: %m");
13c31542 808
538b4852
TH
809 if (has_io) {
810 CGroupIODeviceWeight *w;
811
812 /* FIXME: no way to reset this list */
813 LIST_FOREACH(device_weights, w, c->io_device_weights)
f29ff115 814 cgroup_apply_io_device_weight(u, w->path, w->weight);
538b4852
TH
815 } else if (has_blockio) {
816 CGroupBlockIODeviceWeight *w;
817
818 /* FIXME: no way to reset this list */
128fadc9
TH
819 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
820 weight = cgroup_weight_blkio_to_io(w->weight);
821
822 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
823 w->weight, weight, w->path);
824
825 cgroup_apply_io_device_weight(u, w->path, weight);
826 }
538b4852 827 }
13c31542
TH
828 }
829
64faf04c 830 /* Apply limits and free ones without config. */
538b4852
TH
831 if (has_io) {
832 CGroupIODeviceLimit *l, *next;
833
834 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
f29ff115 835 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
538b4852
TH
836 cgroup_context_free_io_device_limit(c, l);
837 }
838 } else if (has_blockio) {
839 CGroupBlockIODeviceBandwidth *b, *next;
840
841 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
842 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
843 CGroupIOLimitType type;
844
845 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
846 limits[type] = cgroup_io_limit_defaults[type];
847
848 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
849 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
850
128fadc9
TH
851 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
852 b->rbps, b->wbps, b->path);
853
f29ff115 854 if (!cgroup_apply_io_device_limit(u, b->path, limits))
538b4852
TH
855 cgroup_context_free_blockio_device_bandwidth(c, b);
856 }
13c31542
TH
857 }
858 }
859
906c06f6 860 if (apply_mask & CGROUP_MASK_BLKIO) {
538b4852
TH
861 bool has_io = cgroup_context_has_io_config(c);
862 bool has_blockio = cgroup_context_has_blockio_config(c);
4ad49000 863
01efdf13 864 if (!is_root) {
64faf04c
TH
865 char buf[DECIMAL_STR_MAX(uint64_t)+1];
866 uint64_t weight;
64faf04c 867
7d862ab8 868 if (has_io) {
128fadc9
TH
869 uint64_t io_weight = cgroup_context_io_weight(c, state);
870
538b4852 871 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9
TH
872
873 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
874 io_weight, weight);
7d862ab8
TH
875 } else if (has_blockio)
876 weight = cgroup_context_blkio_weight(c, state);
877 else
538b4852 878 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c
TH
879
880 xsprintf(buf, "%" PRIu64 "\n", weight);
01efdf13 881 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 882 if (r < 0)
f29ff115
TH
883 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
884 "Failed to set blkio.weight: %m");
4ad49000 885
7d862ab8 886 if (has_io) {
538b4852
TH
887 CGroupIODeviceWeight *w;
888
889 /* FIXME: no way to reset this list */
128fadc9
TH
890 LIST_FOREACH(device_weights, w, c->io_device_weights) {
891 weight = cgroup_weight_io_to_blkio(w->weight);
892
893 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
894 w->weight, weight, w->path);
895
896 cgroup_apply_blkio_device_weight(u, w->path, weight);
897 }
7d862ab8
TH
898 } else if (has_blockio) {
899 CGroupBlockIODeviceWeight *w;
900
901 /* FIXME: no way to reset this list */
902 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
903 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 904 }
4ad49000
LP
905 }
906
64faf04c 907 /* Apply limits and free ones without config. */
7d862ab8 908 if (has_io) {
538b4852
TH
909 CGroupIODeviceLimit *l, *next;
910
911 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
128fadc9
TH
912 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
913 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
914
f29ff115 915 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
538b4852
TH
916 cgroup_context_free_io_device_limit(c, l);
917 }
7d862ab8
TH
918 } else if (has_blockio) {
919 CGroupBlockIODeviceBandwidth *b, *next;
920
921 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
922 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
923 cgroup_context_free_blockio_device_bandwidth(c, b);
d686d8a9 924 }
8e274523
LP
925 }
926
906c06f6 927 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
b4cccbc1
LP
928 if (cg_all_unified() > 0) {
929 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
efdb0237 930
96e131ea 931 if (cgroup_context_has_unified_memory_config(c)) {
da4d897e 932 max = c->memory_max;
96e131ea
WC
933 swap_max = c->memory_swap_max;
934 } else {
da4d897e 935 max = c->memory_limit;
efdb0237 936
128fadc9
TH
937 if (max != CGROUP_LIMIT_MAX)
938 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
939 }
940
f29ff115
TH
941 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
942 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
943 cgroup_apply_unified_memory_limit(u, "memory.max", max);
96e131ea 944 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
efdb0237 945 } else {
da4d897e 946 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
7d862ab8 947 uint64_t val;
da4d897e 948
7d862ab8 949 if (cgroup_context_has_unified_memory_config(c)) {
78a4ee59 950 val = c->memory_max;
7d862ab8
TH
951 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
952 } else
953 val = c->memory_limit;
128fadc9 954
78a4ee59
DM
955 if (val == CGROUP_LIMIT_MAX)
956 strncpy(buf, "-1\n", sizeof(buf));
957 else
958 xsprintf(buf, "%" PRIu64 "\n", val);
959
da4d897e
TH
960 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
961 if (r < 0)
f29ff115
TH
962 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
963 "Failed to set memory.limit_in_bytes: %m");
da4d897e 964 }
4ad49000 965 }
8e274523 966
906c06f6 967 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 968 CGroupDeviceAllow *a;
8e274523 969
714e2e1d
LP
970 /* Changing the devices list of a populated cgroup
971 * might result in EINVAL, hence ignore EINVAL
972 * here. */
973
4ad49000
LP
974 if (c->device_allow || c->device_policy != CGROUP_AUTO)
975 r = cg_set_attribute("devices", path, "devices.deny", "a");
976 else
977 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 978 if (r < 0)
f29ff115
TH
979 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
980 "Failed to reset devices.list: %m");
fb385181 981
4ad49000
LP
982 if (c->device_policy == CGROUP_CLOSED ||
983 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
984 static const char auto_devices[] =
7d711efb
LP
985 "/dev/null\0" "rwm\0"
986 "/dev/zero\0" "rwm\0"
987 "/dev/full\0" "rwm\0"
988 "/dev/random\0" "rwm\0"
989 "/dev/urandom\0" "rwm\0"
990 "/dev/tty\0" "rwm\0"
5a7f87a9 991 "/dev/ptmx\0" "rwm\0"
0d9e7991 992 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
e7330dfe
DP
993 "-/run/systemd/inaccessible/chr\0" "rwm\0"
994 "-/run/systemd/inaccessible/blk\0" "rwm\0";
4ad49000
LP
995
996 const char *x, *y;
997
998 NULSTR_FOREACH_PAIR(x, y, auto_devices)
999 whitelist_device(path, x, y);
7d711efb 1000
5a7f87a9 1001 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
7d711efb 1002 whitelist_major(path, "pts", 'c', "rw");
4ad49000
LP
1003 }
1004
1005 LIST_FOREACH(device_allow, a, c->device_allow) {
fb4650aa 1006 char acc[4], *val;
4ad49000
LP
1007 unsigned k = 0;
1008
1009 if (a->r)
1010 acc[k++] = 'r';
1011 if (a->w)
1012 acc[k++] = 'w';
1013 if (a->m)
1014 acc[k++] = 'm';
fb385181 1015
4ad49000
LP
1016 if (k == 0)
1017 continue;
fb385181 1018
4ad49000 1019 acc[k++] = 0;
90060676 1020
27458ed6 1021 if (path_startswith(a->path, "/dev/"))
90060676 1022 whitelist_device(path, a->path, acc);
fb4650aa
ZJS
1023 else if ((val = startswith(a->path, "block-")))
1024 whitelist_major(path, val, 'b', acc);
1025 else if ((val = startswith(a->path, "char-")))
1026 whitelist_major(path, val, 'c', acc);
90060676 1027 else
f29ff115 1028 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
1029 }
1030 }
03a7b521 1031
00b5974f
LP
1032 if (apply_mask & CGROUP_MASK_PIDS) {
1033
1034 if (is_root) {
1035 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1036 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1037 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1038 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1039 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1040 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1041 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1042 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1043 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1044 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1045 * which is desirable so that there's an offical way to release control of the sysctl from
1046 * systemd: set the limit to unbounded and reload. */
1047
1048 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1049 u->manager->sysctl_pid_max_changed = true;
1050 r = procfs_tasks_set_limit(c->tasks_max);
1051 } else if (u->manager->sysctl_pid_max_changed)
1052 r = procfs_tasks_set_limit(TASKS_MAX);
1053 else
1054 r = 0;
03a7b521 1055
00b5974f
LP
1056 if (r < 0)
1057 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1058 "Failed to write to tasks limit sysctls: %m");
03a7b521 1059
00b5974f
LP
1060 } else {
1061 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1062 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
03a7b521 1063
00b5974f
LP
1064 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1065 r = cg_set_attribute("pids", path, "pids.max", buf);
1066 } else
1067 r = cg_set_attribute("pids", path, "pids.max", "max");
1068 if (r < 0)
1069 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070 "Failed to set pids.max: %m");
1071 }
03a7b521 1072 }
906c06f6
DM
1073
1074 if (apply_bpf)
0f2d84d2 1075 cgroup_apply_firewall(u);
fb385181
LP
1076}
1077
efdb0237
LP
1078CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1079 CGroupMask mask = 0;
8e274523 1080
4ad49000 1081 /* Figure out which controllers we need */
8e274523 1082
b2f8b02e 1083 if (c->cpu_accounting ||
66ebf6c0
TH
1084 cgroup_context_has_cpu_weight(c) ||
1085 cgroup_context_has_cpu_shares(c) ||
3a43da28 1086 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 1087 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 1088
538b4852
TH
1089 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1090 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1091
4ad49000 1092 if (c->memory_accounting ||
da4d897e
TH
1093 c->memory_limit != CGROUP_LIMIT_MAX ||
1094 cgroup_context_has_unified_memory_config(c))
efdb0237 1095 mask |= CGROUP_MASK_MEMORY;
8e274523 1096
a931ad47
LP
1097 if (c->device_allow ||
1098 c->device_policy != CGROUP_AUTO)
3905f127 1099 mask |= CGROUP_MASK_DEVICES;
4ad49000 1100
03a7b521 1101 if (c->tasks_accounting ||
8793fa25 1102 c->tasks_max != CGROUP_LIMIT_MAX)
03a7b521
LP
1103 mask |= CGROUP_MASK_PIDS;
1104
4ad49000 1105 return mask;
8e274523
LP
1106}
1107
efdb0237 1108CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1109 CGroupContext *c;
8e274523 1110
efdb0237
LP
1111 /* Returns the mask of controllers the unit needs for itself */
1112
4ad49000
LP
1113 c = unit_get_cgroup_context(u);
1114 if (!c)
1115 return 0;
8e274523 1116
64e844e5 1117 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
02638280
LP
1118}
1119
1120CGroupMask unit_get_delegate_mask(Unit *u) {
1121 CGroupContext *c;
1122
1123 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1124 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
19af675e 1125 *
02638280 1126 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1127
1d9cc876 1128 if (!unit_cgroup_delegate(u))
02638280
LP
1129 return 0;
1130
1131 if (cg_all_unified() <= 0) {
a931ad47
LP
1132 ExecContext *e;
1133
1134 e = unit_get_exec_context(u);
02638280
LP
1135 if (e && !exec_context_maintains_privileges(e))
1136 return 0;
a931ad47
LP
1137 }
1138
1d9cc876 1139 assert_se(c = unit_get_cgroup_context(u));
02638280 1140 return c->delegate_controllers;
8e274523
LP
1141}
1142
efdb0237 1143CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1144 assert(u);
bc432dc7 1145
02638280 1146 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1147
bc432dc7
LP
1148 if (u->cgroup_members_mask_valid)
1149 return u->cgroup_members_mask;
1150
64e844e5 1151 u->cgroup_members_mask = 0;
bc432dc7
LP
1152
1153 if (u->type == UNIT_SLICE) {
eef85c4a 1154 void *v;
bc432dc7
LP
1155 Unit *member;
1156 Iterator i;
1157
eef85c4a 1158 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
bc432dc7
LP
1159
1160 if (member == u)
1161 continue;
1162
d4fdc205 1163 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
1164 continue;
1165
31604970 1166 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1167 }
1168 }
1169
1170 u->cgroup_members_mask_valid = true;
6414b7c9 1171 return u->cgroup_members_mask;
246aa6dd
LP
1172}
1173
efdb0237 1174CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 1175 assert(u);
246aa6dd 1176
efdb0237
LP
1177 /* Returns the mask of controllers all of the unit's siblings
1178 * require, i.e. the members mask of the unit's parent slice
1179 * if there is one. */
1180
bc432dc7 1181 if (UNIT_ISSET(u->slice))
637f421e 1182 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 1183
64e844e5 1184 return unit_get_subtree_mask(u); /* we are the top-level slice */
246aa6dd
LP
1185}
1186
efdb0237
LP
1187CGroupMask unit_get_subtree_mask(Unit *u) {
1188
1189 /* Returns the mask of this subtree, meaning of the group
1190 * itself and its children. */
1191
1192 return unit_get_own_mask(u) | unit_get_members_mask(u);
1193}
1194
1195CGroupMask unit_get_target_mask(Unit *u) {
1196 CGroupMask mask;
1197
1198 /* This returns the cgroup mask of all controllers to enable
1199 * for a specific cgroup, i.e. everything it needs itself,
1200 * plus all that its children need, plus all that its siblings
1201 * need. This is primarily useful on the legacy cgroup
1202 * hierarchy, where we need to duplicate each cgroup in each
1203 * hierarchy that shall be enabled for it. */
6414b7c9 1204
efdb0237
LP
1205 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1206 mask &= u->manager->cgroup_supported;
1207
1208 return mask;
1209}
1210
1211CGroupMask unit_get_enable_mask(Unit *u) {
1212 CGroupMask mask;
1213
1214 /* This returns the cgroup mask of all controllers to enable
1215 * for the children of a specific cgroup. This is primarily
1216 * useful for the unified cgroup hierarchy, where each cgroup
1217 * controls which controllers are enabled for its children. */
1218
1219 mask = unit_get_members_mask(u);
6414b7c9
DS
1220 mask &= u->manager->cgroup_supported;
1221
1222 return mask;
1223}
1224
906c06f6
DM
1225bool unit_get_needs_bpf(Unit *u) {
1226 CGroupContext *c;
1227 Unit *p;
1228 assert(u);
1229
1230 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1231 * moment. */
1232 if (u->type == UNIT_SLICE)
1233 return false;
1234
1235 c = unit_get_cgroup_context(u);
1236 if (!c)
1237 return false;
1238
1239 if (c->ip_accounting ||
1240 c->ip_address_allow ||
1241 c->ip_address_deny)
1242 return true;
1243
1244 /* If any parent slice has an IP access list defined, it applies too */
1245 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1246 c = unit_get_cgroup_context(p);
1247 if (!c)
1248 return false;
1249
1250 if (c->ip_address_allow ||
1251 c->ip_address_deny)
1252 return true;
1253 }
1254
1255 return false;
1256}
1257
6414b7c9
DS
1258/* Recurse from a unit up through its containing slices, propagating
1259 * mask bits upward. A unit is also member of itself. */
bc432dc7 1260void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 1261 CGroupMask m;
bc432dc7
LP
1262 bool more;
1263
1264 assert(u);
1265
1266 /* Calculate subtree mask */
efdb0237 1267 m = unit_get_subtree_mask(u);
bc432dc7
LP
1268
1269 /* See if anything changed from the previous invocation. If
1270 * not, we're done. */
1271 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1272 return;
1273
1274 more =
1275 u->cgroup_subtree_mask_valid &&
1276 ((m & ~u->cgroup_subtree_mask) != 0) &&
1277 ((~m & u->cgroup_subtree_mask) == 0);
1278
1279 u->cgroup_subtree_mask = m;
1280 u->cgroup_subtree_mask_valid = true;
1281
6414b7c9
DS
1282 if (UNIT_ISSET(u->slice)) {
1283 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
1284
1285 if (more)
1286 /* There's more set now than before. We
1287 * propagate the new mask to the parent's mask
1288 * (not caring if it actually was valid or
1289 * not). */
1290
1291 s->cgroup_members_mask |= m;
1292
1293 else
1294 /* There's less set now than before (or we
1295 * don't know), we need to recalculate
1296 * everything, so let's invalidate the
1297 * parent's members mask */
1298
1299 s->cgroup_members_mask_valid = false;
1300
1301 /* And now make sure that this change also hits our
1302 * grandparents */
1303 unit_update_cgroup_members_masks(s);
6414b7c9
DS
1304 }
1305}
1306
6592b975 1307const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
03b90d4b 1308
6592b975 1309 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
03b90d4b
LP
1310
1311 while (u) {
6592b975 1312
03b90d4b
LP
1313 if (u->cgroup_path &&
1314 u->cgroup_realized &&
1315 (u->cgroup_realized_mask & mask) == mask)
1316 return u->cgroup_path;
1317
1318 u = UNIT_DEREF(u->slice);
1319 }
1320
1321 return NULL;
1322}
1323
6592b975
LP
1324static const char *migrate_callback(CGroupMask mask, void *userdata) {
1325 return unit_get_realized_cgroup_path(userdata, mask);
1326}
1327
efdb0237
LP
1328char *unit_default_cgroup_path(Unit *u) {
1329 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1330 int r;
1331
1332 assert(u);
1333
1334 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1335 return strdup(u->manager->cgroup_root);
1336
1337 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1338 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1339 if (r < 0)
1340 return NULL;
1341 }
1342
1343 escaped = cg_escape(u->id);
1344 if (!escaped)
1345 return NULL;
1346
1347 if (slice)
605405c6
ZJS
1348 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1349 escaped);
efdb0237 1350 else
605405c6 1351 return strjoin(u->manager->cgroup_root, "/", escaped);
efdb0237
LP
1352}
1353
1354int unit_set_cgroup_path(Unit *u, const char *path) {
1355 _cleanup_free_ char *p = NULL;
1356 int r;
1357
1358 assert(u);
1359
1360 if (path) {
1361 p = strdup(path);
1362 if (!p)
1363 return -ENOMEM;
1364 } else
1365 p = NULL;
1366
1367 if (streq_ptr(u->cgroup_path, p))
1368 return 0;
1369
1370 if (p) {
1371 r = hashmap_put(u->manager->cgroup_unit, p, u);
1372 if (r < 0)
1373 return r;
1374 }
1375
1376 unit_release_cgroup(u);
1377
1378 u->cgroup_path = p;
1379 p = NULL;
1380
1381 return 1;
1382}
1383
1384int unit_watch_cgroup(Unit *u) {
ab2c3861 1385 _cleanup_free_ char *events = NULL;
efdb0237
LP
1386 int r;
1387
1388 assert(u);
1389
1390 if (!u->cgroup_path)
1391 return 0;
1392
1393 if (u->cgroup_inotify_wd >= 0)
1394 return 0;
1395
1396 /* Only applies to the unified hierarchy */
c22800e4 1397 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1398 if (r < 0)
1399 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1400 if (r == 0)
efdb0237
LP
1401 return 0;
1402
1403 /* Don't watch the root slice, it's pointless. */
1404 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1405 return 0;
1406
1407 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1408 if (r < 0)
1409 return log_oom();
1410
ab2c3861 1411 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1412 if (r < 0)
1413 return log_oom();
1414
ab2c3861 1415 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
efdb0237
LP
1416 if (u->cgroup_inotify_wd < 0) {
1417
1418 if (errno == ENOENT) /* If the directory is already
1419 * gone we don't need to track
1420 * it, so this is not an error */
1421 return 0;
1422
1423 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1424 }
1425
1426 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1427 if (r < 0)
1428 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1429
1430 return 0;
1431}
1432
a4634b21
LP
1433int unit_pick_cgroup_path(Unit *u) {
1434 _cleanup_free_ char *path = NULL;
1435 int r;
1436
1437 assert(u);
1438
1439 if (u->cgroup_path)
1440 return 0;
1441
1442 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1443 return -EINVAL;
1444
1445 path = unit_default_cgroup_path(u);
1446 if (!path)
1447 return log_oom();
1448
1449 r = unit_set_cgroup_path(u, path);
1450 if (r == -EEXIST)
1451 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1452 if (r < 0)
1453 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1454
1455 return 0;
1456}
1457
efdb0237
LP
1458static int unit_create_cgroup(
1459 Unit *u,
1460 CGroupMask target_mask,
906c06f6
DM
1461 CGroupMask enable_mask,
1462 bool needs_bpf) {
efdb0237 1463
0cd385d3 1464 CGroupContext *c;
bc432dc7 1465 int r;
64747e2d 1466
4ad49000 1467 assert(u);
64747e2d 1468
0cd385d3
LP
1469 c = unit_get_cgroup_context(u);
1470 if (!c)
1471 return 0;
1472
a4634b21
LP
1473 /* Figure out our cgroup path */
1474 r = unit_pick_cgroup_path(u);
1475 if (r < 0)
1476 return r;
b58b8e11 1477
03b90d4b 1478 /* First, create our own group */
efdb0237 1479 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 1480 if (r < 0)
efdb0237
LP
1481 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1482
1483 /* Start watching it */
1484 (void) unit_watch_cgroup(u);
1485
1486 /* Enable all controllers we need */
1487 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1488 if (r < 0)
1489 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
1490
1491 /* Keep track that this is now realized */
4ad49000 1492 u->cgroup_realized = true;
efdb0237 1493 u->cgroup_realized_mask = target_mask;
ccf78df1 1494 u->cgroup_enabled_mask = enable_mask;
906c06f6 1495 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
4ad49000 1496
1d9cc876 1497 if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
0cd385d3
LP
1498
1499 /* Then, possibly move things over, but not if
1500 * subgroups may contain processes, which is the case
1501 * for slice and delegation units. */
1502 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1503 if (r < 0)
efdb0237 1504 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 1505 }
03b90d4b 1506
64747e2d
LP
1507 return 0;
1508}
1509
6592b975
LP
1510static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1511 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1512 char *pp;
7b3fd631 1513 int r;
6592b975 1514
7b3fd631
LP
1515 assert(u);
1516
6592b975
LP
1517 if (MANAGER_IS_SYSTEM(u->manager))
1518 return -EINVAL;
1519
1520 if (!u->manager->system_bus)
1521 return -EIO;
1522
1523 if (!u->cgroup_path)
1524 return -EINVAL;
1525
1526 /* Determine this unit's cgroup path relative to our cgroup root */
1527 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1528 if (!pp)
1529 return -EINVAL;
1530
1531 pp = strjoina("/", pp, suffix_path);
1532 path_kill_slashes(pp);
1533
1534 r = sd_bus_call_method(u->manager->system_bus,
1535 "org.freedesktop.systemd1",
1536 "/org/freedesktop/systemd1",
1537 "org.freedesktop.systemd1.Manager",
1538 "AttachProcessesToUnit",
1539 &error, NULL,
1540 "ssau",
1541 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
7b3fd631 1542 if (r < 0)
6592b975
LP
1543 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1544
1545 return 0;
1546}
1547
1548int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1549 CGroupMask delegated_mask;
1550 const char *p;
1551 Iterator i;
1552 void *pidp;
1553 int r, q;
1554
1555 assert(u);
1556
1557 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1558 return -EINVAL;
1559
1560 if (set_isempty(pids))
1561 return 0;
7b3fd631 1562
6592b975 1563 r = unit_realize_cgroup(u);
7b3fd631
LP
1564 if (r < 0)
1565 return r;
1566
6592b975
LP
1567 if (isempty(suffix_path))
1568 p = u->cgroup_path;
1569 else
1570 p = strjoina(u->cgroup_path, "/", suffix_path);
1571
1572 delegated_mask = unit_get_delegate_mask(u);
1573
1574 r = 0;
1575 SET_FOREACH(pidp, pids, i) {
1576 pid_t pid = PTR_TO_PID(pidp);
1577 CGroupController c;
1578
1579 /* First, attach the PID to the main cgroup hierarchy */
1580 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1581 if (q < 0) {
1582 log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1583
1584 if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1585 int z;
1586
1587 /* If we are in a user instance, and we can't move the process ourselves due to
1588 * permission problems, let's ask the system instance about it instead. Since it's more
1589 * privileged it might be able to move the process across the leaves of a subtree who's
1590 * top node is not owned by us. */
1591
1592 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1593 if (z < 0)
1594 log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1595 else
1596 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1597 }
1598
1599 if (r >= 0)
1600 r = q; /* Remember first error */
1601
1602 continue;
1603 }
1604
1605 q = cg_all_unified();
1606 if (q < 0)
1607 return q;
1608 if (q > 0)
1609 continue;
1610
1611 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1612 * innermost realized one */
1613
1614 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1615 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1616 const char *realized;
1617
1618 if (!(u->manager->cgroup_supported & bit))
1619 continue;
1620
1621 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1622 if (delegated_mask & u->cgroup_realized_mask & bit) {
1623 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1624 if (q >= 0)
1625 continue; /* Success! */
1626
1627 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1628 pid, p, cgroup_controller_to_string(c));
1629 }
1630
1631 /* So this controller is either not delegate or realized, or something else weird happened. In
1632 * that case let's attach the PID at least to the closest cgroup up the tree that is
1633 * realized. */
1634 realized = unit_get_realized_cgroup_path(u, bit);
1635 if (!realized)
1636 continue; /* Not even realized in the root slice? Then let's not bother */
1637
1638 q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1639 if (q < 0)
1640 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1641 pid, realized, cgroup_controller_to_string(c));
1642 }
1643 }
1644
1645 return r;
7b3fd631
LP
1646}
1647
4b58153d
LP
1648static void cgroup_xattr_apply(Unit *u) {
1649 char ids[SD_ID128_STRING_MAX];
1650 int r;
1651
1652 assert(u);
1653
1654 if (!MANAGER_IS_SYSTEM(u->manager))
1655 return;
1656
1657 if (sd_id128_is_null(u->invocation_id))
1658 return;
1659
1660 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1661 "trusted.invocation_id",
1662 sd_id128_to_string(u->invocation_id, ids), 32,
1663 0);
1664 if (r < 0)
0fb84499 1665 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
4b58153d
LP
1666}
1667
906c06f6
DM
1668static bool unit_has_mask_realized(
1669 Unit *u,
1670 CGroupMask target_mask,
1671 CGroupMask enable_mask,
1672 bool needs_bpf) {
1673
bc432dc7
LP
1674 assert(u);
1675
906c06f6
DM
1676 return u->cgroup_realized &&
1677 u->cgroup_realized_mask == target_mask &&
1678 u->cgroup_enabled_mask == enable_mask &&
1679 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1680 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
6414b7c9
DS
1681}
1682
2aa57a65
LP
1683static void unit_add_to_cgroup_realize_queue(Unit *u) {
1684 assert(u);
1685
1686 if (u->in_cgroup_realize_queue)
1687 return;
1688
1689 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1690 u->in_cgroup_realize_queue = true;
1691}
1692
1693static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1694 assert(u);
1695
1696 if (!u->in_cgroup_realize_queue)
1697 return;
1698
1699 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1700 u->in_cgroup_realize_queue = false;
1701}
1702
1703
6414b7c9
DS
1704/* Check if necessary controllers and attributes for a unit are in place.
1705 *
1706 * If so, do nothing.
1707 * If not, create paths, move processes over, and set attributes.
1708 *
1709 * Returns 0 on success and < 0 on failure. */
db785129 1710static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1711 CGroupMask target_mask, enable_mask;
906c06f6 1712 bool needs_bpf, apply_bpf;
6414b7c9 1713 int r;
64747e2d 1714
4ad49000 1715 assert(u);
64747e2d 1716
2aa57a65 1717 unit_remove_from_cgroup_realize_queue(u);
64747e2d 1718
efdb0237 1719 target_mask = unit_get_target_mask(u);
ccf78df1 1720 enable_mask = unit_get_enable_mask(u);
906c06f6 1721 needs_bpf = unit_get_needs_bpf(u);
ccf78df1 1722
906c06f6 1723 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
0a1eb06d 1724 return 0;
64747e2d 1725
906c06f6
DM
1726 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1727 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1728 * this will trickle down properly to cgroupfs. */
1729 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1730
4ad49000 1731 /* First, realize parents */
6414b7c9 1732 if (UNIT_ISSET(u->slice)) {
db785129 1733 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1734 if (r < 0)
1735 return r;
1736 }
4ad49000
LP
1737
1738 /* And then do the real work */
906c06f6 1739 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
6414b7c9
DS
1740 if (r < 0)
1741 return r;
1742
1743 /* Finally, apply the necessary attributes. */
906c06f6 1744 cgroup_context_apply(u, target_mask, apply_bpf, state);
4b58153d 1745 cgroup_xattr_apply(u);
6414b7c9
DS
1746
1747 return 0;
64747e2d
LP
1748}
1749
91a6073e 1750unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 1751 ManagerState state;
4ad49000 1752 unsigned n = 0;
db785129 1753 Unit *i;
6414b7c9 1754 int r;
ecedd90f 1755
91a6073e
LP
1756 assert(m);
1757
db785129
LP
1758 state = manager_state(m);
1759
91a6073e
LP
1760 while ((i = m->cgroup_realize_queue)) {
1761 assert(i->in_cgroup_realize_queue);
ecedd90f 1762
2aa57a65
LP
1763 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1764 /* Maybe things changed, and the unit is not actually active anymore? */
1765 unit_remove_from_cgroup_realize_queue(i);
1766 continue;
1767 }
1768
db785129 1769 r = unit_realize_cgroup_now(i, state);
6414b7c9 1770 if (r < 0)
efdb0237 1771 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1772
4ad49000
LP
1773 n++;
1774 }
ecedd90f 1775
4ad49000 1776 return n;
8e274523
LP
1777}
1778
91a6073e 1779static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
4ad49000 1780 Unit *slice;
ca949c9d 1781
4ad49000
LP
1782 /* This adds the siblings of the specified unit and the
1783 * siblings of all parent units to the cgroup queue. (But
1784 * neither the specified unit itself nor the parents.) */
1785
1786 while ((slice = UNIT_DEREF(u->slice))) {
1787 Iterator i;
1788 Unit *m;
eef85c4a 1789 void *v;
8f53a7b8 1790
eef85c4a 1791 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
4ad49000
LP
1792 if (m == u)
1793 continue;
8e274523 1794
6414b7c9
DS
1795 /* Skip units that have a dependency on the slice
1796 * but aren't actually in it. */
4ad49000 1797 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1798 continue;
8e274523 1799
6414b7c9
DS
1800 /* No point in doing cgroup application for units
1801 * without active processes. */
1802 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1803 continue;
1804
1805 /* If the unit doesn't need any new controllers
1806 * and has current ones realized, it doesn't need
1807 * any changes. */
906c06f6
DM
1808 if (unit_has_mask_realized(m,
1809 unit_get_target_mask(m),
1810 unit_get_enable_mask(m),
1811 unit_get_needs_bpf(m)))
6414b7c9
DS
1812 continue;
1813
91a6073e 1814 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
1815 }
1816
4ad49000 1817 u = slice;
8e274523 1818 }
4ad49000
LP
1819}
1820
0a1eb06d 1821int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1822 assert(u);
1823
35b7ff80 1824 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1825 return 0;
8e274523 1826
4ad49000
LP
1827 /* So, here's the deal: when realizing the cgroups for this
1828 * unit, we need to first create all parents, but there's more
1829 * actually: for the weight-based controllers we also need to
1830 * make sure that all our siblings (i.e. units that are in the
73e231ab 1831 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1832 * would become very uneven as each of their processes would
1833 * get as much resources as all our group together. This call
1834 * will synchronously create the parent cgroups, but will
1835 * defer work on the siblings to the next event loop
1836 * iteration. */
ca949c9d 1837
4ad49000 1838 /* Add all sibling slices to the cgroup queue. */
91a6073e 1839 unit_add_siblings_to_cgroup_realize_queue(u);
4ad49000 1840
6414b7c9 1841 /* And realize this one now (and apply the values) */
db785129 1842 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1843}
1844
efdb0237
LP
1845void unit_release_cgroup(Unit *u) {
1846 assert(u);
1847
1848 /* Forgets all cgroup details for this cgroup */
1849
1850 if (u->cgroup_path) {
1851 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1852 u->cgroup_path = mfree(u->cgroup_path);
1853 }
1854
1855 if (u->cgroup_inotify_wd >= 0) {
1856 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1857 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1858
1859 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1860 u->cgroup_inotify_wd = -1;
1861 }
1862}
1863
1864void unit_prune_cgroup(Unit *u) {
8e274523 1865 int r;
efdb0237 1866 bool is_root_slice;
8e274523 1867
4ad49000 1868 assert(u);
8e274523 1869
efdb0237
LP
1870 /* Removes the cgroup, if empty and possible, and stops watching it. */
1871
4ad49000
LP
1872 if (!u->cgroup_path)
1873 return;
8e274523 1874
fe700f46
LP
1875 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1876
efdb0237
LP
1877 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1878
1879 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1880 if (r < 0) {
f29ff115 1881 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1882 return;
1883 }
8e274523 1884
efdb0237
LP
1885 if (is_root_slice)
1886 return;
1887
1888 unit_release_cgroup(u);
0a1eb06d 1889
4ad49000 1890 u->cgroup_realized = false;
bc432dc7 1891 u->cgroup_realized_mask = 0;
ccf78df1 1892 u->cgroup_enabled_mask = 0;
8e274523
LP
1893}
1894
efdb0237 1895int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1896 _cleanup_fclose_ FILE *f = NULL;
1897 pid_t pid = 0, npid, mypid;
efdb0237 1898 int r;
4ad49000
LP
1899
1900 assert(u);
efdb0237 1901 assert(ret);
4ad49000
LP
1902
1903 if (!u->cgroup_path)
efdb0237 1904 return -ENXIO;
4ad49000 1905
efdb0237
LP
1906 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1907 if (r < 0)
1908 return r;
4ad49000 1909
df0ff127 1910 mypid = getpid_cached();
4ad49000
LP
1911 while (cg_read_pid(f, &npid) > 0) {
1912 pid_t ppid;
1913
1914 if (npid == pid)
1915 continue;
8e274523 1916
4ad49000 1917 /* Ignore processes that aren't our kids */
6bc73acb 1918 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
4ad49000 1919 continue;
8e274523 1920
efdb0237 1921 if (pid != 0)
4ad49000
LP
1922 /* Dang, there's more than one daemonized PID
1923 in this group, so we don't know what process
1924 is the main process. */
efdb0237
LP
1925
1926 return -ENODATA;
8e274523 1927
4ad49000 1928 pid = npid;
8e274523
LP
1929 }
1930
efdb0237
LP
1931 *ret = pid;
1932 return 0;
1933}
1934
1935static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1936 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1937 _cleanup_fclose_ FILE *f = NULL;
1938 int ret = 0, r;
1939
1940 assert(u);
1941 assert(path);
1942
1943 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1944 if (r < 0)
1945 ret = r;
1946 else {
1947 pid_t pid;
1948
1949 while ((r = cg_read_pid(f, &pid)) > 0) {
1950 r = unit_watch_pid(u, pid);
1951 if (r < 0 && ret >= 0)
1952 ret = r;
1953 }
1954
1955 if (r < 0 && ret >= 0)
1956 ret = r;
1957 }
1958
1959 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1960 if (r < 0) {
1961 if (ret >= 0)
1962 ret = r;
1963 } else {
1964 char *fn;
1965
1966 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1967 _cleanup_free_ char *p = NULL;
1968
605405c6 1969 p = strjoin(path, "/", fn);
efdb0237
LP
1970 free(fn);
1971
1972 if (!p)
1973 return -ENOMEM;
1974
1975 r = unit_watch_pids_in_path(u, p);
1976 if (r < 0 && ret >= 0)
1977 ret = r;
1978 }
1979
1980 if (r < 0 && ret >= 0)
1981 ret = r;
1982 }
1983
1984 return ret;
1985}
1986
11aef522
LP
1987int unit_synthesize_cgroup_empty_event(Unit *u) {
1988 int r;
1989
1990 assert(u);
1991
1992 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1993 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1994 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1995
1996 if (!u->cgroup_path)
1997 return -ENOENT;
1998
1999 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2000 if (r < 0)
2001 return r;
2002 if (r > 0) /* On unified we have reliable notifications, and don't need this */
2003 return 0;
2004
2005 if (!set_isempty(u->pids))
2006 return 0;
2007
2008 unit_add_to_cgroup_empty_queue(u);
2009 return 0;
2010}
2011
efdb0237 2012int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
2013 int r;
2014
efdb0237
LP
2015 assert(u);
2016
2017 /* Adds all PIDs from our cgroup to the set of PIDs we
2018 * watch. This is a fallback logic for cases where we do not
2019 * get reliable cgroup empty notifications: we try to use
2020 * SIGCHLD as replacement. */
2021
2022 if (!u->cgroup_path)
2023 return -ENOENT;
2024
c22800e4 2025 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
2026 if (r < 0)
2027 return r;
2028 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
2029 return 0;
2030
2031 return unit_watch_pids_in_path(u, u->cgroup_path);
2032}
2033
09e24654
LP
2034static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2035 Manager *m = userdata;
2036 Unit *u;
efdb0237
LP
2037 int r;
2038
09e24654
LP
2039 assert(s);
2040 assert(m);
efdb0237 2041
09e24654
LP
2042 u = m->cgroup_empty_queue;
2043 if (!u)
efdb0237
LP
2044 return 0;
2045
09e24654
LP
2046 assert(u->in_cgroup_empty_queue);
2047 u->in_cgroup_empty_queue = false;
2048 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2049
2050 if (m->cgroup_empty_queue) {
2051 /* More stuff queued, let's make sure we remain enabled */
2052 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2053 if (r < 0)
2054 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2055 }
efdb0237
LP
2056
2057 unit_add_to_gc_queue(u);
2058
2059 if (UNIT_VTABLE(u)->notify_cgroup_empty)
2060 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2061
2062 return 0;
2063}
2064
09e24654
LP
2065void unit_add_to_cgroup_empty_queue(Unit *u) {
2066 int r;
2067
2068 assert(u);
2069
2070 /* Note that there are four different ways how cgroup empty events reach us:
2071 *
2072 * 1. On the unified hierarchy we get an inotify event on the cgroup
2073 *
2074 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2075 *
2076 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2077 *
2078 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2079 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2080 *
2081 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2082 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2083 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2084 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2085 * case for scope units). */
2086
2087 if (u->in_cgroup_empty_queue)
2088 return;
2089
2090 /* Let's verify that the cgroup is really empty */
2091 if (!u->cgroup_path)
2092 return;
2093 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2094 if (r < 0) {
2095 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2096 return;
2097 }
2098 if (r == 0)
2099 return;
2100
2101 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2102 u->in_cgroup_empty_queue = true;
2103
2104 /* Trigger the defer event */
2105 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2106 if (r < 0)
2107 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2108}
2109
efdb0237
LP
2110static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2111 Manager *m = userdata;
2112
2113 assert(s);
2114 assert(fd >= 0);
2115 assert(m);
2116
2117 for (;;) {
2118 union inotify_event_buffer buffer;
2119 struct inotify_event *e;
2120 ssize_t l;
2121
2122 l = read(fd, &buffer, sizeof(buffer));
2123 if (l < 0) {
47249640 2124 if (IN_SET(errno, EINTR, EAGAIN))
efdb0237
LP
2125 return 0;
2126
2127 return log_error_errno(errno, "Failed to read control group inotify events: %m");
2128 }
2129
2130 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2131 Unit *u;
2132
2133 if (e->wd < 0)
2134 /* Queue overflow has no watch descriptor */
2135 continue;
2136
2137 if (e->mask & IN_IGNORED)
2138 /* The watch was just removed */
2139 continue;
2140
2141 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2142 if (!u) /* Not that inotify might deliver
2143 * events for a watch even after it
2144 * was removed, because it was queued
2145 * before the removal. Let's ignore
2146 * this here safely. */
2147 continue;
2148
09e24654 2149 unit_add_to_cgroup_empty_queue(u);
efdb0237
LP
2150 }
2151 }
8e274523
LP
2152}
2153
8e274523 2154int manager_setup_cgroup(Manager *m) {
9444b1f2 2155 _cleanup_free_ char *path = NULL;
10bd3e2e 2156 const char *scope_path;
efdb0237 2157 CGroupController c;
b4cccbc1 2158 int r, all_unified;
efdb0237 2159 char *e;
8e274523
LP
2160
2161 assert(m);
2162
35d2e7ec 2163 /* 1. Determine hierarchy */
efdb0237 2164 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 2165 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
2166 if (r < 0)
2167 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 2168
efdb0237
LP
2169 /* Chop off the init scope, if we are already located in it */
2170 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 2171
efdb0237
LP
2172 /* LEGACY: Also chop off the system slice if we are in
2173 * it. This is to support live upgrades from older systemd
2174 * versions where PID 1 was moved there. Also see
2175 * cg_get_root_path(). */
463d0d15 2176 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 2177 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 2178 if (!e)
efdb0237 2179 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 2180 }
efdb0237
LP
2181 if (e)
2182 *e = 0;
7ccfb64a 2183
7546145e
LP
2184 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2185 * easily prepend it everywhere. */
2186 delete_trailing_chars(m->cgroup_root, "/");
8e274523 2187
35d2e7ec 2188 /* 2. Show data */
9444b1f2 2189 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
2190 if (r < 0)
2191 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 2192
415fc41c
TH
2193 r = cg_unified_flush();
2194 if (r < 0)
2195 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 2196
b4cccbc1 2197 all_unified = cg_all_unified();
d4c819ed
ZJS
2198 if (all_unified < 0)
2199 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2200 if (all_unified > 0)
efdb0237 2201 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 2202 else {
c22800e4 2203 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
2204 if (r < 0)
2205 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2206 if (r > 0)
2207 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2208 else
2209 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2210 }
efdb0237 2211
09e24654
LP
2212 /* 3. Allocate cgroup empty defer event source */
2213 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2214 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2215 if (r < 0)
2216 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2217
2218 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2219 if (r < 0)
2220 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2221
2222 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2223 if (r < 0)
2224 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2225
2226 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2227
2228 /* 4. Install notifier inotify object, or agent */
10bd3e2e 2229 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
c6c18be3 2230
09e24654 2231 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
efdb0237 2232
10bd3e2e
LP
2233 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2234 safe_close(m->cgroup_inotify_fd);
efdb0237 2235
10bd3e2e
LP
2236 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2237 if (m->cgroup_inotify_fd < 0)
2238 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 2239
10bd3e2e
LP
2240 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2241 if (r < 0)
2242 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 2243
10bd3e2e
LP
2244 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2245 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
09e24654 2246 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
10bd3e2e
LP
2247 if (r < 0)
2248 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
efdb0237 2249
10bd3e2e 2250 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 2251
10bd3e2e 2252 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
efdb0237 2253
10bd3e2e
LP
2254 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2255 * since it does not generate events when control groups with children run empty. */
8e274523 2256
10bd3e2e 2257 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
23bbb0de 2258 if (r < 0)
10bd3e2e
LP
2259 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2260 else if (r > 0)
2261 log_debug("Installed release agent.");
2262 else if (r == 0)
2263 log_debug("Release agent already installed.");
2264 }
efdb0237 2265
09e24654 2266 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
10bd3e2e
LP
2267 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2268 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2269 if (r < 0)
2270 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
c6c18be3 2271
09e24654 2272 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
10bd3e2e
LP
2273 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2274 if (r < 0)
2275 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
0d8c31ff 2276
09e24654 2277 /* 6. And pin it, so that it cannot be unmounted */
10bd3e2e
LP
2278 safe_close(m->pin_cgroupfs_fd);
2279 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2280 if (m->pin_cgroupfs_fd < 0)
2281 return log_error_errno(errno, "Failed to open pin file: %m");
2282
09e24654 2283 /* 7. Always enable hierarchical support if it exists... */
10bd3e2e
LP
2284 if (!all_unified && m->test_run_flags == 0)
2285 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3 2286
09e24654 2287 /* 8. Figure out which controllers are supported, and log about it */
efdb0237
LP
2288 r = cg_mask_supported(&m->cgroup_supported);
2289 if (r < 0)
2290 return log_error_errno(r, "Failed to determine supported controllers: %m");
efdb0237 2291 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
eee0a1e4 2292 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 2293
a32360f1 2294 return 0;
8e274523
LP
2295}
2296
c6c18be3 2297void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
2298 assert(m);
2299
9444b1f2
LP
2300 /* We can't really delete the group, since we are in it. But
2301 * let's trim it. */
2302 if (delete && m->cgroup_root)
efdb0237
LP
2303 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2304
09e24654
LP
2305 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2306
efdb0237
LP
2307 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2308
2309 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2310 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 2311
03e334a1 2312 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 2313
efdb0237 2314 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
2315}
2316
4ad49000 2317Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 2318 char *p;
4ad49000 2319 Unit *u;
acb14d31
LP
2320
2321 assert(m);
2322 assert(cgroup);
acb14d31 2323
4ad49000
LP
2324 u = hashmap_get(m->cgroup_unit, cgroup);
2325 if (u)
2326 return u;
acb14d31 2327
8e70580b 2328 p = strdupa(cgroup);
acb14d31
LP
2329 for (;;) {
2330 char *e;
2331
2332 e = strrchr(p, '/');
efdb0237
LP
2333 if (!e || e == p)
2334 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
2335
2336 *e = 0;
2337
4ad49000
LP
2338 u = hashmap_get(m->cgroup_unit, p);
2339 if (u)
2340 return u;
acb14d31
LP
2341 }
2342}
2343
b3ac818b 2344Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 2345 _cleanup_free_ char *cgroup = NULL;
8e274523 2346
8c47c732
LP
2347 assert(m);
2348
62a76913 2349 if (!pid_is_valid(pid))
b3ac818b
LP
2350 return NULL;
2351
62a76913 2352 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
b3ac818b
LP
2353 return NULL;
2354
2355 return manager_get_unit_by_cgroup(m, cgroup);
2356}
2357
2358Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
62a76913 2359 Unit *u, **array;
b3ac818b
LP
2360
2361 assert(m);
2362
62a76913
LP
2363 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2364 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2365 * relevant one as children of the process will be assigned to that one, too, before all else. */
2366
2367 if (!pid_is_valid(pid))
8c47c732
LP
2368 return NULL;
2369
2ca9d979 2370 if (pid == getpid_cached())
efdb0237
LP
2371 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2372
62a76913 2373 u = manager_get_unit_by_pid_cgroup(m, pid);
5fe8876b
LP
2374 if (u)
2375 return u;
2376
62a76913 2377 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
5fe8876b
LP
2378 if (u)
2379 return u;
2380
62a76913
LP
2381 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2382 if (array)
2383 return array[0];
2384
2385 return NULL;
6dde1f33 2386}
4fbf50b3 2387
4ad49000
LP
2388int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2389 Unit *u;
4fbf50b3 2390
4ad49000
LP
2391 assert(m);
2392 assert(cgroup);
4fbf50b3 2393
09e24654
LP
2394 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2395 * or from the --system instance */
2396
d8fdc620
LP
2397 log_debug("Got cgroup empty notification for: %s", cgroup);
2398
4ad49000 2399 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
2400 if (!u)
2401 return 0;
b56c28c3 2402
09e24654
LP
2403 unit_add_to_cgroup_empty_queue(u);
2404 return 1;
5ad096b3
LP
2405}
2406
2407int unit_get_memory_current(Unit *u, uint64_t *ret) {
2408 _cleanup_free_ char *v = NULL;
2409 int r;
2410
2411 assert(u);
2412 assert(ret);
2413
2e4025c0 2414 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
cf3b4be1
LP
2415 return -ENODATA;
2416
5ad096b3
LP
2417 if (!u->cgroup_path)
2418 return -ENODATA;
2419
efdb0237 2420 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
2421 return -ENODATA;
2422
b4cccbc1
LP
2423 r = cg_all_unified();
2424 if (r < 0)
2425 return r;
2426 if (r > 0)
efdb0237 2427 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
b4cccbc1
LP
2428 else
2429 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
5ad096b3
LP
2430 if (r == -ENOENT)
2431 return -ENODATA;
2432 if (r < 0)
2433 return r;
2434
2435 return safe_atou64(v, ret);
2436}
2437
03a7b521
LP
2438int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2439 _cleanup_free_ char *v = NULL;
2440 int r;
2441
2442 assert(u);
2443 assert(ret);
2444
2e4025c0 2445 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
2446 return -ENODATA;
2447
03a7b521
LP
2448 if (!u->cgroup_path)
2449 return -ENODATA;
2450
2451 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2452 return -ENODATA;
2453
c36a69f4
LP
2454 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2455 if (unit_has_root_cgroup(u))
2456 return procfs_tasks_get_current(ret);
2457
03a7b521
LP
2458 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2459 if (r == -ENOENT)
2460 return -ENODATA;
2461 if (r < 0)
2462 return r;
2463
2464 return safe_atou64(v, ret);
2465}
2466
5ad096b3
LP
2467static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2468 _cleanup_free_ char *v = NULL;
2469 uint64_t ns;
2470 int r;
2471
2472 assert(u);
2473 assert(ret);
2474
2475 if (!u->cgroup_path)
2476 return -ENODATA;
2477
b4cccbc1
LP
2478 r = cg_all_unified();
2479 if (r < 0)
2480 return r;
2481 if (r > 0) {
66ebf6c0
TH
2482 const char *keys[] = { "usage_usec", NULL };
2483 _cleanup_free_ char *val = NULL;
2484 uint64_t us;
5ad096b3 2485
66ebf6c0
TH
2486 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2487 return -ENODATA;
5ad096b3 2488
66ebf6c0
TH
2489 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2490 if (r < 0)
2491 return r;
2492
2493 r = safe_atou64(val, &us);
2494 if (r < 0)
2495 return r;
2496
2497 ns = us * NSEC_PER_USEC;
2498 } else {
2499 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2500 return -ENODATA;
2501
2502 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2503 if (r == -ENOENT)
2504 return -ENODATA;
2505 if (r < 0)
2506 return r;
2507
2508 r = safe_atou64(v, &ns);
2509 if (r < 0)
2510 return r;
2511 }
5ad096b3
LP
2512
2513 *ret = ns;
2514 return 0;
2515}
2516
2517int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2518 nsec_t ns;
2519 int r;
2520
fe700f46
LP
2521 assert(u);
2522
2523 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2524 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2525 * call this function with a NULL return value. */
2526
2e4025c0 2527 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
cf3b4be1
LP
2528 return -ENODATA;
2529
5ad096b3 2530 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
2531 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2532 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2533 * cached value. */
2534
2535 if (ret)
2536 *ret = u->cpu_usage_last;
2537 return 0;
2538 }
5ad096b3
LP
2539 if (r < 0)
2540 return r;
2541
66ebf6c0
TH
2542 if (ns > u->cpu_usage_base)
2543 ns -= u->cpu_usage_base;
5ad096b3
LP
2544 else
2545 ns = 0;
2546
fe700f46
LP
2547 u->cpu_usage_last = ns;
2548 if (ret)
2549 *ret = ns;
2550
5ad096b3
LP
2551 return 0;
2552}
2553
906c06f6
DM
2554int unit_get_ip_accounting(
2555 Unit *u,
2556 CGroupIPAccountingMetric metric,
2557 uint64_t *ret) {
2558
6b659ed8 2559 uint64_t value;
906c06f6
DM
2560 int fd, r;
2561
2562 assert(u);
2563 assert(metric >= 0);
2564 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2565 assert(ret);
2566
cf3b4be1
LP
2567 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2568 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2569 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2570 * filters. */
2571 if (u->type == UNIT_SLICE)
2572 return -ENODATA;
2573
2e4025c0 2574 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
2575 return -ENODATA;
2576
906c06f6
DM
2577 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2578 u->ip_accounting_ingress_map_fd :
2579 u->ip_accounting_egress_map_fd;
906c06f6
DM
2580 if (fd < 0)
2581 return -ENODATA;
2582
2583 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 2584 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 2585 else
6b659ed8
LP
2586 r = bpf_firewall_read_accounting(fd, NULL, &value);
2587 if (r < 0)
2588 return r;
2589
2590 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2591 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2592 * ip_accounting_extra[] field, and add them in here transparently. */
2593
2594 *ret = value + u->ip_accounting_extra[metric];
906c06f6
DM
2595
2596 return r;
2597}
2598
2599int unit_reset_cpu_accounting(Unit *u) {
5ad096b3
LP
2600 nsec_t ns;
2601 int r;
2602
2603 assert(u);
2604
fe700f46
LP
2605 u->cpu_usage_last = NSEC_INFINITY;
2606
5ad096b3
LP
2607 r = unit_get_cpu_usage_raw(u, &ns);
2608 if (r < 0) {
66ebf6c0 2609 u->cpu_usage_base = 0;
5ad096b3 2610 return r;
b56c28c3 2611 }
2633eb83 2612
66ebf6c0 2613 u->cpu_usage_base = ns;
4ad49000 2614 return 0;
4fbf50b3
LP
2615}
2616
906c06f6
DM
2617int unit_reset_ip_accounting(Unit *u) {
2618 int r = 0, q = 0;
2619
2620 assert(u);
2621
2622 if (u->ip_accounting_ingress_map_fd >= 0)
2623 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2624
2625 if (u->ip_accounting_egress_map_fd >= 0)
2626 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2627
6b659ed8
LP
2628 zero(u->ip_accounting_extra);
2629
906c06f6
DM
2630 return r < 0 ? r : q;
2631}
2632
e7ab4d1a
LP
2633void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2634 assert(u);
2635
2636 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2637 return;
2638
2639 if (m == 0)
2640 return;
2641
538b4852
TH
2642 /* always invalidate compat pairs together */
2643 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2644 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2645
7cce4fb7
LP
2646 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2647 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2648
60c728ad 2649 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
e7ab4d1a
LP
2650 return;
2651
2652 u->cgroup_realized_mask &= ~m;
91a6073e 2653 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
2654}
2655
906c06f6
DM
2656void unit_invalidate_cgroup_bpf(Unit *u) {
2657 assert(u);
2658
2659 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2660 return;
2661
60c728ad 2662 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
906c06f6
DM
2663 return;
2664
2665 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
91a6073e 2666 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
2667
2668 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2669 * list of our children includes our own. */
2670 if (u->type == UNIT_SLICE) {
2671 Unit *member;
2672 Iterator i;
eef85c4a 2673 void *v;
906c06f6 2674
eef85c4a 2675 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
906c06f6
DM
2676 if (member == u)
2677 continue;
2678
2679 if (UNIT_DEREF(member->slice) != u)
2680 continue;
2681
2682 unit_invalidate_cgroup_bpf(member);
2683 }
2684 }
2685}
2686
1d9cc876
LP
2687bool unit_cgroup_delegate(Unit *u) {
2688 CGroupContext *c;
2689
2690 assert(u);
2691
2692 if (!UNIT_VTABLE(u)->can_delegate)
2693 return false;
2694
2695 c = unit_get_cgroup_context(u);
2696 if (!c)
2697 return false;
2698
2699 return c->delegate;
2700}
2701
e7ab4d1a
LP
2702void manager_invalidate_startup_units(Manager *m) {
2703 Iterator i;
2704 Unit *u;
2705
2706 assert(m);
2707
2708 SET_FOREACH(u, m->startup_units, i)
13c31542 2709 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
2710}
2711
4ad49000
LP
2712static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2713 [CGROUP_AUTO] = "auto",
2714 [CGROUP_CLOSED] = "closed",
2715 [CGROUP_STRICT] = "strict",
2716};
4fbf50b3 2717
4ad49000 2718DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);