]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
8e274523
LP
2/***
3 This file is part of systemd.
4
4ad49000 5 Copyright 2013 Lennart Poettering
8e274523
LP
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
8e274523 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
c6c18be3 21#include <fcntl.h>
e41969e3 22#include <fnmatch.h>
8c6db833 23
b5efdb8a 24#include "alloc-util.h"
906c06f6 25#include "bpf-firewall.h"
03a7b521 26#include "cgroup-util.h"
3ffd4af2
LP
27#include "cgroup.h"
28#include "fd-util.h"
0d39fa9c 29#include "fileio.h"
77601719 30#include "fs-util.h"
6bedfcbb 31#include "parse-util.h"
9eb977db 32#include "path-util.h"
03a7b521 33#include "process-util.h"
9444b1f2 34#include "special.h"
906c06f6 35#include "stdio-util.h"
8b43440b 36#include "string-table.h"
07630cea 37#include "string-util.h"
8e274523 38
9a054909
LP
39#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
40
2b40998d 41static void cgroup_compat_warn(void) {
128fadc9
TH
42 static bool cgroup_compat_warned = false;
43
44 if (cgroup_compat_warned)
45 return;
46
47 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
48 cgroup_compat_warned = true;
49}
50
51#define log_cgroup_compat(unit, fmt, ...) do { \
52 cgroup_compat_warn(); \
53 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 54 } while (false)
128fadc9 55
4ad49000
LP
56void cgroup_context_init(CGroupContext *c) {
57 assert(c);
58
59 /* Initialize everything to the kernel defaults, assuming the
60 * structure is preinitialized to 0 */
61
66ebf6c0
TH
62 c->cpu_weight = CGROUP_WEIGHT_INVALID;
63 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
64 c->cpu_quota_per_sec_usec = USEC_INFINITY;
65
d53d9474
LP
66 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
67 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
d53d9474 68
da4d897e
TH
69 c->memory_high = CGROUP_LIMIT_MAX;
70 c->memory_max = CGROUP_LIMIT_MAX;
96e131ea 71 c->memory_swap_max = CGROUP_LIMIT_MAX;
da4d897e
TH
72
73 c->memory_limit = CGROUP_LIMIT_MAX;
b2f8b02e 74
13c31542
TH
75 c->io_weight = CGROUP_WEIGHT_INVALID;
76 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
77
d53d9474
LP
78 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
79 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
80
81 c->tasks_max = (uint64_t) -1;
4ad49000 82}
8e274523 83
4ad49000
LP
84void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
85 assert(c);
86 assert(a);
87
71fda00f 88 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
89 free(a->path);
90 free(a);
91}
92
13c31542
TH
93void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
94 assert(c);
95 assert(w);
96
97 LIST_REMOVE(device_weights, c->io_device_weights, w);
98 free(w->path);
99 free(w);
100}
101
102void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
103 assert(c);
104 assert(l);
105
106 LIST_REMOVE(device_limits, c->io_device_limits, l);
107 free(l->path);
108 free(l);
109}
110
4ad49000
LP
111void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
112 assert(c);
113 assert(w);
114
71fda00f 115 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
116 free(w->path);
117 free(w);
118}
119
120void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
121 assert(c);
8e274523 122 assert(b);
8e274523 123
71fda00f 124 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
125 free(b->path);
126 free(b);
127}
128
129void cgroup_context_done(CGroupContext *c) {
130 assert(c);
131
13c31542
TH
132 while (c->io_device_weights)
133 cgroup_context_free_io_device_weight(c, c->io_device_weights);
134
135 while (c->io_device_limits)
136 cgroup_context_free_io_device_limit(c, c->io_device_limits);
137
4ad49000
LP
138 while (c->blockio_device_weights)
139 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
140
141 while (c->blockio_device_bandwidths)
142 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
143
144 while (c->device_allow)
145 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f
DM
146
147 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
148 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
4ad49000
LP
149}
150
151void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
13c31542
TH
152 CGroupIODeviceLimit *il;
153 CGroupIODeviceWeight *iw;
4ad49000
LP
154 CGroupBlockIODeviceBandwidth *b;
155 CGroupBlockIODeviceWeight *w;
156 CGroupDeviceAllow *a;
c21c9906 157 IPAddressAccessItem *iaai;
9a054909 158 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
159
160 assert(c);
161 assert(f);
162
163 prefix = strempty(prefix);
164
165 fprintf(f,
166 "%sCPUAccounting=%s\n"
13c31542 167 "%sIOAccounting=%s\n"
4ad49000
LP
168 "%sBlockIOAccounting=%s\n"
169 "%sMemoryAccounting=%s\n"
d53d9474 170 "%sTasksAccounting=%s\n"
c21c9906 171 "%sIPAccounting=%s\n"
66ebf6c0
TH
172 "%sCPUWeight=%" PRIu64 "\n"
173 "%sStartupCPUWeight=%" PRIu64 "\n"
d53d9474
LP
174 "%sCPUShares=%" PRIu64 "\n"
175 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 176 "%sCPUQuotaPerSecSec=%s\n"
13c31542
TH
177 "%sIOWeight=%" PRIu64 "\n"
178 "%sStartupIOWeight=%" PRIu64 "\n"
d53d9474
LP
179 "%sBlockIOWeight=%" PRIu64 "\n"
180 "%sStartupBlockIOWeight=%" PRIu64 "\n"
da4d897e
TH
181 "%sMemoryLow=%" PRIu64 "\n"
182 "%sMemoryHigh=%" PRIu64 "\n"
183 "%sMemoryMax=%" PRIu64 "\n"
96e131ea 184 "%sMemorySwapMax=%" PRIu64 "\n"
4ad49000 185 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 186 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
187 "%sDevicePolicy=%s\n"
188 "%sDelegate=%s\n",
4ad49000 189 prefix, yes_no(c->cpu_accounting),
13c31542 190 prefix, yes_no(c->io_accounting),
4ad49000
LP
191 prefix, yes_no(c->blockio_accounting),
192 prefix, yes_no(c->memory_accounting),
d53d9474 193 prefix, yes_no(c->tasks_accounting),
c21c9906 194 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
195 prefix, c->cpu_weight,
196 prefix, c->startup_cpu_weight,
4ad49000 197 prefix, c->cpu_shares,
95ae05c0 198 prefix, c->startup_cpu_shares,
b1d6dcf5 199 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
13c31542
TH
200 prefix, c->io_weight,
201 prefix, c->startup_io_weight,
4ad49000 202 prefix, c->blockio_weight,
95ae05c0 203 prefix, c->startup_blockio_weight,
da4d897e
TH
204 prefix, c->memory_low,
205 prefix, c->memory_high,
206 prefix, c->memory_max,
96e131ea 207 prefix, c->memory_swap_max,
4ad49000 208 prefix, c->memory_limit,
03a7b521 209 prefix, c->tasks_max,
a931ad47
LP
210 prefix, cgroup_device_policy_to_string(c->device_policy),
211 prefix, yes_no(c->delegate));
4ad49000 212
02638280
LP
213 if (c->delegate) {
214 _cleanup_free_ char *t = NULL;
215
216 (void) cg_mask_to_string(c->delegate_controllers, &t);
217
47a78d41 218 fprintf(f, "%sDelegateControllers=%s\n",
02638280
LP
219 prefix,
220 strempty(t));
221 }
222
4ad49000
LP
223 LIST_FOREACH(device_allow, a, c->device_allow)
224 fprintf(f,
225 "%sDeviceAllow=%s %s%s%s\n",
226 prefix,
227 a->path,
228 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
229
13c31542
TH
230 LIST_FOREACH(device_weights, iw, c->io_device_weights)
231 fprintf(f,
232 "%sIODeviceWeight=%s %" PRIu64,
233 prefix,
234 iw->path,
235 iw->weight);
236
237 LIST_FOREACH(device_limits, il, c->io_device_limits) {
238 char buf[FORMAT_BYTES_MAX];
9be57249
TH
239 CGroupIOLimitType type;
240
241 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
242 if (il->limits[type] != cgroup_io_limit_defaults[type])
243 fprintf(f,
244 "%s%s=%s %s\n",
245 prefix,
246 cgroup_io_limit_type_to_string(type),
247 il->path,
248 format_bytes(buf, sizeof(buf), il->limits[type]));
13c31542
TH
249 }
250
4ad49000
LP
251 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
252 fprintf(f,
d53d9474 253 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
254 prefix,
255 w->path,
256 w->weight);
257
258 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
259 char buf[FORMAT_BYTES_MAX];
260
979d0311
TH
261 if (b->rbps != CGROUP_LIMIT_MAX)
262 fprintf(f,
263 "%sBlockIOReadBandwidth=%s %s\n",
264 prefix,
265 b->path,
266 format_bytes(buf, sizeof(buf), b->rbps));
267 if (b->wbps != CGROUP_LIMIT_MAX)
268 fprintf(f,
269 "%sBlockIOWriteBandwidth=%s %s\n",
270 prefix,
271 b->path,
272 format_bytes(buf, sizeof(buf), b->wbps));
4ad49000 273 }
c21c9906
LP
274
275 LIST_FOREACH(items, iaai, c->ip_address_allow) {
276 _cleanup_free_ char *k = NULL;
277
278 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
279 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
280 }
281
282 LIST_FOREACH(items, iaai, c->ip_address_deny) {
283 _cleanup_free_ char *k = NULL;
284
285 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
286 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
287 }
4ad49000
LP
288}
289
13c31542 290static int lookup_block_device(const char *p, dev_t *dev) {
4ad49000
LP
291 struct stat st;
292 int r;
293
294 assert(p);
295 assert(dev);
296
297 r = stat(p, &st);
4a62c710
MS
298 if (r < 0)
299 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 300
4ad49000
LP
301 if (S_ISBLK(st.st_mode))
302 *dev = st.st_rdev;
303 else if (major(st.st_dev) != 0) {
304 /* If this is not a device node then find the block
305 * device this file is stored on */
306 *dev = st.st_dev;
307
308 /* If this is a partition, try to get the originating
309 * block device */
310 block_get_whole_disk(*dev, dev);
311 } else {
312 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
313 return -ENODEV;
314 }
8e274523 315
8e274523 316 return 0;
8e274523
LP
317}
318
4ad49000
LP
319static int whitelist_device(const char *path, const char *node, const char *acc) {
320 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
321 struct stat st;
b200489b 322 bool ignore_notfound;
8c6db833 323 int r;
8e274523 324
4ad49000
LP
325 assert(path);
326 assert(acc);
8e274523 327
b200489b
DR
328 if (node[0] == '-') {
329 /* Non-existent paths starting with "-" must be silently ignored */
330 node++;
331 ignore_notfound = true;
332 } else
333 ignore_notfound = false;
334
4ad49000 335 if (stat(node, &st) < 0) {
b200489b 336 if (errno == ENOENT && ignore_notfound)
e7330dfe
DP
337 return 0;
338
339 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
4ad49000
LP
340 }
341
342 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
343 log_warning("%s is not a device.", node);
344 return -ENODEV;
345 }
346
347 sprintf(buf,
348 "%c %u:%u %s",
349 S_ISCHR(st.st_mode) ? 'c' : 'b',
350 major(st.st_rdev), minor(st.st_rdev),
351 acc);
352
353 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 354 if (r < 0)
077ba06e 355 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 356 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
357
358 return r;
8e274523
LP
359}
360
90060676
LP
361static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
362 _cleanup_fclose_ FILE *f = NULL;
363 char line[LINE_MAX];
364 bool good = false;
365 int r;
366
367 assert(path);
368 assert(acc);
4c701096 369 assert(IN_SET(type, 'b', 'c'));
90060676
LP
370
371 f = fopen("/proc/devices", "re");
4a62c710
MS
372 if (!f)
373 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
374
375 FOREACH_LINE(line, f, goto fail) {
376 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
377 unsigned maj;
378
379 truncate_nl(line);
380
381 if (type == 'c' && streq(line, "Character devices:")) {
382 good = true;
383 continue;
384 }
385
386 if (type == 'b' && streq(line, "Block devices:")) {
387 good = true;
388 continue;
389 }
390
391 if (isempty(line)) {
392 good = false;
393 continue;
394 }
395
396 if (!good)
397 continue;
398
399 p = strstrip(line);
400
401 w = strpbrk(p, WHITESPACE);
402 if (!w)
403 continue;
404 *w = 0;
405
406 r = safe_atou(p, &maj);
407 if (r < 0)
408 continue;
409 if (maj <= 0)
410 continue;
411
412 w++;
413 w += strspn(w, WHITESPACE);
e41969e3
LP
414
415 if (fnmatch(name, w, 0) != 0)
90060676
LP
416 continue;
417
418 sprintf(buf,
419 "%c %u:* %s",
420 type,
421 maj,
422 acc);
423
424 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 425 if (r < 0)
077ba06e 426 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 427 "Failed to set devices.allow on %s: %m", path);
90060676
LP
428 }
429
430 return 0;
431
432fail:
25f027c5 433 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
434}
435
66ebf6c0
TH
436static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
437 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
438 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
439}
440
441static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
442 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
443 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
444}
445
446static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
447 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
448 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
449 return c->startup_cpu_weight;
450 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
451 return c->cpu_weight;
452 else
453 return CGROUP_WEIGHT_DEFAULT;
454}
455
456static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
457 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
458 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
459 return c->startup_cpu_shares;
460 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
461 return c->cpu_shares;
462 else
463 return CGROUP_CPU_SHARES_DEFAULT;
464}
465
466static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
467 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
468 int r;
469
470 xsprintf(buf, "%" PRIu64 "\n", weight);
471 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
472 if (r < 0)
473 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
474 "Failed to set cpu.weight: %m");
475
476 if (quota != USEC_INFINITY)
477 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
478 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
479 else
480 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
481
482 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
483
484 if (r < 0)
485 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
486 "Failed to set cpu.max: %m");
487}
488
489static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
490 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
491 int r;
492
493 xsprintf(buf, "%" PRIu64 "\n", shares);
494 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
495 if (r < 0)
496 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
497 "Failed to set cpu.shares: %m");
498
499 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
500 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
501 if (r < 0)
502 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
503 "Failed to set cpu.cfs_period_us: %m");
504
505 if (quota != USEC_INFINITY) {
506 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
507 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
508 } else
509 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
510 if (r < 0)
511 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
512 "Failed to set cpu.cfs_quota_us: %m");
513}
514
515static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
516 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
517 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
518}
519
520static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
521 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
522 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
523}
524
508c45da 525static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
526 return c->io_accounting ||
527 c->io_weight != CGROUP_WEIGHT_INVALID ||
528 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
529 c->io_device_weights ||
530 c->io_device_limits;
531}
532
508c45da 533static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
534 return c->blockio_accounting ||
535 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
536 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
537 c->blockio_device_weights ||
538 c->blockio_device_bandwidths;
539}
540
508c45da 541static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
542 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
543 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
544 return c->startup_io_weight;
545 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
546 return c->io_weight;
547 else
548 return CGROUP_WEIGHT_DEFAULT;
549}
550
508c45da 551static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
552 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
553 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
554 return c->startup_blockio_weight;
555 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
556 return c->blockio_weight;
557 else
558 return CGROUP_BLKIO_WEIGHT_DEFAULT;
559}
560
508c45da 561static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
562 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
563 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
564}
565
508c45da 566static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
567 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
568 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
569}
570
f29ff115 571static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
572 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
573 dev_t dev;
574 int r;
575
576 r = lookup_block_device(dev_path, &dev);
577 if (r < 0)
578 return;
579
580 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
f29ff115 581 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
64faf04c 582 if (r < 0)
f29ff115
TH
583 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
584 "Failed to set io.weight: %m");
64faf04c
TH
585}
586
f29ff115 587static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
588 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
589 dev_t dev;
590 int r;
591
592 r = lookup_block_device(dev_path, &dev);
593 if (r < 0)
594 return;
595
596 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
f29ff115 597 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
64faf04c 598 if (r < 0)
f29ff115
TH
599 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
600 "Failed to set blkio.weight_device: %m");
64faf04c
TH
601}
602
f29ff115 603static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
64faf04c
TH
604 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
605 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
606 CGroupIOLimitType type;
607 dev_t dev;
608 unsigned n = 0;
609 int r;
610
611 r = lookup_block_device(dev_path, &dev);
612 if (r < 0)
613 return 0;
614
615 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
616 if (limits[type] != cgroup_io_limit_defaults[type]) {
617 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
618 n++;
619 } else {
620 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
621 }
622 }
623
624 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
625 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
626 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
f29ff115 627 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
64faf04c 628 if (r < 0)
f29ff115
TH
629 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
630 "Failed to set io.max: %m");
64faf04c
TH
631 return n;
632}
633
f29ff115 634static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
635 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
636 dev_t dev;
637 unsigned n = 0;
638 int r;
639
640 r = lookup_block_device(dev_path, &dev);
641 if (r < 0)
642 return 0;
643
644 if (rbps != CGROUP_LIMIT_MAX)
645 n++;
646 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
f29ff115 647 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
64faf04c 648 if (r < 0)
f29ff115
TH
649 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
650 "Failed to set blkio.throttle.read_bps_device: %m");
64faf04c
TH
651
652 if (wbps != CGROUP_LIMIT_MAX)
653 n++;
654 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
f29ff115 655 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
64faf04c 656 if (r < 0)
f29ff115
TH
657 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
658 "Failed to set blkio.throttle.write_bps_device: %m");
64faf04c
TH
659
660 return n;
661}
662
da4d897e 663static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
96e131ea 664 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
665}
666
f29ff115 667static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
da4d897e
TH
668 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
669 int r;
670
671 if (v != CGROUP_LIMIT_MAX)
672 xsprintf(buf, "%" PRIu64 "\n", v);
673
f29ff115 674 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
da4d897e 675 if (r < 0)
f29ff115
TH
676 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
677 "Failed to set %s: %m", file);
da4d897e
TH
678}
679
906c06f6
DM
680static void cgroup_apply_firewall(Unit *u, CGroupContext *c) {
681 int r;
682
683 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
684 * not recursive we don't ever touch the bpf on them */
685 return;
686
687 r = bpf_firewall_compile(u);
688 if (r < 0)
689 return;
690
691 (void) bpf_firewall_install(u);
692 return;
693}
694
695static void cgroup_context_apply(
696 Unit *u,
697 CGroupMask apply_mask,
698 bool apply_bpf,
699 ManagerState state) {
700
f29ff115
TH
701 const char *path;
702 CGroupContext *c;
01efdf13 703 bool is_root;
4ad49000
LP
704 int r;
705
f29ff115
TH
706 assert(u);
707
708 c = unit_get_cgroup_context(u);
709 path = u->cgroup_path;
710
4ad49000
LP
711 assert(c);
712 assert(path);
8e274523 713
906c06f6
DM
714 /* Nothing to do? Exit early! */
715 if (apply_mask == 0 && !apply_bpf)
4ad49000 716 return;
8e274523 717
71c26873 718 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
719 * hence silently ignore */
720 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
721 if (is_root)
722 /* Make sure we don't try to display messages with an empty path. */
723 path = "/";
01efdf13 724
714e2e1d
LP
725 /* We generally ignore errors caused by read-only mounted
726 * cgroup trees (assuming we are running in a container then),
727 * and missing cgroups, i.e. EROFS and ENOENT. */
728
906c06f6
DM
729 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
730 bool has_weight, has_shares;
731
732 has_weight = cgroup_context_has_cpu_weight(c);
733 has_shares = cgroup_context_has_cpu_shares(c);
8e274523 734
b4cccbc1 735 if (cg_all_unified() > 0) {
66ebf6c0 736 uint64_t weight;
b2f8b02e 737
66ebf6c0
TH
738 if (has_weight)
739 weight = cgroup_context_cpu_weight(c, state);
740 else if (has_shares) {
741 uint64_t shares = cgroup_context_cpu_shares(c, state);
b2f8b02e 742
66ebf6c0
TH
743 weight = cgroup_cpu_shares_to_weight(shares);
744
745 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
746 shares, weight, path);
747 } else
748 weight = CGROUP_WEIGHT_DEFAULT;
749
750 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
751 } else {
752 uint64_t shares;
753
7d862ab8 754 if (has_weight) {
66ebf6c0
TH
755 uint64_t weight = cgroup_context_cpu_weight(c, state);
756
757 shares = cgroup_cpu_weight_to_shares(weight);
758
759 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
760 weight, shares, path);
7d862ab8
TH
761 } else if (has_shares)
762 shares = cgroup_context_cpu_shares(c, state);
763 else
66ebf6c0
TH
764 shares = CGROUP_CPU_SHARES_DEFAULT;
765
766 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
767 }
4ad49000
LP
768 }
769
906c06f6 770 if (apply_mask & CGROUP_MASK_IO) {
538b4852
TH
771 bool has_io = cgroup_context_has_io_config(c);
772 bool has_blockio = cgroup_context_has_blockio_config(c);
13c31542
TH
773
774 if (!is_root) {
64faf04c
TH
775 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
776 uint64_t weight;
13c31542 777
538b4852
TH
778 if (has_io)
779 weight = cgroup_context_io_weight(c, state);
128fadc9
TH
780 else if (has_blockio) {
781 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
782
783 weight = cgroup_weight_blkio_to_io(blkio_weight);
784
785 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
786 blkio_weight, weight);
787 } else
538b4852 788 weight = CGROUP_WEIGHT_DEFAULT;
13c31542
TH
789
790 xsprintf(buf, "default %" PRIu64 "\n", weight);
791 r = cg_set_attribute("io", path, "io.weight", buf);
792 if (r < 0)
f29ff115
TH
793 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
794 "Failed to set io.weight: %m");
13c31542 795
538b4852
TH
796 if (has_io) {
797 CGroupIODeviceWeight *w;
798
799 /* FIXME: no way to reset this list */
800 LIST_FOREACH(device_weights, w, c->io_device_weights)
f29ff115 801 cgroup_apply_io_device_weight(u, w->path, w->weight);
538b4852
TH
802 } else if (has_blockio) {
803 CGroupBlockIODeviceWeight *w;
804
805 /* FIXME: no way to reset this list */
128fadc9
TH
806 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
807 weight = cgroup_weight_blkio_to_io(w->weight);
808
809 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
810 w->weight, weight, w->path);
811
812 cgroup_apply_io_device_weight(u, w->path, weight);
813 }
538b4852 814 }
13c31542
TH
815 }
816
64faf04c 817 /* Apply limits and free ones without config. */
538b4852
TH
818 if (has_io) {
819 CGroupIODeviceLimit *l, *next;
820
821 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
f29ff115 822 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
538b4852
TH
823 cgroup_context_free_io_device_limit(c, l);
824 }
825 } else if (has_blockio) {
826 CGroupBlockIODeviceBandwidth *b, *next;
827
828 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
829 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
830 CGroupIOLimitType type;
831
832 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
833 limits[type] = cgroup_io_limit_defaults[type];
834
835 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
836 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
837
128fadc9
TH
838 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
839 b->rbps, b->wbps, b->path);
840
f29ff115 841 if (!cgroup_apply_io_device_limit(u, b->path, limits))
538b4852
TH
842 cgroup_context_free_blockio_device_bandwidth(c, b);
843 }
13c31542
TH
844 }
845 }
846
906c06f6 847 if (apply_mask & CGROUP_MASK_BLKIO) {
538b4852
TH
848 bool has_io = cgroup_context_has_io_config(c);
849 bool has_blockio = cgroup_context_has_blockio_config(c);
4ad49000 850
01efdf13 851 if (!is_root) {
64faf04c
TH
852 char buf[DECIMAL_STR_MAX(uint64_t)+1];
853 uint64_t weight;
64faf04c 854
7d862ab8 855 if (has_io) {
128fadc9
TH
856 uint64_t io_weight = cgroup_context_io_weight(c, state);
857
538b4852 858 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9
TH
859
860 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
861 io_weight, weight);
7d862ab8
TH
862 } else if (has_blockio)
863 weight = cgroup_context_blkio_weight(c, state);
864 else
538b4852 865 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c
TH
866
867 xsprintf(buf, "%" PRIu64 "\n", weight);
01efdf13 868 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 869 if (r < 0)
f29ff115
TH
870 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
871 "Failed to set blkio.weight: %m");
4ad49000 872
7d862ab8 873 if (has_io) {
538b4852
TH
874 CGroupIODeviceWeight *w;
875
876 /* FIXME: no way to reset this list */
128fadc9
TH
877 LIST_FOREACH(device_weights, w, c->io_device_weights) {
878 weight = cgroup_weight_io_to_blkio(w->weight);
879
880 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
881 w->weight, weight, w->path);
882
883 cgroup_apply_blkio_device_weight(u, w->path, weight);
884 }
7d862ab8
TH
885 } else if (has_blockio) {
886 CGroupBlockIODeviceWeight *w;
887
888 /* FIXME: no way to reset this list */
889 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
890 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 891 }
4ad49000
LP
892 }
893
64faf04c 894 /* Apply limits and free ones without config. */
7d862ab8 895 if (has_io) {
538b4852
TH
896 CGroupIODeviceLimit *l, *next;
897
898 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
128fadc9
TH
899 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
900 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
901
f29ff115 902 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
538b4852
TH
903 cgroup_context_free_io_device_limit(c, l);
904 }
7d862ab8
TH
905 } else if (has_blockio) {
906 CGroupBlockIODeviceBandwidth *b, *next;
907
908 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
909 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
910 cgroup_context_free_blockio_device_bandwidth(c, b);
d686d8a9 911 }
8e274523
LP
912 }
913
906c06f6 914 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
b4cccbc1
LP
915 if (cg_all_unified() > 0) {
916 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
efdb0237 917
96e131ea 918 if (cgroup_context_has_unified_memory_config(c)) {
da4d897e 919 max = c->memory_max;
96e131ea
WC
920 swap_max = c->memory_swap_max;
921 } else {
da4d897e 922 max = c->memory_limit;
efdb0237 923
128fadc9
TH
924 if (max != CGROUP_LIMIT_MAX)
925 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
926 }
927
f29ff115
TH
928 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
929 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
930 cgroup_apply_unified_memory_limit(u, "memory.max", max);
96e131ea 931 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
efdb0237 932 } else {
da4d897e 933 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
7d862ab8 934 uint64_t val;
da4d897e 935
7d862ab8 936 if (cgroup_context_has_unified_memory_config(c)) {
78a4ee59 937 val = c->memory_max;
7d862ab8
TH
938 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
939 } else
940 val = c->memory_limit;
128fadc9 941
78a4ee59
DM
942 if (val == CGROUP_LIMIT_MAX)
943 strncpy(buf, "-1\n", sizeof(buf));
944 else
945 xsprintf(buf, "%" PRIu64 "\n", val);
946
da4d897e
TH
947 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
948 if (r < 0)
f29ff115
TH
949 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
950 "Failed to set memory.limit_in_bytes: %m");
da4d897e 951 }
4ad49000 952 }
8e274523 953
906c06f6 954 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 955 CGroupDeviceAllow *a;
8e274523 956
714e2e1d
LP
957 /* Changing the devices list of a populated cgroup
958 * might result in EINVAL, hence ignore EINVAL
959 * here. */
960
4ad49000
LP
961 if (c->device_allow || c->device_policy != CGROUP_AUTO)
962 r = cg_set_attribute("devices", path, "devices.deny", "a");
963 else
964 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 965 if (r < 0)
f29ff115
TH
966 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
967 "Failed to reset devices.list: %m");
fb385181 968
4ad49000
LP
969 if (c->device_policy == CGROUP_CLOSED ||
970 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
971 static const char auto_devices[] =
7d711efb
LP
972 "/dev/null\0" "rwm\0"
973 "/dev/zero\0" "rwm\0"
974 "/dev/full\0" "rwm\0"
975 "/dev/random\0" "rwm\0"
976 "/dev/urandom\0" "rwm\0"
977 "/dev/tty\0" "rwm\0"
0d9e7991
AP
978 "/dev/pts/ptmx\0" "rw\0" /* /dev/pts/ptmx may not be duplicated, but accessed */
979 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
e7330dfe
DP
980 "-/run/systemd/inaccessible/chr\0" "rwm\0"
981 "-/run/systemd/inaccessible/blk\0" "rwm\0";
4ad49000
LP
982
983 const char *x, *y;
984
985 NULSTR_FOREACH_PAIR(x, y, auto_devices)
986 whitelist_device(path, x, y);
7d711efb
LP
987
988 whitelist_major(path, "pts", 'c', "rw");
4ad49000
LP
989 }
990
991 LIST_FOREACH(device_allow, a, c->device_allow) {
fb4650aa 992 char acc[4], *val;
4ad49000
LP
993 unsigned k = 0;
994
995 if (a->r)
996 acc[k++] = 'r';
997 if (a->w)
998 acc[k++] = 'w';
999 if (a->m)
1000 acc[k++] = 'm';
fb385181 1001
4ad49000
LP
1002 if (k == 0)
1003 continue;
fb385181 1004
4ad49000 1005 acc[k++] = 0;
90060676 1006
27458ed6 1007 if (path_startswith(a->path, "/dev/"))
90060676 1008 whitelist_device(path, a->path, acc);
fb4650aa
ZJS
1009 else if ((val = startswith(a->path, "block-")))
1010 whitelist_major(path, val, 'b', acc);
1011 else if ((val = startswith(a->path, "char-")))
1012 whitelist_major(path, val, 'c', acc);
90060676 1013 else
f29ff115 1014 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
1015 }
1016 }
03a7b521 1017
906c06f6 1018 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
03a7b521 1019
f5058264 1020 if (c->tasks_max != CGROUP_LIMIT_MAX) {
03a7b521
LP
1021 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1022
1023 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1024 r = cg_set_attribute("pids", path, "pids.max", buf);
1025 } else
1026 r = cg_set_attribute("pids", path, "pids.max", "max");
1027
1028 if (r < 0)
f29ff115
TH
1029 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1030 "Failed to set pids.max: %m");
03a7b521 1031 }
906c06f6
DM
1032
1033 if (apply_bpf)
1034 cgroup_apply_firewall(u, c);
fb385181
LP
1035}
1036
efdb0237
LP
1037CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1038 CGroupMask mask = 0;
8e274523 1039
4ad49000 1040 /* Figure out which controllers we need */
8e274523 1041
b2f8b02e 1042 if (c->cpu_accounting ||
66ebf6c0
TH
1043 cgroup_context_has_cpu_weight(c) ||
1044 cgroup_context_has_cpu_shares(c) ||
3a43da28 1045 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 1046 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 1047
538b4852
TH
1048 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1049 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1050
4ad49000 1051 if (c->memory_accounting ||
da4d897e
TH
1052 c->memory_limit != CGROUP_LIMIT_MAX ||
1053 cgroup_context_has_unified_memory_config(c))
efdb0237 1054 mask |= CGROUP_MASK_MEMORY;
8e274523 1055
a931ad47
LP
1056 if (c->device_allow ||
1057 c->device_policy != CGROUP_AUTO)
3905f127 1058 mask |= CGROUP_MASK_DEVICES;
4ad49000 1059
03a7b521
LP
1060 if (c->tasks_accounting ||
1061 c->tasks_max != (uint64_t) -1)
1062 mask |= CGROUP_MASK_PIDS;
1063
4ad49000 1064 return mask;
8e274523
LP
1065}
1066
efdb0237 1067CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1068 CGroupContext *c;
8e274523 1069
efdb0237
LP
1070 /* Returns the mask of controllers the unit needs for itself */
1071
4ad49000
LP
1072 c = unit_get_cgroup_context(u);
1073 if (!c)
1074 return 0;
8e274523 1075
02638280
LP
1076 return cgroup_context_get_mask(c);
1077}
1078
1079CGroupMask unit_get_delegate_mask(Unit *u) {
1080 CGroupContext *c;
1081
1082 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1083 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
19af675e 1084 *
02638280 1085 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1086
02638280
LP
1087 if (u->type == UNIT_SLICE)
1088 return 0;
1089
1090 c = unit_get_cgroup_context(u);
1091 if (!c)
1092 return 0;
1093
1094 if (!c->delegate)
1095 return 0;
1096
1097 if (cg_all_unified() <= 0) {
a931ad47
LP
1098 ExecContext *e;
1099
1100 e = unit_get_exec_context(u);
02638280
LP
1101 if (e && !exec_context_maintains_privileges(e))
1102 return 0;
a931ad47
LP
1103 }
1104
02638280 1105 return c->delegate_controllers;
8e274523
LP
1106}
1107
efdb0237 1108CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1109 assert(u);
bc432dc7 1110
02638280 1111 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1112
bc432dc7
LP
1113 if (u->cgroup_members_mask_valid)
1114 return u->cgroup_members_mask;
1115
02638280 1116 u->cgroup_members_mask = unit_get_delegate_mask(u);
bc432dc7
LP
1117
1118 if (u->type == UNIT_SLICE) {
eef85c4a 1119 void *v;
bc432dc7
LP
1120 Unit *member;
1121 Iterator i;
1122
eef85c4a 1123 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
bc432dc7
LP
1124
1125 if (member == u)
1126 continue;
1127
d4fdc205 1128 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
1129 continue;
1130
31604970 1131 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1132 }
1133 }
1134
1135 u->cgroup_members_mask_valid = true;
6414b7c9 1136 return u->cgroup_members_mask;
246aa6dd
LP
1137}
1138
efdb0237 1139CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 1140 assert(u);
246aa6dd 1141
efdb0237
LP
1142 /* Returns the mask of controllers all of the unit's siblings
1143 * require, i.e. the members mask of the unit's parent slice
1144 * if there is one. */
1145
bc432dc7 1146 if (UNIT_ISSET(u->slice))
637f421e 1147 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 1148
31604970 1149 return unit_get_subtree_mask(u);
246aa6dd
LP
1150}
1151
efdb0237
LP
1152CGroupMask unit_get_subtree_mask(Unit *u) {
1153
1154 /* Returns the mask of this subtree, meaning of the group
1155 * itself and its children. */
1156
1157 return unit_get_own_mask(u) | unit_get_members_mask(u);
1158}
1159
1160CGroupMask unit_get_target_mask(Unit *u) {
1161 CGroupMask mask;
1162
1163 /* This returns the cgroup mask of all controllers to enable
1164 * for a specific cgroup, i.e. everything it needs itself,
1165 * plus all that its children need, plus all that its siblings
1166 * need. This is primarily useful on the legacy cgroup
1167 * hierarchy, where we need to duplicate each cgroup in each
1168 * hierarchy that shall be enabled for it. */
6414b7c9 1169
efdb0237
LP
1170 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1171 mask &= u->manager->cgroup_supported;
1172
1173 return mask;
1174}
1175
1176CGroupMask unit_get_enable_mask(Unit *u) {
1177 CGroupMask mask;
1178
1179 /* This returns the cgroup mask of all controllers to enable
1180 * for the children of a specific cgroup. This is primarily
1181 * useful for the unified cgroup hierarchy, where each cgroup
1182 * controls which controllers are enabled for its children. */
1183
1184 mask = unit_get_members_mask(u);
6414b7c9
DS
1185 mask &= u->manager->cgroup_supported;
1186
1187 return mask;
1188}
1189
906c06f6
DM
1190bool unit_get_needs_bpf(Unit *u) {
1191 CGroupContext *c;
1192 Unit *p;
1193 assert(u);
1194
1195 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1196 * moment. */
1197 if (u->type == UNIT_SLICE)
1198 return false;
1199
1200 c = unit_get_cgroup_context(u);
1201 if (!c)
1202 return false;
1203
1204 if (c->ip_accounting ||
1205 c->ip_address_allow ||
1206 c->ip_address_deny)
1207 return true;
1208
1209 /* If any parent slice has an IP access list defined, it applies too */
1210 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1211 c = unit_get_cgroup_context(p);
1212 if (!c)
1213 return false;
1214
1215 if (c->ip_address_allow ||
1216 c->ip_address_deny)
1217 return true;
1218 }
1219
1220 return false;
1221}
1222
6414b7c9
DS
1223/* Recurse from a unit up through its containing slices, propagating
1224 * mask bits upward. A unit is also member of itself. */
bc432dc7 1225void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 1226 CGroupMask m;
bc432dc7
LP
1227 bool more;
1228
1229 assert(u);
1230
1231 /* Calculate subtree mask */
efdb0237 1232 m = unit_get_subtree_mask(u);
bc432dc7
LP
1233
1234 /* See if anything changed from the previous invocation. If
1235 * not, we're done. */
1236 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1237 return;
1238
1239 more =
1240 u->cgroup_subtree_mask_valid &&
1241 ((m & ~u->cgroup_subtree_mask) != 0) &&
1242 ((~m & u->cgroup_subtree_mask) == 0);
1243
1244 u->cgroup_subtree_mask = m;
1245 u->cgroup_subtree_mask_valid = true;
1246
6414b7c9
DS
1247 if (UNIT_ISSET(u->slice)) {
1248 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
1249
1250 if (more)
1251 /* There's more set now than before. We
1252 * propagate the new mask to the parent's mask
1253 * (not caring if it actually was valid or
1254 * not). */
1255
1256 s->cgroup_members_mask |= m;
1257
1258 else
1259 /* There's less set now than before (or we
1260 * don't know), we need to recalculate
1261 * everything, so let's invalidate the
1262 * parent's members mask */
1263
1264 s->cgroup_members_mask_valid = false;
1265
1266 /* And now make sure that this change also hits our
1267 * grandparents */
1268 unit_update_cgroup_members_masks(s);
6414b7c9
DS
1269 }
1270}
1271
efdb0237 1272static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
1273 Unit *u = userdata;
1274
1275 assert(mask != 0);
1276 assert(u);
1277
1278 while (u) {
1279 if (u->cgroup_path &&
1280 u->cgroup_realized &&
1281 (u->cgroup_realized_mask & mask) == mask)
1282 return u->cgroup_path;
1283
1284 u = UNIT_DEREF(u->slice);
1285 }
1286
1287 return NULL;
1288}
1289
efdb0237
LP
1290char *unit_default_cgroup_path(Unit *u) {
1291 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1292 int r;
1293
1294 assert(u);
1295
1296 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1297 return strdup(u->manager->cgroup_root);
1298
1299 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1300 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1301 if (r < 0)
1302 return NULL;
1303 }
1304
1305 escaped = cg_escape(u->id);
1306 if (!escaped)
1307 return NULL;
1308
1309 if (slice)
605405c6
ZJS
1310 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1311 escaped);
efdb0237 1312 else
605405c6 1313 return strjoin(u->manager->cgroup_root, "/", escaped);
efdb0237
LP
1314}
1315
1316int unit_set_cgroup_path(Unit *u, const char *path) {
1317 _cleanup_free_ char *p = NULL;
1318 int r;
1319
1320 assert(u);
1321
1322 if (path) {
1323 p = strdup(path);
1324 if (!p)
1325 return -ENOMEM;
1326 } else
1327 p = NULL;
1328
1329 if (streq_ptr(u->cgroup_path, p))
1330 return 0;
1331
1332 if (p) {
1333 r = hashmap_put(u->manager->cgroup_unit, p, u);
1334 if (r < 0)
1335 return r;
1336 }
1337
1338 unit_release_cgroup(u);
1339
1340 u->cgroup_path = p;
1341 p = NULL;
1342
1343 return 1;
1344}
1345
1346int unit_watch_cgroup(Unit *u) {
ab2c3861 1347 _cleanup_free_ char *events = NULL;
efdb0237
LP
1348 int r;
1349
1350 assert(u);
1351
1352 if (!u->cgroup_path)
1353 return 0;
1354
1355 if (u->cgroup_inotify_wd >= 0)
1356 return 0;
1357
1358 /* Only applies to the unified hierarchy */
c22800e4 1359 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1360 if (r < 0)
1361 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1362 if (r == 0)
efdb0237
LP
1363 return 0;
1364
1365 /* Don't watch the root slice, it's pointless. */
1366 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1367 return 0;
1368
1369 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1370 if (r < 0)
1371 return log_oom();
1372
ab2c3861 1373 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1374 if (r < 0)
1375 return log_oom();
1376
ab2c3861 1377 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
efdb0237
LP
1378 if (u->cgroup_inotify_wd < 0) {
1379
1380 if (errno == ENOENT) /* If the directory is already
1381 * gone we don't need to track
1382 * it, so this is not an error */
1383 return 0;
1384
1385 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1386 }
1387
1388 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1389 if (r < 0)
1390 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1391
1392 return 0;
1393}
1394
1395static int unit_create_cgroup(
1396 Unit *u,
1397 CGroupMask target_mask,
906c06f6
DM
1398 CGroupMask enable_mask,
1399 bool needs_bpf) {
efdb0237 1400
0cd385d3 1401 CGroupContext *c;
bc432dc7 1402 int r;
64747e2d 1403
4ad49000 1404 assert(u);
64747e2d 1405
0cd385d3
LP
1406 c = unit_get_cgroup_context(u);
1407 if (!c)
1408 return 0;
1409
7b3fd631
LP
1410 if (!u->cgroup_path) {
1411 _cleanup_free_ char *path = NULL;
64747e2d 1412
7b3fd631
LP
1413 path = unit_default_cgroup_path(u);
1414 if (!path)
1415 return log_oom();
1416
efdb0237
LP
1417 r = unit_set_cgroup_path(u, path);
1418 if (r == -EEXIST)
1419 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1420 if (r < 0)
1421 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
1422 }
1423
03b90d4b 1424 /* First, create our own group */
efdb0237 1425 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 1426 if (r < 0)
efdb0237
LP
1427 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1428
1429 /* Start watching it */
1430 (void) unit_watch_cgroup(u);
1431
1432 /* Enable all controllers we need */
1433 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1434 if (r < 0)
1435 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
1436
1437 /* Keep track that this is now realized */
4ad49000 1438 u->cgroup_realized = true;
efdb0237 1439 u->cgroup_realized_mask = target_mask;
ccf78df1 1440 u->cgroup_enabled_mask = enable_mask;
906c06f6 1441 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
4ad49000 1442
0cd385d3
LP
1443 if (u->type != UNIT_SLICE && !c->delegate) {
1444
1445 /* Then, possibly move things over, but not if
1446 * subgroups may contain processes, which is the case
1447 * for slice and delegation units. */
1448 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1449 if (r < 0)
efdb0237 1450 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 1451 }
03b90d4b 1452
64747e2d
LP
1453 return 0;
1454}
1455
7b3fd631
LP
1456int unit_attach_pids_to_cgroup(Unit *u) {
1457 int r;
1458 assert(u);
1459
1460 r = unit_realize_cgroup(u);
1461 if (r < 0)
1462 return r;
1463
1464 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1465 if (r < 0)
1466 return r;
1467
1468 return 0;
1469}
1470
4b58153d
LP
1471static void cgroup_xattr_apply(Unit *u) {
1472 char ids[SD_ID128_STRING_MAX];
1473 int r;
1474
1475 assert(u);
1476
1477 if (!MANAGER_IS_SYSTEM(u->manager))
1478 return;
1479
1480 if (sd_id128_is_null(u->invocation_id))
1481 return;
1482
1483 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1484 "trusted.invocation_id",
1485 sd_id128_to_string(u->invocation_id, ids), 32,
1486 0);
1487 if (r < 0)
1488 log_unit_warning_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1489}
1490
906c06f6
DM
1491static bool unit_has_mask_realized(
1492 Unit *u,
1493 CGroupMask target_mask,
1494 CGroupMask enable_mask,
1495 bool needs_bpf) {
1496
bc432dc7
LP
1497 assert(u);
1498
906c06f6
DM
1499 return u->cgroup_realized &&
1500 u->cgroup_realized_mask == target_mask &&
1501 u->cgroup_enabled_mask == enable_mask &&
1502 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1503 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
6414b7c9
DS
1504}
1505
1506/* Check if necessary controllers and attributes for a unit are in place.
1507 *
1508 * If so, do nothing.
1509 * If not, create paths, move processes over, and set attributes.
1510 *
1511 * Returns 0 on success and < 0 on failure. */
db785129 1512static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1513 CGroupMask target_mask, enable_mask;
906c06f6 1514 bool needs_bpf, apply_bpf;
6414b7c9 1515 int r;
64747e2d 1516
4ad49000 1517 assert(u);
64747e2d 1518
91a6073e
LP
1519 if (u->in_cgroup_realize_queue) {
1520 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1521 u->in_cgroup_realize_queue = false;
4ad49000 1522 }
64747e2d 1523
efdb0237 1524 target_mask = unit_get_target_mask(u);
ccf78df1 1525 enable_mask = unit_get_enable_mask(u);
906c06f6 1526 needs_bpf = unit_get_needs_bpf(u);
ccf78df1 1527
906c06f6 1528 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
0a1eb06d 1529 return 0;
64747e2d 1530
906c06f6
DM
1531 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1532 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1533 * this will trickle down properly to cgroupfs. */
1534 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1535
4ad49000 1536 /* First, realize parents */
6414b7c9 1537 if (UNIT_ISSET(u->slice)) {
db785129 1538 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1539 if (r < 0)
1540 return r;
1541 }
4ad49000
LP
1542
1543 /* And then do the real work */
906c06f6 1544 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
6414b7c9
DS
1545 if (r < 0)
1546 return r;
1547
1548 /* Finally, apply the necessary attributes. */
906c06f6 1549 cgroup_context_apply(u, target_mask, apply_bpf, state);
4b58153d 1550 cgroup_xattr_apply(u);
6414b7c9
DS
1551
1552 return 0;
64747e2d
LP
1553}
1554
91a6073e 1555static void unit_add_to_cgroup_realize_queue(Unit *u) {
58d83430 1556 assert(u);
ecedd90f 1557
91a6073e 1558 if (u->in_cgroup_realize_queue)
4ad49000 1559 return;
8e274523 1560
91a6073e
LP
1561 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1562 u->in_cgroup_realize_queue = true;
4ad49000 1563}
8c6db833 1564
91a6073e 1565unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 1566 ManagerState state;
4ad49000 1567 unsigned n = 0;
db785129 1568 Unit *i;
6414b7c9 1569 int r;
ecedd90f 1570
91a6073e
LP
1571 assert(m);
1572
db785129
LP
1573 state = manager_state(m);
1574
91a6073e
LP
1575 while ((i = m->cgroup_realize_queue)) {
1576 assert(i->in_cgroup_realize_queue);
ecedd90f 1577
db785129 1578 r = unit_realize_cgroup_now(i, state);
6414b7c9 1579 if (r < 0)
efdb0237 1580 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1581
4ad49000
LP
1582 n++;
1583 }
ecedd90f 1584
4ad49000 1585 return n;
8e274523
LP
1586}
1587
91a6073e 1588static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
4ad49000 1589 Unit *slice;
ca949c9d 1590
4ad49000
LP
1591 /* This adds the siblings of the specified unit and the
1592 * siblings of all parent units to the cgroup queue. (But
1593 * neither the specified unit itself nor the parents.) */
1594
1595 while ((slice = UNIT_DEREF(u->slice))) {
1596 Iterator i;
1597 Unit *m;
eef85c4a 1598 void *v;
8f53a7b8 1599
eef85c4a 1600 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
4ad49000
LP
1601 if (m == u)
1602 continue;
8e274523 1603
6414b7c9
DS
1604 /* Skip units that have a dependency on the slice
1605 * but aren't actually in it. */
4ad49000 1606 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1607 continue;
8e274523 1608
6414b7c9
DS
1609 /* No point in doing cgroup application for units
1610 * without active processes. */
1611 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1612 continue;
1613
1614 /* If the unit doesn't need any new controllers
1615 * and has current ones realized, it doesn't need
1616 * any changes. */
906c06f6
DM
1617 if (unit_has_mask_realized(m,
1618 unit_get_target_mask(m),
1619 unit_get_enable_mask(m),
1620 unit_get_needs_bpf(m)))
6414b7c9
DS
1621 continue;
1622
91a6073e 1623 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
1624 }
1625
4ad49000 1626 u = slice;
8e274523 1627 }
4ad49000
LP
1628}
1629
0a1eb06d 1630int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1631 assert(u);
1632
35b7ff80 1633 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1634 return 0;
8e274523 1635
4ad49000
LP
1636 /* So, here's the deal: when realizing the cgroups for this
1637 * unit, we need to first create all parents, but there's more
1638 * actually: for the weight-based controllers we also need to
1639 * make sure that all our siblings (i.e. units that are in the
73e231ab 1640 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1641 * would become very uneven as each of their processes would
1642 * get as much resources as all our group together. This call
1643 * will synchronously create the parent cgroups, but will
1644 * defer work on the siblings to the next event loop
1645 * iteration. */
ca949c9d 1646
4ad49000 1647 /* Add all sibling slices to the cgroup queue. */
91a6073e 1648 unit_add_siblings_to_cgroup_realize_queue(u);
4ad49000 1649
6414b7c9 1650 /* And realize this one now (and apply the values) */
db785129 1651 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1652}
1653
efdb0237
LP
1654void unit_release_cgroup(Unit *u) {
1655 assert(u);
1656
1657 /* Forgets all cgroup details for this cgroup */
1658
1659 if (u->cgroup_path) {
1660 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1661 u->cgroup_path = mfree(u->cgroup_path);
1662 }
1663
1664 if (u->cgroup_inotify_wd >= 0) {
1665 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1666 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1667
1668 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1669 u->cgroup_inotify_wd = -1;
1670 }
1671}
1672
1673void unit_prune_cgroup(Unit *u) {
8e274523 1674 int r;
efdb0237 1675 bool is_root_slice;
8e274523 1676
4ad49000 1677 assert(u);
8e274523 1678
efdb0237
LP
1679 /* Removes the cgroup, if empty and possible, and stops watching it. */
1680
4ad49000
LP
1681 if (!u->cgroup_path)
1682 return;
8e274523 1683
fe700f46
LP
1684 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1685
efdb0237
LP
1686 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1687
1688 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1689 if (r < 0) {
f29ff115 1690 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1691 return;
1692 }
8e274523 1693
efdb0237
LP
1694 if (is_root_slice)
1695 return;
1696
1697 unit_release_cgroup(u);
0a1eb06d 1698
4ad49000 1699 u->cgroup_realized = false;
bc432dc7 1700 u->cgroup_realized_mask = 0;
ccf78df1 1701 u->cgroup_enabled_mask = 0;
8e274523
LP
1702}
1703
efdb0237 1704int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1705 _cleanup_fclose_ FILE *f = NULL;
1706 pid_t pid = 0, npid, mypid;
efdb0237 1707 int r;
4ad49000
LP
1708
1709 assert(u);
efdb0237 1710 assert(ret);
4ad49000
LP
1711
1712 if (!u->cgroup_path)
efdb0237 1713 return -ENXIO;
4ad49000 1714
efdb0237
LP
1715 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1716 if (r < 0)
1717 return r;
4ad49000 1718
df0ff127 1719 mypid = getpid_cached();
4ad49000
LP
1720 while (cg_read_pid(f, &npid) > 0) {
1721 pid_t ppid;
1722
1723 if (npid == pid)
1724 continue;
8e274523 1725
4ad49000 1726 /* Ignore processes that aren't our kids */
6bc73acb 1727 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
4ad49000 1728 continue;
8e274523 1729
efdb0237 1730 if (pid != 0)
4ad49000
LP
1731 /* Dang, there's more than one daemonized PID
1732 in this group, so we don't know what process
1733 is the main process. */
efdb0237
LP
1734
1735 return -ENODATA;
8e274523 1736
4ad49000 1737 pid = npid;
8e274523
LP
1738 }
1739
efdb0237
LP
1740 *ret = pid;
1741 return 0;
1742}
1743
1744static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1745 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1746 _cleanup_fclose_ FILE *f = NULL;
1747 int ret = 0, r;
1748
1749 assert(u);
1750 assert(path);
1751
1752 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1753 if (r < 0)
1754 ret = r;
1755 else {
1756 pid_t pid;
1757
1758 while ((r = cg_read_pid(f, &pid)) > 0) {
1759 r = unit_watch_pid(u, pid);
1760 if (r < 0 && ret >= 0)
1761 ret = r;
1762 }
1763
1764 if (r < 0 && ret >= 0)
1765 ret = r;
1766 }
1767
1768 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1769 if (r < 0) {
1770 if (ret >= 0)
1771 ret = r;
1772 } else {
1773 char *fn;
1774
1775 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1776 _cleanup_free_ char *p = NULL;
1777
605405c6 1778 p = strjoin(path, "/", fn);
efdb0237
LP
1779 free(fn);
1780
1781 if (!p)
1782 return -ENOMEM;
1783
1784 r = unit_watch_pids_in_path(u, p);
1785 if (r < 0 && ret >= 0)
1786 ret = r;
1787 }
1788
1789 if (r < 0 && ret >= 0)
1790 ret = r;
1791 }
1792
1793 return ret;
1794}
1795
1796int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
1797 int r;
1798
efdb0237
LP
1799 assert(u);
1800
1801 /* Adds all PIDs from our cgroup to the set of PIDs we
1802 * watch. This is a fallback logic for cases where we do not
1803 * get reliable cgroup empty notifications: we try to use
1804 * SIGCHLD as replacement. */
1805
1806 if (!u->cgroup_path)
1807 return -ENOENT;
1808
c22800e4 1809 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1810 if (r < 0)
1811 return r;
1812 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
1813 return 0;
1814
1815 return unit_watch_pids_in_path(u, u->cgroup_path);
1816}
1817
09e24654
LP
1818static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1819 Manager *m = userdata;
1820 Unit *u;
efdb0237
LP
1821 int r;
1822
09e24654
LP
1823 assert(s);
1824 assert(m);
efdb0237 1825
09e24654
LP
1826 u = m->cgroup_empty_queue;
1827 if (!u)
efdb0237
LP
1828 return 0;
1829
09e24654
LP
1830 assert(u->in_cgroup_empty_queue);
1831 u->in_cgroup_empty_queue = false;
1832 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1833
1834 if (m->cgroup_empty_queue) {
1835 /* More stuff queued, let's make sure we remain enabled */
1836 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1837 if (r < 0)
1838 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1839 }
efdb0237
LP
1840
1841 unit_add_to_gc_queue(u);
1842
1843 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1844 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1845
1846 return 0;
1847}
1848
09e24654
LP
1849void unit_add_to_cgroup_empty_queue(Unit *u) {
1850 int r;
1851
1852 assert(u);
1853
1854 /* Note that there are four different ways how cgroup empty events reach us:
1855 *
1856 * 1. On the unified hierarchy we get an inotify event on the cgroup
1857 *
1858 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1859 *
1860 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1861 *
1862 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1863 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1864 *
1865 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1866 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1867 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1868 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1869 * case for scope units). */
1870
1871 if (u->in_cgroup_empty_queue)
1872 return;
1873
1874 /* Let's verify that the cgroup is really empty */
1875 if (!u->cgroup_path)
1876 return;
1877 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1878 if (r < 0) {
1879 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1880 return;
1881 }
1882 if (r == 0)
1883 return;
1884
1885 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1886 u->in_cgroup_empty_queue = true;
1887
1888 /* Trigger the defer event */
1889 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1890 if (r < 0)
1891 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1892}
1893
efdb0237
LP
1894static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1895 Manager *m = userdata;
1896
1897 assert(s);
1898 assert(fd >= 0);
1899 assert(m);
1900
1901 for (;;) {
1902 union inotify_event_buffer buffer;
1903 struct inotify_event *e;
1904 ssize_t l;
1905
1906 l = read(fd, &buffer, sizeof(buffer));
1907 if (l < 0) {
47249640 1908 if (IN_SET(errno, EINTR, EAGAIN))
efdb0237
LP
1909 return 0;
1910
1911 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1912 }
1913
1914 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1915 Unit *u;
1916
1917 if (e->wd < 0)
1918 /* Queue overflow has no watch descriptor */
1919 continue;
1920
1921 if (e->mask & IN_IGNORED)
1922 /* The watch was just removed */
1923 continue;
1924
1925 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1926 if (!u) /* Not that inotify might deliver
1927 * events for a watch even after it
1928 * was removed, because it was queued
1929 * before the removal. Let's ignore
1930 * this here safely. */
1931 continue;
1932
09e24654 1933 unit_add_to_cgroup_empty_queue(u);
efdb0237
LP
1934 }
1935 }
8e274523
LP
1936}
1937
8e274523 1938int manager_setup_cgroup(Manager *m) {
9444b1f2 1939 _cleanup_free_ char *path = NULL;
10bd3e2e 1940 const char *scope_path;
efdb0237 1941 CGroupController c;
b4cccbc1 1942 int r, all_unified;
efdb0237 1943 char *e;
8e274523
LP
1944
1945 assert(m);
1946
35d2e7ec 1947 /* 1. Determine hierarchy */
efdb0237 1948 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1949 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1950 if (r < 0)
1951 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1952
efdb0237
LP
1953 /* Chop off the init scope, if we are already located in it */
1954 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1955
efdb0237
LP
1956 /* LEGACY: Also chop off the system slice if we are in
1957 * it. This is to support live upgrades from older systemd
1958 * versions where PID 1 was moved there. Also see
1959 * cg_get_root_path(). */
463d0d15 1960 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 1961 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1962 if (!e)
efdb0237 1963 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1964 }
efdb0237
LP
1965 if (e)
1966 *e = 0;
7ccfb64a 1967
7546145e
LP
1968 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
1969 * easily prepend it everywhere. */
1970 delete_trailing_chars(m->cgroup_root, "/");
8e274523 1971
35d2e7ec 1972 /* 2. Show data */
9444b1f2 1973 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1974 if (r < 0)
1975 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1976
415fc41c
TH
1977 r = cg_unified_flush();
1978 if (r < 0)
1979 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 1980
b4cccbc1 1981 all_unified = cg_all_unified();
d4c819ed
ZJS
1982 if (all_unified < 0)
1983 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
1984 if (all_unified > 0)
efdb0237 1985 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 1986 else {
c22800e4 1987 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1988 if (r < 0)
1989 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
1990 if (r > 0)
1991 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
1992 else
1993 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
1994 }
efdb0237 1995
09e24654
LP
1996 /* 3. Allocate cgroup empty defer event source */
1997 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
1998 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
1999 if (r < 0)
2000 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2001
2002 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2003 if (r < 0)
2004 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2005
2006 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2007 if (r < 0)
2008 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2009
2010 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2011
2012 /* 4. Install notifier inotify object, or agent */
10bd3e2e 2013 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
c6c18be3 2014
09e24654 2015 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
efdb0237 2016
10bd3e2e
LP
2017 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2018 safe_close(m->cgroup_inotify_fd);
efdb0237 2019
10bd3e2e
LP
2020 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2021 if (m->cgroup_inotify_fd < 0)
2022 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 2023
10bd3e2e
LP
2024 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2025 if (r < 0)
2026 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 2027
10bd3e2e
LP
2028 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2029 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
09e24654 2030 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
10bd3e2e
LP
2031 if (r < 0)
2032 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
efdb0237 2033
10bd3e2e 2034 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 2035
10bd3e2e 2036 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
efdb0237 2037
10bd3e2e
LP
2038 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2039 * since it does not generate events when control groups with children run empty. */
8e274523 2040
10bd3e2e 2041 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
23bbb0de 2042 if (r < 0)
10bd3e2e
LP
2043 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2044 else if (r > 0)
2045 log_debug("Installed release agent.");
2046 else if (r == 0)
2047 log_debug("Release agent already installed.");
2048 }
efdb0237 2049
09e24654 2050 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
10bd3e2e
LP
2051 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2052 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2053 if (r < 0)
2054 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
c6c18be3 2055
09e24654 2056 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
10bd3e2e
LP
2057 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2058 if (r < 0)
2059 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
0d8c31ff 2060
09e24654 2061 /* 6. And pin it, so that it cannot be unmounted */
10bd3e2e
LP
2062 safe_close(m->pin_cgroupfs_fd);
2063 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2064 if (m->pin_cgroupfs_fd < 0)
2065 return log_error_errno(errno, "Failed to open pin file: %m");
2066
09e24654 2067 /* 7. Always enable hierarchical support if it exists... */
10bd3e2e
LP
2068 if (!all_unified && m->test_run_flags == 0)
2069 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3 2070
09e24654 2071 /* 8. Figure out which controllers are supported, and log about it */
efdb0237
LP
2072 r = cg_mask_supported(&m->cgroup_supported);
2073 if (r < 0)
2074 return log_error_errno(r, "Failed to determine supported controllers: %m");
efdb0237 2075 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
eee0a1e4 2076 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 2077
a32360f1 2078 return 0;
8e274523
LP
2079}
2080
c6c18be3 2081void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
2082 assert(m);
2083
9444b1f2
LP
2084 /* We can't really delete the group, since we are in it. But
2085 * let's trim it. */
2086 if (delete && m->cgroup_root)
efdb0237
LP
2087 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2088
09e24654
LP
2089 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2090
efdb0237
LP
2091 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2092
2093 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2094 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 2095
03e334a1 2096 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 2097
efdb0237 2098 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
2099}
2100
4ad49000 2101Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 2102 char *p;
4ad49000 2103 Unit *u;
acb14d31
LP
2104
2105 assert(m);
2106 assert(cgroup);
acb14d31 2107
4ad49000
LP
2108 u = hashmap_get(m->cgroup_unit, cgroup);
2109 if (u)
2110 return u;
acb14d31 2111
8e70580b 2112 p = strdupa(cgroup);
acb14d31
LP
2113 for (;;) {
2114 char *e;
2115
2116 e = strrchr(p, '/');
efdb0237
LP
2117 if (!e || e == p)
2118 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
2119
2120 *e = 0;
2121
4ad49000
LP
2122 u = hashmap_get(m->cgroup_unit, p);
2123 if (u)
2124 return u;
acb14d31
LP
2125 }
2126}
2127
b3ac818b 2128Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 2129 _cleanup_free_ char *cgroup = NULL;
acb14d31 2130 int r;
8e274523 2131
8c47c732
LP
2132 assert(m);
2133
b3ac818b
LP
2134 if (pid <= 0)
2135 return NULL;
2136
2137 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2138 if (r < 0)
2139 return NULL;
2140
2141 return manager_get_unit_by_cgroup(m, cgroup);
2142}
2143
2144Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2145 Unit *u;
2146
2147 assert(m);
2148
efdb0237 2149 if (pid <= 0)
8c47c732
LP
2150 return NULL;
2151
efdb0237
LP
2152 if (pid == 1)
2153 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2154
fea72cc0 2155 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
2156 if (u)
2157 return u;
2158
fea72cc0 2159 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
2160 if (u)
2161 return u;
2162
b3ac818b 2163 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 2164}
4fbf50b3 2165
4ad49000
LP
2166int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2167 Unit *u;
4fbf50b3 2168
4ad49000
LP
2169 assert(m);
2170 assert(cgroup);
4fbf50b3 2171
09e24654
LP
2172 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2173 * or from the --system instance */
2174
d8fdc620
LP
2175 log_debug("Got cgroup empty notification for: %s", cgroup);
2176
4ad49000 2177 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
2178 if (!u)
2179 return 0;
b56c28c3 2180
09e24654
LP
2181 unit_add_to_cgroup_empty_queue(u);
2182 return 1;
5ad096b3
LP
2183}
2184
2185int unit_get_memory_current(Unit *u, uint64_t *ret) {
2186 _cleanup_free_ char *v = NULL;
2187 int r;
2188
2189 assert(u);
2190 assert(ret);
2191
2e4025c0 2192 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
cf3b4be1
LP
2193 return -ENODATA;
2194
5ad096b3
LP
2195 if (!u->cgroup_path)
2196 return -ENODATA;
2197
efdb0237 2198 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
2199 return -ENODATA;
2200
b4cccbc1
LP
2201 r = cg_all_unified();
2202 if (r < 0)
2203 return r;
2204 if (r > 0)
efdb0237 2205 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
b4cccbc1
LP
2206 else
2207 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
5ad096b3
LP
2208 if (r == -ENOENT)
2209 return -ENODATA;
2210 if (r < 0)
2211 return r;
2212
2213 return safe_atou64(v, ret);
2214}
2215
03a7b521
LP
2216int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2217 _cleanup_free_ char *v = NULL;
2218 int r;
2219
2220 assert(u);
2221 assert(ret);
2222
2e4025c0 2223 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
2224 return -ENODATA;
2225
03a7b521
LP
2226 if (!u->cgroup_path)
2227 return -ENODATA;
2228
2229 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2230 return -ENODATA;
2231
2232 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2233 if (r == -ENOENT)
2234 return -ENODATA;
2235 if (r < 0)
2236 return r;
2237
2238 return safe_atou64(v, ret);
2239}
2240
5ad096b3
LP
2241static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2242 _cleanup_free_ char *v = NULL;
2243 uint64_t ns;
2244 int r;
2245
2246 assert(u);
2247 assert(ret);
2248
2249 if (!u->cgroup_path)
2250 return -ENODATA;
2251
b4cccbc1
LP
2252 r = cg_all_unified();
2253 if (r < 0)
2254 return r;
2255 if (r > 0) {
66ebf6c0
TH
2256 const char *keys[] = { "usage_usec", NULL };
2257 _cleanup_free_ char *val = NULL;
2258 uint64_t us;
5ad096b3 2259
66ebf6c0
TH
2260 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2261 return -ENODATA;
5ad096b3 2262
66ebf6c0
TH
2263 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2264 if (r < 0)
2265 return r;
2266
2267 r = safe_atou64(val, &us);
2268 if (r < 0)
2269 return r;
2270
2271 ns = us * NSEC_PER_USEC;
2272 } else {
2273 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2274 return -ENODATA;
2275
2276 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2277 if (r == -ENOENT)
2278 return -ENODATA;
2279 if (r < 0)
2280 return r;
2281
2282 r = safe_atou64(v, &ns);
2283 if (r < 0)
2284 return r;
2285 }
5ad096b3
LP
2286
2287 *ret = ns;
2288 return 0;
2289}
2290
2291int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2292 nsec_t ns;
2293 int r;
2294
fe700f46
LP
2295 assert(u);
2296
2297 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2298 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2299 * call this function with a NULL return value. */
2300
2e4025c0 2301 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
cf3b4be1
LP
2302 return -ENODATA;
2303
5ad096b3 2304 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
2305 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2306 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2307 * cached value. */
2308
2309 if (ret)
2310 *ret = u->cpu_usage_last;
2311 return 0;
2312 }
5ad096b3
LP
2313 if (r < 0)
2314 return r;
2315
66ebf6c0
TH
2316 if (ns > u->cpu_usage_base)
2317 ns -= u->cpu_usage_base;
5ad096b3
LP
2318 else
2319 ns = 0;
2320
fe700f46
LP
2321 u->cpu_usage_last = ns;
2322 if (ret)
2323 *ret = ns;
2324
5ad096b3
LP
2325 return 0;
2326}
2327
906c06f6
DM
2328int unit_get_ip_accounting(
2329 Unit *u,
2330 CGroupIPAccountingMetric metric,
2331 uint64_t *ret) {
2332
6b659ed8 2333 uint64_t value;
906c06f6
DM
2334 int fd, r;
2335
2336 assert(u);
2337 assert(metric >= 0);
2338 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2339 assert(ret);
2340
cf3b4be1
LP
2341 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2342 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2343 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2344 * filters. */
2345 if (u->type == UNIT_SLICE)
2346 return -ENODATA;
2347
2e4025c0 2348 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
2349 return -ENODATA;
2350
906c06f6
DM
2351 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2352 u->ip_accounting_ingress_map_fd :
2353 u->ip_accounting_egress_map_fd;
2354
2355 if (fd < 0)
2356 return -ENODATA;
2357
2358 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 2359 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 2360 else
6b659ed8
LP
2361 r = bpf_firewall_read_accounting(fd, NULL, &value);
2362 if (r < 0)
2363 return r;
2364
2365 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2366 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2367 * ip_accounting_extra[] field, and add them in here transparently. */
2368
2369 *ret = value + u->ip_accounting_extra[metric];
906c06f6
DM
2370
2371 return r;
2372}
2373
2374int unit_reset_cpu_accounting(Unit *u) {
5ad096b3
LP
2375 nsec_t ns;
2376 int r;
2377
2378 assert(u);
2379
fe700f46
LP
2380 u->cpu_usage_last = NSEC_INFINITY;
2381
5ad096b3
LP
2382 r = unit_get_cpu_usage_raw(u, &ns);
2383 if (r < 0) {
66ebf6c0 2384 u->cpu_usage_base = 0;
5ad096b3 2385 return r;
b56c28c3 2386 }
2633eb83 2387
66ebf6c0 2388 u->cpu_usage_base = ns;
4ad49000 2389 return 0;
4fbf50b3
LP
2390}
2391
906c06f6
DM
2392int unit_reset_ip_accounting(Unit *u) {
2393 int r = 0, q = 0;
2394
2395 assert(u);
2396
2397 if (u->ip_accounting_ingress_map_fd >= 0)
2398 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2399
2400 if (u->ip_accounting_egress_map_fd >= 0)
2401 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2402
6b659ed8
LP
2403 zero(u->ip_accounting_extra);
2404
906c06f6
DM
2405 return r < 0 ? r : q;
2406}
2407
e7ab4d1a
LP
2408void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2409 assert(u);
2410
2411 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2412 return;
2413
2414 if (m == 0)
2415 return;
2416
538b4852
TH
2417 /* always invalidate compat pairs together */
2418 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2419 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2420
7cce4fb7
LP
2421 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2422 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2423
e7ab4d1a
LP
2424 if ((u->cgroup_realized_mask & m) == 0)
2425 return;
2426
2427 u->cgroup_realized_mask &= ~m;
91a6073e 2428 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
2429}
2430
906c06f6
DM
2431void unit_invalidate_cgroup_bpf(Unit *u) {
2432 assert(u);
2433
2434 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2435 return;
2436
2437 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED)
2438 return;
2439
2440 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
91a6073e 2441 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
2442
2443 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2444 * list of our children includes our own. */
2445 if (u->type == UNIT_SLICE) {
2446 Unit *member;
2447 Iterator i;
eef85c4a 2448 void *v;
906c06f6 2449
eef85c4a 2450 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
906c06f6
DM
2451 if (member == u)
2452 continue;
2453
2454 if (UNIT_DEREF(member->slice) != u)
2455 continue;
2456
2457 unit_invalidate_cgroup_bpf(member);
2458 }
2459 }
2460}
2461
e7ab4d1a
LP
2462void manager_invalidate_startup_units(Manager *m) {
2463 Iterator i;
2464 Unit *u;
2465
2466 assert(m);
2467
2468 SET_FOREACH(u, m->startup_units, i)
13c31542 2469 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
2470}
2471
4ad49000
LP
2472static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2473 [CGROUP_AUTO] = "auto",
2474 [CGROUP_CLOSED] = "closed",
2475 [CGROUP_STRICT] = "strict",
2476};
4fbf50b3 2477
4ad49000 2478DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);