]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
test-cgroup-util: add basic test for cg_all_unified/cg_hybrid_unified/cg_unified_cont...
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
8e274523
LP
1/***
2 This file is part of systemd.
3
4ad49000 4 Copyright 2013 Lennart Poettering
8e274523
LP
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
8e274523 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
c6c18be3 20#include <fcntl.h>
e41969e3 21#include <fnmatch.h>
8c6db833 22
b5efdb8a 23#include "alloc-util.h"
906c06f6 24#include "bpf-firewall.h"
03a7b521 25#include "cgroup-util.h"
3ffd4af2
LP
26#include "cgroup.h"
27#include "fd-util.h"
0d39fa9c 28#include "fileio.h"
77601719 29#include "fs-util.h"
6bedfcbb 30#include "parse-util.h"
9eb977db 31#include "path-util.h"
03a7b521 32#include "process-util.h"
9444b1f2 33#include "special.h"
906c06f6 34#include "stdio-util.h"
8b43440b 35#include "string-table.h"
07630cea 36#include "string-util.h"
8e274523 37
9a054909
LP
38#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
39
2b40998d 40static void cgroup_compat_warn(void) {
128fadc9
TH
41 static bool cgroup_compat_warned = false;
42
43 if (cgroup_compat_warned)
44 return;
45
46 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
47 cgroup_compat_warned = true;
48}
49
50#define log_cgroup_compat(unit, fmt, ...) do { \
51 cgroup_compat_warn(); \
52 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 53 } while (false)
128fadc9 54
4ad49000
LP
55void cgroup_context_init(CGroupContext *c) {
56 assert(c);
57
58 /* Initialize everything to the kernel defaults, assuming the
59 * structure is preinitialized to 0 */
60
66ebf6c0
TH
61 c->cpu_weight = CGROUP_WEIGHT_INVALID;
62 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
63 c->cpu_quota_per_sec_usec = USEC_INFINITY;
64
d53d9474
LP
65 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
66 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
d53d9474 67
da4d897e
TH
68 c->memory_high = CGROUP_LIMIT_MAX;
69 c->memory_max = CGROUP_LIMIT_MAX;
96e131ea 70 c->memory_swap_max = CGROUP_LIMIT_MAX;
da4d897e
TH
71
72 c->memory_limit = CGROUP_LIMIT_MAX;
b2f8b02e 73
13c31542
TH
74 c->io_weight = CGROUP_WEIGHT_INVALID;
75 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
76
d53d9474
LP
77 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
78 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
79
80 c->tasks_max = (uint64_t) -1;
4ad49000 81}
8e274523 82
4ad49000
LP
83void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
84 assert(c);
85 assert(a);
86
71fda00f 87 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
88 free(a->path);
89 free(a);
90}
91
13c31542
TH
92void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
93 assert(c);
94 assert(w);
95
96 LIST_REMOVE(device_weights, c->io_device_weights, w);
97 free(w->path);
98 free(w);
99}
100
101void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
102 assert(c);
103 assert(l);
104
105 LIST_REMOVE(device_limits, c->io_device_limits, l);
106 free(l->path);
107 free(l);
108}
109
4ad49000
LP
110void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
111 assert(c);
112 assert(w);
113
71fda00f 114 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
115 free(w->path);
116 free(w);
117}
118
119void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
120 assert(c);
8e274523 121 assert(b);
8e274523 122
71fda00f 123 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
124 free(b->path);
125 free(b);
126}
127
128void cgroup_context_done(CGroupContext *c) {
129 assert(c);
130
13c31542
TH
131 while (c->io_device_weights)
132 cgroup_context_free_io_device_weight(c, c->io_device_weights);
133
134 while (c->io_device_limits)
135 cgroup_context_free_io_device_limit(c, c->io_device_limits);
136
4ad49000
LP
137 while (c->blockio_device_weights)
138 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
139
140 while (c->blockio_device_bandwidths)
141 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
142
143 while (c->device_allow)
144 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f
DM
145
146 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
147 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
4ad49000
LP
148}
149
150void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
13c31542
TH
151 CGroupIODeviceLimit *il;
152 CGroupIODeviceWeight *iw;
4ad49000
LP
153 CGroupBlockIODeviceBandwidth *b;
154 CGroupBlockIODeviceWeight *w;
155 CGroupDeviceAllow *a;
c21c9906 156 IPAddressAccessItem *iaai;
9a054909 157 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
158
159 assert(c);
160 assert(f);
161
162 prefix = strempty(prefix);
163
164 fprintf(f,
165 "%sCPUAccounting=%s\n"
13c31542 166 "%sIOAccounting=%s\n"
4ad49000
LP
167 "%sBlockIOAccounting=%s\n"
168 "%sMemoryAccounting=%s\n"
d53d9474 169 "%sTasksAccounting=%s\n"
c21c9906 170 "%sIPAccounting=%s\n"
66ebf6c0
TH
171 "%sCPUWeight=%" PRIu64 "\n"
172 "%sStartupCPUWeight=%" PRIu64 "\n"
d53d9474
LP
173 "%sCPUShares=%" PRIu64 "\n"
174 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 175 "%sCPUQuotaPerSecSec=%s\n"
13c31542
TH
176 "%sIOWeight=%" PRIu64 "\n"
177 "%sStartupIOWeight=%" PRIu64 "\n"
d53d9474
LP
178 "%sBlockIOWeight=%" PRIu64 "\n"
179 "%sStartupBlockIOWeight=%" PRIu64 "\n"
da4d897e
TH
180 "%sMemoryLow=%" PRIu64 "\n"
181 "%sMemoryHigh=%" PRIu64 "\n"
182 "%sMemoryMax=%" PRIu64 "\n"
96e131ea 183 "%sMemorySwapMax=%" PRIu64 "\n"
4ad49000 184 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 185 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
186 "%sDevicePolicy=%s\n"
187 "%sDelegate=%s\n",
4ad49000 188 prefix, yes_no(c->cpu_accounting),
13c31542 189 prefix, yes_no(c->io_accounting),
4ad49000
LP
190 prefix, yes_no(c->blockio_accounting),
191 prefix, yes_no(c->memory_accounting),
d53d9474 192 prefix, yes_no(c->tasks_accounting),
c21c9906 193 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
194 prefix, c->cpu_weight,
195 prefix, c->startup_cpu_weight,
4ad49000 196 prefix, c->cpu_shares,
95ae05c0 197 prefix, c->startup_cpu_shares,
b1d6dcf5 198 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
13c31542
TH
199 prefix, c->io_weight,
200 prefix, c->startup_io_weight,
4ad49000 201 prefix, c->blockio_weight,
95ae05c0 202 prefix, c->startup_blockio_weight,
da4d897e
TH
203 prefix, c->memory_low,
204 prefix, c->memory_high,
205 prefix, c->memory_max,
96e131ea 206 prefix, c->memory_swap_max,
4ad49000 207 prefix, c->memory_limit,
03a7b521 208 prefix, c->tasks_max,
a931ad47
LP
209 prefix, cgroup_device_policy_to_string(c->device_policy),
210 prefix, yes_no(c->delegate));
4ad49000 211
02638280
LP
212 if (c->delegate) {
213 _cleanup_free_ char *t = NULL;
214
215 (void) cg_mask_to_string(c->delegate_controllers, &t);
216
47a78d41 217 fprintf(f, "%sDelegateControllers=%s\n",
02638280
LP
218 prefix,
219 strempty(t));
220 }
221
4ad49000
LP
222 LIST_FOREACH(device_allow, a, c->device_allow)
223 fprintf(f,
224 "%sDeviceAllow=%s %s%s%s\n",
225 prefix,
226 a->path,
227 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
228
13c31542
TH
229 LIST_FOREACH(device_weights, iw, c->io_device_weights)
230 fprintf(f,
231 "%sIODeviceWeight=%s %" PRIu64,
232 prefix,
233 iw->path,
234 iw->weight);
235
236 LIST_FOREACH(device_limits, il, c->io_device_limits) {
237 char buf[FORMAT_BYTES_MAX];
9be57249
TH
238 CGroupIOLimitType type;
239
240 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
241 if (il->limits[type] != cgroup_io_limit_defaults[type])
242 fprintf(f,
243 "%s%s=%s %s\n",
244 prefix,
245 cgroup_io_limit_type_to_string(type),
246 il->path,
247 format_bytes(buf, sizeof(buf), il->limits[type]));
13c31542
TH
248 }
249
4ad49000
LP
250 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
251 fprintf(f,
d53d9474 252 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
253 prefix,
254 w->path,
255 w->weight);
256
257 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
258 char buf[FORMAT_BYTES_MAX];
259
979d0311
TH
260 if (b->rbps != CGROUP_LIMIT_MAX)
261 fprintf(f,
262 "%sBlockIOReadBandwidth=%s %s\n",
263 prefix,
264 b->path,
265 format_bytes(buf, sizeof(buf), b->rbps));
266 if (b->wbps != CGROUP_LIMIT_MAX)
267 fprintf(f,
268 "%sBlockIOWriteBandwidth=%s %s\n",
269 prefix,
270 b->path,
271 format_bytes(buf, sizeof(buf), b->wbps));
4ad49000 272 }
c21c9906
LP
273
274 LIST_FOREACH(items, iaai, c->ip_address_allow) {
275 _cleanup_free_ char *k = NULL;
276
277 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
278 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
279 }
280
281 LIST_FOREACH(items, iaai, c->ip_address_deny) {
282 _cleanup_free_ char *k = NULL;
283
284 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
285 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
286 }
4ad49000
LP
287}
288
13c31542 289static int lookup_block_device(const char *p, dev_t *dev) {
4ad49000
LP
290 struct stat st;
291 int r;
292
293 assert(p);
294 assert(dev);
295
296 r = stat(p, &st);
4a62c710
MS
297 if (r < 0)
298 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 299
4ad49000
LP
300 if (S_ISBLK(st.st_mode))
301 *dev = st.st_rdev;
302 else if (major(st.st_dev) != 0) {
303 /* If this is not a device node then find the block
304 * device this file is stored on */
305 *dev = st.st_dev;
306
307 /* If this is a partition, try to get the originating
308 * block device */
309 block_get_whole_disk(*dev, dev);
310 } else {
311 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
312 return -ENODEV;
313 }
8e274523 314
8e274523 315 return 0;
8e274523
LP
316}
317
4ad49000
LP
318static int whitelist_device(const char *path, const char *node, const char *acc) {
319 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
320 struct stat st;
b200489b 321 bool ignore_notfound;
8c6db833 322 int r;
8e274523 323
4ad49000
LP
324 assert(path);
325 assert(acc);
8e274523 326
b200489b
DR
327 if (node[0] == '-') {
328 /* Non-existent paths starting with "-" must be silently ignored */
329 node++;
330 ignore_notfound = true;
331 } else
332 ignore_notfound = false;
333
4ad49000 334 if (stat(node, &st) < 0) {
b200489b 335 if (errno == ENOENT && ignore_notfound)
e7330dfe
DP
336 return 0;
337
338 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
4ad49000
LP
339 }
340
341 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
342 log_warning("%s is not a device.", node);
343 return -ENODEV;
344 }
345
346 sprintf(buf,
347 "%c %u:%u %s",
348 S_ISCHR(st.st_mode) ? 'c' : 'b',
349 major(st.st_rdev), minor(st.st_rdev),
350 acc);
351
352 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 353 if (r < 0)
077ba06e 354 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 355 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
356
357 return r;
8e274523
LP
358}
359
90060676
LP
360static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
361 _cleanup_fclose_ FILE *f = NULL;
362 char line[LINE_MAX];
363 bool good = false;
364 int r;
365
366 assert(path);
367 assert(acc);
4c701096 368 assert(IN_SET(type, 'b', 'c'));
90060676
LP
369
370 f = fopen("/proc/devices", "re");
4a62c710
MS
371 if (!f)
372 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
373
374 FOREACH_LINE(line, f, goto fail) {
375 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
376 unsigned maj;
377
378 truncate_nl(line);
379
380 if (type == 'c' && streq(line, "Character devices:")) {
381 good = true;
382 continue;
383 }
384
385 if (type == 'b' && streq(line, "Block devices:")) {
386 good = true;
387 continue;
388 }
389
390 if (isempty(line)) {
391 good = false;
392 continue;
393 }
394
395 if (!good)
396 continue;
397
398 p = strstrip(line);
399
400 w = strpbrk(p, WHITESPACE);
401 if (!w)
402 continue;
403 *w = 0;
404
405 r = safe_atou(p, &maj);
406 if (r < 0)
407 continue;
408 if (maj <= 0)
409 continue;
410
411 w++;
412 w += strspn(w, WHITESPACE);
e41969e3
LP
413
414 if (fnmatch(name, w, 0) != 0)
90060676
LP
415 continue;
416
417 sprintf(buf,
418 "%c %u:* %s",
419 type,
420 maj,
421 acc);
422
423 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 424 if (r < 0)
077ba06e 425 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 426 "Failed to set devices.allow on %s: %m", path);
90060676
LP
427 }
428
429 return 0;
430
431fail:
25f027c5 432 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
433}
434
66ebf6c0
TH
435static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
436 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
437 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
438}
439
440static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
441 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
442 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
443}
444
445static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
446 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
447 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
448 return c->startup_cpu_weight;
449 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
450 return c->cpu_weight;
451 else
452 return CGROUP_WEIGHT_DEFAULT;
453}
454
455static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
456 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
457 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
458 return c->startup_cpu_shares;
459 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
460 return c->cpu_shares;
461 else
462 return CGROUP_CPU_SHARES_DEFAULT;
463}
464
465static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
466 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
467 int r;
468
469 xsprintf(buf, "%" PRIu64 "\n", weight);
470 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
471 if (r < 0)
472 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
473 "Failed to set cpu.weight: %m");
474
475 if (quota != USEC_INFINITY)
476 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
477 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
478 else
479 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
480
481 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
482
483 if (r < 0)
484 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
485 "Failed to set cpu.max: %m");
486}
487
488static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
489 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
490 int r;
491
492 xsprintf(buf, "%" PRIu64 "\n", shares);
493 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
494 if (r < 0)
495 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
496 "Failed to set cpu.shares: %m");
497
498 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
499 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
500 if (r < 0)
501 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
502 "Failed to set cpu.cfs_period_us: %m");
503
504 if (quota != USEC_INFINITY) {
505 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
506 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
507 } else
508 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
509 if (r < 0)
510 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
511 "Failed to set cpu.cfs_quota_us: %m");
512}
513
514static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
515 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
516 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
517}
518
519static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
520 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
521 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
522}
523
508c45da 524static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
525 return c->io_accounting ||
526 c->io_weight != CGROUP_WEIGHT_INVALID ||
527 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
528 c->io_device_weights ||
529 c->io_device_limits;
530}
531
508c45da 532static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
533 return c->blockio_accounting ||
534 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
535 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
536 c->blockio_device_weights ||
537 c->blockio_device_bandwidths;
538}
539
508c45da 540static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
541 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
542 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
543 return c->startup_io_weight;
544 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
545 return c->io_weight;
546 else
547 return CGROUP_WEIGHT_DEFAULT;
548}
549
508c45da 550static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
551 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
552 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
553 return c->startup_blockio_weight;
554 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
555 return c->blockio_weight;
556 else
557 return CGROUP_BLKIO_WEIGHT_DEFAULT;
558}
559
508c45da 560static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
561 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
562 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
563}
564
508c45da 565static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
566 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
567 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
568}
569
f29ff115 570static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
571 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
572 dev_t dev;
573 int r;
574
575 r = lookup_block_device(dev_path, &dev);
576 if (r < 0)
577 return;
578
579 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
f29ff115 580 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
64faf04c 581 if (r < 0)
f29ff115
TH
582 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
583 "Failed to set io.weight: %m");
64faf04c
TH
584}
585
f29ff115 586static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
587 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
588 dev_t dev;
589 int r;
590
591 r = lookup_block_device(dev_path, &dev);
592 if (r < 0)
593 return;
594
595 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
f29ff115 596 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
64faf04c 597 if (r < 0)
f29ff115
TH
598 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
599 "Failed to set blkio.weight_device: %m");
64faf04c
TH
600}
601
f29ff115 602static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
64faf04c
TH
603 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
604 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
605 CGroupIOLimitType type;
606 dev_t dev;
607 unsigned n = 0;
608 int r;
609
610 r = lookup_block_device(dev_path, &dev);
611 if (r < 0)
612 return 0;
613
614 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
615 if (limits[type] != cgroup_io_limit_defaults[type]) {
616 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
617 n++;
618 } else {
619 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
620 }
621 }
622
623 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
624 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
625 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
f29ff115 626 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
64faf04c 627 if (r < 0)
f29ff115
TH
628 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
629 "Failed to set io.max: %m");
64faf04c
TH
630 return n;
631}
632
f29ff115 633static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
634 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
635 dev_t dev;
636 unsigned n = 0;
637 int r;
638
639 r = lookup_block_device(dev_path, &dev);
640 if (r < 0)
641 return 0;
642
643 if (rbps != CGROUP_LIMIT_MAX)
644 n++;
645 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
f29ff115 646 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
64faf04c 647 if (r < 0)
f29ff115
TH
648 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
649 "Failed to set blkio.throttle.read_bps_device: %m");
64faf04c
TH
650
651 if (wbps != CGROUP_LIMIT_MAX)
652 n++;
653 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
f29ff115 654 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
64faf04c 655 if (r < 0)
f29ff115
TH
656 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
657 "Failed to set blkio.throttle.write_bps_device: %m");
64faf04c
TH
658
659 return n;
660}
661
da4d897e 662static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
96e131ea 663 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
664}
665
f29ff115 666static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
da4d897e
TH
667 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
668 int r;
669
670 if (v != CGROUP_LIMIT_MAX)
671 xsprintf(buf, "%" PRIu64 "\n", v);
672
f29ff115 673 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
da4d897e 674 if (r < 0)
f29ff115
TH
675 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
676 "Failed to set %s: %m", file);
da4d897e
TH
677}
678
906c06f6
DM
679static void cgroup_apply_firewall(Unit *u, CGroupContext *c) {
680 int r;
681
682 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
683 * not recursive we don't ever touch the bpf on them */
684 return;
685
686 r = bpf_firewall_compile(u);
687 if (r < 0)
688 return;
689
690 (void) bpf_firewall_install(u);
691 return;
692}
693
694static void cgroup_context_apply(
695 Unit *u,
696 CGroupMask apply_mask,
697 bool apply_bpf,
698 ManagerState state) {
699
f29ff115
TH
700 const char *path;
701 CGroupContext *c;
01efdf13 702 bool is_root;
4ad49000
LP
703 int r;
704
f29ff115
TH
705 assert(u);
706
707 c = unit_get_cgroup_context(u);
708 path = u->cgroup_path;
709
4ad49000
LP
710 assert(c);
711 assert(path);
8e274523 712
906c06f6
DM
713 /* Nothing to do? Exit early! */
714 if (apply_mask == 0 && !apply_bpf)
4ad49000 715 return;
8e274523 716
71c26873 717 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
718 * hence silently ignore */
719 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
720 if (is_root)
721 /* Make sure we don't try to display messages with an empty path. */
722 path = "/";
01efdf13 723
714e2e1d
LP
724 /* We generally ignore errors caused by read-only mounted
725 * cgroup trees (assuming we are running in a container then),
726 * and missing cgroups, i.e. EROFS and ENOENT. */
727
906c06f6
DM
728 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
729 bool has_weight, has_shares;
730
731 has_weight = cgroup_context_has_cpu_weight(c);
732 has_shares = cgroup_context_has_cpu_shares(c);
8e274523 733
b4cccbc1 734 if (cg_all_unified() > 0) {
66ebf6c0 735 uint64_t weight;
b2f8b02e 736
66ebf6c0
TH
737 if (has_weight)
738 weight = cgroup_context_cpu_weight(c, state);
739 else if (has_shares) {
740 uint64_t shares = cgroup_context_cpu_shares(c, state);
b2f8b02e 741
66ebf6c0
TH
742 weight = cgroup_cpu_shares_to_weight(shares);
743
744 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
745 shares, weight, path);
746 } else
747 weight = CGROUP_WEIGHT_DEFAULT;
748
749 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
750 } else {
751 uint64_t shares;
752
7d862ab8 753 if (has_weight) {
66ebf6c0
TH
754 uint64_t weight = cgroup_context_cpu_weight(c, state);
755
756 shares = cgroup_cpu_weight_to_shares(weight);
757
758 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
759 weight, shares, path);
7d862ab8
TH
760 } else if (has_shares)
761 shares = cgroup_context_cpu_shares(c, state);
762 else
66ebf6c0
TH
763 shares = CGROUP_CPU_SHARES_DEFAULT;
764
765 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
766 }
4ad49000
LP
767 }
768
906c06f6 769 if (apply_mask & CGROUP_MASK_IO) {
538b4852
TH
770 bool has_io = cgroup_context_has_io_config(c);
771 bool has_blockio = cgroup_context_has_blockio_config(c);
13c31542
TH
772
773 if (!is_root) {
64faf04c
TH
774 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
775 uint64_t weight;
13c31542 776
538b4852
TH
777 if (has_io)
778 weight = cgroup_context_io_weight(c, state);
128fadc9
TH
779 else if (has_blockio) {
780 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
781
782 weight = cgroup_weight_blkio_to_io(blkio_weight);
783
784 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
785 blkio_weight, weight);
786 } else
538b4852 787 weight = CGROUP_WEIGHT_DEFAULT;
13c31542
TH
788
789 xsprintf(buf, "default %" PRIu64 "\n", weight);
790 r = cg_set_attribute("io", path, "io.weight", buf);
791 if (r < 0)
f29ff115
TH
792 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
793 "Failed to set io.weight: %m");
13c31542 794
538b4852
TH
795 if (has_io) {
796 CGroupIODeviceWeight *w;
797
798 /* FIXME: no way to reset this list */
799 LIST_FOREACH(device_weights, w, c->io_device_weights)
f29ff115 800 cgroup_apply_io_device_weight(u, w->path, w->weight);
538b4852
TH
801 } else if (has_blockio) {
802 CGroupBlockIODeviceWeight *w;
803
804 /* FIXME: no way to reset this list */
128fadc9
TH
805 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
806 weight = cgroup_weight_blkio_to_io(w->weight);
807
808 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
809 w->weight, weight, w->path);
810
811 cgroup_apply_io_device_weight(u, w->path, weight);
812 }
538b4852 813 }
13c31542
TH
814 }
815
64faf04c 816 /* Apply limits and free ones without config. */
538b4852
TH
817 if (has_io) {
818 CGroupIODeviceLimit *l, *next;
819
820 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
f29ff115 821 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
538b4852
TH
822 cgroup_context_free_io_device_limit(c, l);
823 }
824 } else if (has_blockio) {
825 CGroupBlockIODeviceBandwidth *b, *next;
826
827 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
828 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
829 CGroupIOLimitType type;
830
831 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
832 limits[type] = cgroup_io_limit_defaults[type];
833
834 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
835 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
836
128fadc9
TH
837 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
838 b->rbps, b->wbps, b->path);
839
f29ff115 840 if (!cgroup_apply_io_device_limit(u, b->path, limits))
538b4852
TH
841 cgroup_context_free_blockio_device_bandwidth(c, b);
842 }
13c31542
TH
843 }
844 }
845
906c06f6 846 if (apply_mask & CGROUP_MASK_BLKIO) {
538b4852
TH
847 bool has_io = cgroup_context_has_io_config(c);
848 bool has_blockio = cgroup_context_has_blockio_config(c);
4ad49000 849
01efdf13 850 if (!is_root) {
64faf04c
TH
851 char buf[DECIMAL_STR_MAX(uint64_t)+1];
852 uint64_t weight;
64faf04c 853
7d862ab8 854 if (has_io) {
128fadc9
TH
855 uint64_t io_weight = cgroup_context_io_weight(c, state);
856
538b4852 857 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9
TH
858
859 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
860 io_weight, weight);
7d862ab8
TH
861 } else if (has_blockio)
862 weight = cgroup_context_blkio_weight(c, state);
863 else
538b4852 864 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c
TH
865
866 xsprintf(buf, "%" PRIu64 "\n", weight);
01efdf13 867 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 868 if (r < 0)
f29ff115
TH
869 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
870 "Failed to set blkio.weight: %m");
4ad49000 871
7d862ab8 872 if (has_io) {
538b4852
TH
873 CGroupIODeviceWeight *w;
874
875 /* FIXME: no way to reset this list */
128fadc9
TH
876 LIST_FOREACH(device_weights, w, c->io_device_weights) {
877 weight = cgroup_weight_io_to_blkio(w->weight);
878
879 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
880 w->weight, weight, w->path);
881
882 cgroup_apply_blkio_device_weight(u, w->path, weight);
883 }
7d862ab8
TH
884 } else if (has_blockio) {
885 CGroupBlockIODeviceWeight *w;
886
887 /* FIXME: no way to reset this list */
888 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
889 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 890 }
4ad49000
LP
891 }
892
64faf04c 893 /* Apply limits and free ones without config. */
7d862ab8 894 if (has_io) {
538b4852
TH
895 CGroupIODeviceLimit *l, *next;
896
897 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
128fadc9
TH
898 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
899 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
900
f29ff115 901 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
538b4852
TH
902 cgroup_context_free_io_device_limit(c, l);
903 }
7d862ab8
TH
904 } else if (has_blockio) {
905 CGroupBlockIODeviceBandwidth *b, *next;
906
907 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
908 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
909 cgroup_context_free_blockio_device_bandwidth(c, b);
d686d8a9 910 }
8e274523
LP
911 }
912
906c06f6 913 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
b4cccbc1
LP
914 if (cg_all_unified() > 0) {
915 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
efdb0237 916
96e131ea 917 if (cgroup_context_has_unified_memory_config(c)) {
da4d897e 918 max = c->memory_max;
96e131ea
WC
919 swap_max = c->memory_swap_max;
920 } else {
da4d897e 921 max = c->memory_limit;
efdb0237 922
128fadc9
TH
923 if (max != CGROUP_LIMIT_MAX)
924 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
925 }
926
f29ff115
TH
927 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
928 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
929 cgroup_apply_unified_memory_limit(u, "memory.max", max);
96e131ea 930 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
efdb0237 931 } else {
da4d897e 932 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
7d862ab8 933 uint64_t val;
da4d897e 934
7d862ab8 935 if (cgroup_context_has_unified_memory_config(c)) {
78a4ee59 936 val = c->memory_max;
7d862ab8
TH
937 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
938 } else
939 val = c->memory_limit;
128fadc9 940
78a4ee59
DM
941 if (val == CGROUP_LIMIT_MAX)
942 strncpy(buf, "-1\n", sizeof(buf));
943 else
944 xsprintf(buf, "%" PRIu64 "\n", val);
945
da4d897e
TH
946 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
947 if (r < 0)
f29ff115
TH
948 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
949 "Failed to set memory.limit_in_bytes: %m");
da4d897e 950 }
4ad49000 951 }
8e274523 952
906c06f6 953 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 954 CGroupDeviceAllow *a;
8e274523 955
714e2e1d
LP
956 /* Changing the devices list of a populated cgroup
957 * might result in EINVAL, hence ignore EINVAL
958 * here. */
959
4ad49000
LP
960 if (c->device_allow || c->device_policy != CGROUP_AUTO)
961 r = cg_set_attribute("devices", path, "devices.deny", "a");
962 else
963 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 964 if (r < 0)
f29ff115
TH
965 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
966 "Failed to reset devices.list: %m");
fb385181 967
4ad49000
LP
968 if (c->device_policy == CGROUP_CLOSED ||
969 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
970 static const char auto_devices[] =
7d711efb
LP
971 "/dev/null\0" "rwm\0"
972 "/dev/zero\0" "rwm\0"
973 "/dev/full\0" "rwm\0"
974 "/dev/random\0" "rwm\0"
975 "/dev/urandom\0" "rwm\0"
976 "/dev/tty\0" "rwm\0"
0d9e7991
AP
977 "/dev/pts/ptmx\0" "rw\0" /* /dev/pts/ptmx may not be duplicated, but accessed */
978 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
e7330dfe
DP
979 "-/run/systemd/inaccessible/chr\0" "rwm\0"
980 "-/run/systemd/inaccessible/blk\0" "rwm\0";
4ad49000
LP
981
982 const char *x, *y;
983
984 NULSTR_FOREACH_PAIR(x, y, auto_devices)
985 whitelist_device(path, x, y);
7d711efb
LP
986
987 whitelist_major(path, "pts", 'c', "rw");
4ad49000
LP
988 }
989
990 LIST_FOREACH(device_allow, a, c->device_allow) {
fb4650aa 991 char acc[4], *val;
4ad49000
LP
992 unsigned k = 0;
993
994 if (a->r)
995 acc[k++] = 'r';
996 if (a->w)
997 acc[k++] = 'w';
998 if (a->m)
999 acc[k++] = 'm';
fb385181 1000
4ad49000
LP
1001 if (k == 0)
1002 continue;
fb385181 1003
4ad49000 1004 acc[k++] = 0;
90060676 1005
27458ed6 1006 if (path_startswith(a->path, "/dev/"))
90060676 1007 whitelist_device(path, a->path, acc);
fb4650aa
ZJS
1008 else if ((val = startswith(a->path, "block-")))
1009 whitelist_major(path, val, 'b', acc);
1010 else if ((val = startswith(a->path, "char-")))
1011 whitelist_major(path, val, 'c', acc);
90060676 1012 else
f29ff115 1013 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
1014 }
1015 }
03a7b521 1016
906c06f6 1017 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
03a7b521 1018
f5058264 1019 if (c->tasks_max != CGROUP_LIMIT_MAX) {
03a7b521
LP
1020 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1021
1022 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1023 r = cg_set_attribute("pids", path, "pids.max", buf);
1024 } else
1025 r = cg_set_attribute("pids", path, "pids.max", "max");
1026
1027 if (r < 0)
f29ff115
TH
1028 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1029 "Failed to set pids.max: %m");
03a7b521 1030 }
906c06f6
DM
1031
1032 if (apply_bpf)
1033 cgroup_apply_firewall(u, c);
fb385181
LP
1034}
1035
efdb0237
LP
1036CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1037 CGroupMask mask = 0;
8e274523 1038
4ad49000 1039 /* Figure out which controllers we need */
8e274523 1040
b2f8b02e 1041 if (c->cpu_accounting ||
66ebf6c0
TH
1042 cgroup_context_has_cpu_weight(c) ||
1043 cgroup_context_has_cpu_shares(c) ||
3a43da28 1044 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 1045 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 1046
538b4852
TH
1047 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1048 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1049
4ad49000 1050 if (c->memory_accounting ||
da4d897e
TH
1051 c->memory_limit != CGROUP_LIMIT_MAX ||
1052 cgroup_context_has_unified_memory_config(c))
efdb0237 1053 mask |= CGROUP_MASK_MEMORY;
8e274523 1054
a931ad47
LP
1055 if (c->device_allow ||
1056 c->device_policy != CGROUP_AUTO)
3905f127 1057 mask |= CGROUP_MASK_DEVICES;
4ad49000 1058
03a7b521
LP
1059 if (c->tasks_accounting ||
1060 c->tasks_max != (uint64_t) -1)
1061 mask |= CGROUP_MASK_PIDS;
1062
4ad49000 1063 return mask;
8e274523
LP
1064}
1065
efdb0237 1066CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1067 CGroupContext *c;
8e274523 1068
efdb0237
LP
1069 /* Returns the mask of controllers the unit needs for itself */
1070
4ad49000
LP
1071 c = unit_get_cgroup_context(u);
1072 if (!c)
1073 return 0;
8e274523 1074
02638280
LP
1075 return cgroup_context_get_mask(c);
1076}
1077
1078CGroupMask unit_get_delegate_mask(Unit *u) {
1079 CGroupContext *c;
1080
1081 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1082 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
19af675e 1083 *
02638280 1084 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1085
02638280
LP
1086 if (u->type == UNIT_SLICE)
1087 return 0;
1088
1089 c = unit_get_cgroup_context(u);
1090 if (!c)
1091 return 0;
1092
1093 if (!c->delegate)
1094 return 0;
1095
1096 if (cg_all_unified() <= 0) {
a931ad47
LP
1097 ExecContext *e;
1098
1099 e = unit_get_exec_context(u);
02638280
LP
1100 if (e && !exec_context_maintains_privileges(e))
1101 return 0;
a931ad47
LP
1102 }
1103
02638280 1104 return c->delegate_controllers;
8e274523
LP
1105}
1106
efdb0237 1107CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1108 assert(u);
bc432dc7 1109
02638280 1110 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1111
bc432dc7
LP
1112 if (u->cgroup_members_mask_valid)
1113 return u->cgroup_members_mask;
1114
02638280 1115 u->cgroup_members_mask = unit_get_delegate_mask(u);
bc432dc7
LP
1116
1117 if (u->type == UNIT_SLICE) {
eef85c4a 1118 void *v;
bc432dc7
LP
1119 Unit *member;
1120 Iterator i;
1121
eef85c4a 1122 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
bc432dc7
LP
1123
1124 if (member == u)
1125 continue;
1126
d4fdc205 1127 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
1128 continue;
1129
31604970 1130 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1131 }
1132 }
1133
1134 u->cgroup_members_mask_valid = true;
6414b7c9 1135 return u->cgroup_members_mask;
246aa6dd
LP
1136}
1137
efdb0237 1138CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 1139 assert(u);
246aa6dd 1140
efdb0237
LP
1141 /* Returns the mask of controllers all of the unit's siblings
1142 * require, i.e. the members mask of the unit's parent slice
1143 * if there is one. */
1144
bc432dc7 1145 if (UNIT_ISSET(u->slice))
637f421e 1146 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 1147
31604970 1148 return unit_get_subtree_mask(u);
246aa6dd
LP
1149}
1150
efdb0237
LP
1151CGroupMask unit_get_subtree_mask(Unit *u) {
1152
1153 /* Returns the mask of this subtree, meaning of the group
1154 * itself and its children. */
1155
1156 return unit_get_own_mask(u) | unit_get_members_mask(u);
1157}
1158
1159CGroupMask unit_get_target_mask(Unit *u) {
1160 CGroupMask mask;
1161
1162 /* This returns the cgroup mask of all controllers to enable
1163 * for a specific cgroup, i.e. everything it needs itself,
1164 * plus all that its children need, plus all that its siblings
1165 * need. This is primarily useful on the legacy cgroup
1166 * hierarchy, where we need to duplicate each cgroup in each
1167 * hierarchy that shall be enabled for it. */
6414b7c9 1168
efdb0237
LP
1169 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1170 mask &= u->manager->cgroup_supported;
1171
1172 return mask;
1173}
1174
1175CGroupMask unit_get_enable_mask(Unit *u) {
1176 CGroupMask mask;
1177
1178 /* This returns the cgroup mask of all controllers to enable
1179 * for the children of a specific cgroup. This is primarily
1180 * useful for the unified cgroup hierarchy, where each cgroup
1181 * controls which controllers are enabled for its children. */
1182
1183 mask = unit_get_members_mask(u);
6414b7c9
DS
1184 mask &= u->manager->cgroup_supported;
1185
1186 return mask;
1187}
1188
906c06f6
DM
1189bool unit_get_needs_bpf(Unit *u) {
1190 CGroupContext *c;
1191 Unit *p;
1192 assert(u);
1193
1194 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1195 * moment. */
1196 if (u->type == UNIT_SLICE)
1197 return false;
1198
1199 c = unit_get_cgroup_context(u);
1200 if (!c)
1201 return false;
1202
1203 if (c->ip_accounting ||
1204 c->ip_address_allow ||
1205 c->ip_address_deny)
1206 return true;
1207
1208 /* If any parent slice has an IP access list defined, it applies too */
1209 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1210 c = unit_get_cgroup_context(p);
1211 if (!c)
1212 return false;
1213
1214 if (c->ip_address_allow ||
1215 c->ip_address_deny)
1216 return true;
1217 }
1218
1219 return false;
1220}
1221
6414b7c9
DS
1222/* Recurse from a unit up through its containing slices, propagating
1223 * mask bits upward. A unit is also member of itself. */
bc432dc7 1224void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 1225 CGroupMask m;
bc432dc7
LP
1226 bool more;
1227
1228 assert(u);
1229
1230 /* Calculate subtree mask */
efdb0237 1231 m = unit_get_subtree_mask(u);
bc432dc7
LP
1232
1233 /* See if anything changed from the previous invocation. If
1234 * not, we're done. */
1235 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1236 return;
1237
1238 more =
1239 u->cgroup_subtree_mask_valid &&
1240 ((m & ~u->cgroup_subtree_mask) != 0) &&
1241 ((~m & u->cgroup_subtree_mask) == 0);
1242
1243 u->cgroup_subtree_mask = m;
1244 u->cgroup_subtree_mask_valid = true;
1245
6414b7c9
DS
1246 if (UNIT_ISSET(u->slice)) {
1247 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
1248
1249 if (more)
1250 /* There's more set now than before. We
1251 * propagate the new mask to the parent's mask
1252 * (not caring if it actually was valid or
1253 * not). */
1254
1255 s->cgroup_members_mask |= m;
1256
1257 else
1258 /* There's less set now than before (or we
1259 * don't know), we need to recalculate
1260 * everything, so let's invalidate the
1261 * parent's members mask */
1262
1263 s->cgroup_members_mask_valid = false;
1264
1265 /* And now make sure that this change also hits our
1266 * grandparents */
1267 unit_update_cgroup_members_masks(s);
6414b7c9
DS
1268 }
1269}
1270
efdb0237 1271static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
1272 Unit *u = userdata;
1273
1274 assert(mask != 0);
1275 assert(u);
1276
1277 while (u) {
1278 if (u->cgroup_path &&
1279 u->cgroup_realized &&
1280 (u->cgroup_realized_mask & mask) == mask)
1281 return u->cgroup_path;
1282
1283 u = UNIT_DEREF(u->slice);
1284 }
1285
1286 return NULL;
1287}
1288
efdb0237
LP
1289char *unit_default_cgroup_path(Unit *u) {
1290 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1291 int r;
1292
1293 assert(u);
1294
1295 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1296 return strdup(u->manager->cgroup_root);
1297
1298 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1299 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1300 if (r < 0)
1301 return NULL;
1302 }
1303
1304 escaped = cg_escape(u->id);
1305 if (!escaped)
1306 return NULL;
1307
1308 if (slice)
605405c6
ZJS
1309 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1310 escaped);
efdb0237 1311 else
605405c6 1312 return strjoin(u->manager->cgroup_root, "/", escaped);
efdb0237
LP
1313}
1314
1315int unit_set_cgroup_path(Unit *u, const char *path) {
1316 _cleanup_free_ char *p = NULL;
1317 int r;
1318
1319 assert(u);
1320
1321 if (path) {
1322 p = strdup(path);
1323 if (!p)
1324 return -ENOMEM;
1325 } else
1326 p = NULL;
1327
1328 if (streq_ptr(u->cgroup_path, p))
1329 return 0;
1330
1331 if (p) {
1332 r = hashmap_put(u->manager->cgroup_unit, p, u);
1333 if (r < 0)
1334 return r;
1335 }
1336
1337 unit_release_cgroup(u);
1338
1339 u->cgroup_path = p;
1340 p = NULL;
1341
1342 return 1;
1343}
1344
1345int unit_watch_cgroup(Unit *u) {
ab2c3861 1346 _cleanup_free_ char *events = NULL;
efdb0237
LP
1347 int r;
1348
1349 assert(u);
1350
1351 if (!u->cgroup_path)
1352 return 0;
1353
1354 if (u->cgroup_inotify_wd >= 0)
1355 return 0;
1356
1357 /* Only applies to the unified hierarchy */
c22800e4 1358 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1359 if (r < 0)
1360 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1361 if (r == 0)
efdb0237
LP
1362 return 0;
1363
1364 /* Don't watch the root slice, it's pointless. */
1365 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1366 return 0;
1367
1368 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1369 if (r < 0)
1370 return log_oom();
1371
ab2c3861 1372 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1373 if (r < 0)
1374 return log_oom();
1375
ab2c3861 1376 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
efdb0237
LP
1377 if (u->cgroup_inotify_wd < 0) {
1378
1379 if (errno == ENOENT) /* If the directory is already
1380 * gone we don't need to track
1381 * it, so this is not an error */
1382 return 0;
1383
1384 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1385 }
1386
1387 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1388 if (r < 0)
1389 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1390
1391 return 0;
1392}
1393
1394static int unit_create_cgroup(
1395 Unit *u,
1396 CGroupMask target_mask,
906c06f6
DM
1397 CGroupMask enable_mask,
1398 bool needs_bpf) {
efdb0237 1399
0cd385d3 1400 CGroupContext *c;
bc432dc7 1401 int r;
64747e2d 1402
4ad49000 1403 assert(u);
64747e2d 1404
0cd385d3
LP
1405 c = unit_get_cgroup_context(u);
1406 if (!c)
1407 return 0;
1408
7b3fd631
LP
1409 if (!u->cgroup_path) {
1410 _cleanup_free_ char *path = NULL;
64747e2d 1411
7b3fd631
LP
1412 path = unit_default_cgroup_path(u);
1413 if (!path)
1414 return log_oom();
1415
efdb0237
LP
1416 r = unit_set_cgroup_path(u, path);
1417 if (r == -EEXIST)
1418 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1419 if (r < 0)
1420 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
1421 }
1422
03b90d4b 1423 /* First, create our own group */
efdb0237 1424 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 1425 if (r < 0)
efdb0237
LP
1426 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1427
1428 /* Start watching it */
1429 (void) unit_watch_cgroup(u);
1430
1431 /* Enable all controllers we need */
1432 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1433 if (r < 0)
1434 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
1435
1436 /* Keep track that this is now realized */
4ad49000 1437 u->cgroup_realized = true;
efdb0237 1438 u->cgroup_realized_mask = target_mask;
ccf78df1 1439 u->cgroup_enabled_mask = enable_mask;
906c06f6 1440 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
4ad49000 1441
0cd385d3
LP
1442 if (u->type != UNIT_SLICE && !c->delegate) {
1443
1444 /* Then, possibly move things over, but not if
1445 * subgroups may contain processes, which is the case
1446 * for slice and delegation units. */
1447 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1448 if (r < 0)
efdb0237 1449 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 1450 }
03b90d4b 1451
64747e2d
LP
1452 return 0;
1453}
1454
7b3fd631
LP
1455int unit_attach_pids_to_cgroup(Unit *u) {
1456 int r;
1457 assert(u);
1458
1459 r = unit_realize_cgroup(u);
1460 if (r < 0)
1461 return r;
1462
1463 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1464 if (r < 0)
1465 return r;
1466
1467 return 0;
1468}
1469
4b58153d
LP
1470static void cgroup_xattr_apply(Unit *u) {
1471 char ids[SD_ID128_STRING_MAX];
1472 int r;
1473
1474 assert(u);
1475
1476 if (!MANAGER_IS_SYSTEM(u->manager))
1477 return;
1478
1479 if (sd_id128_is_null(u->invocation_id))
1480 return;
1481
1482 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1483 "trusted.invocation_id",
1484 sd_id128_to_string(u->invocation_id, ids), 32,
1485 0);
1486 if (r < 0)
1487 log_unit_warning_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1488}
1489
906c06f6
DM
1490static bool unit_has_mask_realized(
1491 Unit *u,
1492 CGroupMask target_mask,
1493 CGroupMask enable_mask,
1494 bool needs_bpf) {
1495
bc432dc7
LP
1496 assert(u);
1497
906c06f6
DM
1498 return u->cgroup_realized &&
1499 u->cgroup_realized_mask == target_mask &&
1500 u->cgroup_enabled_mask == enable_mask &&
1501 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1502 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
6414b7c9
DS
1503}
1504
1505/* Check if necessary controllers and attributes for a unit are in place.
1506 *
1507 * If so, do nothing.
1508 * If not, create paths, move processes over, and set attributes.
1509 *
1510 * Returns 0 on success and < 0 on failure. */
db785129 1511static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1512 CGroupMask target_mask, enable_mask;
906c06f6 1513 bool needs_bpf, apply_bpf;
6414b7c9 1514 int r;
64747e2d 1515
4ad49000 1516 assert(u);
64747e2d 1517
91a6073e
LP
1518 if (u->in_cgroup_realize_queue) {
1519 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1520 u->in_cgroup_realize_queue = false;
4ad49000 1521 }
64747e2d 1522
efdb0237 1523 target_mask = unit_get_target_mask(u);
ccf78df1 1524 enable_mask = unit_get_enable_mask(u);
906c06f6 1525 needs_bpf = unit_get_needs_bpf(u);
ccf78df1 1526
906c06f6 1527 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
0a1eb06d 1528 return 0;
64747e2d 1529
906c06f6
DM
1530 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1531 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1532 * this will trickle down properly to cgroupfs. */
1533 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1534
4ad49000 1535 /* First, realize parents */
6414b7c9 1536 if (UNIT_ISSET(u->slice)) {
db785129 1537 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1538 if (r < 0)
1539 return r;
1540 }
4ad49000
LP
1541
1542 /* And then do the real work */
906c06f6 1543 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
6414b7c9
DS
1544 if (r < 0)
1545 return r;
1546
1547 /* Finally, apply the necessary attributes. */
906c06f6 1548 cgroup_context_apply(u, target_mask, apply_bpf, state);
4b58153d 1549 cgroup_xattr_apply(u);
6414b7c9
DS
1550
1551 return 0;
64747e2d
LP
1552}
1553
91a6073e 1554static void unit_add_to_cgroup_realize_queue(Unit *u) {
58d83430 1555 assert(u);
ecedd90f 1556
91a6073e 1557 if (u->in_cgroup_realize_queue)
4ad49000 1558 return;
8e274523 1559
91a6073e
LP
1560 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1561 u->in_cgroup_realize_queue = true;
4ad49000 1562}
8c6db833 1563
91a6073e 1564unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 1565 ManagerState state;
4ad49000 1566 unsigned n = 0;
db785129 1567 Unit *i;
6414b7c9 1568 int r;
ecedd90f 1569
91a6073e
LP
1570 assert(m);
1571
db785129
LP
1572 state = manager_state(m);
1573
91a6073e
LP
1574 while ((i = m->cgroup_realize_queue)) {
1575 assert(i->in_cgroup_realize_queue);
ecedd90f 1576
db785129 1577 r = unit_realize_cgroup_now(i, state);
6414b7c9 1578 if (r < 0)
efdb0237 1579 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1580
4ad49000
LP
1581 n++;
1582 }
ecedd90f 1583
4ad49000 1584 return n;
8e274523
LP
1585}
1586
91a6073e 1587static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
4ad49000 1588 Unit *slice;
ca949c9d 1589
4ad49000
LP
1590 /* This adds the siblings of the specified unit and the
1591 * siblings of all parent units to the cgroup queue. (But
1592 * neither the specified unit itself nor the parents.) */
1593
1594 while ((slice = UNIT_DEREF(u->slice))) {
1595 Iterator i;
1596 Unit *m;
eef85c4a 1597 void *v;
8f53a7b8 1598
eef85c4a 1599 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
4ad49000
LP
1600 if (m == u)
1601 continue;
8e274523 1602
6414b7c9
DS
1603 /* Skip units that have a dependency on the slice
1604 * but aren't actually in it. */
4ad49000 1605 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1606 continue;
8e274523 1607
6414b7c9
DS
1608 /* No point in doing cgroup application for units
1609 * without active processes. */
1610 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1611 continue;
1612
1613 /* If the unit doesn't need any new controllers
1614 * and has current ones realized, it doesn't need
1615 * any changes. */
906c06f6
DM
1616 if (unit_has_mask_realized(m,
1617 unit_get_target_mask(m),
1618 unit_get_enable_mask(m),
1619 unit_get_needs_bpf(m)))
6414b7c9
DS
1620 continue;
1621
91a6073e 1622 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
1623 }
1624
4ad49000 1625 u = slice;
8e274523 1626 }
4ad49000
LP
1627}
1628
0a1eb06d 1629int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1630 assert(u);
1631
35b7ff80 1632 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1633 return 0;
8e274523 1634
4ad49000
LP
1635 /* So, here's the deal: when realizing the cgroups for this
1636 * unit, we need to first create all parents, but there's more
1637 * actually: for the weight-based controllers we also need to
1638 * make sure that all our siblings (i.e. units that are in the
73e231ab 1639 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1640 * would become very uneven as each of their processes would
1641 * get as much resources as all our group together. This call
1642 * will synchronously create the parent cgroups, but will
1643 * defer work on the siblings to the next event loop
1644 * iteration. */
ca949c9d 1645
4ad49000 1646 /* Add all sibling slices to the cgroup queue. */
91a6073e 1647 unit_add_siblings_to_cgroup_realize_queue(u);
4ad49000 1648
6414b7c9 1649 /* And realize this one now (and apply the values) */
db785129 1650 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1651}
1652
efdb0237
LP
1653void unit_release_cgroup(Unit *u) {
1654 assert(u);
1655
1656 /* Forgets all cgroup details for this cgroup */
1657
1658 if (u->cgroup_path) {
1659 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1660 u->cgroup_path = mfree(u->cgroup_path);
1661 }
1662
1663 if (u->cgroup_inotify_wd >= 0) {
1664 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1665 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1666
1667 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1668 u->cgroup_inotify_wd = -1;
1669 }
1670}
1671
1672void unit_prune_cgroup(Unit *u) {
8e274523 1673 int r;
efdb0237 1674 bool is_root_slice;
8e274523 1675
4ad49000 1676 assert(u);
8e274523 1677
efdb0237
LP
1678 /* Removes the cgroup, if empty and possible, and stops watching it. */
1679
4ad49000
LP
1680 if (!u->cgroup_path)
1681 return;
8e274523 1682
fe700f46
LP
1683 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1684
efdb0237
LP
1685 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1686
1687 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1688 if (r < 0) {
f29ff115 1689 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1690 return;
1691 }
8e274523 1692
efdb0237
LP
1693 if (is_root_slice)
1694 return;
1695
1696 unit_release_cgroup(u);
0a1eb06d 1697
4ad49000 1698 u->cgroup_realized = false;
bc432dc7 1699 u->cgroup_realized_mask = 0;
ccf78df1 1700 u->cgroup_enabled_mask = 0;
8e274523
LP
1701}
1702
efdb0237 1703int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1704 _cleanup_fclose_ FILE *f = NULL;
1705 pid_t pid = 0, npid, mypid;
efdb0237 1706 int r;
4ad49000
LP
1707
1708 assert(u);
efdb0237 1709 assert(ret);
4ad49000
LP
1710
1711 if (!u->cgroup_path)
efdb0237 1712 return -ENXIO;
4ad49000 1713
efdb0237
LP
1714 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1715 if (r < 0)
1716 return r;
4ad49000 1717
df0ff127 1718 mypid = getpid_cached();
4ad49000
LP
1719 while (cg_read_pid(f, &npid) > 0) {
1720 pid_t ppid;
1721
1722 if (npid == pid)
1723 continue;
8e274523 1724
4ad49000 1725 /* Ignore processes that aren't our kids */
6bc73acb 1726 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
4ad49000 1727 continue;
8e274523 1728
efdb0237 1729 if (pid != 0)
4ad49000
LP
1730 /* Dang, there's more than one daemonized PID
1731 in this group, so we don't know what process
1732 is the main process. */
efdb0237
LP
1733
1734 return -ENODATA;
8e274523 1735
4ad49000 1736 pid = npid;
8e274523
LP
1737 }
1738
efdb0237
LP
1739 *ret = pid;
1740 return 0;
1741}
1742
1743static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1744 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1745 _cleanup_fclose_ FILE *f = NULL;
1746 int ret = 0, r;
1747
1748 assert(u);
1749 assert(path);
1750
1751 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1752 if (r < 0)
1753 ret = r;
1754 else {
1755 pid_t pid;
1756
1757 while ((r = cg_read_pid(f, &pid)) > 0) {
1758 r = unit_watch_pid(u, pid);
1759 if (r < 0 && ret >= 0)
1760 ret = r;
1761 }
1762
1763 if (r < 0 && ret >= 0)
1764 ret = r;
1765 }
1766
1767 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1768 if (r < 0) {
1769 if (ret >= 0)
1770 ret = r;
1771 } else {
1772 char *fn;
1773
1774 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1775 _cleanup_free_ char *p = NULL;
1776
605405c6 1777 p = strjoin(path, "/", fn);
efdb0237
LP
1778 free(fn);
1779
1780 if (!p)
1781 return -ENOMEM;
1782
1783 r = unit_watch_pids_in_path(u, p);
1784 if (r < 0 && ret >= 0)
1785 ret = r;
1786 }
1787
1788 if (r < 0 && ret >= 0)
1789 ret = r;
1790 }
1791
1792 return ret;
1793}
1794
1795int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
1796 int r;
1797
efdb0237
LP
1798 assert(u);
1799
1800 /* Adds all PIDs from our cgroup to the set of PIDs we
1801 * watch. This is a fallback logic for cases where we do not
1802 * get reliable cgroup empty notifications: we try to use
1803 * SIGCHLD as replacement. */
1804
1805 if (!u->cgroup_path)
1806 return -ENOENT;
1807
c22800e4 1808 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1809 if (r < 0)
1810 return r;
1811 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
1812 return 0;
1813
1814 return unit_watch_pids_in_path(u, u->cgroup_path);
1815}
1816
09e24654
LP
1817static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1818 Manager *m = userdata;
1819 Unit *u;
efdb0237
LP
1820 int r;
1821
09e24654
LP
1822 assert(s);
1823 assert(m);
efdb0237 1824
09e24654
LP
1825 u = m->cgroup_empty_queue;
1826 if (!u)
efdb0237
LP
1827 return 0;
1828
09e24654
LP
1829 assert(u->in_cgroup_empty_queue);
1830 u->in_cgroup_empty_queue = false;
1831 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1832
1833 if (m->cgroup_empty_queue) {
1834 /* More stuff queued, let's make sure we remain enabled */
1835 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1836 if (r < 0)
1837 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1838 }
efdb0237
LP
1839
1840 unit_add_to_gc_queue(u);
1841
1842 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1843 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1844
1845 return 0;
1846}
1847
09e24654
LP
1848void unit_add_to_cgroup_empty_queue(Unit *u) {
1849 int r;
1850
1851 assert(u);
1852
1853 /* Note that there are four different ways how cgroup empty events reach us:
1854 *
1855 * 1. On the unified hierarchy we get an inotify event on the cgroup
1856 *
1857 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1858 *
1859 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1860 *
1861 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1862 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1863 *
1864 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1865 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1866 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1867 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1868 * case for scope units). */
1869
1870 if (u->in_cgroup_empty_queue)
1871 return;
1872
1873 /* Let's verify that the cgroup is really empty */
1874 if (!u->cgroup_path)
1875 return;
1876 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1877 if (r < 0) {
1878 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1879 return;
1880 }
1881 if (r == 0)
1882 return;
1883
1884 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1885 u->in_cgroup_empty_queue = true;
1886
1887 /* Trigger the defer event */
1888 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1889 if (r < 0)
1890 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1891}
1892
efdb0237
LP
1893static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1894 Manager *m = userdata;
1895
1896 assert(s);
1897 assert(fd >= 0);
1898 assert(m);
1899
1900 for (;;) {
1901 union inotify_event_buffer buffer;
1902 struct inotify_event *e;
1903 ssize_t l;
1904
1905 l = read(fd, &buffer, sizeof(buffer));
1906 if (l < 0) {
47249640 1907 if (IN_SET(errno, EINTR, EAGAIN))
efdb0237
LP
1908 return 0;
1909
1910 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1911 }
1912
1913 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1914 Unit *u;
1915
1916 if (e->wd < 0)
1917 /* Queue overflow has no watch descriptor */
1918 continue;
1919
1920 if (e->mask & IN_IGNORED)
1921 /* The watch was just removed */
1922 continue;
1923
1924 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1925 if (!u) /* Not that inotify might deliver
1926 * events for a watch even after it
1927 * was removed, because it was queued
1928 * before the removal. Let's ignore
1929 * this here safely. */
1930 continue;
1931
09e24654 1932 unit_add_to_cgroup_empty_queue(u);
efdb0237
LP
1933 }
1934 }
8e274523
LP
1935}
1936
8e274523 1937int manager_setup_cgroup(Manager *m) {
9444b1f2 1938 _cleanup_free_ char *path = NULL;
10bd3e2e 1939 const char *scope_path;
efdb0237 1940 CGroupController c;
b4cccbc1 1941 int r, all_unified;
efdb0237 1942 char *e;
8e274523
LP
1943
1944 assert(m);
1945
35d2e7ec 1946 /* 1. Determine hierarchy */
efdb0237 1947 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1948 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1949 if (r < 0)
1950 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1951
efdb0237
LP
1952 /* Chop off the init scope, if we are already located in it */
1953 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1954
efdb0237
LP
1955 /* LEGACY: Also chop off the system slice if we are in
1956 * it. This is to support live upgrades from older systemd
1957 * versions where PID 1 was moved there. Also see
1958 * cg_get_root_path(). */
463d0d15 1959 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 1960 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1961 if (!e)
efdb0237 1962 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1963 }
efdb0237
LP
1964 if (e)
1965 *e = 0;
7ccfb64a 1966
7546145e
LP
1967 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
1968 * easily prepend it everywhere. */
1969 delete_trailing_chars(m->cgroup_root, "/");
8e274523 1970
35d2e7ec 1971 /* 2. Show data */
9444b1f2 1972 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1973 if (r < 0)
1974 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1975
415fc41c
TH
1976 r = cg_unified_flush();
1977 if (r < 0)
1978 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 1979
b4cccbc1
LP
1980 all_unified = cg_all_unified();
1981 if (r < 0)
1982 return log_error_errno(r, "Couldn't determine whether we are in all unified mode: %m");
1983 if (r > 0)
efdb0237 1984 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 1985 else {
c22800e4 1986 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1987 if (r < 0)
1988 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
1989 if (r > 0)
1990 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
1991 else
1992 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
1993 }
efdb0237 1994
09e24654
LP
1995 /* 3. Allocate cgroup empty defer event source */
1996 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
1997 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
1998 if (r < 0)
1999 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2000
2001 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2002 if (r < 0)
2003 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2004
2005 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2006 if (r < 0)
2007 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2008
2009 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2010
2011 /* 4. Install notifier inotify object, or agent */
10bd3e2e 2012 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
c6c18be3 2013
09e24654 2014 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
efdb0237 2015
10bd3e2e
LP
2016 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2017 safe_close(m->cgroup_inotify_fd);
efdb0237 2018
10bd3e2e
LP
2019 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2020 if (m->cgroup_inotify_fd < 0)
2021 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 2022
10bd3e2e
LP
2023 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2024 if (r < 0)
2025 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 2026
10bd3e2e
LP
2027 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2028 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
09e24654 2029 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
10bd3e2e
LP
2030 if (r < 0)
2031 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
efdb0237 2032
10bd3e2e 2033 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 2034
10bd3e2e 2035 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
efdb0237 2036
10bd3e2e
LP
2037 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2038 * since it does not generate events when control groups with children run empty. */
8e274523 2039
10bd3e2e 2040 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
23bbb0de 2041 if (r < 0)
10bd3e2e
LP
2042 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2043 else if (r > 0)
2044 log_debug("Installed release agent.");
2045 else if (r == 0)
2046 log_debug("Release agent already installed.");
2047 }
efdb0237 2048
09e24654 2049 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
10bd3e2e
LP
2050 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2051 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2052 if (r < 0)
2053 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
c6c18be3 2054
09e24654 2055 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
10bd3e2e
LP
2056 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2057 if (r < 0)
2058 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
0d8c31ff 2059
09e24654 2060 /* 6. And pin it, so that it cannot be unmounted */
10bd3e2e
LP
2061 safe_close(m->pin_cgroupfs_fd);
2062 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2063 if (m->pin_cgroupfs_fd < 0)
2064 return log_error_errno(errno, "Failed to open pin file: %m");
2065
09e24654 2066 /* 7. Always enable hierarchical support if it exists... */
10bd3e2e
LP
2067 if (!all_unified && m->test_run_flags == 0)
2068 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3 2069
09e24654 2070 /* 8. Figure out which controllers are supported, and log about it */
efdb0237
LP
2071 r = cg_mask_supported(&m->cgroup_supported);
2072 if (r < 0)
2073 return log_error_errno(r, "Failed to determine supported controllers: %m");
efdb0237 2074 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
eee0a1e4 2075 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 2076
a32360f1 2077 return 0;
8e274523
LP
2078}
2079
c6c18be3 2080void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
2081 assert(m);
2082
9444b1f2
LP
2083 /* We can't really delete the group, since we are in it. But
2084 * let's trim it. */
2085 if (delete && m->cgroup_root)
efdb0237
LP
2086 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2087
09e24654
LP
2088 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2089
efdb0237
LP
2090 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2091
2092 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2093 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 2094
03e334a1 2095 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 2096
efdb0237 2097 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
2098}
2099
4ad49000 2100Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 2101 char *p;
4ad49000 2102 Unit *u;
acb14d31
LP
2103
2104 assert(m);
2105 assert(cgroup);
acb14d31 2106
4ad49000
LP
2107 u = hashmap_get(m->cgroup_unit, cgroup);
2108 if (u)
2109 return u;
acb14d31 2110
8e70580b 2111 p = strdupa(cgroup);
acb14d31
LP
2112 for (;;) {
2113 char *e;
2114
2115 e = strrchr(p, '/');
efdb0237
LP
2116 if (!e || e == p)
2117 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
2118
2119 *e = 0;
2120
4ad49000
LP
2121 u = hashmap_get(m->cgroup_unit, p);
2122 if (u)
2123 return u;
acb14d31
LP
2124 }
2125}
2126
b3ac818b 2127Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 2128 _cleanup_free_ char *cgroup = NULL;
acb14d31 2129 int r;
8e274523 2130
8c47c732
LP
2131 assert(m);
2132
b3ac818b
LP
2133 if (pid <= 0)
2134 return NULL;
2135
2136 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2137 if (r < 0)
2138 return NULL;
2139
2140 return manager_get_unit_by_cgroup(m, cgroup);
2141}
2142
2143Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2144 Unit *u;
2145
2146 assert(m);
2147
efdb0237 2148 if (pid <= 0)
8c47c732
LP
2149 return NULL;
2150
efdb0237
LP
2151 if (pid == 1)
2152 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2153
fea72cc0 2154 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
2155 if (u)
2156 return u;
2157
fea72cc0 2158 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
2159 if (u)
2160 return u;
2161
b3ac818b 2162 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 2163}
4fbf50b3 2164
4ad49000
LP
2165int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2166 Unit *u;
4fbf50b3 2167
4ad49000
LP
2168 assert(m);
2169 assert(cgroup);
4fbf50b3 2170
09e24654
LP
2171 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2172 * or from the --system instance */
2173
d8fdc620
LP
2174 log_debug("Got cgroup empty notification for: %s", cgroup);
2175
4ad49000 2176 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
2177 if (!u)
2178 return 0;
b56c28c3 2179
09e24654
LP
2180 unit_add_to_cgroup_empty_queue(u);
2181 return 1;
5ad096b3
LP
2182}
2183
2184int unit_get_memory_current(Unit *u, uint64_t *ret) {
2185 _cleanup_free_ char *v = NULL;
2186 int r;
2187
2188 assert(u);
2189 assert(ret);
2190
2e4025c0 2191 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
cf3b4be1
LP
2192 return -ENODATA;
2193
5ad096b3
LP
2194 if (!u->cgroup_path)
2195 return -ENODATA;
2196
efdb0237 2197 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
2198 return -ENODATA;
2199
b4cccbc1
LP
2200 r = cg_all_unified();
2201 if (r < 0)
2202 return r;
2203 if (r > 0)
efdb0237 2204 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
b4cccbc1
LP
2205 else
2206 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
5ad096b3
LP
2207 if (r == -ENOENT)
2208 return -ENODATA;
2209 if (r < 0)
2210 return r;
2211
2212 return safe_atou64(v, ret);
2213}
2214
03a7b521
LP
2215int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2216 _cleanup_free_ char *v = NULL;
2217 int r;
2218
2219 assert(u);
2220 assert(ret);
2221
2e4025c0 2222 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
2223 return -ENODATA;
2224
03a7b521
LP
2225 if (!u->cgroup_path)
2226 return -ENODATA;
2227
2228 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2229 return -ENODATA;
2230
2231 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2232 if (r == -ENOENT)
2233 return -ENODATA;
2234 if (r < 0)
2235 return r;
2236
2237 return safe_atou64(v, ret);
2238}
2239
5ad096b3
LP
2240static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2241 _cleanup_free_ char *v = NULL;
2242 uint64_t ns;
2243 int r;
2244
2245 assert(u);
2246 assert(ret);
2247
2248 if (!u->cgroup_path)
2249 return -ENODATA;
2250
b4cccbc1
LP
2251 r = cg_all_unified();
2252 if (r < 0)
2253 return r;
2254 if (r > 0) {
66ebf6c0
TH
2255 const char *keys[] = { "usage_usec", NULL };
2256 _cleanup_free_ char *val = NULL;
2257 uint64_t us;
5ad096b3 2258
66ebf6c0
TH
2259 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2260 return -ENODATA;
5ad096b3 2261
66ebf6c0
TH
2262 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2263 if (r < 0)
2264 return r;
2265
2266 r = safe_atou64(val, &us);
2267 if (r < 0)
2268 return r;
2269
2270 ns = us * NSEC_PER_USEC;
2271 } else {
2272 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2273 return -ENODATA;
2274
2275 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2276 if (r == -ENOENT)
2277 return -ENODATA;
2278 if (r < 0)
2279 return r;
2280
2281 r = safe_atou64(v, &ns);
2282 if (r < 0)
2283 return r;
2284 }
5ad096b3
LP
2285
2286 *ret = ns;
2287 return 0;
2288}
2289
2290int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2291 nsec_t ns;
2292 int r;
2293
fe700f46
LP
2294 assert(u);
2295
2296 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2297 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2298 * call this function with a NULL return value. */
2299
2e4025c0 2300 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
cf3b4be1
LP
2301 return -ENODATA;
2302
5ad096b3 2303 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
2304 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2305 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2306 * cached value. */
2307
2308 if (ret)
2309 *ret = u->cpu_usage_last;
2310 return 0;
2311 }
5ad096b3
LP
2312 if (r < 0)
2313 return r;
2314
66ebf6c0
TH
2315 if (ns > u->cpu_usage_base)
2316 ns -= u->cpu_usage_base;
5ad096b3
LP
2317 else
2318 ns = 0;
2319
fe700f46
LP
2320 u->cpu_usage_last = ns;
2321 if (ret)
2322 *ret = ns;
2323
5ad096b3
LP
2324 return 0;
2325}
2326
906c06f6
DM
2327int unit_get_ip_accounting(
2328 Unit *u,
2329 CGroupIPAccountingMetric metric,
2330 uint64_t *ret) {
2331
6b659ed8 2332 uint64_t value;
906c06f6
DM
2333 int fd, r;
2334
2335 assert(u);
2336 assert(metric >= 0);
2337 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2338 assert(ret);
2339
cf3b4be1
LP
2340 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2341 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2342 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2343 * filters. */
2344 if (u->type == UNIT_SLICE)
2345 return -ENODATA;
2346
2e4025c0 2347 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
2348 return -ENODATA;
2349
906c06f6
DM
2350 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2351 u->ip_accounting_ingress_map_fd :
2352 u->ip_accounting_egress_map_fd;
2353
2354 if (fd < 0)
2355 return -ENODATA;
2356
2357 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 2358 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 2359 else
6b659ed8
LP
2360 r = bpf_firewall_read_accounting(fd, NULL, &value);
2361 if (r < 0)
2362 return r;
2363
2364 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2365 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2366 * ip_accounting_extra[] field, and add them in here transparently. */
2367
2368 *ret = value + u->ip_accounting_extra[metric];
906c06f6
DM
2369
2370 return r;
2371}
2372
2373int unit_reset_cpu_accounting(Unit *u) {
5ad096b3
LP
2374 nsec_t ns;
2375 int r;
2376
2377 assert(u);
2378
fe700f46
LP
2379 u->cpu_usage_last = NSEC_INFINITY;
2380
5ad096b3
LP
2381 r = unit_get_cpu_usage_raw(u, &ns);
2382 if (r < 0) {
66ebf6c0 2383 u->cpu_usage_base = 0;
5ad096b3 2384 return r;
b56c28c3 2385 }
2633eb83 2386
66ebf6c0 2387 u->cpu_usage_base = ns;
4ad49000 2388 return 0;
4fbf50b3
LP
2389}
2390
906c06f6
DM
2391int unit_reset_ip_accounting(Unit *u) {
2392 int r = 0, q = 0;
2393
2394 assert(u);
2395
2396 if (u->ip_accounting_ingress_map_fd >= 0)
2397 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2398
2399 if (u->ip_accounting_egress_map_fd >= 0)
2400 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2401
6b659ed8
LP
2402 zero(u->ip_accounting_extra);
2403
906c06f6
DM
2404 return r < 0 ? r : q;
2405}
2406
e7ab4d1a
LP
2407void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2408 assert(u);
2409
2410 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2411 return;
2412
2413 if (m == 0)
2414 return;
2415
538b4852
TH
2416 /* always invalidate compat pairs together */
2417 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2418 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2419
7cce4fb7
LP
2420 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2421 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2422
e7ab4d1a
LP
2423 if ((u->cgroup_realized_mask & m) == 0)
2424 return;
2425
2426 u->cgroup_realized_mask &= ~m;
91a6073e 2427 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
2428}
2429
906c06f6
DM
2430void unit_invalidate_cgroup_bpf(Unit *u) {
2431 assert(u);
2432
2433 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2434 return;
2435
2436 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED)
2437 return;
2438
2439 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
91a6073e 2440 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
2441
2442 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2443 * list of our children includes our own. */
2444 if (u->type == UNIT_SLICE) {
2445 Unit *member;
2446 Iterator i;
eef85c4a 2447 void *v;
906c06f6 2448
eef85c4a 2449 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
906c06f6
DM
2450 if (member == u)
2451 continue;
2452
2453 if (UNIT_DEREF(member->slice) != u)
2454 continue;
2455
2456 unit_invalidate_cgroup_bpf(member);
2457 }
2458 }
2459}
2460
e7ab4d1a
LP
2461void manager_invalidate_startup_units(Manager *m) {
2462 Iterator i;
2463 Unit *u;
2464
2465 assert(m);
2466
2467 SET_FOREACH(u, m->startup_units, i)
13c31542 2468 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
2469}
2470
4ad49000
LP
2471static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2472 [CGROUP_AUTO] = "auto",
2473 [CGROUP_CLOSED] = "closed",
2474 [CGROUP_STRICT] = "strict",
2475};
4fbf50b3 2476
4ad49000 2477DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);