]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
core: unify call we use to synthesize cgroup empty events when we stopped watching...
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
8e274523
LP
2/***
3 This file is part of systemd.
4
4ad49000 5 Copyright 2013 Lennart Poettering
8e274523
LP
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
8e274523 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
c6c18be3 21#include <fcntl.h>
e41969e3 22#include <fnmatch.h>
8c6db833 23
b5efdb8a 24#include "alloc-util.h"
18c528e9 25#include "blockdev-util.h"
906c06f6 26#include "bpf-firewall.h"
03a7b521 27#include "cgroup-util.h"
3ffd4af2
LP
28#include "cgroup.h"
29#include "fd-util.h"
0d39fa9c 30#include "fileio.h"
77601719 31#include "fs-util.h"
6bedfcbb 32#include "parse-util.h"
9eb977db 33#include "path-util.h"
03a7b521 34#include "process-util.h"
9444b1f2 35#include "special.h"
906c06f6 36#include "stdio-util.h"
8b43440b 37#include "string-table.h"
07630cea 38#include "string-util.h"
8e274523 39
9a054909
LP
40#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
41
2b40998d 42static void cgroup_compat_warn(void) {
128fadc9
TH
43 static bool cgroup_compat_warned = false;
44
45 if (cgroup_compat_warned)
46 return;
47
48 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
49 cgroup_compat_warned = true;
50}
51
52#define log_cgroup_compat(unit, fmt, ...) do { \
53 cgroup_compat_warn(); \
54 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 55 } while (false)
128fadc9 56
4ad49000
LP
57void cgroup_context_init(CGroupContext *c) {
58 assert(c);
59
60 /* Initialize everything to the kernel defaults, assuming the
61 * structure is preinitialized to 0 */
62
66ebf6c0
TH
63 c->cpu_weight = CGROUP_WEIGHT_INVALID;
64 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
65 c->cpu_quota_per_sec_usec = USEC_INFINITY;
66
d53d9474
LP
67 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
68 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
d53d9474 69
da4d897e
TH
70 c->memory_high = CGROUP_LIMIT_MAX;
71 c->memory_max = CGROUP_LIMIT_MAX;
96e131ea 72 c->memory_swap_max = CGROUP_LIMIT_MAX;
da4d897e
TH
73
74 c->memory_limit = CGROUP_LIMIT_MAX;
b2f8b02e 75
13c31542
TH
76 c->io_weight = CGROUP_WEIGHT_INVALID;
77 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
78
d53d9474
LP
79 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
80 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
81
82 c->tasks_max = (uint64_t) -1;
4ad49000 83}
8e274523 84
4ad49000
LP
85void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
86 assert(c);
87 assert(a);
88
71fda00f 89 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
90 free(a->path);
91 free(a);
92}
93
13c31542
TH
94void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
95 assert(c);
96 assert(w);
97
98 LIST_REMOVE(device_weights, c->io_device_weights, w);
99 free(w->path);
100 free(w);
101}
102
103void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
104 assert(c);
105 assert(l);
106
107 LIST_REMOVE(device_limits, c->io_device_limits, l);
108 free(l->path);
109 free(l);
110}
111
4ad49000
LP
112void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
113 assert(c);
114 assert(w);
115
71fda00f 116 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
117 free(w->path);
118 free(w);
119}
120
121void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
122 assert(c);
8e274523 123 assert(b);
8e274523 124
71fda00f 125 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
126 free(b->path);
127 free(b);
128}
129
130void cgroup_context_done(CGroupContext *c) {
131 assert(c);
132
13c31542
TH
133 while (c->io_device_weights)
134 cgroup_context_free_io_device_weight(c, c->io_device_weights);
135
136 while (c->io_device_limits)
137 cgroup_context_free_io_device_limit(c, c->io_device_limits);
138
4ad49000
LP
139 while (c->blockio_device_weights)
140 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
141
142 while (c->blockio_device_bandwidths)
143 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
144
145 while (c->device_allow)
146 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f
DM
147
148 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
149 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
4ad49000
LP
150}
151
152void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
13c31542
TH
153 CGroupIODeviceLimit *il;
154 CGroupIODeviceWeight *iw;
4ad49000
LP
155 CGroupBlockIODeviceBandwidth *b;
156 CGroupBlockIODeviceWeight *w;
157 CGroupDeviceAllow *a;
c21c9906 158 IPAddressAccessItem *iaai;
9a054909 159 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
160
161 assert(c);
162 assert(f);
163
164 prefix = strempty(prefix);
165
166 fprintf(f,
167 "%sCPUAccounting=%s\n"
13c31542 168 "%sIOAccounting=%s\n"
4ad49000
LP
169 "%sBlockIOAccounting=%s\n"
170 "%sMemoryAccounting=%s\n"
d53d9474 171 "%sTasksAccounting=%s\n"
c21c9906 172 "%sIPAccounting=%s\n"
66ebf6c0
TH
173 "%sCPUWeight=%" PRIu64 "\n"
174 "%sStartupCPUWeight=%" PRIu64 "\n"
d53d9474
LP
175 "%sCPUShares=%" PRIu64 "\n"
176 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 177 "%sCPUQuotaPerSecSec=%s\n"
13c31542
TH
178 "%sIOWeight=%" PRIu64 "\n"
179 "%sStartupIOWeight=%" PRIu64 "\n"
d53d9474
LP
180 "%sBlockIOWeight=%" PRIu64 "\n"
181 "%sStartupBlockIOWeight=%" PRIu64 "\n"
da4d897e
TH
182 "%sMemoryLow=%" PRIu64 "\n"
183 "%sMemoryHigh=%" PRIu64 "\n"
184 "%sMemoryMax=%" PRIu64 "\n"
96e131ea 185 "%sMemorySwapMax=%" PRIu64 "\n"
4ad49000 186 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 187 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
188 "%sDevicePolicy=%s\n"
189 "%sDelegate=%s\n",
4ad49000 190 prefix, yes_no(c->cpu_accounting),
13c31542 191 prefix, yes_no(c->io_accounting),
4ad49000
LP
192 prefix, yes_no(c->blockio_accounting),
193 prefix, yes_no(c->memory_accounting),
d53d9474 194 prefix, yes_no(c->tasks_accounting),
c21c9906 195 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
196 prefix, c->cpu_weight,
197 prefix, c->startup_cpu_weight,
4ad49000 198 prefix, c->cpu_shares,
95ae05c0 199 prefix, c->startup_cpu_shares,
b1d6dcf5 200 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
13c31542
TH
201 prefix, c->io_weight,
202 prefix, c->startup_io_weight,
4ad49000 203 prefix, c->blockio_weight,
95ae05c0 204 prefix, c->startup_blockio_weight,
da4d897e
TH
205 prefix, c->memory_low,
206 prefix, c->memory_high,
207 prefix, c->memory_max,
96e131ea 208 prefix, c->memory_swap_max,
4ad49000 209 prefix, c->memory_limit,
03a7b521 210 prefix, c->tasks_max,
a931ad47
LP
211 prefix, cgroup_device_policy_to_string(c->device_policy),
212 prefix, yes_no(c->delegate));
4ad49000 213
02638280
LP
214 if (c->delegate) {
215 _cleanup_free_ char *t = NULL;
216
217 (void) cg_mask_to_string(c->delegate_controllers, &t);
218
47a78d41 219 fprintf(f, "%sDelegateControllers=%s\n",
02638280
LP
220 prefix,
221 strempty(t));
222 }
223
4ad49000
LP
224 LIST_FOREACH(device_allow, a, c->device_allow)
225 fprintf(f,
226 "%sDeviceAllow=%s %s%s%s\n",
227 prefix,
228 a->path,
229 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
230
13c31542
TH
231 LIST_FOREACH(device_weights, iw, c->io_device_weights)
232 fprintf(f,
233 "%sIODeviceWeight=%s %" PRIu64,
234 prefix,
235 iw->path,
236 iw->weight);
237
238 LIST_FOREACH(device_limits, il, c->io_device_limits) {
239 char buf[FORMAT_BYTES_MAX];
9be57249
TH
240 CGroupIOLimitType type;
241
242 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
243 if (il->limits[type] != cgroup_io_limit_defaults[type])
244 fprintf(f,
245 "%s%s=%s %s\n",
246 prefix,
247 cgroup_io_limit_type_to_string(type),
248 il->path,
249 format_bytes(buf, sizeof(buf), il->limits[type]));
13c31542
TH
250 }
251
4ad49000
LP
252 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
253 fprintf(f,
d53d9474 254 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
255 prefix,
256 w->path,
257 w->weight);
258
259 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
260 char buf[FORMAT_BYTES_MAX];
261
979d0311
TH
262 if (b->rbps != CGROUP_LIMIT_MAX)
263 fprintf(f,
264 "%sBlockIOReadBandwidth=%s %s\n",
265 prefix,
266 b->path,
267 format_bytes(buf, sizeof(buf), b->rbps));
268 if (b->wbps != CGROUP_LIMIT_MAX)
269 fprintf(f,
270 "%sBlockIOWriteBandwidth=%s %s\n",
271 prefix,
272 b->path,
273 format_bytes(buf, sizeof(buf), b->wbps));
4ad49000 274 }
c21c9906
LP
275
276 LIST_FOREACH(items, iaai, c->ip_address_allow) {
277 _cleanup_free_ char *k = NULL;
278
279 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
280 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
281 }
282
283 LIST_FOREACH(items, iaai, c->ip_address_deny) {
284 _cleanup_free_ char *k = NULL;
285
286 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
287 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
288 }
4ad49000
LP
289}
290
13c31542 291static int lookup_block_device(const char *p, dev_t *dev) {
4ad49000
LP
292 struct stat st;
293 int r;
294
295 assert(p);
296 assert(dev);
297
298 r = stat(p, &st);
4a62c710
MS
299 if (r < 0)
300 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 301
4ad49000
LP
302 if (S_ISBLK(st.st_mode))
303 *dev = st.st_rdev;
304 else if (major(st.st_dev) != 0) {
305 /* If this is not a device node then find the block
306 * device this file is stored on */
307 *dev = st.st_dev;
308
309 /* If this is a partition, try to get the originating
310 * block device */
18c528e9 311 (void) block_get_whole_disk(*dev, dev);
4ad49000
LP
312 } else {
313 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
314 return -ENODEV;
315 }
8e274523 316
8e274523 317 return 0;
8e274523
LP
318}
319
4ad49000
LP
320static int whitelist_device(const char *path, const char *node, const char *acc) {
321 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
322 struct stat st;
b200489b 323 bool ignore_notfound;
8c6db833 324 int r;
8e274523 325
4ad49000
LP
326 assert(path);
327 assert(acc);
8e274523 328
b200489b
DR
329 if (node[0] == '-') {
330 /* Non-existent paths starting with "-" must be silently ignored */
331 node++;
332 ignore_notfound = true;
333 } else
334 ignore_notfound = false;
335
4ad49000 336 if (stat(node, &st) < 0) {
b200489b 337 if (errno == ENOENT && ignore_notfound)
e7330dfe
DP
338 return 0;
339
340 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
4ad49000
LP
341 }
342
343 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
344 log_warning("%s is not a device.", node);
345 return -ENODEV;
346 }
347
348 sprintf(buf,
349 "%c %u:%u %s",
350 S_ISCHR(st.st_mode) ? 'c' : 'b',
351 major(st.st_rdev), minor(st.st_rdev),
352 acc);
353
354 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 355 if (r < 0)
077ba06e 356 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 357 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
358
359 return r;
8e274523
LP
360}
361
90060676
LP
362static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
363 _cleanup_fclose_ FILE *f = NULL;
364 char line[LINE_MAX];
365 bool good = false;
366 int r;
367
368 assert(path);
369 assert(acc);
4c701096 370 assert(IN_SET(type, 'b', 'c'));
90060676
LP
371
372 f = fopen("/proc/devices", "re");
4a62c710
MS
373 if (!f)
374 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
375
376 FOREACH_LINE(line, f, goto fail) {
377 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
378 unsigned maj;
379
380 truncate_nl(line);
381
382 if (type == 'c' && streq(line, "Character devices:")) {
383 good = true;
384 continue;
385 }
386
387 if (type == 'b' && streq(line, "Block devices:")) {
388 good = true;
389 continue;
390 }
391
392 if (isempty(line)) {
393 good = false;
394 continue;
395 }
396
397 if (!good)
398 continue;
399
400 p = strstrip(line);
401
402 w = strpbrk(p, WHITESPACE);
403 if (!w)
404 continue;
405 *w = 0;
406
407 r = safe_atou(p, &maj);
408 if (r < 0)
409 continue;
410 if (maj <= 0)
411 continue;
412
413 w++;
414 w += strspn(w, WHITESPACE);
e41969e3
LP
415
416 if (fnmatch(name, w, 0) != 0)
90060676
LP
417 continue;
418
419 sprintf(buf,
420 "%c %u:* %s",
421 type,
422 maj,
423 acc);
424
425 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 426 if (r < 0)
077ba06e 427 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 428 "Failed to set devices.allow on %s: %m", path);
90060676
LP
429 }
430
431 return 0;
432
433fail:
25f027c5 434 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
435}
436
66ebf6c0
TH
437static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
438 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
439 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
440}
441
442static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
443 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
444 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
445}
446
447static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
448 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
449 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
450 return c->startup_cpu_weight;
451 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
452 return c->cpu_weight;
453 else
454 return CGROUP_WEIGHT_DEFAULT;
455}
456
457static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
458 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
459 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
460 return c->startup_cpu_shares;
461 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
462 return c->cpu_shares;
463 else
464 return CGROUP_CPU_SHARES_DEFAULT;
465}
466
467static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
468 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
469 int r;
470
471 xsprintf(buf, "%" PRIu64 "\n", weight);
472 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
473 if (r < 0)
474 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
475 "Failed to set cpu.weight: %m");
476
477 if (quota != USEC_INFINITY)
478 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
479 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
480 else
481 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
482
483 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
484
485 if (r < 0)
486 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
487 "Failed to set cpu.max: %m");
488}
489
490static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
491 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
492 int r;
493
494 xsprintf(buf, "%" PRIu64 "\n", shares);
495 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
496 if (r < 0)
497 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
498 "Failed to set cpu.shares: %m");
499
500 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
501 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
502 if (r < 0)
503 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
504 "Failed to set cpu.cfs_period_us: %m");
505
506 if (quota != USEC_INFINITY) {
507 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
508 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
509 } else
510 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
511 if (r < 0)
512 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
513 "Failed to set cpu.cfs_quota_us: %m");
514}
515
516static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
517 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
518 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
519}
520
521static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
522 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
523 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
524}
525
508c45da 526static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
527 return c->io_accounting ||
528 c->io_weight != CGROUP_WEIGHT_INVALID ||
529 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
530 c->io_device_weights ||
531 c->io_device_limits;
532}
533
508c45da 534static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
535 return c->blockio_accounting ||
536 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
537 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
538 c->blockio_device_weights ||
539 c->blockio_device_bandwidths;
540}
541
508c45da 542static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
543 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
544 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
545 return c->startup_io_weight;
546 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
547 return c->io_weight;
548 else
549 return CGROUP_WEIGHT_DEFAULT;
550}
551
508c45da 552static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
553 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
554 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
555 return c->startup_blockio_weight;
556 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
557 return c->blockio_weight;
558 else
559 return CGROUP_BLKIO_WEIGHT_DEFAULT;
560}
561
508c45da 562static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
563 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
564 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
565}
566
508c45da 567static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
568 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
569 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
570}
571
f29ff115 572static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
573 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
574 dev_t dev;
575 int r;
576
577 r = lookup_block_device(dev_path, &dev);
578 if (r < 0)
579 return;
580
581 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
f29ff115 582 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
64faf04c 583 if (r < 0)
f29ff115
TH
584 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
585 "Failed to set io.weight: %m");
64faf04c
TH
586}
587
f29ff115 588static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
589 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
590 dev_t dev;
591 int r;
592
593 r = lookup_block_device(dev_path, &dev);
594 if (r < 0)
595 return;
596
597 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
f29ff115 598 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
64faf04c 599 if (r < 0)
f29ff115
TH
600 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
601 "Failed to set blkio.weight_device: %m");
64faf04c
TH
602}
603
f29ff115 604static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
64faf04c
TH
605 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
606 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
607 CGroupIOLimitType type;
608 dev_t dev;
609 unsigned n = 0;
610 int r;
611
612 r = lookup_block_device(dev_path, &dev);
613 if (r < 0)
614 return 0;
615
616 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
617 if (limits[type] != cgroup_io_limit_defaults[type]) {
618 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
619 n++;
620 } else {
621 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
622 }
623 }
624
625 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
626 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
627 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
f29ff115 628 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
64faf04c 629 if (r < 0)
f29ff115
TH
630 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
631 "Failed to set io.max: %m");
64faf04c
TH
632 return n;
633}
634
f29ff115 635static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
636 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
637 dev_t dev;
638 unsigned n = 0;
639 int r;
640
641 r = lookup_block_device(dev_path, &dev);
642 if (r < 0)
643 return 0;
644
645 if (rbps != CGROUP_LIMIT_MAX)
646 n++;
647 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
f29ff115 648 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
64faf04c 649 if (r < 0)
f29ff115
TH
650 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
651 "Failed to set blkio.throttle.read_bps_device: %m");
64faf04c
TH
652
653 if (wbps != CGROUP_LIMIT_MAX)
654 n++;
655 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
f29ff115 656 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
64faf04c 657 if (r < 0)
f29ff115
TH
658 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
659 "Failed to set blkio.throttle.write_bps_device: %m");
64faf04c
TH
660
661 return n;
662}
663
da4d897e 664static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
96e131ea 665 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
666}
667
f29ff115 668static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
da4d897e
TH
669 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
670 int r;
671
672 if (v != CGROUP_LIMIT_MAX)
673 xsprintf(buf, "%" PRIu64 "\n", v);
674
f29ff115 675 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
da4d897e 676 if (r < 0)
f29ff115
TH
677 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
678 "Failed to set %s: %m", file);
da4d897e
TH
679}
680
0f2d84d2 681static void cgroup_apply_firewall(Unit *u) {
906c06f6
DM
682 int r;
683
0f2d84d2
LP
684 assert(u);
685
906c06f6
DM
686 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
687 * not recursive we don't ever touch the bpf on them */
688 return;
689
690 r = bpf_firewall_compile(u);
691 if (r < 0)
692 return;
693
694 (void) bpf_firewall_install(u);
695 return;
696}
697
698static void cgroup_context_apply(
699 Unit *u,
700 CGroupMask apply_mask,
701 bool apply_bpf,
702 ManagerState state) {
703
f29ff115
TH
704 const char *path;
705 CGroupContext *c;
01efdf13 706 bool is_root;
4ad49000
LP
707 int r;
708
f29ff115
TH
709 assert(u);
710
711 c = unit_get_cgroup_context(u);
712 path = u->cgroup_path;
713
4ad49000
LP
714 assert(c);
715 assert(path);
8e274523 716
906c06f6
DM
717 /* Nothing to do? Exit early! */
718 if (apply_mask == 0 && !apply_bpf)
4ad49000 719 return;
8e274523 720
71c26873 721 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
722 * hence silently ignore */
723 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
724 if (is_root)
725 /* Make sure we don't try to display messages with an empty path. */
726 path = "/";
01efdf13 727
714e2e1d
LP
728 /* We generally ignore errors caused by read-only mounted
729 * cgroup trees (assuming we are running in a container then),
730 * and missing cgroups, i.e. EROFS and ENOENT. */
731
906c06f6
DM
732 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
733 bool has_weight, has_shares;
734
735 has_weight = cgroup_context_has_cpu_weight(c);
736 has_shares = cgroup_context_has_cpu_shares(c);
8e274523 737
b4cccbc1 738 if (cg_all_unified() > 0) {
66ebf6c0 739 uint64_t weight;
b2f8b02e 740
66ebf6c0
TH
741 if (has_weight)
742 weight = cgroup_context_cpu_weight(c, state);
743 else if (has_shares) {
744 uint64_t shares = cgroup_context_cpu_shares(c, state);
b2f8b02e 745
66ebf6c0
TH
746 weight = cgroup_cpu_shares_to_weight(shares);
747
748 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
749 shares, weight, path);
750 } else
751 weight = CGROUP_WEIGHT_DEFAULT;
752
753 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
754 } else {
755 uint64_t shares;
756
7d862ab8 757 if (has_weight) {
66ebf6c0
TH
758 uint64_t weight = cgroup_context_cpu_weight(c, state);
759
760 shares = cgroup_cpu_weight_to_shares(weight);
761
762 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
763 weight, shares, path);
7d862ab8
TH
764 } else if (has_shares)
765 shares = cgroup_context_cpu_shares(c, state);
766 else
66ebf6c0
TH
767 shares = CGROUP_CPU_SHARES_DEFAULT;
768
769 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
770 }
4ad49000
LP
771 }
772
906c06f6 773 if (apply_mask & CGROUP_MASK_IO) {
538b4852
TH
774 bool has_io = cgroup_context_has_io_config(c);
775 bool has_blockio = cgroup_context_has_blockio_config(c);
13c31542
TH
776
777 if (!is_root) {
64faf04c
TH
778 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
779 uint64_t weight;
13c31542 780
538b4852
TH
781 if (has_io)
782 weight = cgroup_context_io_weight(c, state);
128fadc9
TH
783 else if (has_blockio) {
784 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
785
786 weight = cgroup_weight_blkio_to_io(blkio_weight);
787
788 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
789 blkio_weight, weight);
790 } else
538b4852 791 weight = CGROUP_WEIGHT_DEFAULT;
13c31542
TH
792
793 xsprintf(buf, "default %" PRIu64 "\n", weight);
794 r = cg_set_attribute("io", path, "io.weight", buf);
795 if (r < 0)
f29ff115
TH
796 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
797 "Failed to set io.weight: %m");
13c31542 798
538b4852
TH
799 if (has_io) {
800 CGroupIODeviceWeight *w;
801
802 /* FIXME: no way to reset this list */
803 LIST_FOREACH(device_weights, w, c->io_device_weights)
f29ff115 804 cgroup_apply_io_device_weight(u, w->path, w->weight);
538b4852
TH
805 } else if (has_blockio) {
806 CGroupBlockIODeviceWeight *w;
807
808 /* FIXME: no way to reset this list */
128fadc9
TH
809 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
810 weight = cgroup_weight_blkio_to_io(w->weight);
811
812 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
813 w->weight, weight, w->path);
814
815 cgroup_apply_io_device_weight(u, w->path, weight);
816 }
538b4852 817 }
13c31542
TH
818 }
819
64faf04c 820 /* Apply limits and free ones without config. */
538b4852
TH
821 if (has_io) {
822 CGroupIODeviceLimit *l, *next;
823
824 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
f29ff115 825 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
538b4852
TH
826 cgroup_context_free_io_device_limit(c, l);
827 }
828 } else if (has_blockio) {
829 CGroupBlockIODeviceBandwidth *b, *next;
830
831 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
832 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
833 CGroupIOLimitType type;
834
835 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
836 limits[type] = cgroup_io_limit_defaults[type];
837
838 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
839 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
840
128fadc9
TH
841 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
842 b->rbps, b->wbps, b->path);
843
f29ff115 844 if (!cgroup_apply_io_device_limit(u, b->path, limits))
538b4852
TH
845 cgroup_context_free_blockio_device_bandwidth(c, b);
846 }
13c31542
TH
847 }
848 }
849
906c06f6 850 if (apply_mask & CGROUP_MASK_BLKIO) {
538b4852
TH
851 bool has_io = cgroup_context_has_io_config(c);
852 bool has_blockio = cgroup_context_has_blockio_config(c);
4ad49000 853
01efdf13 854 if (!is_root) {
64faf04c
TH
855 char buf[DECIMAL_STR_MAX(uint64_t)+1];
856 uint64_t weight;
64faf04c 857
7d862ab8 858 if (has_io) {
128fadc9
TH
859 uint64_t io_weight = cgroup_context_io_weight(c, state);
860
538b4852 861 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9
TH
862
863 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
864 io_weight, weight);
7d862ab8
TH
865 } else if (has_blockio)
866 weight = cgroup_context_blkio_weight(c, state);
867 else
538b4852 868 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c
TH
869
870 xsprintf(buf, "%" PRIu64 "\n", weight);
01efdf13 871 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 872 if (r < 0)
f29ff115
TH
873 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
874 "Failed to set blkio.weight: %m");
4ad49000 875
7d862ab8 876 if (has_io) {
538b4852
TH
877 CGroupIODeviceWeight *w;
878
879 /* FIXME: no way to reset this list */
128fadc9
TH
880 LIST_FOREACH(device_weights, w, c->io_device_weights) {
881 weight = cgroup_weight_io_to_blkio(w->weight);
882
883 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
884 w->weight, weight, w->path);
885
886 cgroup_apply_blkio_device_weight(u, w->path, weight);
887 }
7d862ab8
TH
888 } else if (has_blockio) {
889 CGroupBlockIODeviceWeight *w;
890
891 /* FIXME: no way to reset this list */
892 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
893 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 894 }
4ad49000
LP
895 }
896
64faf04c 897 /* Apply limits and free ones without config. */
7d862ab8 898 if (has_io) {
538b4852
TH
899 CGroupIODeviceLimit *l, *next;
900
901 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
128fadc9
TH
902 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
903 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
904
f29ff115 905 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
538b4852
TH
906 cgroup_context_free_io_device_limit(c, l);
907 }
7d862ab8
TH
908 } else if (has_blockio) {
909 CGroupBlockIODeviceBandwidth *b, *next;
910
911 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
912 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
913 cgroup_context_free_blockio_device_bandwidth(c, b);
d686d8a9 914 }
8e274523
LP
915 }
916
906c06f6 917 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
b4cccbc1
LP
918 if (cg_all_unified() > 0) {
919 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
efdb0237 920
96e131ea 921 if (cgroup_context_has_unified_memory_config(c)) {
da4d897e 922 max = c->memory_max;
96e131ea
WC
923 swap_max = c->memory_swap_max;
924 } else {
da4d897e 925 max = c->memory_limit;
efdb0237 926
128fadc9
TH
927 if (max != CGROUP_LIMIT_MAX)
928 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
929 }
930
f29ff115
TH
931 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
932 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
933 cgroup_apply_unified_memory_limit(u, "memory.max", max);
96e131ea 934 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
efdb0237 935 } else {
da4d897e 936 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
7d862ab8 937 uint64_t val;
da4d897e 938
7d862ab8 939 if (cgroup_context_has_unified_memory_config(c)) {
78a4ee59 940 val = c->memory_max;
7d862ab8
TH
941 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
942 } else
943 val = c->memory_limit;
128fadc9 944
78a4ee59
DM
945 if (val == CGROUP_LIMIT_MAX)
946 strncpy(buf, "-1\n", sizeof(buf));
947 else
948 xsprintf(buf, "%" PRIu64 "\n", val);
949
da4d897e
TH
950 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
951 if (r < 0)
f29ff115
TH
952 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
953 "Failed to set memory.limit_in_bytes: %m");
da4d897e 954 }
4ad49000 955 }
8e274523 956
906c06f6 957 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 958 CGroupDeviceAllow *a;
8e274523 959
714e2e1d
LP
960 /* Changing the devices list of a populated cgroup
961 * might result in EINVAL, hence ignore EINVAL
962 * here. */
963
4ad49000
LP
964 if (c->device_allow || c->device_policy != CGROUP_AUTO)
965 r = cg_set_attribute("devices", path, "devices.deny", "a");
966 else
967 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 968 if (r < 0)
f29ff115
TH
969 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
970 "Failed to reset devices.list: %m");
fb385181 971
4ad49000
LP
972 if (c->device_policy == CGROUP_CLOSED ||
973 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
974 static const char auto_devices[] =
7d711efb
LP
975 "/dev/null\0" "rwm\0"
976 "/dev/zero\0" "rwm\0"
977 "/dev/full\0" "rwm\0"
978 "/dev/random\0" "rwm\0"
979 "/dev/urandom\0" "rwm\0"
980 "/dev/tty\0" "rwm\0"
5a7f87a9 981 "/dev/ptmx\0" "rwm\0"
0d9e7991 982 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
e7330dfe
DP
983 "-/run/systemd/inaccessible/chr\0" "rwm\0"
984 "-/run/systemd/inaccessible/blk\0" "rwm\0";
4ad49000
LP
985
986 const char *x, *y;
987
988 NULSTR_FOREACH_PAIR(x, y, auto_devices)
989 whitelist_device(path, x, y);
7d711efb 990
5a7f87a9 991 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
7d711efb 992 whitelist_major(path, "pts", 'c', "rw");
4ad49000
LP
993 }
994
995 LIST_FOREACH(device_allow, a, c->device_allow) {
fb4650aa 996 char acc[4], *val;
4ad49000
LP
997 unsigned k = 0;
998
999 if (a->r)
1000 acc[k++] = 'r';
1001 if (a->w)
1002 acc[k++] = 'w';
1003 if (a->m)
1004 acc[k++] = 'm';
fb385181 1005
4ad49000
LP
1006 if (k == 0)
1007 continue;
fb385181 1008
4ad49000 1009 acc[k++] = 0;
90060676 1010
27458ed6 1011 if (path_startswith(a->path, "/dev/"))
90060676 1012 whitelist_device(path, a->path, acc);
fb4650aa
ZJS
1013 else if ((val = startswith(a->path, "block-")))
1014 whitelist_major(path, val, 'b', acc);
1015 else if ((val = startswith(a->path, "char-")))
1016 whitelist_major(path, val, 'c', acc);
90060676 1017 else
f29ff115 1018 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
1019 }
1020 }
03a7b521 1021
906c06f6 1022 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
03a7b521 1023
f5058264 1024 if (c->tasks_max != CGROUP_LIMIT_MAX) {
03a7b521
LP
1025 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1026
1027 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1028 r = cg_set_attribute("pids", path, "pids.max", buf);
1029 } else
1030 r = cg_set_attribute("pids", path, "pids.max", "max");
1031
1032 if (r < 0)
f29ff115
TH
1033 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1034 "Failed to set pids.max: %m");
03a7b521 1035 }
906c06f6
DM
1036
1037 if (apply_bpf)
0f2d84d2 1038 cgroup_apply_firewall(u);
fb385181
LP
1039}
1040
efdb0237
LP
1041CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1042 CGroupMask mask = 0;
8e274523 1043
4ad49000 1044 /* Figure out which controllers we need */
8e274523 1045
b2f8b02e 1046 if (c->cpu_accounting ||
66ebf6c0
TH
1047 cgroup_context_has_cpu_weight(c) ||
1048 cgroup_context_has_cpu_shares(c) ||
3a43da28 1049 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 1050 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 1051
538b4852
TH
1052 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1053 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1054
4ad49000 1055 if (c->memory_accounting ||
da4d897e
TH
1056 c->memory_limit != CGROUP_LIMIT_MAX ||
1057 cgroup_context_has_unified_memory_config(c))
efdb0237 1058 mask |= CGROUP_MASK_MEMORY;
8e274523 1059
a931ad47
LP
1060 if (c->device_allow ||
1061 c->device_policy != CGROUP_AUTO)
3905f127 1062 mask |= CGROUP_MASK_DEVICES;
4ad49000 1063
03a7b521
LP
1064 if (c->tasks_accounting ||
1065 c->tasks_max != (uint64_t) -1)
1066 mask |= CGROUP_MASK_PIDS;
1067
4ad49000 1068 return mask;
8e274523
LP
1069}
1070
efdb0237 1071CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1072 CGroupContext *c;
8e274523 1073
efdb0237
LP
1074 /* Returns the mask of controllers the unit needs for itself */
1075
4ad49000
LP
1076 c = unit_get_cgroup_context(u);
1077 if (!c)
1078 return 0;
8e274523 1079
64e844e5 1080 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
02638280
LP
1081}
1082
1083CGroupMask unit_get_delegate_mask(Unit *u) {
1084 CGroupContext *c;
1085
1086 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1087 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
19af675e 1088 *
02638280 1089 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1090
02638280
LP
1091 if (u->type == UNIT_SLICE)
1092 return 0;
1093
1094 c = unit_get_cgroup_context(u);
1095 if (!c)
1096 return 0;
1097
1098 if (!c->delegate)
1099 return 0;
1100
1101 if (cg_all_unified() <= 0) {
a931ad47
LP
1102 ExecContext *e;
1103
1104 e = unit_get_exec_context(u);
02638280
LP
1105 if (e && !exec_context_maintains_privileges(e))
1106 return 0;
a931ad47
LP
1107 }
1108
02638280 1109 return c->delegate_controllers;
8e274523
LP
1110}
1111
efdb0237 1112CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1113 assert(u);
bc432dc7 1114
02638280 1115 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1116
bc432dc7
LP
1117 if (u->cgroup_members_mask_valid)
1118 return u->cgroup_members_mask;
1119
64e844e5 1120 u->cgroup_members_mask = 0;
bc432dc7
LP
1121
1122 if (u->type == UNIT_SLICE) {
eef85c4a 1123 void *v;
bc432dc7
LP
1124 Unit *member;
1125 Iterator i;
1126
eef85c4a 1127 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
bc432dc7
LP
1128
1129 if (member == u)
1130 continue;
1131
d4fdc205 1132 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
1133 continue;
1134
31604970 1135 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1136 }
1137 }
1138
1139 u->cgroup_members_mask_valid = true;
6414b7c9 1140 return u->cgroup_members_mask;
246aa6dd
LP
1141}
1142
efdb0237 1143CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 1144 assert(u);
246aa6dd 1145
efdb0237
LP
1146 /* Returns the mask of controllers all of the unit's siblings
1147 * require, i.e. the members mask of the unit's parent slice
1148 * if there is one. */
1149
bc432dc7 1150 if (UNIT_ISSET(u->slice))
637f421e 1151 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 1152
64e844e5 1153 return unit_get_subtree_mask(u); /* we are the top-level slice */
246aa6dd
LP
1154}
1155
efdb0237
LP
1156CGroupMask unit_get_subtree_mask(Unit *u) {
1157
1158 /* Returns the mask of this subtree, meaning of the group
1159 * itself and its children. */
1160
1161 return unit_get_own_mask(u) | unit_get_members_mask(u);
1162}
1163
1164CGroupMask unit_get_target_mask(Unit *u) {
1165 CGroupMask mask;
1166
1167 /* This returns the cgroup mask of all controllers to enable
1168 * for a specific cgroup, i.e. everything it needs itself,
1169 * plus all that its children need, plus all that its siblings
1170 * need. This is primarily useful on the legacy cgroup
1171 * hierarchy, where we need to duplicate each cgroup in each
1172 * hierarchy that shall be enabled for it. */
6414b7c9 1173
efdb0237
LP
1174 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1175 mask &= u->manager->cgroup_supported;
1176
1177 return mask;
1178}
1179
1180CGroupMask unit_get_enable_mask(Unit *u) {
1181 CGroupMask mask;
1182
1183 /* This returns the cgroup mask of all controllers to enable
1184 * for the children of a specific cgroup. This is primarily
1185 * useful for the unified cgroup hierarchy, where each cgroup
1186 * controls which controllers are enabled for its children. */
1187
1188 mask = unit_get_members_mask(u);
6414b7c9
DS
1189 mask &= u->manager->cgroup_supported;
1190
1191 return mask;
1192}
1193
906c06f6
DM
1194bool unit_get_needs_bpf(Unit *u) {
1195 CGroupContext *c;
1196 Unit *p;
1197 assert(u);
1198
1199 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1200 * moment. */
1201 if (u->type == UNIT_SLICE)
1202 return false;
1203
1204 c = unit_get_cgroup_context(u);
1205 if (!c)
1206 return false;
1207
1208 if (c->ip_accounting ||
1209 c->ip_address_allow ||
1210 c->ip_address_deny)
1211 return true;
1212
1213 /* If any parent slice has an IP access list defined, it applies too */
1214 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1215 c = unit_get_cgroup_context(p);
1216 if (!c)
1217 return false;
1218
1219 if (c->ip_address_allow ||
1220 c->ip_address_deny)
1221 return true;
1222 }
1223
1224 return false;
1225}
1226
6414b7c9
DS
1227/* Recurse from a unit up through its containing slices, propagating
1228 * mask bits upward. A unit is also member of itself. */
bc432dc7 1229void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 1230 CGroupMask m;
bc432dc7
LP
1231 bool more;
1232
1233 assert(u);
1234
1235 /* Calculate subtree mask */
efdb0237 1236 m = unit_get_subtree_mask(u);
bc432dc7
LP
1237
1238 /* See if anything changed from the previous invocation. If
1239 * not, we're done. */
1240 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1241 return;
1242
1243 more =
1244 u->cgroup_subtree_mask_valid &&
1245 ((m & ~u->cgroup_subtree_mask) != 0) &&
1246 ((~m & u->cgroup_subtree_mask) == 0);
1247
1248 u->cgroup_subtree_mask = m;
1249 u->cgroup_subtree_mask_valid = true;
1250
6414b7c9
DS
1251 if (UNIT_ISSET(u->slice)) {
1252 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
1253
1254 if (more)
1255 /* There's more set now than before. We
1256 * propagate the new mask to the parent's mask
1257 * (not caring if it actually was valid or
1258 * not). */
1259
1260 s->cgroup_members_mask |= m;
1261
1262 else
1263 /* There's less set now than before (or we
1264 * don't know), we need to recalculate
1265 * everything, so let's invalidate the
1266 * parent's members mask */
1267
1268 s->cgroup_members_mask_valid = false;
1269
1270 /* And now make sure that this change also hits our
1271 * grandparents */
1272 unit_update_cgroup_members_masks(s);
6414b7c9
DS
1273 }
1274}
1275
efdb0237 1276static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
1277 Unit *u = userdata;
1278
1279 assert(mask != 0);
1280 assert(u);
1281
1282 while (u) {
1283 if (u->cgroup_path &&
1284 u->cgroup_realized &&
1285 (u->cgroup_realized_mask & mask) == mask)
1286 return u->cgroup_path;
1287
1288 u = UNIT_DEREF(u->slice);
1289 }
1290
1291 return NULL;
1292}
1293
efdb0237
LP
1294char *unit_default_cgroup_path(Unit *u) {
1295 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1296 int r;
1297
1298 assert(u);
1299
1300 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1301 return strdup(u->manager->cgroup_root);
1302
1303 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1304 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1305 if (r < 0)
1306 return NULL;
1307 }
1308
1309 escaped = cg_escape(u->id);
1310 if (!escaped)
1311 return NULL;
1312
1313 if (slice)
605405c6
ZJS
1314 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1315 escaped);
efdb0237 1316 else
605405c6 1317 return strjoin(u->manager->cgroup_root, "/", escaped);
efdb0237
LP
1318}
1319
1320int unit_set_cgroup_path(Unit *u, const char *path) {
1321 _cleanup_free_ char *p = NULL;
1322 int r;
1323
1324 assert(u);
1325
1326 if (path) {
1327 p = strdup(path);
1328 if (!p)
1329 return -ENOMEM;
1330 } else
1331 p = NULL;
1332
1333 if (streq_ptr(u->cgroup_path, p))
1334 return 0;
1335
1336 if (p) {
1337 r = hashmap_put(u->manager->cgroup_unit, p, u);
1338 if (r < 0)
1339 return r;
1340 }
1341
1342 unit_release_cgroup(u);
1343
1344 u->cgroup_path = p;
1345 p = NULL;
1346
1347 return 1;
1348}
1349
1350int unit_watch_cgroup(Unit *u) {
ab2c3861 1351 _cleanup_free_ char *events = NULL;
efdb0237
LP
1352 int r;
1353
1354 assert(u);
1355
1356 if (!u->cgroup_path)
1357 return 0;
1358
1359 if (u->cgroup_inotify_wd >= 0)
1360 return 0;
1361
1362 /* Only applies to the unified hierarchy */
c22800e4 1363 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1364 if (r < 0)
1365 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1366 if (r == 0)
efdb0237
LP
1367 return 0;
1368
1369 /* Don't watch the root slice, it's pointless. */
1370 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1371 return 0;
1372
1373 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1374 if (r < 0)
1375 return log_oom();
1376
ab2c3861 1377 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1378 if (r < 0)
1379 return log_oom();
1380
ab2c3861 1381 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
efdb0237
LP
1382 if (u->cgroup_inotify_wd < 0) {
1383
1384 if (errno == ENOENT) /* If the directory is already
1385 * gone we don't need to track
1386 * it, so this is not an error */
1387 return 0;
1388
1389 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1390 }
1391
1392 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1393 if (r < 0)
1394 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1395
1396 return 0;
1397}
1398
a4634b21
LP
1399int unit_pick_cgroup_path(Unit *u) {
1400 _cleanup_free_ char *path = NULL;
1401 int r;
1402
1403 assert(u);
1404
1405 if (u->cgroup_path)
1406 return 0;
1407
1408 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1409 return -EINVAL;
1410
1411 path = unit_default_cgroup_path(u);
1412 if (!path)
1413 return log_oom();
1414
1415 r = unit_set_cgroup_path(u, path);
1416 if (r == -EEXIST)
1417 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1418 if (r < 0)
1419 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1420
1421 return 0;
1422}
1423
efdb0237
LP
1424static int unit_create_cgroup(
1425 Unit *u,
1426 CGroupMask target_mask,
906c06f6
DM
1427 CGroupMask enable_mask,
1428 bool needs_bpf) {
efdb0237 1429
0cd385d3 1430 CGroupContext *c;
bc432dc7 1431 int r;
64747e2d 1432
4ad49000 1433 assert(u);
64747e2d 1434
0cd385d3
LP
1435 c = unit_get_cgroup_context(u);
1436 if (!c)
1437 return 0;
1438
a4634b21
LP
1439 /* Figure out our cgroup path */
1440 r = unit_pick_cgroup_path(u);
1441 if (r < 0)
1442 return r;
b58b8e11 1443
03b90d4b 1444 /* First, create our own group */
efdb0237 1445 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 1446 if (r < 0)
efdb0237
LP
1447 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1448
1449 /* Start watching it */
1450 (void) unit_watch_cgroup(u);
1451
1452 /* Enable all controllers we need */
1453 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1454 if (r < 0)
1455 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
1456
1457 /* Keep track that this is now realized */
4ad49000 1458 u->cgroup_realized = true;
efdb0237 1459 u->cgroup_realized_mask = target_mask;
ccf78df1 1460 u->cgroup_enabled_mask = enable_mask;
906c06f6 1461 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
4ad49000 1462
0cd385d3
LP
1463 if (u->type != UNIT_SLICE && !c->delegate) {
1464
1465 /* Then, possibly move things over, but not if
1466 * subgroups may contain processes, which is the case
1467 * for slice and delegation units. */
1468 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1469 if (r < 0)
efdb0237 1470 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 1471 }
03b90d4b 1472
64747e2d
LP
1473 return 0;
1474}
1475
7b3fd631
LP
1476int unit_attach_pids_to_cgroup(Unit *u) {
1477 int r;
1478 assert(u);
1479
1480 r = unit_realize_cgroup(u);
1481 if (r < 0)
1482 return r;
1483
1484 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1485 if (r < 0)
1486 return r;
1487
1488 return 0;
1489}
1490
4b58153d
LP
1491static void cgroup_xattr_apply(Unit *u) {
1492 char ids[SD_ID128_STRING_MAX];
1493 int r;
1494
1495 assert(u);
1496
1497 if (!MANAGER_IS_SYSTEM(u->manager))
1498 return;
1499
1500 if (sd_id128_is_null(u->invocation_id))
1501 return;
1502
1503 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1504 "trusted.invocation_id",
1505 sd_id128_to_string(u->invocation_id, ids), 32,
1506 0);
1507 if (r < 0)
0fb84499 1508 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
4b58153d
LP
1509}
1510
906c06f6
DM
1511static bool unit_has_mask_realized(
1512 Unit *u,
1513 CGroupMask target_mask,
1514 CGroupMask enable_mask,
1515 bool needs_bpf) {
1516
bc432dc7
LP
1517 assert(u);
1518
906c06f6
DM
1519 return u->cgroup_realized &&
1520 u->cgroup_realized_mask == target_mask &&
1521 u->cgroup_enabled_mask == enable_mask &&
1522 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1523 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
6414b7c9
DS
1524}
1525
2aa57a65
LP
1526static void unit_add_to_cgroup_realize_queue(Unit *u) {
1527 assert(u);
1528
1529 if (u->in_cgroup_realize_queue)
1530 return;
1531
1532 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1533 u->in_cgroup_realize_queue = true;
1534}
1535
1536static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1537 assert(u);
1538
1539 if (!u->in_cgroup_realize_queue)
1540 return;
1541
1542 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1543 u->in_cgroup_realize_queue = false;
1544}
1545
1546
6414b7c9
DS
1547/* Check if necessary controllers and attributes for a unit are in place.
1548 *
1549 * If so, do nothing.
1550 * If not, create paths, move processes over, and set attributes.
1551 *
1552 * Returns 0 on success and < 0 on failure. */
db785129 1553static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1554 CGroupMask target_mask, enable_mask;
906c06f6 1555 bool needs_bpf, apply_bpf;
6414b7c9 1556 int r;
64747e2d 1557
4ad49000 1558 assert(u);
64747e2d 1559
2aa57a65 1560 unit_remove_from_cgroup_realize_queue(u);
64747e2d 1561
efdb0237 1562 target_mask = unit_get_target_mask(u);
ccf78df1 1563 enable_mask = unit_get_enable_mask(u);
906c06f6 1564 needs_bpf = unit_get_needs_bpf(u);
ccf78df1 1565
906c06f6 1566 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
0a1eb06d 1567 return 0;
64747e2d 1568
906c06f6
DM
1569 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1570 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1571 * this will trickle down properly to cgroupfs. */
1572 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1573
4ad49000 1574 /* First, realize parents */
6414b7c9 1575 if (UNIT_ISSET(u->slice)) {
db785129 1576 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1577 if (r < 0)
1578 return r;
1579 }
4ad49000
LP
1580
1581 /* And then do the real work */
906c06f6 1582 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
6414b7c9
DS
1583 if (r < 0)
1584 return r;
1585
1586 /* Finally, apply the necessary attributes. */
906c06f6 1587 cgroup_context_apply(u, target_mask, apply_bpf, state);
4b58153d 1588 cgroup_xattr_apply(u);
6414b7c9
DS
1589
1590 return 0;
64747e2d
LP
1591}
1592
91a6073e 1593unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 1594 ManagerState state;
4ad49000 1595 unsigned n = 0;
db785129 1596 Unit *i;
6414b7c9 1597 int r;
ecedd90f 1598
91a6073e
LP
1599 assert(m);
1600
db785129
LP
1601 state = manager_state(m);
1602
91a6073e
LP
1603 while ((i = m->cgroup_realize_queue)) {
1604 assert(i->in_cgroup_realize_queue);
ecedd90f 1605
2aa57a65
LP
1606 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1607 /* Maybe things changed, and the unit is not actually active anymore? */
1608 unit_remove_from_cgroup_realize_queue(i);
1609 continue;
1610 }
1611
db785129 1612 r = unit_realize_cgroup_now(i, state);
6414b7c9 1613 if (r < 0)
efdb0237 1614 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1615
4ad49000
LP
1616 n++;
1617 }
ecedd90f 1618
4ad49000 1619 return n;
8e274523
LP
1620}
1621
91a6073e 1622static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
4ad49000 1623 Unit *slice;
ca949c9d 1624
4ad49000
LP
1625 /* This adds the siblings of the specified unit and the
1626 * siblings of all parent units to the cgroup queue. (But
1627 * neither the specified unit itself nor the parents.) */
1628
1629 while ((slice = UNIT_DEREF(u->slice))) {
1630 Iterator i;
1631 Unit *m;
eef85c4a 1632 void *v;
8f53a7b8 1633
eef85c4a 1634 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
4ad49000
LP
1635 if (m == u)
1636 continue;
8e274523 1637
6414b7c9
DS
1638 /* Skip units that have a dependency on the slice
1639 * but aren't actually in it. */
4ad49000 1640 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1641 continue;
8e274523 1642
6414b7c9
DS
1643 /* No point in doing cgroup application for units
1644 * without active processes. */
1645 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1646 continue;
1647
1648 /* If the unit doesn't need any new controllers
1649 * and has current ones realized, it doesn't need
1650 * any changes. */
906c06f6
DM
1651 if (unit_has_mask_realized(m,
1652 unit_get_target_mask(m),
1653 unit_get_enable_mask(m),
1654 unit_get_needs_bpf(m)))
6414b7c9
DS
1655 continue;
1656
91a6073e 1657 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
1658 }
1659
4ad49000 1660 u = slice;
8e274523 1661 }
4ad49000
LP
1662}
1663
0a1eb06d 1664int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1665 assert(u);
1666
35b7ff80 1667 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1668 return 0;
8e274523 1669
4ad49000
LP
1670 /* So, here's the deal: when realizing the cgroups for this
1671 * unit, we need to first create all parents, but there's more
1672 * actually: for the weight-based controllers we also need to
1673 * make sure that all our siblings (i.e. units that are in the
73e231ab 1674 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1675 * would become very uneven as each of their processes would
1676 * get as much resources as all our group together. This call
1677 * will synchronously create the parent cgroups, but will
1678 * defer work on the siblings to the next event loop
1679 * iteration. */
ca949c9d 1680
4ad49000 1681 /* Add all sibling slices to the cgroup queue. */
91a6073e 1682 unit_add_siblings_to_cgroup_realize_queue(u);
4ad49000 1683
6414b7c9 1684 /* And realize this one now (and apply the values) */
db785129 1685 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1686}
1687
efdb0237
LP
1688void unit_release_cgroup(Unit *u) {
1689 assert(u);
1690
1691 /* Forgets all cgroup details for this cgroup */
1692
1693 if (u->cgroup_path) {
1694 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1695 u->cgroup_path = mfree(u->cgroup_path);
1696 }
1697
1698 if (u->cgroup_inotify_wd >= 0) {
1699 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1700 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1701
1702 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1703 u->cgroup_inotify_wd = -1;
1704 }
1705}
1706
1707void unit_prune_cgroup(Unit *u) {
8e274523 1708 int r;
efdb0237 1709 bool is_root_slice;
8e274523 1710
4ad49000 1711 assert(u);
8e274523 1712
efdb0237
LP
1713 /* Removes the cgroup, if empty and possible, and stops watching it. */
1714
4ad49000
LP
1715 if (!u->cgroup_path)
1716 return;
8e274523 1717
fe700f46
LP
1718 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1719
efdb0237
LP
1720 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1721
1722 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1723 if (r < 0) {
f29ff115 1724 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1725 return;
1726 }
8e274523 1727
efdb0237
LP
1728 if (is_root_slice)
1729 return;
1730
1731 unit_release_cgroup(u);
0a1eb06d 1732
4ad49000 1733 u->cgroup_realized = false;
bc432dc7 1734 u->cgroup_realized_mask = 0;
ccf78df1 1735 u->cgroup_enabled_mask = 0;
8e274523
LP
1736}
1737
efdb0237 1738int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1739 _cleanup_fclose_ FILE *f = NULL;
1740 pid_t pid = 0, npid, mypid;
efdb0237 1741 int r;
4ad49000
LP
1742
1743 assert(u);
efdb0237 1744 assert(ret);
4ad49000
LP
1745
1746 if (!u->cgroup_path)
efdb0237 1747 return -ENXIO;
4ad49000 1748
efdb0237
LP
1749 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1750 if (r < 0)
1751 return r;
4ad49000 1752
df0ff127 1753 mypid = getpid_cached();
4ad49000
LP
1754 while (cg_read_pid(f, &npid) > 0) {
1755 pid_t ppid;
1756
1757 if (npid == pid)
1758 continue;
8e274523 1759
4ad49000 1760 /* Ignore processes that aren't our kids */
6bc73acb 1761 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
4ad49000 1762 continue;
8e274523 1763
efdb0237 1764 if (pid != 0)
4ad49000
LP
1765 /* Dang, there's more than one daemonized PID
1766 in this group, so we don't know what process
1767 is the main process. */
efdb0237
LP
1768
1769 return -ENODATA;
8e274523 1770
4ad49000 1771 pid = npid;
8e274523
LP
1772 }
1773
efdb0237
LP
1774 *ret = pid;
1775 return 0;
1776}
1777
1778static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1779 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1780 _cleanup_fclose_ FILE *f = NULL;
1781 int ret = 0, r;
1782
1783 assert(u);
1784 assert(path);
1785
1786 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1787 if (r < 0)
1788 ret = r;
1789 else {
1790 pid_t pid;
1791
1792 while ((r = cg_read_pid(f, &pid)) > 0) {
1793 r = unit_watch_pid(u, pid);
1794 if (r < 0 && ret >= 0)
1795 ret = r;
1796 }
1797
1798 if (r < 0 && ret >= 0)
1799 ret = r;
1800 }
1801
1802 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1803 if (r < 0) {
1804 if (ret >= 0)
1805 ret = r;
1806 } else {
1807 char *fn;
1808
1809 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1810 _cleanup_free_ char *p = NULL;
1811
605405c6 1812 p = strjoin(path, "/", fn);
efdb0237
LP
1813 free(fn);
1814
1815 if (!p)
1816 return -ENOMEM;
1817
1818 r = unit_watch_pids_in_path(u, p);
1819 if (r < 0 && ret >= 0)
1820 ret = r;
1821 }
1822
1823 if (r < 0 && ret >= 0)
1824 ret = r;
1825 }
1826
1827 return ret;
1828}
1829
11aef522
LP
1830int unit_synthesize_cgroup_empty_event(Unit *u) {
1831 int r;
1832
1833 assert(u);
1834
1835 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1836 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1837 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1838
1839 if (!u->cgroup_path)
1840 return -ENOENT;
1841
1842 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1843 if (r < 0)
1844 return r;
1845 if (r > 0) /* On unified we have reliable notifications, and don't need this */
1846 return 0;
1847
1848 if (!set_isempty(u->pids))
1849 return 0;
1850
1851 unit_add_to_cgroup_empty_queue(u);
1852 return 0;
1853}
1854
efdb0237 1855int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
1856 int r;
1857
efdb0237
LP
1858 assert(u);
1859
1860 /* Adds all PIDs from our cgroup to the set of PIDs we
1861 * watch. This is a fallback logic for cases where we do not
1862 * get reliable cgroup empty notifications: we try to use
1863 * SIGCHLD as replacement. */
1864
1865 if (!u->cgroup_path)
1866 return -ENOENT;
1867
c22800e4 1868 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1869 if (r < 0)
1870 return r;
1871 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
1872 return 0;
1873
1874 return unit_watch_pids_in_path(u, u->cgroup_path);
1875}
1876
09e24654
LP
1877static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1878 Manager *m = userdata;
1879 Unit *u;
efdb0237
LP
1880 int r;
1881
09e24654
LP
1882 assert(s);
1883 assert(m);
efdb0237 1884
09e24654
LP
1885 u = m->cgroup_empty_queue;
1886 if (!u)
efdb0237
LP
1887 return 0;
1888
09e24654
LP
1889 assert(u->in_cgroup_empty_queue);
1890 u->in_cgroup_empty_queue = false;
1891 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1892
1893 if (m->cgroup_empty_queue) {
1894 /* More stuff queued, let's make sure we remain enabled */
1895 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1896 if (r < 0)
1897 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1898 }
efdb0237
LP
1899
1900 unit_add_to_gc_queue(u);
1901
1902 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1903 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1904
1905 return 0;
1906}
1907
09e24654
LP
1908void unit_add_to_cgroup_empty_queue(Unit *u) {
1909 int r;
1910
1911 assert(u);
1912
1913 /* Note that there are four different ways how cgroup empty events reach us:
1914 *
1915 * 1. On the unified hierarchy we get an inotify event on the cgroup
1916 *
1917 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1918 *
1919 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1920 *
1921 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1922 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1923 *
1924 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1925 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1926 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1927 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1928 * case for scope units). */
1929
1930 if (u->in_cgroup_empty_queue)
1931 return;
1932
1933 /* Let's verify that the cgroup is really empty */
1934 if (!u->cgroup_path)
1935 return;
1936 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1937 if (r < 0) {
1938 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1939 return;
1940 }
1941 if (r == 0)
1942 return;
1943
1944 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1945 u->in_cgroup_empty_queue = true;
1946
1947 /* Trigger the defer event */
1948 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1949 if (r < 0)
1950 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1951}
1952
efdb0237
LP
1953static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1954 Manager *m = userdata;
1955
1956 assert(s);
1957 assert(fd >= 0);
1958 assert(m);
1959
1960 for (;;) {
1961 union inotify_event_buffer buffer;
1962 struct inotify_event *e;
1963 ssize_t l;
1964
1965 l = read(fd, &buffer, sizeof(buffer));
1966 if (l < 0) {
47249640 1967 if (IN_SET(errno, EINTR, EAGAIN))
efdb0237
LP
1968 return 0;
1969
1970 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1971 }
1972
1973 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1974 Unit *u;
1975
1976 if (e->wd < 0)
1977 /* Queue overflow has no watch descriptor */
1978 continue;
1979
1980 if (e->mask & IN_IGNORED)
1981 /* The watch was just removed */
1982 continue;
1983
1984 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1985 if (!u) /* Not that inotify might deliver
1986 * events for a watch even after it
1987 * was removed, because it was queued
1988 * before the removal. Let's ignore
1989 * this here safely. */
1990 continue;
1991
09e24654 1992 unit_add_to_cgroup_empty_queue(u);
efdb0237
LP
1993 }
1994 }
8e274523
LP
1995}
1996
8e274523 1997int manager_setup_cgroup(Manager *m) {
9444b1f2 1998 _cleanup_free_ char *path = NULL;
10bd3e2e 1999 const char *scope_path;
efdb0237 2000 CGroupController c;
b4cccbc1 2001 int r, all_unified;
efdb0237 2002 char *e;
8e274523
LP
2003
2004 assert(m);
2005
35d2e7ec 2006 /* 1. Determine hierarchy */
efdb0237 2007 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 2008 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
2009 if (r < 0)
2010 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 2011
efdb0237
LP
2012 /* Chop off the init scope, if we are already located in it */
2013 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 2014
efdb0237
LP
2015 /* LEGACY: Also chop off the system slice if we are in
2016 * it. This is to support live upgrades from older systemd
2017 * versions where PID 1 was moved there. Also see
2018 * cg_get_root_path(). */
463d0d15 2019 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 2020 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 2021 if (!e)
efdb0237 2022 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 2023 }
efdb0237
LP
2024 if (e)
2025 *e = 0;
7ccfb64a 2026
7546145e
LP
2027 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2028 * easily prepend it everywhere. */
2029 delete_trailing_chars(m->cgroup_root, "/");
8e274523 2030
35d2e7ec 2031 /* 2. Show data */
9444b1f2 2032 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
2033 if (r < 0)
2034 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 2035
415fc41c
TH
2036 r = cg_unified_flush();
2037 if (r < 0)
2038 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 2039
b4cccbc1 2040 all_unified = cg_all_unified();
d4c819ed
ZJS
2041 if (all_unified < 0)
2042 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2043 if (all_unified > 0)
efdb0237 2044 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 2045 else {
c22800e4 2046 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
2047 if (r < 0)
2048 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2049 if (r > 0)
2050 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2051 else
2052 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2053 }
efdb0237 2054
09e24654
LP
2055 /* 3. Allocate cgroup empty defer event source */
2056 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2057 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2058 if (r < 0)
2059 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2060
2061 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2062 if (r < 0)
2063 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2064
2065 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2066 if (r < 0)
2067 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2068
2069 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2070
2071 /* 4. Install notifier inotify object, or agent */
10bd3e2e 2072 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
c6c18be3 2073
09e24654 2074 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
efdb0237 2075
10bd3e2e
LP
2076 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2077 safe_close(m->cgroup_inotify_fd);
efdb0237 2078
10bd3e2e
LP
2079 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2080 if (m->cgroup_inotify_fd < 0)
2081 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 2082
10bd3e2e
LP
2083 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2084 if (r < 0)
2085 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 2086
10bd3e2e
LP
2087 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2088 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
09e24654 2089 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
10bd3e2e
LP
2090 if (r < 0)
2091 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
efdb0237 2092
10bd3e2e 2093 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 2094
10bd3e2e 2095 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
efdb0237 2096
10bd3e2e
LP
2097 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2098 * since it does not generate events when control groups with children run empty. */
8e274523 2099
10bd3e2e 2100 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
23bbb0de 2101 if (r < 0)
10bd3e2e
LP
2102 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2103 else if (r > 0)
2104 log_debug("Installed release agent.");
2105 else if (r == 0)
2106 log_debug("Release agent already installed.");
2107 }
efdb0237 2108
09e24654 2109 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
10bd3e2e
LP
2110 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2111 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2112 if (r < 0)
2113 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
c6c18be3 2114
09e24654 2115 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
10bd3e2e
LP
2116 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2117 if (r < 0)
2118 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
0d8c31ff 2119
09e24654 2120 /* 6. And pin it, so that it cannot be unmounted */
10bd3e2e
LP
2121 safe_close(m->pin_cgroupfs_fd);
2122 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2123 if (m->pin_cgroupfs_fd < 0)
2124 return log_error_errno(errno, "Failed to open pin file: %m");
2125
09e24654 2126 /* 7. Always enable hierarchical support if it exists... */
10bd3e2e
LP
2127 if (!all_unified && m->test_run_flags == 0)
2128 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3 2129
09e24654 2130 /* 8. Figure out which controllers are supported, and log about it */
efdb0237
LP
2131 r = cg_mask_supported(&m->cgroup_supported);
2132 if (r < 0)
2133 return log_error_errno(r, "Failed to determine supported controllers: %m");
efdb0237 2134 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
eee0a1e4 2135 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 2136
a32360f1 2137 return 0;
8e274523
LP
2138}
2139
c6c18be3 2140void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
2141 assert(m);
2142
9444b1f2
LP
2143 /* We can't really delete the group, since we are in it. But
2144 * let's trim it. */
2145 if (delete && m->cgroup_root)
efdb0237
LP
2146 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2147
09e24654
LP
2148 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2149
efdb0237
LP
2150 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2151
2152 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2153 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 2154
03e334a1 2155 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 2156
efdb0237 2157 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
2158}
2159
4ad49000 2160Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 2161 char *p;
4ad49000 2162 Unit *u;
acb14d31
LP
2163
2164 assert(m);
2165 assert(cgroup);
acb14d31 2166
4ad49000
LP
2167 u = hashmap_get(m->cgroup_unit, cgroup);
2168 if (u)
2169 return u;
acb14d31 2170
8e70580b 2171 p = strdupa(cgroup);
acb14d31
LP
2172 for (;;) {
2173 char *e;
2174
2175 e = strrchr(p, '/');
efdb0237
LP
2176 if (!e || e == p)
2177 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
2178
2179 *e = 0;
2180
4ad49000
LP
2181 u = hashmap_get(m->cgroup_unit, p);
2182 if (u)
2183 return u;
acb14d31
LP
2184 }
2185}
2186
b3ac818b 2187Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 2188 _cleanup_free_ char *cgroup = NULL;
acb14d31 2189 int r;
8e274523 2190
8c47c732
LP
2191 assert(m);
2192
b3ac818b
LP
2193 if (pid <= 0)
2194 return NULL;
2195
2196 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2197 if (r < 0)
2198 return NULL;
2199
2200 return manager_get_unit_by_cgroup(m, cgroup);
2201}
2202
2203Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2204 Unit *u;
2205
2206 assert(m);
2207
efdb0237 2208 if (pid <= 0)
8c47c732
LP
2209 return NULL;
2210
2ca9d979 2211 if (pid == getpid_cached())
efdb0237
LP
2212 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2213
fea72cc0 2214 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
2215 if (u)
2216 return u;
2217
fea72cc0 2218 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
2219 if (u)
2220 return u;
2221
b3ac818b 2222 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 2223}
4fbf50b3 2224
4ad49000
LP
2225int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2226 Unit *u;
4fbf50b3 2227
4ad49000
LP
2228 assert(m);
2229 assert(cgroup);
4fbf50b3 2230
09e24654
LP
2231 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2232 * or from the --system instance */
2233
d8fdc620
LP
2234 log_debug("Got cgroup empty notification for: %s", cgroup);
2235
4ad49000 2236 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
2237 if (!u)
2238 return 0;
b56c28c3 2239
09e24654
LP
2240 unit_add_to_cgroup_empty_queue(u);
2241 return 1;
5ad096b3
LP
2242}
2243
2244int unit_get_memory_current(Unit *u, uint64_t *ret) {
2245 _cleanup_free_ char *v = NULL;
2246 int r;
2247
2248 assert(u);
2249 assert(ret);
2250
2e4025c0 2251 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
cf3b4be1
LP
2252 return -ENODATA;
2253
5ad096b3
LP
2254 if (!u->cgroup_path)
2255 return -ENODATA;
2256
efdb0237 2257 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
2258 return -ENODATA;
2259
b4cccbc1
LP
2260 r = cg_all_unified();
2261 if (r < 0)
2262 return r;
2263 if (r > 0)
efdb0237 2264 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
b4cccbc1
LP
2265 else
2266 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
5ad096b3
LP
2267 if (r == -ENOENT)
2268 return -ENODATA;
2269 if (r < 0)
2270 return r;
2271
2272 return safe_atou64(v, ret);
2273}
2274
03a7b521
LP
2275int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2276 _cleanup_free_ char *v = NULL;
2277 int r;
2278
2279 assert(u);
2280 assert(ret);
2281
2e4025c0 2282 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
2283 return -ENODATA;
2284
03a7b521
LP
2285 if (!u->cgroup_path)
2286 return -ENODATA;
2287
2288 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2289 return -ENODATA;
2290
2291 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2292 if (r == -ENOENT)
2293 return -ENODATA;
2294 if (r < 0)
2295 return r;
2296
2297 return safe_atou64(v, ret);
2298}
2299
5ad096b3
LP
2300static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2301 _cleanup_free_ char *v = NULL;
2302 uint64_t ns;
2303 int r;
2304
2305 assert(u);
2306 assert(ret);
2307
2308 if (!u->cgroup_path)
2309 return -ENODATA;
2310
b4cccbc1
LP
2311 r = cg_all_unified();
2312 if (r < 0)
2313 return r;
2314 if (r > 0) {
66ebf6c0
TH
2315 const char *keys[] = { "usage_usec", NULL };
2316 _cleanup_free_ char *val = NULL;
2317 uint64_t us;
5ad096b3 2318
66ebf6c0
TH
2319 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2320 return -ENODATA;
5ad096b3 2321
66ebf6c0
TH
2322 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2323 if (r < 0)
2324 return r;
2325
2326 r = safe_atou64(val, &us);
2327 if (r < 0)
2328 return r;
2329
2330 ns = us * NSEC_PER_USEC;
2331 } else {
2332 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2333 return -ENODATA;
2334
2335 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2336 if (r == -ENOENT)
2337 return -ENODATA;
2338 if (r < 0)
2339 return r;
2340
2341 r = safe_atou64(v, &ns);
2342 if (r < 0)
2343 return r;
2344 }
5ad096b3
LP
2345
2346 *ret = ns;
2347 return 0;
2348}
2349
2350int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2351 nsec_t ns;
2352 int r;
2353
fe700f46
LP
2354 assert(u);
2355
2356 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2357 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2358 * call this function with a NULL return value. */
2359
2e4025c0 2360 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
cf3b4be1
LP
2361 return -ENODATA;
2362
5ad096b3 2363 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
2364 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2365 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2366 * cached value. */
2367
2368 if (ret)
2369 *ret = u->cpu_usage_last;
2370 return 0;
2371 }
5ad096b3
LP
2372 if (r < 0)
2373 return r;
2374
66ebf6c0
TH
2375 if (ns > u->cpu_usage_base)
2376 ns -= u->cpu_usage_base;
5ad096b3
LP
2377 else
2378 ns = 0;
2379
fe700f46
LP
2380 u->cpu_usage_last = ns;
2381 if (ret)
2382 *ret = ns;
2383
5ad096b3
LP
2384 return 0;
2385}
2386
906c06f6
DM
2387int unit_get_ip_accounting(
2388 Unit *u,
2389 CGroupIPAccountingMetric metric,
2390 uint64_t *ret) {
2391
6b659ed8 2392 uint64_t value;
906c06f6
DM
2393 int fd, r;
2394
2395 assert(u);
2396 assert(metric >= 0);
2397 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2398 assert(ret);
2399
cf3b4be1
LP
2400 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2401 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2402 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2403 * filters. */
2404 if (u->type == UNIT_SLICE)
2405 return -ENODATA;
2406
2e4025c0 2407 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
2408 return -ENODATA;
2409
906c06f6
DM
2410 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2411 u->ip_accounting_ingress_map_fd :
2412 u->ip_accounting_egress_map_fd;
906c06f6
DM
2413 if (fd < 0)
2414 return -ENODATA;
2415
2416 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 2417 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 2418 else
6b659ed8
LP
2419 r = bpf_firewall_read_accounting(fd, NULL, &value);
2420 if (r < 0)
2421 return r;
2422
2423 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2424 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2425 * ip_accounting_extra[] field, and add them in here transparently. */
2426
2427 *ret = value + u->ip_accounting_extra[metric];
906c06f6
DM
2428
2429 return r;
2430}
2431
2432int unit_reset_cpu_accounting(Unit *u) {
5ad096b3
LP
2433 nsec_t ns;
2434 int r;
2435
2436 assert(u);
2437
fe700f46
LP
2438 u->cpu_usage_last = NSEC_INFINITY;
2439
5ad096b3
LP
2440 r = unit_get_cpu_usage_raw(u, &ns);
2441 if (r < 0) {
66ebf6c0 2442 u->cpu_usage_base = 0;
5ad096b3 2443 return r;
b56c28c3 2444 }
2633eb83 2445
66ebf6c0 2446 u->cpu_usage_base = ns;
4ad49000 2447 return 0;
4fbf50b3
LP
2448}
2449
906c06f6
DM
2450int unit_reset_ip_accounting(Unit *u) {
2451 int r = 0, q = 0;
2452
2453 assert(u);
2454
2455 if (u->ip_accounting_ingress_map_fd >= 0)
2456 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2457
2458 if (u->ip_accounting_egress_map_fd >= 0)
2459 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2460
6b659ed8
LP
2461 zero(u->ip_accounting_extra);
2462
906c06f6
DM
2463 return r < 0 ? r : q;
2464}
2465
e7ab4d1a
LP
2466void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2467 assert(u);
2468
2469 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2470 return;
2471
2472 if (m == 0)
2473 return;
2474
538b4852
TH
2475 /* always invalidate compat pairs together */
2476 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2477 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2478
7cce4fb7
LP
2479 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2480 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2481
60c728ad 2482 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
e7ab4d1a
LP
2483 return;
2484
2485 u->cgroup_realized_mask &= ~m;
91a6073e 2486 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
2487}
2488
906c06f6
DM
2489void unit_invalidate_cgroup_bpf(Unit *u) {
2490 assert(u);
2491
2492 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2493 return;
2494
60c728ad 2495 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
906c06f6
DM
2496 return;
2497
2498 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
91a6073e 2499 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
2500
2501 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2502 * list of our children includes our own. */
2503 if (u->type == UNIT_SLICE) {
2504 Unit *member;
2505 Iterator i;
eef85c4a 2506 void *v;
906c06f6 2507
eef85c4a 2508 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
906c06f6
DM
2509 if (member == u)
2510 continue;
2511
2512 if (UNIT_DEREF(member->slice) != u)
2513 continue;
2514
2515 unit_invalidate_cgroup_bpf(member);
2516 }
2517 }
2518}
2519
e7ab4d1a
LP
2520void manager_invalidate_startup_units(Manager *m) {
2521 Iterator i;
2522 Unit *u;
2523
2524 assert(m);
2525
2526 SET_FOREACH(u, m->startup_units, i)
13c31542 2527 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
2528}
2529
4ad49000
LP
2530static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2531 [CGROUP_AUTO] = "auto",
2532 [CGROUP_CLOSED] = "closed",
2533 [CGROUP_STRICT] = "strict",
2534};
4fbf50b3 2535
4ad49000 2536DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);