]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
core: generalize the cgroup empty check on GC
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
8e274523
LP
2/***
3 This file is part of systemd.
4
4ad49000 5 Copyright 2013 Lennart Poettering
8e274523
LP
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
8e274523 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
c6c18be3 21#include <fcntl.h>
e41969e3 22#include <fnmatch.h>
8c6db833 23
b5efdb8a 24#include "alloc-util.h"
906c06f6 25#include "bpf-firewall.h"
03a7b521 26#include "cgroup-util.h"
3ffd4af2
LP
27#include "cgroup.h"
28#include "fd-util.h"
0d39fa9c 29#include "fileio.h"
77601719 30#include "fs-util.h"
6bedfcbb 31#include "parse-util.h"
9eb977db 32#include "path-util.h"
03a7b521 33#include "process-util.h"
9444b1f2 34#include "special.h"
906c06f6 35#include "stdio-util.h"
8b43440b 36#include "string-table.h"
07630cea 37#include "string-util.h"
8e274523 38
9a054909
LP
39#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
40
2b40998d 41static void cgroup_compat_warn(void) {
128fadc9
TH
42 static bool cgroup_compat_warned = false;
43
44 if (cgroup_compat_warned)
45 return;
46
47 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
48 cgroup_compat_warned = true;
49}
50
51#define log_cgroup_compat(unit, fmt, ...) do { \
52 cgroup_compat_warn(); \
53 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 54 } while (false)
128fadc9 55
4ad49000
LP
56void cgroup_context_init(CGroupContext *c) {
57 assert(c);
58
59 /* Initialize everything to the kernel defaults, assuming the
60 * structure is preinitialized to 0 */
61
66ebf6c0
TH
62 c->cpu_weight = CGROUP_WEIGHT_INVALID;
63 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
64 c->cpu_quota_per_sec_usec = USEC_INFINITY;
65
d53d9474
LP
66 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
67 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
d53d9474 68
da4d897e
TH
69 c->memory_high = CGROUP_LIMIT_MAX;
70 c->memory_max = CGROUP_LIMIT_MAX;
96e131ea 71 c->memory_swap_max = CGROUP_LIMIT_MAX;
da4d897e
TH
72
73 c->memory_limit = CGROUP_LIMIT_MAX;
b2f8b02e 74
13c31542
TH
75 c->io_weight = CGROUP_WEIGHT_INVALID;
76 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
77
d53d9474
LP
78 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
79 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
80
81 c->tasks_max = (uint64_t) -1;
4ad49000 82}
8e274523 83
4ad49000
LP
84void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
85 assert(c);
86 assert(a);
87
71fda00f 88 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
89 free(a->path);
90 free(a);
91}
92
13c31542
TH
93void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
94 assert(c);
95 assert(w);
96
97 LIST_REMOVE(device_weights, c->io_device_weights, w);
98 free(w->path);
99 free(w);
100}
101
102void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
103 assert(c);
104 assert(l);
105
106 LIST_REMOVE(device_limits, c->io_device_limits, l);
107 free(l->path);
108 free(l);
109}
110
4ad49000
LP
111void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
112 assert(c);
113 assert(w);
114
71fda00f 115 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
116 free(w->path);
117 free(w);
118}
119
120void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
121 assert(c);
8e274523 122 assert(b);
8e274523 123
71fda00f 124 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
125 free(b->path);
126 free(b);
127}
128
129void cgroup_context_done(CGroupContext *c) {
130 assert(c);
131
13c31542
TH
132 while (c->io_device_weights)
133 cgroup_context_free_io_device_weight(c, c->io_device_weights);
134
135 while (c->io_device_limits)
136 cgroup_context_free_io_device_limit(c, c->io_device_limits);
137
4ad49000
LP
138 while (c->blockio_device_weights)
139 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
140
141 while (c->blockio_device_bandwidths)
142 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
143
144 while (c->device_allow)
145 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f
DM
146
147 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
148 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
4ad49000
LP
149}
150
151void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
13c31542
TH
152 CGroupIODeviceLimit *il;
153 CGroupIODeviceWeight *iw;
4ad49000
LP
154 CGroupBlockIODeviceBandwidth *b;
155 CGroupBlockIODeviceWeight *w;
156 CGroupDeviceAllow *a;
c21c9906 157 IPAddressAccessItem *iaai;
9a054909 158 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
159
160 assert(c);
161 assert(f);
162
163 prefix = strempty(prefix);
164
165 fprintf(f,
166 "%sCPUAccounting=%s\n"
13c31542 167 "%sIOAccounting=%s\n"
4ad49000
LP
168 "%sBlockIOAccounting=%s\n"
169 "%sMemoryAccounting=%s\n"
d53d9474 170 "%sTasksAccounting=%s\n"
c21c9906 171 "%sIPAccounting=%s\n"
66ebf6c0
TH
172 "%sCPUWeight=%" PRIu64 "\n"
173 "%sStartupCPUWeight=%" PRIu64 "\n"
d53d9474
LP
174 "%sCPUShares=%" PRIu64 "\n"
175 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 176 "%sCPUQuotaPerSecSec=%s\n"
13c31542
TH
177 "%sIOWeight=%" PRIu64 "\n"
178 "%sStartupIOWeight=%" PRIu64 "\n"
d53d9474
LP
179 "%sBlockIOWeight=%" PRIu64 "\n"
180 "%sStartupBlockIOWeight=%" PRIu64 "\n"
da4d897e
TH
181 "%sMemoryLow=%" PRIu64 "\n"
182 "%sMemoryHigh=%" PRIu64 "\n"
183 "%sMemoryMax=%" PRIu64 "\n"
96e131ea 184 "%sMemorySwapMax=%" PRIu64 "\n"
4ad49000 185 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 186 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
187 "%sDevicePolicy=%s\n"
188 "%sDelegate=%s\n",
4ad49000 189 prefix, yes_no(c->cpu_accounting),
13c31542 190 prefix, yes_no(c->io_accounting),
4ad49000
LP
191 prefix, yes_no(c->blockio_accounting),
192 prefix, yes_no(c->memory_accounting),
d53d9474 193 prefix, yes_no(c->tasks_accounting),
c21c9906 194 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
195 prefix, c->cpu_weight,
196 prefix, c->startup_cpu_weight,
4ad49000 197 prefix, c->cpu_shares,
95ae05c0 198 prefix, c->startup_cpu_shares,
b1d6dcf5 199 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
13c31542
TH
200 prefix, c->io_weight,
201 prefix, c->startup_io_weight,
4ad49000 202 prefix, c->blockio_weight,
95ae05c0 203 prefix, c->startup_blockio_weight,
da4d897e
TH
204 prefix, c->memory_low,
205 prefix, c->memory_high,
206 prefix, c->memory_max,
96e131ea 207 prefix, c->memory_swap_max,
4ad49000 208 prefix, c->memory_limit,
03a7b521 209 prefix, c->tasks_max,
a931ad47
LP
210 prefix, cgroup_device_policy_to_string(c->device_policy),
211 prefix, yes_no(c->delegate));
4ad49000 212
02638280
LP
213 if (c->delegate) {
214 _cleanup_free_ char *t = NULL;
215
216 (void) cg_mask_to_string(c->delegate_controllers, &t);
217
47a78d41 218 fprintf(f, "%sDelegateControllers=%s\n",
02638280
LP
219 prefix,
220 strempty(t));
221 }
222
4ad49000
LP
223 LIST_FOREACH(device_allow, a, c->device_allow)
224 fprintf(f,
225 "%sDeviceAllow=%s %s%s%s\n",
226 prefix,
227 a->path,
228 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
229
13c31542
TH
230 LIST_FOREACH(device_weights, iw, c->io_device_weights)
231 fprintf(f,
232 "%sIODeviceWeight=%s %" PRIu64,
233 prefix,
234 iw->path,
235 iw->weight);
236
237 LIST_FOREACH(device_limits, il, c->io_device_limits) {
238 char buf[FORMAT_BYTES_MAX];
9be57249
TH
239 CGroupIOLimitType type;
240
241 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
242 if (il->limits[type] != cgroup_io_limit_defaults[type])
243 fprintf(f,
244 "%s%s=%s %s\n",
245 prefix,
246 cgroup_io_limit_type_to_string(type),
247 il->path,
248 format_bytes(buf, sizeof(buf), il->limits[type]));
13c31542
TH
249 }
250
4ad49000
LP
251 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
252 fprintf(f,
d53d9474 253 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
254 prefix,
255 w->path,
256 w->weight);
257
258 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
259 char buf[FORMAT_BYTES_MAX];
260
979d0311
TH
261 if (b->rbps != CGROUP_LIMIT_MAX)
262 fprintf(f,
263 "%sBlockIOReadBandwidth=%s %s\n",
264 prefix,
265 b->path,
266 format_bytes(buf, sizeof(buf), b->rbps));
267 if (b->wbps != CGROUP_LIMIT_MAX)
268 fprintf(f,
269 "%sBlockIOWriteBandwidth=%s %s\n",
270 prefix,
271 b->path,
272 format_bytes(buf, sizeof(buf), b->wbps));
4ad49000 273 }
c21c9906
LP
274
275 LIST_FOREACH(items, iaai, c->ip_address_allow) {
276 _cleanup_free_ char *k = NULL;
277
278 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
279 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
280 }
281
282 LIST_FOREACH(items, iaai, c->ip_address_deny) {
283 _cleanup_free_ char *k = NULL;
284
285 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
286 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
287 }
4ad49000
LP
288}
289
13c31542 290static int lookup_block_device(const char *p, dev_t *dev) {
4ad49000
LP
291 struct stat st;
292 int r;
293
294 assert(p);
295 assert(dev);
296
297 r = stat(p, &st);
4a62c710
MS
298 if (r < 0)
299 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 300
4ad49000
LP
301 if (S_ISBLK(st.st_mode))
302 *dev = st.st_rdev;
303 else if (major(st.st_dev) != 0) {
304 /* If this is not a device node then find the block
305 * device this file is stored on */
306 *dev = st.st_dev;
307
308 /* If this is a partition, try to get the originating
309 * block device */
310 block_get_whole_disk(*dev, dev);
311 } else {
312 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
313 return -ENODEV;
314 }
8e274523 315
8e274523 316 return 0;
8e274523
LP
317}
318
4ad49000
LP
319static int whitelist_device(const char *path, const char *node, const char *acc) {
320 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
321 struct stat st;
b200489b 322 bool ignore_notfound;
8c6db833 323 int r;
8e274523 324
4ad49000
LP
325 assert(path);
326 assert(acc);
8e274523 327
b200489b
DR
328 if (node[0] == '-') {
329 /* Non-existent paths starting with "-" must be silently ignored */
330 node++;
331 ignore_notfound = true;
332 } else
333 ignore_notfound = false;
334
4ad49000 335 if (stat(node, &st) < 0) {
b200489b 336 if (errno == ENOENT && ignore_notfound)
e7330dfe
DP
337 return 0;
338
339 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
4ad49000
LP
340 }
341
342 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
343 log_warning("%s is not a device.", node);
344 return -ENODEV;
345 }
346
347 sprintf(buf,
348 "%c %u:%u %s",
349 S_ISCHR(st.st_mode) ? 'c' : 'b',
350 major(st.st_rdev), minor(st.st_rdev),
351 acc);
352
353 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 354 if (r < 0)
077ba06e 355 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 356 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
357
358 return r;
8e274523
LP
359}
360
90060676
LP
361static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
362 _cleanup_fclose_ FILE *f = NULL;
363 char line[LINE_MAX];
364 bool good = false;
365 int r;
366
367 assert(path);
368 assert(acc);
4c701096 369 assert(IN_SET(type, 'b', 'c'));
90060676
LP
370
371 f = fopen("/proc/devices", "re");
4a62c710
MS
372 if (!f)
373 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
374
375 FOREACH_LINE(line, f, goto fail) {
376 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
377 unsigned maj;
378
379 truncate_nl(line);
380
381 if (type == 'c' && streq(line, "Character devices:")) {
382 good = true;
383 continue;
384 }
385
386 if (type == 'b' && streq(line, "Block devices:")) {
387 good = true;
388 continue;
389 }
390
391 if (isempty(line)) {
392 good = false;
393 continue;
394 }
395
396 if (!good)
397 continue;
398
399 p = strstrip(line);
400
401 w = strpbrk(p, WHITESPACE);
402 if (!w)
403 continue;
404 *w = 0;
405
406 r = safe_atou(p, &maj);
407 if (r < 0)
408 continue;
409 if (maj <= 0)
410 continue;
411
412 w++;
413 w += strspn(w, WHITESPACE);
e41969e3
LP
414
415 if (fnmatch(name, w, 0) != 0)
90060676
LP
416 continue;
417
418 sprintf(buf,
419 "%c %u:* %s",
420 type,
421 maj,
422 acc);
423
424 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 425 if (r < 0)
077ba06e 426 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 427 "Failed to set devices.allow on %s: %m", path);
90060676
LP
428 }
429
430 return 0;
431
432fail:
25f027c5 433 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
434}
435
66ebf6c0
TH
436static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
437 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
438 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
439}
440
441static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
442 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
443 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
444}
445
446static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
447 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
448 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
449 return c->startup_cpu_weight;
450 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
451 return c->cpu_weight;
452 else
453 return CGROUP_WEIGHT_DEFAULT;
454}
455
456static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
457 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
458 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
459 return c->startup_cpu_shares;
460 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
461 return c->cpu_shares;
462 else
463 return CGROUP_CPU_SHARES_DEFAULT;
464}
465
466static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
467 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
468 int r;
469
470 xsprintf(buf, "%" PRIu64 "\n", weight);
471 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
472 if (r < 0)
473 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
474 "Failed to set cpu.weight: %m");
475
476 if (quota != USEC_INFINITY)
477 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
478 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
479 else
480 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
481
482 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
483
484 if (r < 0)
485 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
486 "Failed to set cpu.max: %m");
487}
488
489static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
490 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
491 int r;
492
493 xsprintf(buf, "%" PRIu64 "\n", shares);
494 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
495 if (r < 0)
496 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
497 "Failed to set cpu.shares: %m");
498
499 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
500 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
501 if (r < 0)
502 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
503 "Failed to set cpu.cfs_period_us: %m");
504
505 if (quota != USEC_INFINITY) {
506 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
507 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
508 } else
509 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
510 if (r < 0)
511 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
512 "Failed to set cpu.cfs_quota_us: %m");
513}
514
515static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
516 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
517 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
518}
519
520static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
521 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
522 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
523}
524
508c45da 525static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
526 return c->io_accounting ||
527 c->io_weight != CGROUP_WEIGHT_INVALID ||
528 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
529 c->io_device_weights ||
530 c->io_device_limits;
531}
532
508c45da 533static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
534 return c->blockio_accounting ||
535 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
536 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
537 c->blockio_device_weights ||
538 c->blockio_device_bandwidths;
539}
540
508c45da 541static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
542 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
543 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
544 return c->startup_io_weight;
545 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
546 return c->io_weight;
547 else
548 return CGROUP_WEIGHT_DEFAULT;
549}
550
508c45da 551static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
552 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
553 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
554 return c->startup_blockio_weight;
555 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
556 return c->blockio_weight;
557 else
558 return CGROUP_BLKIO_WEIGHT_DEFAULT;
559}
560
508c45da 561static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
562 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
563 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
564}
565
508c45da 566static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
567 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
568 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
569}
570
f29ff115 571static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
572 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
573 dev_t dev;
574 int r;
575
576 r = lookup_block_device(dev_path, &dev);
577 if (r < 0)
578 return;
579
580 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
f29ff115 581 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
64faf04c 582 if (r < 0)
f29ff115
TH
583 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
584 "Failed to set io.weight: %m");
64faf04c
TH
585}
586
f29ff115 587static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
588 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
589 dev_t dev;
590 int r;
591
592 r = lookup_block_device(dev_path, &dev);
593 if (r < 0)
594 return;
595
596 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
f29ff115 597 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
64faf04c 598 if (r < 0)
f29ff115
TH
599 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
600 "Failed to set blkio.weight_device: %m");
64faf04c
TH
601}
602
f29ff115 603static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
64faf04c
TH
604 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
605 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
606 CGroupIOLimitType type;
607 dev_t dev;
608 unsigned n = 0;
609 int r;
610
611 r = lookup_block_device(dev_path, &dev);
612 if (r < 0)
613 return 0;
614
615 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
616 if (limits[type] != cgroup_io_limit_defaults[type]) {
617 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
618 n++;
619 } else {
620 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
621 }
622 }
623
624 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
625 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
626 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
f29ff115 627 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
64faf04c 628 if (r < 0)
f29ff115
TH
629 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
630 "Failed to set io.max: %m");
64faf04c
TH
631 return n;
632}
633
f29ff115 634static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
635 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
636 dev_t dev;
637 unsigned n = 0;
638 int r;
639
640 r = lookup_block_device(dev_path, &dev);
641 if (r < 0)
642 return 0;
643
644 if (rbps != CGROUP_LIMIT_MAX)
645 n++;
646 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
f29ff115 647 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
64faf04c 648 if (r < 0)
f29ff115
TH
649 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
650 "Failed to set blkio.throttle.read_bps_device: %m");
64faf04c
TH
651
652 if (wbps != CGROUP_LIMIT_MAX)
653 n++;
654 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
f29ff115 655 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
64faf04c 656 if (r < 0)
f29ff115
TH
657 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
658 "Failed to set blkio.throttle.write_bps_device: %m");
64faf04c
TH
659
660 return n;
661}
662
da4d897e 663static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
96e131ea 664 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
665}
666
f29ff115 667static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
da4d897e
TH
668 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
669 int r;
670
671 if (v != CGROUP_LIMIT_MAX)
672 xsprintf(buf, "%" PRIu64 "\n", v);
673
f29ff115 674 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
da4d897e 675 if (r < 0)
f29ff115
TH
676 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
677 "Failed to set %s: %m", file);
da4d897e
TH
678}
679
0f2d84d2 680static void cgroup_apply_firewall(Unit *u) {
906c06f6
DM
681 int r;
682
0f2d84d2
LP
683 assert(u);
684
906c06f6
DM
685 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
686 * not recursive we don't ever touch the bpf on them */
687 return;
688
689 r = bpf_firewall_compile(u);
690 if (r < 0)
691 return;
692
693 (void) bpf_firewall_install(u);
694 return;
695}
696
697static void cgroup_context_apply(
698 Unit *u,
699 CGroupMask apply_mask,
700 bool apply_bpf,
701 ManagerState state) {
702
f29ff115
TH
703 const char *path;
704 CGroupContext *c;
01efdf13 705 bool is_root;
4ad49000
LP
706 int r;
707
f29ff115
TH
708 assert(u);
709
710 c = unit_get_cgroup_context(u);
711 path = u->cgroup_path;
712
4ad49000
LP
713 assert(c);
714 assert(path);
8e274523 715
906c06f6
DM
716 /* Nothing to do? Exit early! */
717 if (apply_mask == 0 && !apply_bpf)
4ad49000 718 return;
8e274523 719
71c26873 720 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
721 * hence silently ignore */
722 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
723 if (is_root)
724 /* Make sure we don't try to display messages with an empty path. */
725 path = "/";
01efdf13 726
714e2e1d
LP
727 /* We generally ignore errors caused by read-only mounted
728 * cgroup trees (assuming we are running in a container then),
729 * and missing cgroups, i.e. EROFS and ENOENT. */
730
906c06f6
DM
731 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
732 bool has_weight, has_shares;
733
734 has_weight = cgroup_context_has_cpu_weight(c);
735 has_shares = cgroup_context_has_cpu_shares(c);
8e274523 736
b4cccbc1 737 if (cg_all_unified() > 0) {
66ebf6c0 738 uint64_t weight;
b2f8b02e 739
66ebf6c0
TH
740 if (has_weight)
741 weight = cgroup_context_cpu_weight(c, state);
742 else if (has_shares) {
743 uint64_t shares = cgroup_context_cpu_shares(c, state);
b2f8b02e 744
66ebf6c0
TH
745 weight = cgroup_cpu_shares_to_weight(shares);
746
747 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
748 shares, weight, path);
749 } else
750 weight = CGROUP_WEIGHT_DEFAULT;
751
752 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
753 } else {
754 uint64_t shares;
755
7d862ab8 756 if (has_weight) {
66ebf6c0
TH
757 uint64_t weight = cgroup_context_cpu_weight(c, state);
758
759 shares = cgroup_cpu_weight_to_shares(weight);
760
761 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
762 weight, shares, path);
7d862ab8
TH
763 } else if (has_shares)
764 shares = cgroup_context_cpu_shares(c, state);
765 else
66ebf6c0
TH
766 shares = CGROUP_CPU_SHARES_DEFAULT;
767
768 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
769 }
4ad49000
LP
770 }
771
906c06f6 772 if (apply_mask & CGROUP_MASK_IO) {
538b4852
TH
773 bool has_io = cgroup_context_has_io_config(c);
774 bool has_blockio = cgroup_context_has_blockio_config(c);
13c31542
TH
775
776 if (!is_root) {
64faf04c
TH
777 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
778 uint64_t weight;
13c31542 779
538b4852
TH
780 if (has_io)
781 weight = cgroup_context_io_weight(c, state);
128fadc9
TH
782 else if (has_blockio) {
783 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
784
785 weight = cgroup_weight_blkio_to_io(blkio_weight);
786
787 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
788 blkio_weight, weight);
789 } else
538b4852 790 weight = CGROUP_WEIGHT_DEFAULT;
13c31542
TH
791
792 xsprintf(buf, "default %" PRIu64 "\n", weight);
793 r = cg_set_attribute("io", path, "io.weight", buf);
794 if (r < 0)
f29ff115
TH
795 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
796 "Failed to set io.weight: %m");
13c31542 797
538b4852
TH
798 if (has_io) {
799 CGroupIODeviceWeight *w;
800
801 /* FIXME: no way to reset this list */
802 LIST_FOREACH(device_weights, w, c->io_device_weights)
f29ff115 803 cgroup_apply_io_device_weight(u, w->path, w->weight);
538b4852
TH
804 } else if (has_blockio) {
805 CGroupBlockIODeviceWeight *w;
806
807 /* FIXME: no way to reset this list */
128fadc9
TH
808 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
809 weight = cgroup_weight_blkio_to_io(w->weight);
810
811 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
812 w->weight, weight, w->path);
813
814 cgroup_apply_io_device_weight(u, w->path, weight);
815 }
538b4852 816 }
13c31542
TH
817 }
818
64faf04c 819 /* Apply limits and free ones without config. */
538b4852
TH
820 if (has_io) {
821 CGroupIODeviceLimit *l, *next;
822
823 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
f29ff115 824 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
538b4852
TH
825 cgroup_context_free_io_device_limit(c, l);
826 }
827 } else if (has_blockio) {
828 CGroupBlockIODeviceBandwidth *b, *next;
829
830 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
831 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
832 CGroupIOLimitType type;
833
834 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
835 limits[type] = cgroup_io_limit_defaults[type];
836
837 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
838 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
839
128fadc9
TH
840 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
841 b->rbps, b->wbps, b->path);
842
f29ff115 843 if (!cgroup_apply_io_device_limit(u, b->path, limits))
538b4852
TH
844 cgroup_context_free_blockio_device_bandwidth(c, b);
845 }
13c31542
TH
846 }
847 }
848
906c06f6 849 if (apply_mask & CGROUP_MASK_BLKIO) {
538b4852
TH
850 bool has_io = cgroup_context_has_io_config(c);
851 bool has_blockio = cgroup_context_has_blockio_config(c);
4ad49000 852
01efdf13 853 if (!is_root) {
64faf04c
TH
854 char buf[DECIMAL_STR_MAX(uint64_t)+1];
855 uint64_t weight;
64faf04c 856
7d862ab8 857 if (has_io) {
128fadc9
TH
858 uint64_t io_weight = cgroup_context_io_weight(c, state);
859
538b4852 860 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9
TH
861
862 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
863 io_weight, weight);
7d862ab8
TH
864 } else if (has_blockio)
865 weight = cgroup_context_blkio_weight(c, state);
866 else
538b4852 867 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c
TH
868
869 xsprintf(buf, "%" PRIu64 "\n", weight);
01efdf13 870 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 871 if (r < 0)
f29ff115
TH
872 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
873 "Failed to set blkio.weight: %m");
4ad49000 874
7d862ab8 875 if (has_io) {
538b4852
TH
876 CGroupIODeviceWeight *w;
877
878 /* FIXME: no way to reset this list */
128fadc9
TH
879 LIST_FOREACH(device_weights, w, c->io_device_weights) {
880 weight = cgroup_weight_io_to_blkio(w->weight);
881
882 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
883 w->weight, weight, w->path);
884
885 cgroup_apply_blkio_device_weight(u, w->path, weight);
886 }
7d862ab8
TH
887 } else if (has_blockio) {
888 CGroupBlockIODeviceWeight *w;
889
890 /* FIXME: no way to reset this list */
891 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
892 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 893 }
4ad49000
LP
894 }
895
64faf04c 896 /* Apply limits and free ones without config. */
7d862ab8 897 if (has_io) {
538b4852
TH
898 CGroupIODeviceLimit *l, *next;
899
900 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
128fadc9
TH
901 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
902 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
903
f29ff115 904 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
538b4852
TH
905 cgroup_context_free_io_device_limit(c, l);
906 }
7d862ab8
TH
907 } else if (has_blockio) {
908 CGroupBlockIODeviceBandwidth *b, *next;
909
910 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
911 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
912 cgroup_context_free_blockio_device_bandwidth(c, b);
d686d8a9 913 }
8e274523
LP
914 }
915
906c06f6 916 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
b4cccbc1
LP
917 if (cg_all_unified() > 0) {
918 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
efdb0237 919
96e131ea 920 if (cgroup_context_has_unified_memory_config(c)) {
da4d897e 921 max = c->memory_max;
96e131ea
WC
922 swap_max = c->memory_swap_max;
923 } else {
da4d897e 924 max = c->memory_limit;
efdb0237 925
128fadc9
TH
926 if (max != CGROUP_LIMIT_MAX)
927 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
928 }
929
f29ff115
TH
930 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
931 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
932 cgroup_apply_unified_memory_limit(u, "memory.max", max);
96e131ea 933 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
efdb0237 934 } else {
da4d897e 935 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
7d862ab8 936 uint64_t val;
da4d897e 937
7d862ab8 938 if (cgroup_context_has_unified_memory_config(c)) {
78a4ee59 939 val = c->memory_max;
7d862ab8
TH
940 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
941 } else
942 val = c->memory_limit;
128fadc9 943
78a4ee59
DM
944 if (val == CGROUP_LIMIT_MAX)
945 strncpy(buf, "-1\n", sizeof(buf));
946 else
947 xsprintf(buf, "%" PRIu64 "\n", val);
948
da4d897e
TH
949 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
950 if (r < 0)
f29ff115
TH
951 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
952 "Failed to set memory.limit_in_bytes: %m");
da4d897e 953 }
4ad49000 954 }
8e274523 955
906c06f6 956 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 957 CGroupDeviceAllow *a;
8e274523 958
714e2e1d
LP
959 /* Changing the devices list of a populated cgroup
960 * might result in EINVAL, hence ignore EINVAL
961 * here. */
962
4ad49000
LP
963 if (c->device_allow || c->device_policy != CGROUP_AUTO)
964 r = cg_set_attribute("devices", path, "devices.deny", "a");
965 else
966 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 967 if (r < 0)
f29ff115
TH
968 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
969 "Failed to reset devices.list: %m");
fb385181 970
4ad49000
LP
971 if (c->device_policy == CGROUP_CLOSED ||
972 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
973 static const char auto_devices[] =
7d711efb
LP
974 "/dev/null\0" "rwm\0"
975 "/dev/zero\0" "rwm\0"
976 "/dev/full\0" "rwm\0"
977 "/dev/random\0" "rwm\0"
978 "/dev/urandom\0" "rwm\0"
979 "/dev/tty\0" "rwm\0"
0d9e7991
AP
980 "/dev/pts/ptmx\0" "rw\0" /* /dev/pts/ptmx may not be duplicated, but accessed */
981 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
e7330dfe
DP
982 "-/run/systemd/inaccessible/chr\0" "rwm\0"
983 "-/run/systemd/inaccessible/blk\0" "rwm\0";
4ad49000
LP
984
985 const char *x, *y;
986
987 NULSTR_FOREACH_PAIR(x, y, auto_devices)
988 whitelist_device(path, x, y);
7d711efb
LP
989
990 whitelist_major(path, "pts", 'c', "rw");
4ad49000
LP
991 }
992
993 LIST_FOREACH(device_allow, a, c->device_allow) {
fb4650aa 994 char acc[4], *val;
4ad49000
LP
995 unsigned k = 0;
996
997 if (a->r)
998 acc[k++] = 'r';
999 if (a->w)
1000 acc[k++] = 'w';
1001 if (a->m)
1002 acc[k++] = 'm';
fb385181 1003
4ad49000
LP
1004 if (k == 0)
1005 continue;
fb385181 1006
4ad49000 1007 acc[k++] = 0;
90060676 1008
27458ed6 1009 if (path_startswith(a->path, "/dev/"))
90060676 1010 whitelist_device(path, a->path, acc);
fb4650aa
ZJS
1011 else if ((val = startswith(a->path, "block-")))
1012 whitelist_major(path, val, 'b', acc);
1013 else if ((val = startswith(a->path, "char-")))
1014 whitelist_major(path, val, 'c', acc);
90060676 1015 else
f29ff115 1016 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
1017 }
1018 }
03a7b521 1019
906c06f6 1020 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
03a7b521 1021
f5058264 1022 if (c->tasks_max != CGROUP_LIMIT_MAX) {
03a7b521
LP
1023 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1024
1025 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1026 r = cg_set_attribute("pids", path, "pids.max", buf);
1027 } else
1028 r = cg_set_attribute("pids", path, "pids.max", "max");
1029
1030 if (r < 0)
f29ff115
TH
1031 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1032 "Failed to set pids.max: %m");
03a7b521 1033 }
906c06f6
DM
1034
1035 if (apply_bpf)
0f2d84d2 1036 cgroup_apply_firewall(u);
fb385181
LP
1037}
1038
efdb0237
LP
1039CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1040 CGroupMask mask = 0;
8e274523 1041
4ad49000 1042 /* Figure out which controllers we need */
8e274523 1043
b2f8b02e 1044 if (c->cpu_accounting ||
66ebf6c0
TH
1045 cgroup_context_has_cpu_weight(c) ||
1046 cgroup_context_has_cpu_shares(c) ||
3a43da28 1047 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 1048 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 1049
538b4852
TH
1050 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1051 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1052
4ad49000 1053 if (c->memory_accounting ||
da4d897e
TH
1054 c->memory_limit != CGROUP_LIMIT_MAX ||
1055 cgroup_context_has_unified_memory_config(c))
efdb0237 1056 mask |= CGROUP_MASK_MEMORY;
8e274523 1057
a931ad47
LP
1058 if (c->device_allow ||
1059 c->device_policy != CGROUP_AUTO)
3905f127 1060 mask |= CGROUP_MASK_DEVICES;
4ad49000 1061
03a7b521
LP
1062 if (c->tasks_accounting ||
1063 c->tasks_max != (uint64_t) -1)
1064 mask |= CGROUP_MASK_PIDS;
1065
4ad49000 1066 return mask;
8e274523
LP
1067}
1068
efdb0237 1069CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1070 CGroupContext *c;
8e274523 1071
efdb0237
LP
1072 /* Returns the mask of controllers the unit needs for itself */
1073
4ad49000
LP
1074 c = unit_get_cgroup_context(u);
1075 if (!c)
1076 return 0;
8e274523 1077
64e844e5 1078 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
02638280
LP
1079}
1080
1081CGroupMask unit_get_delegate_mask(Unit *u) {
1082 CGroupContext *c;
1083
1084 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1085 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
19af675e 1086 *
02638280 1087 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1088
02638280
LP
1089 if (u->type == UNIT_SLICE)
1090 return 0;
1091
1092 c = unit_get_cgroup_context(u);
1093 if (!c)
1094 return 0;
1095
1096 if (!c->delegate)
1097 return 0;
1098
1099 if (cg_all_unified() <= 0) {
a931ad47
LP
1100 ExecContext *e;
1101
1102 e = unit_get_exec_context(u);
02638280
LP
1103 if (e && !exec_context_maintains_privileges(e))
1104 return 0;
a931ad47
LP
1105 }
1106
02638280 1107 return c->delegate_controllers;
8e274523
LP
1108}
1109
efdb0237 1110CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1111 assert(u);
bc432dc7 1112
02638280 1113 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1114
bc432dc7
LP
1115 if (u->cgroup_members_mask_valid)
1116 return u->cgroup_members_mask;
1117
64e844e5 1118 u->cgroup_members_mask = 0;
bc432dc7
LP
1119
1120 if (u->type == UNIT_SLICE) {
eef85c4a 1121 void *v;
bc432dc7
LP
1122 Unit *member;
1123 Iterator i;
1124
eef85c4a 1125 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
bc432dc7
LP
1126
1127 if (member == u)
1128 continue;
1129
d4fdc205 1130 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
1131 continue;
1132
31604970 1133 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1134 }
1135 }
1136
1137 u->cgroup_members_mask_valid = true;
6414b7c9 1138 return u->cgroup_members_mask;
246aa6dd
LP
1139}
1140
efdb0237 1141CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 1142 assert(u);
246aa6dd 1143
efdb0237
LP
1144 /* Returns the mask of controllers all of the unit's siblings
1145 * require, i.e. the members mask of the unit's parent slice
1146 * if there is one. */
1147
bc432dc7 1148 if (UNIT_ISSET(u->slice))
637f421e 1149 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 1150
64e844e5 1151 return unit_get_subtree_mask(u); /* we are the top-level slice */
246aa6dd
LP
1152}
1153
efdb0237
LP
1154CGroupMask unit_get_subtree_mask(Unit *u) {
1155
1156 /* Returns the mask of this subtree, meaning of the group
1157 * itself and its children. */
1158
1159 return unit_get_own_mask(u) | unit_get_members_mask(u);
1160}
1161
1162CGroupMask unit_get_target_mask(Unit *u) {
1163 CGroupMask mask;
1164
1165 /* This returns the cgroup mask of all controllers to enable
1166 * for a specific cgroup, i.e. everything it needs itself,
1167 * plus all that its children need, plus all that its siblings
1168 * need. This is primarily useful on the legacy cgroup
1169 * hierarchy, where we need to duplicate each cgroup in each
1170 * hierarchy that shall be enabled for it. */
6414b7c9 1171
efdb0237
LP
1172 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1173 mask &= u->manager->cgroup_supported;
1174
1175 return mask;
1176}
1177
1178CGroupMask unit_get_enable_mask(Unit *u) {
1179 CGroupMask mask;
1180
1181 /* This returns the cgroup mask of all controllers to enable
1182 * for the children of a specific cgroup. This is primarily
1183 * useful for the unified cgroup hierarchy, where each cgroup
1184 * controls which controllers are enabled for its children. */
1185
1186 mask = unit_get_members_mask(u);
6414b7c9
DS
1187 mask &= u->manager->cgroup_supported;
1188
1189 return mask;
1190}
1191
906c06f6
DM
1192bool unit_get_needs_bpf(Unit *u) {
1193 CGroupContext *c;
1194 Unit *p;
1195 assert(u);
1196
1197 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1198 * moment. */
1199 if (u->type == UNIT_SLICE)
1200 return false;
1201
1202 c = unit_get_cgroup_context(u);
1203 if (!c)
1204 return false;
1205
1206 if (c->ip_accounting ||
1207 c->ip_address_allow ||
1208 c->ip_address_deny)
1209 return true;
1210
1211 /* If any parent slice has an IP access list defined, it applies too */
1212 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1213 c = unit_get_cgroup_context(p);
1214 if (!c)
1215 return false;
1216
1217 if (c->ip_address_allow ||
1218 c->ip_address_deny)
1219 return true;
1220 }
1221
1222 return false;
1223}
1224
6414b7c9
DS
1225/* Recurse from a unit up through its containing slices, propagating
1226 * mask bits upward. A unit is also member of itself. */
bc432dc7 1227void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 1228 CGroupMask m;
bc432dc7
LP
1229 bool more;
1230
1231 assert(u);
1232
1233 /* Calculate subtree mask */
efdb0237 1234 m = unit_get_subtree_mask(u);
bc432dc7
LP
1235
1236 /* See if anything changed from the previous invocation. If
1237 * not, we're done. */
1238 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1239 return;
1240
1241 more =
1242 u->cgroup_subtree_mask_valid &&
1243 ((m & ~u->cgroup_subtree_mask) != 0) &&
1244 ((~m & u->cgroup_subtree_mask) == 0);
1245
1246 u->cgroup_subtree_mask = m;
1247 u->cgroup_subtree_mask_valid = true;
1248
6414b7c9
DS
1249 if (UNIT_ISSET(u->slice)) {
1250 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
1251
1252 if (more)
1253 /* There's more set now than before. We
1254 * propagate the new mask to the parent's mask
1255 * (not caring if it actually was valid or
1256 * not). */
1257
1258 s->cgroup_members_mask |= m;
1259
1260 else
1261 /* There's less set now than before (or we
1262 * don't know), we need to recalculate
1263 * everything, so let's invalidate the
1264 * parent's members mask */
1265
1266 s->cgroup_members_mask_valid = false;
1267
1268 /* And now make sure that this change also hits our
1269 * grandparents */
1270 unit_update_cgroup_members_masks(s);
6414b7c9
DS
1271 }
1272}
1273
efdb0237 1274static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
1275 Unit *u = userdata;
1276
1277 assert(mask != 0);
1278 assert(u);
1279
1280 while (u) {
1281 if (u->cgroup_path &&
1282 u->cgroup_realized &&
1283 (u->cgroup_realized_mask & mask) == mask)
1284 return u->cgroup_path;
1285
1286 u = UNIT_DEREF(u->slice);
1287 }
1288
1289 return NULL;
1290}
1291
efdb0237
LP
1292char *unit_default_cgroup_path(Unit *u) {
1293 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1294 int r;
1295
1296 assert(u);
1297
1298 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1299 return strdup(u->manager->cgroup_root);
1300
1301 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1302 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1303 if (r < 0)
1304 return NULL;
1305 }
1306
1307 escaped = cg_escape(u->id);
1308 if (!escaped)
1309 return NULL;
1310
1311 if (slice)
605405c6
ZJS
1312 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1313 escaped);
efdb0237 1314 else
605405c6 1315 return strjoin(u->manager->cgroup_root, "/", escaped);
efdb0237
LP
1316}
1317
1318int unit_set_cgroup_path(Unit *u, const char *path) {
1319 _cleanup_free_ char *p = NULL;
1320 int r;
1321
1322 assert(u);
1323
1324 if (path) {
1325 p = strdup(path);
1326 if (!p)
1327 return -ENOMEM;
1328 } else
1329 p = NULL;
1330
1331 if (streq_ptr(u->cgroup_path, p))
1332 return 0;
1333
1334 if (p) {
1335 r = hashmap_put(u->manager->cgroup_unit, p, u);
1336 if (r < 0)
1337 return r;
1338 }
1339
1340 unit_release_cgroup(u);
1341
1342 u->cgroup_path = p;
1343 p = NULL;
1344
1345 return 1;
1346}
1347
1348int unit_watch_cgroup(Unit *u) {
ab2c3861 1349 _cleanup_free_ char *events = NULL;
efdb0237
LP
1350 int r;
1351
1352 assert(u);
1353
1354 if (!u->cgroup_path)
1355 return 0;
1356
1357 if (u->cgroup_inotify_wd >= 0)
1358 return 0;
1359
1360 /* Only applies to the unified hierarchy */
c22800e4 1361 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1362 if (r < 0)
1363 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1364 if (r == 0)
efdb0237
LP
1365 return 0;
1366
1367 /* Don't watch the root slice, it's pointless. */
1368 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1369 return 0;
1370
1371 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1372 if (r < 0)
1373 return log_oom();
1374
ab2c3861 1375 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1376 if (r < 0)
1377 return log_oom();
1378
ab2c3861 1379 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
efdb0237
LP
1380 if (u->cgroup_inotify_wd < 0) {
1381
1382 if (errno == ENOENT) /* If the directory is already
1383 * gone we don't need to track
1384 * it, so this is not an error */
1385 return 0;
1386
1387 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1388 }
1389
1390 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1391 if (r < 0)
1392 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1393
1394 return 0;
1395}
1396
1397static int unit_create_cgroup(
1398 Unit *u,
1399 CGroupMask target_mask,
906c06f6
DM
1400 CGroupMask enable_mask,
1401 bool needs_bpf) {
efdb0237 1402
0cd385d3 1403 CGroupContext *c;
bc432dc7 1404 int r;
64747e2d 1405
4ad49000 1406 assert(u);
64747e2d 1407
0cd385d3
LP
1408 c = unit_get_cgroup_context(u);
1409 if (!c)
1410 return 0;
1411
7b3fd631
LP
1412 if (!u->cgroup_path) {
1413 _cleanup_free_ char *path = NULL;
64747e2d 1414
7b3fd631
LP
1415 path = unit_default_cgroup_path(u);
1416 if (!path)
1417 return log_oom();
1418
efdb0237
LP
1419 r = unit_set_cgroup_path(u, path);
1420 if (r == -EEXIST)
1421 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1422 if (r < 0)
1423 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
1424 }
1425
03b90d4b 1426 /* First, create our own group */
efdb0237 1427 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 1428 if (r < 0)
efdb0237
LP
1429 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1430
1431 /* Start watching it */
1432 (void) unit_watch_cgroup(u);
1433
1434 /* Enable all controllers we need */
1435 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1436 if (r < 0)
1437 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
1438
1439 /* Keep track that this is now realized */
4ad49000 1440 u->cgroup_realized = true;
efdb0237 1441 u->cgroup_realized_mask = target_mask;
ccf78df1 1442 u->cgroup_enabled_mask = enable_mask;
906c06f6 1443 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
4ad49000 1444
0cd385d3
LP
1445 if (u->type != UNIT_SLICE && !c->delegate) {
1446
1447 /* Then, possibly move things over, but not if
1448 * subgroups may contain processes, which is the case
1449 * for slice and delegation units. */
1450 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1451 if (r < 0)
efdb0237 1452 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 1453 }
03b90d4b 1454
64747e2d
LP
1455 return 0;
1456}
1457
7b3fd631
LP
1458int unit_attach_pids_to_cgroup(Unit *u) {
1459 int r;
1460 assert(u);
1461
1462 r = unit_realize_cgroup(u);
1463 if (r < 0)
1464 return r;
1465
1466 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1467 if (r < 0)
1468 return r;
1469
1470 return 0;
1471}
1472
4b58153d
LP
1473static void cgroup_xattr_apply(Unit *u) {
1474 char ids[SD_ID128_STRING_MAX];
1475 int r;
1476
1477 assert(u);
1478
1479 if (!MANAGER_IS_SYSTEM(u->manager))
1480 return;
1481
1482 if (sd_id128_is_null(u->invocation_id))
1483 return;
1484
1485 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1486 "trusted.invocation_id",
1487 sd_id128_to_string(u->invocation_id, ids), 32,
1488 0);
1489 if (r < 0)
0fb84499 1490 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
4b58153d
LP
1491}
1492
906c06f6
DM
1493static bool unit_has_mask_realized(
1494 Unit *u,
1495 CGroupMask target_mask,
1496 CGroupMask enable_mask,
1497 bool needs_bpf) {
1498
bc432dc7
LP
1499 assert(u);
1500
906c06f6
DM
1501 return u->cgroup_realized &&
1502 u->cgroup_realized_mask == target_mask &&
1503 u->cgroup_enabled_mask == enable_mask &&
1504 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1505 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
6414b7c9
DS
1506}
1507
2aa57a65
LP
1508static void unit_add_to_cgroup_realize_queue(Unit *u) {
1509 assert(u);
1510
1511 if (u->in_cgroup_realize_queue)
1512 return;
1513
1514 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1515 u->in_cgroup_realize_queue = true;
1516}
1517
1518static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1519 assert(u);
1520
1521 if (!u->in_cgroup_realize_queue)
1522 return;
1523
1524 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1525 u->in_cgroup_realize_queue = false;
1526}
1527
1528
6414b7c9
DS
1529/* Check if necessary controllers and attributes for a unit are in place.
1530 *
1531 * If so, do nothing.
1532 * If not, create paths, move processes over, and set attributes.
1533 *
1534 * Returns 0 on success and < 0 on failure. */
db785129 1535static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1536 CGroupMask target_mask, enable_mask;
906c06f6 1537 bool needs_bpf, apply_bpf;
6414b7c9 1538 int r;
64747e2d 1539
4ad49000 1540 assert(u);
64747e2d 1541
2aa57a65 1542 unit_remove_from_cgroup_realize_queue(u);
64747e2d 1543
efdb0237 1544 target_mask = unit_get_target_mask(u);
ccf78df1 1545 enable_mask = unit_get_enable_mask(u);
906c06f6 1546 needs_bpf = unit_get_needs_bpf(u);
ccf78df1 1547
906c06f6 1548 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
0a1eb06d 1549 return 0;
64747e2d 1550
906c06f6
DM
1551 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1552 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1553 * this will trickle down properly to cgroupfs. */
1554 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1555
4ad49000 1556 /* First, realize parents */
6414b7c9 1557 if (UNIT_ISSET(u->slice)) {
db785129 1558 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1559 if (r < 0)
1560 return r;
1561 }
4ad49000
LP
1562
1563 /* And then do the real work */
906c06f6 1564 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
6414b7c9
DS
1565 if (r < 0)
1566 return r;
1567
1568 /* Finally, apply the necessary attributes. */
906c06f6 1569 cgroup_context_apply(u, target_mask, apply_bpf, state);
4b58153d 1570 cgroup_xattr_apply(u);
6414b7c9
DS
1571
1572 return 0;
64747e2d
LP
1573}
1574
91a6073e 1575unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 1576 ManagerState state;
4ad49000 1577 unsigned n = 0;
db785129 1578 Unit *i;
6414b7c9 1579 int r;
ecedd90f 1580
91a6073e
LP
1581 assert(m);
1582
db785129
LP
1583 state = manager_state(m);
1584
91a6073e
LP
1585 while ((i = m->cgroup_realize_queue)) {
1586 assert(i->in_cgroup_realize_queue);
ecedd90f 1587
2aa57a65
LP
1588 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1589 /* Maybe things changed, and the unit is not actually active anymore? */
1590 unit_remove_from_cgroup_realize_queue(i);
1591 continue;
1592 }
1593
db785129 1594 r = unit_realize_cgroup_now(i, state);
6414b7c9 1595 if (r < 0)
efdb0237 1596 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1597
4ad49000
LP
1598 n++;
1599 }
ecedd90f 1600
4ad49000 1601 return n;
8e274523
LP
1602}
1603
91a6073e 1604static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
4ad49000 1605 Unit *slice;
ca949c9d 1606
4ad49000
LP
1607 /* This adds the siblings of the specified unit and the
1608 * siblings of all parent units to the cgroup queue. (But
1609 * neither the specified unit itself nor the parents.) */
1610
1611 while ((slice = UNIT_DEREF(u->slice))) {
1612 Iterator i;
1613 Unit *m;
eef85c4a 1614 void *v;
8f53a7b8 1615
eef85c4a 1616 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
4ad49000
LP
1617 if (m == u)
1618 continue;
8e274523 1619
6414b7c9
DS
1620 /* Skip units that have a dependency on the slice
1621 * but aren't actually in it. */
4ad49000 1622 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1623 continue;
8e274523 1624
6414b7c9
DS
1625 /* No point in doing cgroup application for units
1626 * without active processes. */
1627 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1628 continue;
1629
1630 /* If the unit doesn't need any new controllers
1631 * and has current ones realized, it doesn't need
1632 * any changes. */
906c06f6
DM
1633 if (unit_has_mask_realized(m,
1634 unit_get_target_mask(m),
1635 unit_get_enable_mask(m),
1636 unit_get_needs_bpf(m)))
6414b7c9
DS
1637 continue;
1638
91a6073e 1639 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
1640 }
1641
4ad49000 1642 u = slice;
8e274523 1643 }
4ad49000
LP
1644}
1645
0a1eb06d 1646int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1647 assert(u);
1648
35b7ff80 1649 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1650 return 0;
8e274523 1651
4ad49000
LP
1652 /* So, here's the deal: when realizing the cgroups for this
1653 * unit, we need to first create all parents, but there's more
1654 * actually: for the weight-based controllers we also need to
1655 * make sure that all our siblings (i.e. units that are in the
73e231ab 1656 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1657 * would become very uneven as each of their processes would
1658 * get as much resources as all our group together. This call
1659 * will synchronously create the parent cgroups, but will
1660 * defer work on the siblings to the next event loop
1661 * iteration. */
ca949c9d 1662
4ad49000 1663 /* Add all sibling slices to the cgroup queue. */
91a6073e 1664 unit_add_siblings_to_cgroup_realize_queue(u);
4ad49000 1665
6414b7c9 1666 /* And realize this one now (and apply the values) */
db785129 1667 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1668}
1669
efdb0237
LP
1670void unit_release_cgroup(Unit *u) {
1671 assert(u);
1672
1673 /* Forgets all cgroup details for this cgroup */
1674
1675 if (u->cgroup_path) {
1676 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1677 u->cgroup_path = mfree(u->cgroup_path);
1678 }
1679
1680 if (u->cgroup_inotify_wd >= 0) {
1681 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1682 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1683
1684 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1685 u->cgroup_inotify_wd = -1;
1686 }
1687}
1688
1689void unit_prune_cgroup(Unit *u) {
8e274523 1690 int r;
efdb0237 1691 bool is_root_slice;
8e274523 1692
4ad49000 1693 assert(u);
8e274523 1694
efdb0237
LP
1695 /* Removes the cgroup, if empty and possible, and stops watching it. */
1696
4ad49000
LP
1697 if (!u->cgroup_path)
1698 return;
8e274523 1699
fe700f46
LP
1700 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1701
efdb0237
LP
1702 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1703
1704 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1705 if (r < 0) {
f29ff115 1706 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1707 return;
1708 }
8e274523 1709
efdb0237
LP
1710 if (is_root_slice)
1711 return;
1712
1713 unit_release_cgroup(u);
0a1eb06d 1714
4ad49000 1715 u->cgroup_realized = false;
bc432dc7 1716 u->cgroup_realized_mask = 0;
ccf78df1 1717 u->cgroup_enabled_mask = 0;
8e274523
LP
1718}
1719
efdb0237 1720int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1721 _cleanup_fclose_ FILE *f = NULL;
1722 pid_t pid = 0, npid, mypid;
efdb0237 1723 int r;
4ad49000
LP
1724
1725 assert(u);
efdb0237 1726 assert(ret);
4ad49000
LP
1727
1728 if (!u->cgroup_path)
efdb0237 1729 return -ENXIO;
4ad49000 1730
efdb0237
LP
1731 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1732 if (r < 0)
1733 return r;
4ad49000 1734
df0ff127 1735 mypid = getpid_cached();
4ad49000
LP
1736 while (cg_read_pid(f, &npid) > 0) {
1737 pid_t ppid;
1738
1739 if (npid == pid)
1740 continue;
8e274523 1741
4ad49000 1742 /* Ignore processes that aren't our kids */
6bc73acb 1743 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
4ad49000 1744 continue;
8e274523 1745
efdb0237 1746 if (pid != 0)
4ad49000
LP
1747 /* Dang, there's more than one daemonized PID
1748 in this group, so we don't know what process
1749 is the main process. */
efdb0237
LP
1750
1751 return -ENODATA;
8e274523 1752
4ad49000 1753 pid = npid;
8e274523
LP
1754 }
1755
efdb0237
LP
1756 *ret = pid;
1757 return 0;
1758}
1759
1760static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1761 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1762 _cleanup_fclose_ FILE *f = NULL;
1763 int ret = 0, r;
1764
1765 assert(u);
1766 assert(path);
1767
1768 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1769 if (r < 0)
1770 ret = r;
1771 else {
1772 pid_t pid;
1773
1774 while ((r = cg_read_pid(f, &pid)) > 0) {
1775 r = unit_watch_pid(u, pid);
1776 if (r < 0 && ret >= 0)
1777 ret = r;
1778 }
1779
1780 if (r < 0 && ret >= 0)
1781 ret = r;
1782 }
1783
1784 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1785 if (r < 0) {
1786 if (ret >= 0)
1787 ret = r;
1788 } else {
1789 char *fn;
1790
1791 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1792 _cleanup_free_ char *p = NULL;
1793
605405c6 1794 p = strjoin(path, "/", fn);
efdb0237
LP
1795 free(fn);
1796
1797 if (!p)
1798 return -ENOMEM;
1799
1800 r = unit_watch_pids_in_path(u, p);
1801 if (r < 0 && ret >= 0)
1802 ret = r;
1803 }
1804
1805 if (r < 0 && ret >= 0)
1806 ret = r;
1807 }
1808
1809 return ret;
1810}
1811
1812int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
1813 int r;
1814
efdb0237
LP
1815 assert(u);
1816
1817 /* Adds all PIDs from our cgroup to the set of PIDs we
1818 * watch. This is a fallback logic for cases where we do not
1819 * get reliable cgroup empty notifications: we try to use
1820 * SIGCHLD as replacement. */
1821
1822 if (!u->cgroup_path)
1823 return -ENOENT;
1824
c22800e4 1825 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1826 if (r < 0)
1827 return r;
1828 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
1829 return 0;
1830
1831 return unit_watch_pids_in_path(u, u->cgroup_path);
1832}
1833
09e24654
LP
1834static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1835 Manager *m = userdata;
1836 Unit *u;
efdb0237
LP
1837 int r;
1838
09e24654
LP
1839 assert(s);
1840 assert(m);
efdb0237 1841
09e24654
LP
1842 u = m->cgroup_empty_queue;
1843 if (!u)
efdb0237
LP
1844 return 0;
1845
09e24654
LP
1846 assert(u->in_cgroup_empty_queue);
1847 u->in_cgroup_empty_queue = false;
1848 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1849
1850 if (m->cgroup_empty_queue) {
1851 /* More stuff queued, let's make sure we remain enabled */
1852 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1853 if (r < 0)
1854 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1855 }
efdb0237
LP
1856
1857 unit_add_to_gc_queue(u);
1858
1859 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1860 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1861
1862 return 0;
1863}
1864
09e24654
LP
1865void unit_add_to_cgroup_empty_queue(Unit *u) {
1866 int r;
1867
1868 assert(u);
1869
1870 /* Note that there are four different ways how cgroup empty events reach us:
1871 *
1872 * 1. On the unified hierarchy we get an inotify event on the cgroup
1873 *
1874 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1875 *
1876 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1877 *
1878 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1879 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1880 *
1881 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1882 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1883 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1884 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1885 * case for scope units). */
1886
1887 if (u->in_cgroup_empty_queue)
1888 return;
1889
1890 /* Let's verify that the cgroup is really empty */
1891 if (!u->cgroup_path)
1892 return;
1893 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1894 if (r < 0) {
1895 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1896 return;
1897 }
1898 if (r == 0)
1899 return;
1900
1901 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1902 u->in_cgroup_empty_queue = true;
1903
1904 /* Trigger the defer event */
1905 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1906 if (r < 0)
1907 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1908}
1909
efdb0237
LP
1910static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1911 Manager *m = userdata;
1912
1913 assert(s);
1914 assert(fd >= 0);
1915 assert(m);
1916
1917 for (;;) {
1918 union inotify_event_buffer buffer;
1919 struct inotify_event *e;
1920 ssize_t l;
1921
1922 l = read(fd, &buffer, sizeof(buffer));
1923 if (l < 0) {
47249640 1924 if (IN_SET(errno, EINTR, EAGAIN))
efdb0237
LP
1925 return 0;
1926
1927 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1928 }
1929
1930 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1931 Unit *u;
1932
1933 if (e->wd < 0)
1934 /* Queue overflow has no watch descriptor */
1935 continue;
1936
1937 if (e->mask & IN_IGNORED)
1938 /* The watch was just removed */
1939 continue;
1940
1941 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1942 if (!u) /* Not that inotify might deliver
1943 * events for a watch even after it
1944 * was removed, because it was queued
1945 * before the removal. Let's ignore
1946 * this here safely. */
1947 continue;
1948
09e24654 1949 unit_add_to_cgroup_empty_queue(u);
efdb0237
LP
1950 }
1951 }
8e274523
LP
1952}
1953
8e274523 1954int manager_setup_cgroup(Manager *m) {
9444b1f2 1955 _cleanup_free_ char *path = NULL;
10bd3e2e 1956 const char *scope_path;
efdb0237 1957 CGroupController c;
b4cccbc1 1958 int r, all_unified;
efdb0237 1959 char *e;
8e274523
LP
1960
1961 assert(m);
1962
35d2e7ec 1963 /* 1. Determine hierarchy */
efdb0237 1964 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1965 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1966 if (r < 0)
1967 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1968
efdb0237
LP
1969 /* Chop off the init scope, if we are already located in it */
1970 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1971
efdb0237
LP
1972 /* LEGACY: Also chop off the system slice if we are in
1973 * it. This is to support live upgrades from older systemd
1974 * versions where PID 1 was moved there. Also see
1975 * cg_get_root_path(). */
463d0d15 1976 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 1977 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1978 if (!e)
efdb0237 1979 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1980 }
efdb0237
LP
1981 if (e)
1982 *e = 0;
7ccfb64a 1983
7546145e
LP
1984 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
1985 * easily prepend it everywhere. */
1986 delete_trailing_chars(m->cgroup_root, "/");
8e274523 1987
35d2e7ec 1988 /* 2. Show data */
9444b1f2 1989 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1990 if (r < 0)
1991 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1992
415fc41c
TH
1993 r = cg_unified_flush();
1994 if (r < 0)
1995 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 1996
b4cccbc1 1997 all_unified = cg_all_unified();
d4c819ed
ZJS
1998 if (all_unified < 0)
1999 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2000 if (all_unified > 0)
efdb0237 2001 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 2002 else {
c22800e4 2003 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
2004 if (r < 0)
2005 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2006 if (r > 0)
2007 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2008 else
2009 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2010 }
efdb0237 2011
09e24654
LP
2012 /* 3. Allocate cgroup empty defer event source */
2013 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2014 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2015 if (r < 0)
2016 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2017
2018 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2019 if (r < 0)
2020 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2021
2022 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2023 if (r < 0)
2024 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2025
2026 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2027
2028 /* 4. Install notifier inotify object, or agent */
10bd3e2e 2029 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
c6c18be3 2030
09e24654 2031 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
efdb0237 2032
10bd3e2e
LP
2033 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2034 safe_close(m->cgroup_inotify_fd);
efdb0237 2035
10bd3e2e
LP
2036 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2037 if (m->cgroup_inotify_fd < 0)
2038 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 2039
10bd3e2e
LP
2040 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2041 if (r < 0)
2042 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 2043
10bd3e2e
LP
2044 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2045 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
09e24654 2046 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
10bd3e2e
LP
2047 if (r < 0)
2048 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
efdb0237 2049
10bd3e2e 2050 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 2051
10bd3e2e 2052 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
efdb0237 2053
10bd3e2e
LP
2054 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2055 * since it does not generate events when control groups with children run empty. */
8e274523 2056
10bd3e2e 2057 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
23bbb0de 2058 if (r < 0)
10bd3e2e
LP
2059 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2060 else if (r > 0)
2061 log_debug("Installed release agent.");
2062 else if (r == 0)
2063 log_debug("Release agent already installed.");
2064 }
efdb0237 2065
09e24654 2066 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
10bd3e2e
LP
2067 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2068 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2069 if (r < 0)
2070 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
c6c18be3 2071
09e24654 2072 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
10bd3e2e
LP
2073 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2074 if (r < 0)
2075 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
0d8c31ff 2076
09e24654 2077 /* 6. And pin it, so that it cannot be unmounted */
10bd3e2e
LP
2078 safe_close(m->pin_cgroupfs_fd);
2079 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2080 if (m->pin_cgroupfs_fd < 0)
2081 return log_error_errno(errno, "Failed to open pin file: %m");
2082
09e24654 2083 /* 7. Always enable hierarchical support if it exists... */
10bd3e2e
LP
2084 if (!all_unified && m->test_run_flags == 0)
2085 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3 2086
09e24654 2087 /* 8. Figure out which controllers are supported, and log about it */
efdb0237
LP
2088 r = cg_mask_supported(&m->cgroup_supported);
2089 if (r < 0)
2090 return log_error_errno(r, "Failed to determine supported controllers: %m");
efdb0237 2091 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
eee0a1e4 2092 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 2093
a32360f1 2094 return 0;
8e274523
LP
2095}
2096
c6c18be3 2097void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
2098 assert(m);
2099
9444b1f2
LP
2100 /* We can't really delete the group, since we are in it. But
2101 * let's trim it. */
2102 if (delete && m->cgroup_root)
efdb0237
LP
2103 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2104
09e24654
LP
2105 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2106
efdb0237
LP
2107 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2108
2109 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2110 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 2111
03e334a1 2112 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 2113
efdb0237 2114 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
2115}
2116
4ad49000 2117Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 2118 char *p;
4ad49000 2119 Unit *u;
acb14d31
LP
2120
2121 assert(m);
2122 assert(cgroup);
acb14d31 2123
4ad49000
LP
2124 u = hashmap_get(m->cgroup_unit, cgroup);
2125 if (u)
2126 return u;
acb14d31 2127
8e70580b 2128 p = strdupa(cgroup);
acb14d31
LP
2129 for (;;) {
2130 char *e;
2131
2132 e = strrchr(p, '/');
efdb0237
LP
2133 if (!e || e == p)
2134 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
2135
2136 *e = 0;
2137
4ad49000
LP
2138 u = hashmap_get(m->cgroup_unit, p);
2139 if (u)
2140 return u;
acb14d31
LP
2141 }
2142}
2143
b3ac818b 2144Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 2145 _cleanup_free_ char *cgroup = NULL;
acb14d31 2146 int r;
8e274523 2147
8c47c732
LP
2148 assert(m);
2149
b3ac818b
LP
2150 if (pid <= 0)
2151 return NULL;
2152
2153 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2154 if (r < 0)
2155 return NULL;
2156
2157 return manager_get_unit_by_cgroup(m, cgroup);
2158}
2159
2160Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2161 Unit *u;
2162
2163 assert(m);
2164
efdb0237 2165 if (pid <= 0)
8c47c732
LP
2166 return NULL;
2167
efdb0237
LP
2168 if (pid == 1)
2169 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2170
fea72cc0 2171 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
2172 if (u)
2173 return u;
2174
fea72cc0 2175 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
2176 if (u)
2177 return u;
2178
b3ac818b 2179 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 2180}
4fbf50b3 2181
4ad49000
LP
2182int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2183 Unit *u;
4fbf50b3 2184
4ad49000
LP
2185 assert(m);
2186 assert(cgroup);
4fbf50b3 2187
09e24654
LP
2188 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2189 * or from the --system instance */
2190
d8fdc620
LP
2191 log_debug("Got cgroup empty notification for: %s", cgroup);
2192
4ad49000 2193 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
2194 if (!u)
2195 return 0;
b56c28c3 2196
09e24654
LP
2197 unit_add_to_cgroup_empty_queue(u);
2198 return 1;
5ad096b3
LP
2199}
2200
2201int unit_get_memory_current(Unit *u, uint64_t *ret) {
2202 _cleanup_free_ char *v = NULL;
2203 int r;
2204
2205 assert(u);
2206 assert(ret);
2207
2e4025c0 2208 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
cf3b4be1
LP
2209 return -ENODATA;
2210
5ad096b3
LP
2211 if (!u->cgroup_path)
2212 return -ENODATA;
2213
efdb0237 2214 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
2215 return -ENODATA;
2216
b4cccbc1
LP
2217 r = cg_all_unified();
2218 if (r < 0)
2219 return r;
2220 if (r > 0)
efdb0237 2221 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
b4cccbc1
LP
2222 else
2223 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
5ad096b3
LP
2224 if (r == -ENOENT)
2225 return -ENODATA;
2226 if (r < 0)
2227 return r;
2228
2229 return safe_atou64(v, ret);
2230}
2231
03a7b521
LP
2232int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2233 _cleanup_free_ char *v = NULL;
2234 int r;
2235
2236 assert(u);
2237 assert(ret);
2238
2e4025c0 2239 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
2240 return -ENODATA;
2241
03a7b521
LP
2242 if (!u->cgroup_path)
2243 return -ENODATA;
2244
2245 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2246 return -ENODATA;
2247
2248 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2249 if (r == -ENOENT)
2250 return -ENODATA;
2251 if (r < 0)
2252 return r;
2253
2254 return safe_atou64(v, ret);
2255}
2256
5ad096b3
LP
2257static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2258 _cleanup_free_ char *v = NULL;
2259 uint64_t ns;
2260 int r;
2261
2262 assert(u);
2263 assert(ret);
2264
2265 if (!u->cgroup_path)
2266 return -ENODATA;
2267
b4cccbc1
LP
2268 r = cg_all_unified();
2269 if (r < 0)
2270 return r;
2271 if (r > 0) {
66ebf6c0
TH
2272 const char *keys[] = { "usage_usec", NULL };
2273 _cleanup_free_ char *val = NULL;
2274 uint64_t us;
5ad096b3 2275
66ebf6c0
TH
2276 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2277 return -ENODATA;
5ad096b3 2278
66ebf6c0
TH
2279 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2280 if (r < 0)
2281 return r;
2282
2283 r = safe_atou64(val, &us);
2284 if (r < 0)
2285 return r;
2286
2287 ns = us * NSEC_PER_USEC;
2288 } else {
2289 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2290 return -ENODATA;
2291
2292 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2293 if (r == -ENOENT)
2294 return -ENODATA;
2295 if (r < 0)
2296 return r;
2297
2298 r = safe_atou64(v, &ns);
2299 if (r < 0)
2300 return r;
2301 }
5ad096b3
LP
2302
2303 *ret = ns;
2304 return 0;
2305}
2306
2307int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2308 nsec_t ns;
2309 int r;
2310
fe700f46
LP
2311 assert(u);
2312
2313 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2314 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2315 * call this function with a NULL return value. */
2316
2e4025c0 2317 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
cf3b4be1
LP
2318 return -ENODATA;
2319
5ad096b3 2320 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
2321 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2322 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2323 * cached value. */
2324
2325 if (ret)
2326 *ret = u->cpu_usage_last;
2327 return 0;
2328 }
5ad096b3
LP
2329 if (r < 0)
2330 return r;
2331
66ebf6c0
TH
2332 if (ns > u->cpu_usage_base)
2333 ns -= u->cpu_usage_base;
5ad096b3
LP
2334 else
2335 ns = 0;
2336
fe700f46
LP
2337 u->cpu_usage_last = ns;
2338 if (ret)
2339 *ret = ns;
2340
5ad096b3
LP
2341 return 0;
2342}
2343
906c06f6
DM
2344int unit_get_ip_accounting(
2345 Unit *u,
2346 CGroupIPAccountingMetric metric,
2347 uint64_t *ret) {
2348
6b659ed8 2349 uint64_t value;
906c06f6
DM
2350 int fd, r;
2351
2352 assert(u);
2353 assert(metric >= 0);
2354 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2355 assert(ret);
2356
cf3b4be1
LP
2357 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2358 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2359 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2360 * filters. */
2361 if (u->type == UNIT_SLICE)
2362 return -ENODATA;
2363
2e4025c0 2364 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
2365 return -ENODATA;
2366
906c06f6
DM
2367 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2368 u->ip_accounting_ingress_map_fd :
2369 u->ip_accounting_egress_map_fd;
906c06f6
DM
2370 if (fd < 0)
2371 return -ENODATA;
2372
2373 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 2374 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 2375 else
6b659ed8
LP
2376 r = bpf_firewall_read_accounting(fd, NULL, &value);
2377 if (r < 0)
2378 return r;
2379
2380 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2381 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2382 * ip_accounting_extra[] field, and add them in here transparently. */
2383
2384 *ret = value + u->ip_accounting_extra[metric];
906c06f6
DM
2385
2386 return r;
2387}
2388
2389int unit_reset_cpu_accounting(Unit *u) {
5ad096b3
LP
2390 nsec_t ns;
2391 int r;
2392
2393 assert(u);
2394
fe700f46
LP
2395 u->cpu_usage_last = NSEC_INFINITY;
2396
5ad096b3
LP
2397 r = unit_get_cpu_usage_raw(u, &ns);
2398 if (r < 0) {
66ebf6c0 2399 u->cpu_usage_base = 0;
5ad096b3 2400 return r;
b56c28c3 2401 }
2633eb83 2402
66ebf6c0 2403 u->cpu_usage_base = ns;
4ad49000 2404 return 0;
4fbf50b3
LP
2405}
2406
906c06f6
DM
2407int unit_reset_ip_accounting(Unit *u) {
2408 int r = 0, q = 0;
2409
2410 assert(u);
2411
2412 if (u->ip_accounting_ingress_map_fd >= 0)
2413 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2414
2415 if (u->ip_accounting_egress_map_fd >= 0)
2416 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2417
6b659ed8
LP
2418 zero(u->ip_accounting_extra);
2419
906c06f6
DM
2420 return r < 0 ? r : q;
2421}
2422
e7ab4d1a
LP
2423void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2424 assert(u);
2425
2426 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2427 return;
2428
2429 if (m == 0)
2430 return;
2431
538b4852
TH
2432 /* always invalidate compat pairs together */
2433 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2434 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2435
7cce4fb7
LP
2436 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2437 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2438
60c728ad 2439 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
e7ab4d1a
LP
2440 return;
2441
2442 u->cgroup_realized_mask &= ~m;
91a6073e 2443 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
2444}
2445
906c06f6
DM
2446void unit_invalidate_cgroup_bpf(Unit *u) {
2447 assert(u);
2448
2449 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2450 return;
2451
60c728ad 2452 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
906c06f6
DM
2453 return;
2454
2455 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
91a6073e 2456 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
2457
2458 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2459 * list of our children includes our own. */
2460 if (u->type == UNIT_SLICE) {
2461 Unit *member;
2462 Iterator i;
eef85c4a 2463 void *v;
906c06f6 2464
eef85c4a 2465 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
906c06f6
DM
2466 if (member == u)
2467 continue;
2468
2469 if (UNIT_DEREF(member->slice) != u)
2470 continue;
2471
2472 unit_invalidate_cgroup_bpf(member);
2473 }
2474 }
2475}
2476
e7ab4d1a
LP
2477void manager_invalidate_startup_units(Manager *m) {
2478 Iterator i;
2479 Unit *u;
2480
2481 assert(m);
2482
2483 SET_FOREACH(u, m->startup_units, i)
13c31542 2484 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
2485}
2486
4ad49000
LP
2487static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2488 [CGROUP_AUTO] = "auto",
2489 [CGROUP_CLOSED] = "closed",
2490 [CGROUP_STRICT] = "strict",
2491};
4fbf50b3 2492
4ad49000 2493DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);