]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
man/udevadm: remove superfluous --version from subcommands (#8549)
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
8e274523
LP
2/***
3 This file is part of systemd.
4
4ad49000 5 Copyright 2013 Lennart Poettering
8e274523
LP
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
8e274523 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
c6c18be3 21#include <fcntl.h>
e41969e3 22#include <fnmatch.h>
8c6db833 23
b5efdb8a 24#include "alloc-util.h"
18c528e9 25#include "blockdev-util.h"
906c06f6 26#include "bpf-firewall.h"
6592b975 27#include "bus-error.h"
03a7b521 28#include "cgroup-util.h"
3ffd4af2
LP
29#include "cgroup.h"
30#include "fd-util.h"
0d39fa9c 31#include "fileio.h"
77601719 32#include "fs-util.h"
6bedfcbb 33#include "parse-util.h"
9eb977db 34#include "path-util.h"
03a7b521 35#include "process-util.h"
c36a69f4 36#include "procfs-util.h"
9444b1f2 37#include "special.h"
906c06f6 38#include "stdio-util.h"
8b43440b 39#include "string-table.h"
07630cea 40#include "string-util.h"
cc6271f1 41#include "virt.h"
8e274523 42
9a054909
LP
43#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
44
cc6271f1
LP
45bool manager_owns_root_cgroup(Manager *m) {
46 assert(m);
47
48 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
49 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
50 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
51 * we run in any kind of container virtualization. */
52
53 if (detect_container() > 0)
54 return false;
55
56 return isempty(m->cgroup_root) || path_equal(m->cgroup_root, "/");
57}
58
f3725e64
LP
59bool unit_has_root_cgroup(Unit *u) {
60 assert(u);
61
cc6271f1
LP
62 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
63 * the manager manages the root cgroup. */
f3725e64 64
cc6271f1 65 if (!manager_owns_root_cgroup(u->manager))
f3725e64
LP
66 return false;
67
cc6271f1 68 return unit_has_name(u, SPECIAL_ROOT_SLICE);
f3725e64
LP
69}
70
2b40998d 71static void cgroup_compat_warn(void) {
128fadc9
TH
72 static bool cgroup_compat_warned = false;
73
74 if (cgroup_compat_warned)
75 return;
76
cc6271f1
LP
77 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
78 "See cgroup-compat debug messages for details.");
79
128fadc9
TH
80 cgroup_compat_warned = true;
81}
82
83#define log_cgroup_compat(unit, fmt, ...) do { \
84 cgroup_compat_warn(); \
85 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 86 } while (false)
128fadc9 87
4ad49000
LP
88void cgroup_context_init(CGroupContext *c) {
89 assert(c);
90
91 /* Initialize everything to the kernel defaults, assuming the
92 * structure is preinitialized to 0 */
93
66ebf6c0
TH
94 c->cpu_weight = CGROUP_WEIGHT_INVALID;
95 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
96 c->cpu_quota_per_sec_usec = USEC_INFINITY;
97
d53d9474
LP
98 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
99 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
d53d9474 100
da4d897e
TH
101 c->memory_high = CGROUP_LIMIT_MAX;
102 c->memory_max = CGROUP_LIMIT_MAX;
96e131ea 103 c->memory_swap_max = CGROUP_LIMIT_MAX;
da4d897e
TH
104
105 c->memory_limit = CGROUP_LIMIT_MAX;
b2f8b02e 106
13c31542
TH
107 c->io_weight = CGROUP_WEIGHT_INVALID;
108 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
109
d53d9474
LP
110 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
111 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
112
113 c->tasks_max = (uint64_t) -1;
4ad49000 114}
8e274523 115
4ad49000
LP
116void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
117 assert(c);
118 assert(a);
119
71fda00f 120 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
121 free(a->path);
122 free(a);
123}
124
13c31542
TH
125void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
126 assert(c);
127 assert(w);
128
129 LIST_REMOVE(device_weights, c->io_device_weights, w);
130 free(w->path);
131 free(w);
132}
133
134void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
135 assert(c);
136 assert(l);
137
138 LIST_REMOVE(device_limits, c->io_device_limits, l);
139 free(l->path);
140 free(l);
141}
142
4ad49000
LP
143void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
144 assert(c);
145 assert(w);
146
71fda00f 147 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
148 free(w->path);
149 free(w);
150}
151
152void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
153 assert(c);
8e274523 154 assert(b);
8e274523 155
71fda00f 156 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
157 free(b->path);
158 free(b);
159}
160
161void cgroup_context_done(CGroupContext *c) {
162 assert(c);
163
13c31542
TH
164 while (c->io_device_weights)
165 cgroup_context_free_io_device_weight(c, c->io_device_weights);
166
167 while (c->io_device_limits)
168 cgroup_context_free_io_device_limit(c, c->io_device_limits);
169
4ad49000
LP
170 while (c->blockio_device_weights)
171 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
172
173 while (c->blockio_device_bandwidths)
174 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
175
176 while (c->device_allow)
177 cgroup_context_free_device_allow(c, c->device_allow);
6a48d82f
DM
178
179 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
180 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
4ad49000
LP
181}
182
183void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
13c31542
TH
184 CGroupIODeviceLimit *il;
185 CGroupIODeviceWeight *iw;
4ad49000
LP
186 CGroupBlockIODeviceBandwidth *b;
187 CGroupBlockIODeviceWeight *w;
188 CGroupDeviceAllow *a;
c21c9906 189 IPAddressAccessItem *iaai;
9a054909 190 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
191
192 assert(c);
193 assert(f);
194
195 prefix = strempty(prefix);
196
197 fprintf(f,
198 "%sCPUAccounting=%s\n"
13c31542 199 "%sIOAccounting=%s\n"
4ad49000
LP
200 "%sBlockIOAccounting=%s\n"
201 "%sMemoryAccounting=%s\n"
d53d9474 202 "%sTasksAccounting=%s\n"
c21c9906 203 "%sIPAccounting=%s\n"
66ebf6c0
TH
204 "%sCPUWeight=%" PRIu64 "\n"
205 "%sStartupCPUWeight=%" PRIu64 "\n"
d53d9474
LP
206 "%sCPUShares=%" PRIu64 "\n"
207 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 208 "%sCPUQuotaPerSecSec=%s\n"
13c31542
TH
209 "%sIOWeight=%" PRIu64 "\n"
210 "%sStartupIOWeight=%" PRIu64 "\n"
d53d9474
LP
211 "%sBlockIOWeight=%" PRIu64 "\n"
212 "%sStartupBlockIOWeight=%" PRIu64 "\n"
da4d897e
TH
213 "%sMemoryLow=%" PRIu64 "\n"
214 "%sMemoryHigh=%" PRIu64 "\n"
215 "%sMemoryMax=%" PRIu64 "\n"
96e131ea 216 "%sMemorySwapMax=%" PRIu64 "\n"
4ad49000 217 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 218 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
219 "%sDevicePolicy=%s\n"
220 "%sDelegate=%s\n",
4ad49000 221 prefix, yes_no(c->cpu_accounting),
13c31542 222 prefix, yes_no(c->io_accounting),
4ad49000
LP
223 prefix, yes_no(c->blockio_accounting),
224 prefix, yes_no(c->memory_accounting),
d53d9474 225 prefix, yes_no(c->tasks_accounting),
c21c9906 226 prefix, yes_no(c->ip_accounting),
66ebf6c0
TH
227 prefix, c->cpu_weight,
228 prefix, c->startup_cpu_weight,
4ad49000 229 prefix, c->cpu_shares,
95ae05c0 230 prefix, c->startup_cpu_shares,
b1d6dcf5 231 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
13c31542
TH
232 prefix, c->io_weight,
233 prefix, c->startup_io_weight,
4ad49000 234 prefix, c->blockio_weight,
95ae05c0 235 prefix, c->startup_blockio_weight,
da4d897e
TH
236 prefix, c->memory_low,
237 prefix, c->memory_high,
238 prefix, c->memory_max,
96e131ea 239 prefix, c->memory_swap_max,
4ad49000 240 prefix, c->memory_limit,
03a7b521 241 prefix, c->tasks_max,
a931ad47
LP
242 prefix, cgroup_device_policy_to_string(c->device_policy),
243 prefix, yes_no(c->delegate));
4ad49000 244
02638280
LP
245 if (c->delegate) {
246 _cleanup_free_ char *t = NULL;
247
248 (void) cg_mask_to_string(c->delegate_controllers, &t);
249
47a78d41 250 fprintf(f, "%sDelegateControllers=%s\n",
02638280
LP
251 prefix,
252 strempty(t));
253 }
254
4ad49000
LP
255 LIST_FOREACH(device_allow, a, c->device_allow)
256 fprintf(f,
257 "%sDeviceAllow=%s %s%s%s\n",
258 prefix,
259 a->path,
260 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
261
13c31542
TH
262 LIST_FOREACH(device_weights, iw, c->io_device_weights)
263 fprintf(f,
264 "%sIODeviceWeight=%s %" PRIu64,
265 prefix,
266 iw->path,
267 iw->weight);
268
269 LIST_FOREACH(device_limits, il, c->io_device_limits) {
270 char buf[FORMAT_BYTES_MAX];
9be57249
TH
271 CGroupIOLimitType type;
272
273 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
274 if (il->limits[type] != cgroup_io_limit_defaults[type])
275 fprintf(f,
276 "%s%s=%s %s\n",
277 prefix,
278 cgroup_io_limit_type_to_string(type),
279 il->path,
280 format_bytes(buf, sizeof(buf), il->limits[type]));
13c31542
TH
281 }
282
4ad49000
LP
283 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
284 fprintf(f,
d53d9474 285 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
286 prefix,
287 w->path,
288 w->weight);
289
290 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
291 char buf[FORMAT_BYTES_MAX];
292
979d0311
TH
293 if (b->rbps != CGROUP_LIMIT_MAX)
294 fprintf(f,
295 "%sBlockIOReadBandwidth=%s %s\n",
296 prefix,
297 b->path,
298 format_bytes(buf, sizeof(buf), b->rbps));
299 if (b->wbps != CGROUP_LIMIT_MAX)
300 fprintf(f,
301 "%sBlockIOWriteBandwidth=%s %s\n",
302 prefix,
303 b->path,
304 format_bytes(buf, sizeof(buf), b->wbps));
4ad49000 305 }
c21c9906
LP
306
307 LIST_FOREACH(items, iaai, c->ip_address_allow) {
308 _cleanup_free_ char *k = NULL;
309
310 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
311 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
312 }
313
314 LIST_FOREACH(items, iaai, c->ip_address_deny) {
315 _cleanup_free_ char *k = NULL;
316
317 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
318 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
319 }
4ad49000
LP
320}
321
13c31542 322static int lookup_block_device(const char *p, dev_t *dev) {
4ad49000
LP
323 struct stat st;
324 int r;
325
326 assert(p);
327 assert(dev);
328
329 r = stat(p, &st);
4a62c710
MS
330 if (r < 0)
331 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 332
4ad49000
LP
333 if (S_ISBLK(st.st_mode))
334 *dev = st.st_rdev;
335 else if (major(st.st_dev) != 0) {
336 /* If this is not a device node then find the block
337 * device this file is stored on */
338 *dev = st.st_dev;
339
340 /* If this is a partition, try to get the originating
341 * block device */
18c528e9 342 (void) block_get_whole_disk(*dev, dev);
4ad49000
LP
343 } else {
344 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
345 return -ENODEV;
346 }
8e274523 347
8e274523 348 return 0;
8e274523
LP
349}
350
4ad49000
LP
351static int whitelist_device(const char *path, const char *node, const char *acc) {
352 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
353 struct stat st;
b200489b 354 bool ignore_notfound;
8c6db833 355 int r;
8e274523 356
4ad49000
LP
357 assert(path);
358 assert(acc);
8e274523 359
b200489b
DR
360 if (node[0] == '-') {
361 /* Non-existent paths starting with "-" must be silently ignored */
362 node++;
363 ignore_notfound = true;
364 } else
365 ignore_notfound = false;
366
4ad49000 367 if (stat(node, &st) < 0) {
b200489b 368 if (errno == ENOENT && ignore_notfound)
e7330dfe
DP
369 return 0;
370
371 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
4ad49000
LP
372 }
373
374 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
375 log_warning("%s is not a device.", node);
376 return -ENODEV;
377 }
378
379 sprintf(buf,
380 "%c %u:%u %s",
381 S_ISCHR(st.st_mode) ? 'c' : 'b',
382 major(st.st_rdev), minor(st.st_rdev),
383 acc);
384
385 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 386 if (r < 0)
077ba06e 387 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 388 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
389
390 return r;
8e274523
LP
391}
392
90060676
LP
393static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
394 _cleanup_fclose_ FILE *f = NULL;
395 char line[LINE_MAX];
396 bool good = false;
397 int r;
398
399 assert(path);
400 assert(acc);
4c701096 401 assert(IN_SET(type, 'b', 'c'));
90060676
LP
402
403 f = fopen("/proc/devices", "re");
4a62c710
MS
404 if (!f)
405 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
406
407 FOREACH_LINE(line, f, goto fail) {
408 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
409 unsigned maj;
410
411 truncate_nl(line);
412
413 if (type == 'c' && streq(line, "Character devices:")) {
414 good = true;
415 continue;
416 }
417
418 if (type == 'b' && streq(line, "Block devices:")) {
419 good = true;
420 continue;
421 }
422
423 if (isempty(line)) {
424 good = false;
425 continue;
426 }
427
428 if (!good)
429 continue;
430
431 p = strstrip(line);
432
433 w = strpbrk(p, WHITESPACE);
434 if (!w)
435 continue;
436 *w = 0;
437
438 r = safe_atou(p, &maj);
439 if (r < 0)
440 continue;
441 if (maj <= 0)
442 continue;
443
444 w++;
445 w += strspn(w, WHITESPACE);
e41969e3
LP
446
447 if (fnmatch(name, w, 0) != 0)
90060676
LP
448 continue;
449
450 sprintf(buf,
451 "%c %u:* %s",
452 type,
453 maj,
454 acc);
455
456 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 457 if (r < 0)
077ba06e 458 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 459 "Failed to set devices.allow on %s: %m", path);
90060676
LP
460 }
461
462 return 0;
463
464fail:
25f027c5 465 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
466}
467
66ebf6c0
TH
468static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
469 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
470 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
471}
472
473static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
474 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
475 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
476}
477
478static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
479 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
480 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
481 return c->startup_cpu_weight;
482 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
483 return c->cpu_weight;
484 else
485 return CGROUP_WEIGHT_DEFAULT;
486}
487
488static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
489 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
490 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
491 return c->startup_cpu_shares;
492 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
493 return c->cpu_shares;
494 else
495 return CGROUP_CPU_SHARES_DEFAULT;
496}
497
498static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
499 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
500 int r;
501
502 xsprintf(buf, "%" PRIu64 "\n", weight);
503 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
504 if (r < 0)
505 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
506 "Failed to set cpu.weight: %m");
507
508 if (quota != USEC_INFINITY)
509 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
510 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
511 else
512 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
513
514 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
515
516 if (r < 0)
517 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
518 "Failed to set cpu.max: %m");
519}
520
521static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
522 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
523 int r;
524
525 xsprintf(buf, "%" PRIu64 "\n", shares);
526 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
527 if (r < 0)
528 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
529 "Failed to set cpu.shares: %m");
530
531 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
532 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
533 if (r < 0)
534 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
535 "Failed to set cpu.cfs_period_us: %m");
536
537 if (quota != USEC_INFINITY) {
538 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
539 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
540 } else
541 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
542 if (r < 0)
543 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
544 "Failed to set cpu.cfs_quota_us: %m");
545}
546
547static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
548 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
549 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
550}
551
552static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
553 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
554 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
555}
556
508c45da 557static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
558 return c->io_accounting ||
559 c->io_weight != CGROUP_WEIGHT_INVALID ||
560 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
561 c->io_device_weights ||
562 c->io_device_limits;
563}
564
508c45da 565static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
566 return c->blockio_accounting ||
567 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
568 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
569 c->blockio_device_weights ||
570 c->blockio_device_bandwidths;
571}
572
508c45da 573static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
574 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
575 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
576 return c->startup_io_weight;
577 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
578 return c->io_weight;
579 else
580 return CGROUP_WEIGHT_DEFAULT;
581}
582
508c45da 583static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
584 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
585 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
586 return c->startup_blockio_weight;
587 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
588 return c->blockio_weight;
589 else
590 return CGROUP_BLKIO_WEIGHT_DEFAULT;
591}
592
508c45da 593static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
594 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
595 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
596}
597
508c45da 598static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
599 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
600 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
601}
602
f29ff115 603static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
604 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
605 dev_t dev;
606 int r;
607
608 r = lookup_block_device(dev_path, &dev);
609 if (r < 0)
610 return;
611
612 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
f29ff115 613 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
64faf04c 614 if (r < 0)
f29ff115
TH
615 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
616 "Failed to set io.weight: %m");
64faf04c
TH
617}
618
f29ff115 619static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
620 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
621 dev_t dev;
622 int r;
623
624 r = lookup_block_device(dev_path, &dev);
625 if (r < 0)
626 return;
627
628 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
f29ff115 629 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
64faf04c 630 if (r < 0)
f29ff115
TH
631 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
632 "Failed to set blkio.weight_device: %m");
64faf04c
TH
633}
634
f29ff115 635static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
64faf04c
TH
636 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
637 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
638 CGroupIOLimitType type;
639 dev_t dev;
640 unsigned n = 0;
641 int r;
642
643 r = lookup_block_device(dev_path, &dev);
644 if (r < 0)
645 return 0;
646
647 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
648 if (limits[type] != cgroup_io_limit_defaults[type]) {
649 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
650 n++;
651 } else {
652 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
653 }
654 }
655
656 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
657 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
658 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
f29ff115 659 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
64faf04c 660 if (r < 0)
f29ff115
TH
661 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
662 "Failed to set io.max: %m");
64faf04c
TH
663 return n;
664}
665
f29ff115 666static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
667 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
668 dev_t dev;
669 unsigned n = 0;
670 int r;
671
672 r = lookup_block_device(dev_path, &dev);
673 if (r < 0)
674 return 0;
675
676 if (rbps != CGROUP_LIMIT_MAX)
677 n++;
678 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
f29ff115 679 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
64faf04c 680 if (r < 0)
f29ff115
TH
681 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
682 "Failed to set blkio.throttle.read_bps_device: %m");
64faf04c
TH
683
684 if (wbps != CGROUP_LIMIT_MAX)
685 n++;
686 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
f29ff115 687 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
64faf04c 688 if (r < 0)
f29ff115
TH
689 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
690 "Failed to set blkio.throttle.write_bps_device: %m");
64faf04c
TH
691
692 return n;
693}
694
da4d897e 695static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
96e131ea 696 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
697}
698
f29ff115 699static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
da4d897e
TH
700 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
701 int r;
702
703 if (v != CGROUP_LIMIT_MAX)
704 xsprintf(buf, "%" PRIu64 "\n", v);
705
f29ff115 706 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
da4d897e 707 if (r < 0)
f29ff115
TH
708 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
709 "Failed to set %s: %m", file);
da4d897e
TH
710}
711
0f2d84d2 712static void cgroup_apply_firewall(Unit *u) {
0f2d84d2
LP
713 assert(u);
714
acf7f253 715 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
906c06f6 716
acf7f253 717 if (bpf_firewall_compile(u) < 0)
906c06f6
DM
718 return;
719
720 (void) bpf_firewall_install(u);
906c06f6
DM
721}
722
723static void cgroup_context_apply(
724 Unit *u,
725 CGroupMask apply_mask,
726 bool apply_bpf,
727 ManagerState state) {
728
f29ff115
TH
729 const char *path;
730 CGroupContext *c;
01efdf13 731 bool is_root;
4ad49000
LP
732 int r;
733
f29ff115
TH
734 assert(u);
735
906c06f6
DM
736 /* Nothing to do? Exit early! */
737 if (apply_mask == 0 && !apply_bpf)
4ad49000 738 return;
8e274523 739
f3725e64
LP
740 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
741 is_root = unit_has_root_cgroup(u);
742
743 assert_se(c = unit_get_cgroup_context(u));
744 assert_se(path = u->cgroup_path);
745
746 if (is_root) /* Make sure we don't try to display messages with an empty path. */
6da13913 747 path = "/";
01efdf13 748
714e2e1d
LP
749 /* We generally ignore errors caused by read-only mounted
750 * cgroup trees (assuming we are running in a container then),
751 * and missing cgroups, i.e. EROFS and ENOENT. */
752
906c06f6
DM
753 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
754 bool has_weight, has_shares;
755
756 has_weight = cgroup_context_has_cpu_weight(c);
757 has_shares = cgroup_context_has_cpu_shares(c);
8e274523 758
b4cccbc1 759 if (cg_all_unified() > 0) {
66ebf6c0 760 uint64_t weight;
b2f8b02e 761
66ebf6c0
TH
762 if (has_weight)
763 weight = cgroup_context_cpu_weight(c, state);
764 else if (has_shares) {
765 uint64_t shares = cgroup_context_cpu_shares(c, state);
b2f8b02e 766
66ebf6c0
TH
767 weight = cgroup_cpu_shares_to_weight(shares);
768
769 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
770 shares, weight, path);
771 } else
772 weight = CGROUP_WEIGHT_DEFAULT;
773
774 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
775 } else {
776 uint64_t shares;
777
7d862ab8 778 if (has_weight) {
66ebf6c0
TH
779 uint64_t weight = cgroup_context_cpu_weight(c, state);
780
781 shares = cgroup_cpu_weight_to_shares(weight);
782
783 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
784 weight, shares, path);
7d862ab8
TH
785 } else if (has_shares)
786 shares = cgroup_context_cpu_shares(c, state);
787 else
66ebf6c0
TH
788 shares = CGROUP_CPU_SHARES_DEFAULT;
789
790 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
791 }
4ad49000
LP
792 }
793
906c06f6 794 if (apply_mask & CGROUP_MASK_IO) {
538b4852
TH
795 bool has_io = cgroup_context_has_io_config(c);
796 bool has_blockio = cgroup_context_has_blockio_config(c);
13c31542
TH
797
798 if (!is_root) {
64faf04c
TH
799 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
800 uint64_t weight;
13c31542 801
538b4852
TH
802 if (has_io)
803 weight = cgroup_context_io_weight(c, state);
128fadc9
TH
804 else if (has_blockio) {
805 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
806
807 weight = cgroup_weight_blkio_to_io(blkio_weight);
808
809 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
810 blkio_weight, weight);
811 } else
538b4852 812 weight = CGROUP_WEIGHT_DEFAULT;
13c31542
TH
813
814 xsprintf(buf, "default %" PRIu64 "\n", weight);
815 r = cg_set_attribute("io", path, "io.weight", buf);
816 if (r < 0)
f29ff115
TH
817 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
818 "Failed to set io.weight: %m");
13c31542 819
538b4852
TH
820 if (has_io) {
821 CGroupIODeviceWeight *w;
822
823 /* FIXME: no way to reset this list */
824 LIST_FOREACH(device_weights, w, c->io_device_weights)
f29ff115 825 cgroup_apply_io_device_weight(u, w->path, w->weight);
538b4852
TH
826 } else if (has_blockio) {
827 CGroupBlockIODeviceWeight *w;
828
829 /* FIXME: no way to reset this list */
128fadc9
TH
830 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
831 weight = cgroup_weight_blkio_to_io(w->weight);
832
833 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
834 w->weight, weight, w->path);
835
836 cgroup_apply_io_device_weight(u, w->path, weight);
837 }
538b4852 838 }
13c31542
TH
839 }
840
64faf04c 841 /* Apply limits and free ones without config. */
538b4852
TH
842 if (has_io) {
843 CGroupIODeviceLimit *l, *next;
844
845 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
f29ff115 846 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
538b4852
TH
847 cgroup_context_free_io_device_limit(c, l);
848 }
849 } else if (has_blockio) {
850 CGroupBlockIODeviceBandwidth *b, *next;
851
852 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
853 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
854 CGroupIOLimitType type;
855
856 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
857 limits[type] = cgroup_io_limit_defaults[type];
858
859 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
860 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
861
128fadc9
TH
862 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
863 b->rbps, b->wbps, b->path);
864
f29ff115 865 if (!cgroup_apply_io_device_limit(u, b->path, limits))
538b4852
TH
866 cgroup_context_free_blockio_device_bandwidth(c, b);
867 }
13c31542
TH
868 }
869 }
870
906c06f6 871 if (apply_mask & CGROUP_MASK_BLKIO) {
538b4852
TH
872 bool has_io = cgroup_context_has_io_config(c);
873 bool has_blockio = cgroup_context_has_blockio_config(c);
4ad49000 874
01efdf13 875 if (!is_root) {
64faf04c
TH
876 char buf[DECIMAL_STR_MAX(uint64_t)+1];
877 uint64_t weight;
64faf04c 878
7d862ab8 879 if (has_io) {
128fadc9
TH
880 uint64_t io_weight = cgroup_context_io_weight(c, state);
881
538b4852 882 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9
TH
883
884 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
885 io_weight, weight);
7d862ab8
TH
886 } else if (has_blockio)
887 weight = cgroup_context_blkio_weight(c, state);
888 else
538b4852 889 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c
TH
890
891 xsprintf(buf, "%" PRIu64 "\n", weight);
01efdf13 892 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 893 if (r < 0)
f29ff115
TH
894 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
895 "Failed to set blkio.weight: %m");
4ad49000 896
7d862ab8 897 if (has_io) {
538b4852
TH
898 CGroupIODeviceWeight *w;
899
900 /* FIXME: no way to reset this list */
128fadc9
TH
901 LIST_FOREACH(device_weights, w, c->io_device_weights) {
902 weight = cgroup_weight_io_to_blkio(w->weight);
903
904 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
905 w->weight, weight, w->path);
906
907 cgroup_apply_blkio_device_weight(u, w->path, weight);
908 }
7d862ab8
TH
909 } else if (has_blockio) {
910 CGroupBlockIODeviceWeight *w;
911
912 /* FIXME: no way to reset this list */
913 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
914 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 915 }
4ad49000
LP
916 }
917
64faf04c 918 /* Apply limits and free ones without config. */
7d862ab8 919 if (has_io) {
538b4852
TH
920 CGroupIODeviceLimit *l, *next;
921
922 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
128fadc9
TH
923 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
924 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
925
f29ff115 926 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
538b4852
TH
927 cgroup_context_free_io_device_limit(c, l);
928 }
7d862ab8
TH
929 } else if (has_blockio) {
930 CGroupBlockIODeviceBandwidth *b, *next;
931
932 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
933 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
934 cgroup_context_free_blockio_device_bandwidth(c, b);
d686d8a9 935 }
8e274523
LP
936 }
937
906c06f6 938 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
b4cccbc1
LP
939 if (cg_all_unified() > 0) {
940 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
efdb0237 941
96e131ea 942 if (cgroup_context_has_unified_memory_config(c)) {
da4d897e 943 max = c->memory_max;
96e131ea
WC
944 swap_max = c->memory_swap_max;
945 } else {
da4d897e 946 max = c->memory_limit;
efdb0237 947
128fadc9
TH
948 if (max != CGROUP_LIMIT_MAX)
949 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
950 }
951
f29ff115
TH
952 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
953 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
954 cgroup_apply_unified_memory_limit(u, "memory.max", max);
96e131ea 955 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
efdb0237 956 } else {
da4d897e 957 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
7d862ab8 958 uint64_t val;
da4d897e 959
7d862ab8 960 if (cgroup_context_has_unified_memory_config(c)) {
78a4ee59 961 val = c->memory_max;
7d862ab8
TH
962 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
963 } else
964 val = c->memory_limit;
128fadc9 965
78a4ee59
DM
966 if (val == CGROUP_LIMIT_MAX)
967 strncpy(buf, "-1\n", sizeof(buf));
968 else
969 xsprintf(buf, "%" PRIu64 "\n", val);
970
da4d897e
TH
971 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
972 if (r < 0)
f29ff115
TH
973 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
974 "Failed to set memory.limit_in_bytes: %m");
da4d897e 975 }
4ad49000 976 }
8e274523 977
906c06f6 978 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 979 CGroupDeviceAllow *a;
8e274523 980
714e2e1d
LP
981 /* Changing the devices list of a populated cgroup
982 * might result in EINVAL, hence ignore EINVAL
983 * here. */
984
4ad49000
LP
985 if (c->device_allow || c->device_policy != CGROUP_AUTO)
986 r = cg_set_attribute("devices", path, "devices.deny", "a");
987 else
988 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 989 if (r < 0)
f29ff115
TH
990 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
991 "Failed to reset devices.list: %m");
fb385181 992
4ad49000
LP
993 if (c->device_policy == CGROUP_CLOSED ||
994 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
995 static const char auto_devices[] =
7d711efb
LP
996 "/dev/null\0" "rwm\0"
997 "/dev/zero\0" "rwm\0"
998 "/dev/full\0" "rwm\0"
999 "/dev/random\0" "rwm\0"
1000 "/dev/urandom\0" "rwm\0"
1001 "/dev/tty\0" "rwm\0"
5a7f87a9 1002 "/dev/ptmx\0" "rwm\0"
0d9e7991 1003 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
e7330dfe
DP
1004 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1005 "-/run/systemd/inaccessible/blk\0" "rwm\0";
4ad49000
LP
1006
1007 const char *x, *y;
1008
1009 NULSTR_FOREACH_PAIR(x, y, auto_devices)
1010 whitelist_device(path, x, y);
7d711efb 1011
5a7f87a9 1012 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
7d711efb 1013 whitelist_major(path, "pts", 'c', "rw");
4ad49000
LP
1014 }
1015
1016 LIST_FOREACH(device_allow, a, c->device_allow) {
fb4650aa 1017 char acc[4], *val;
4ad49000
LP
1018 unsigned k = 0;
1019
1020 if (a->r)
1021 acc[k++] = 'r';
1022 if (a->w)
1023 acc[k++] = 'w';
1024 if (a->m)
1025 acc[k++] = 'm';
fb385181 1026
4ad49000
LP
1027 if (k == 0)
1028 continue;
fb385181 1029
4ad49000 1030 acc[k++] = 0;
90060676 1031
27458ed6 1032 if (path_startswith(a->path, "/dev/"))
90060676 1033 whitelist_device(path, a->path, acc);
fb4650aa
ZJS
1034 else if ((val = startswith(a->path, "block-")))
1035 whitelist_major(path, val, 'b', acc);
1036 else if ((val = startswith(a->path, "char-")))
1037 whitelist_major(path, val, 'c', acc);
90060676 1038 else
f29ff115 1039 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
1040 }
1041 }
03a7b521 1042
00b5974f
LP
1043 if (apply_mask & CGROUP_MASK_PIDS) {
1044
1045 if (is_root) {
1046 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1047 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1048 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1049 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1050 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1051 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1052 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1053 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1054 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1055 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1056 * which is desirable so that there's an offical way to release control of the sysctl from
1057 * systemd: set the limit to unbounded and reload. */
1058
1059 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1060 u->manager->sysctl_pid_max_changed = true;
1061 r = procfs_tasks_set_limit(c->tasks_max);
1062 } else if (u->manager->sysctl_pid_max_changed)
1063 r = procfs_tasks_set_limit(TASKS_MAX);
1064 else
1065 r = 0;
03a7b521 1066
00b5974f
LP
1067 if (r < 0)
1068 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1069 "Failed to write to tasks limit sysctls: %m");
03a7b521 1070
00b5974f
LP
1071 } else {
1072 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1073 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
03a7b521 1074
00b5974f
LP
1075 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1076 r = cg_set_attribute("pids", path, "pids.max", buf);
1077 } else
1078 r = cg_set_attribute("pids", path, "pids.max", "max");
1079 if (r < 0)
1080 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1081 "Failed to set pids.max: %m");
1082 }
03a7b521 1083 }
906c06f6
DM
1084
1085 if (apply_bpf)
0f2d84d2 1086 cgroup_apply_firewall(u);
fb385181
LP
1087}
1088
efdb0237
LP
1089CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1090 CGroupMask mask = 0;
8e274523 1091
4ad49000 1092 /* Figure out which controllers we need */
8e274523 1093
b2f8b02e 1094 if (c->cpu_accounting ||
66ebf6c0
TH
1095 cgroup_context_has_cpu_weight(c) ||
1096 cgroup_context_has_cpu_shares(c) ||
3a43da28 1097 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 1098 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 1099
538b4852
TH
1100 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1101 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 1102
4ad49000 1103 if (c->memory_accounting ||
da4d897e
TH
1104 c->memory_limit != CGROUP_LIMIT_MAX ||
1105 cgroup_context_has_unified_memory_config(c))
efdb0237 1106 mask |= CGROUP_MASK_MEMORY;
8e274523 1107
a931ad47
LP
1108 if (c->device_allow ||
1109 c->device_policy != CGROUP_AUTO)
3905f127 1110 mask |= CGROUP_MASK_DEVICES;
4ad49000 1111
03a7b521 1112 if (c->tasks_accounting ||
8793fa25 1113 c->tasks_max != CGROUP_LIMIT_MAX)
03a7b521
LP
1114 mask |= CGROUP_MASK_PIDS;
1115
4ad49000 1116 return mask;
8e274523
LP
1117}
1118
efdb0237 1119CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1120 CGroupContext *c;
8e274523 1121
efdb0237
LP
1122 /* Returns the mask of controllers the unit needs for itself */
1123
4ad49000
LP
1124 c = unit_get_cgroup_context(u);
1125 if (!c)
1126 return 0;
8e274523 1127
64e844e5 1128 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
02638280
LP
1129}
1130
1131CGroupMask unit_get_delegate_mask(Unit *u) {
1132 CGroupContext *c;
1133
1134 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1135 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
19af675e 1136 *
02638280 1137 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
a931ad47 1138
1d9cc876 1139 if (!unit_cgroup_delegate(u))
02638280
LP
1140 return 0;
1141
1142 if (cg_all_unified() <= 0) {
a931ad47
LP
1143 ExecContext *e;
1144
1145 e = unit_get_exec_context(u);
02638280
LP
1146 if (e && !exec_context_maintains_privileges(e))
1147 return 0;
a931ad47
LP
1148 }
1149
1d9cc876 1150 assert_se(c = unit_get_cgroup_context(u));
02638280 1151 return c->delegate_controllers;
8e274523
LP
1152}
1153
efdb0237 1154CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1155 assert(u);
bc432dc7 1156
02638280 1157 /* Returns the mask of controllers all of the unit's children require, merged */
efdb0237 1158
bc432dc7
LP
1159 if (u->cgroup_members_mask_valid)
1160 return u->cgroup_members_mask;
1161
64e844e5 1162 u->cgroup_members_mask = 0;
bc432dc7
LP
1163
1164 if (u->type == UNIT_SLICE) {
eef85c4a 1165 void *v;
bc432dc7
LP
1166 Unit *member;
1167 Iterator i;
1168
eef85c4a 1169 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
bc432dc7
LP
1170
1171 if (member == u)
1172 continue;
1173
d4fdc205 1174 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
1175 continue;
1176
31604970 1177 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
bc432dc7
LP
1178 }
1179 }
1180
1181 u->cgroup_members_mask_valid = true;
6414b7c9 1182 return u->cgroup_members_mask;
246aa6dd
LP
1183}
1184
efdb0237 1185CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 1186 assert(u);
246aa6dd 1187
efdb0237
LP
1188 /* Returns the mask of controllers all of the unit's siblings
1189 * require, i.e. the members mask of the unit's parent slice
1190 * if there is one. */
1191
bc432dc7 1192 if (UNIT_ISSET(u->slice))
637f421e 1193 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 1194
64e844e5 1195 return unit_get_subtree_mask(u); /* we are the top-level slice */
246aa6dd
LP
1196}
1197
efdb0237
LP
1198CGroupMask unit_get_subtree_mask(Unit *u) {
1199
1200 /* Returns the mask of this subtree, meaning of the group
1201 * itself and its children. */
1202
1203 return unit_get_own_mask(u) | unit_get_members_mask(u);
1204}
1205
1206CGroupMask unit_get_target_mask(Unit *u) {
1207 CGroupMask mask;
1208
1209 /* This returns the cgroup mask of all controllers to enable
1210 * for a specific cgroup, i.e. everything it needs itself,
1211 * plus all that its children need, plus all that its siblings
1212 * need. This is primarily useful on the legacy cgroup
1213 * hierarchy, where we need to duplicate each cgroup in each
1214 * hierarchy that shall be enabled for it. */
6414b7c9 1215
efdb0237
LP
1216 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1217 mask &= u->manager->cgroup_supported;
1218
1219 return mask;
1220}
1221
1222CGroupMask unit_get_enable_mask(Unit *u) {
1223 CGroupMask mask;
1224
1225 /* This returns the cgroup mask of all controllers to enable
1226 * for the children of a specific cgroup. This is primarily
1227 * useful for the unified cgroup hierarchy, where each cgroup
1228 * controls which controllers are enabled for its children. */
1229
1230 mask = unit_get_members_mask(u);
6414b7c9
DS
1231 mask &= u->manager->cgroup_supported;
1232
1233 return mask;
1234}
1235
906c06f6
DM
1236bool unit_get_needs_bpf(Unit *u) {
1237 CGroupContext *c;
1238 Unit *p;
1239 assert(u);
1240
906c06f6
DM
1241 c = unit_get_cgroup_context(u);
1242 if (!c)
1243 return false;
1244
1245 if (c->ip_accounting ||
1246 c->ip_address_allow ||
1247 c->ip_address_deny)
1248 return true;
1249
1250 /* If any parent slice has an IP access list defined, it applies too */
1251 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1252 c = unit_get_cgroup_context(p);
1253 if (!c)
1254 return false;
1255
1256 if (c->ip_address_allow ||
1257 c->ip_address_deny)
1258 return true;
1259 }
1260
1261 return false;
1262}
1263
6414b7c9
DS
1264/* Recurse from a unit up through its containing slices, propagating
1265 * mask bits upward. A unit is also member of itself. */
bc432dc7 1266void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 1267 CGroupMask m;
bc432dc7
LP
1268 bool more;
1269
1270 assert(u);
1271
1272 /* Calculate subtree mask */
efdb0237 1273 m = unit_get_subtree_mask(u);
bc432dc7
LP
1274
1275 /* See if anything changed from the previous invocation. If
1276 * not, we're done. */
1277 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1278 return;
1279
1280 more =
1281 u->cgroup_subtree_mask_valid &&
1282 ((m & ~u->cgroup_subtree_mask) != 0) &&
1283 ((~m & u->cgroup_subtree_mask) == 0);
1284
1285 u->cgroup_subtree_mask = m;
1286 u->cgroup_subtree_mask_valid = true;
1287
6414b7c9
DS
1288 if (UNIT_ISSET(u->slice)) {
1289 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
1290
1291 if (more)
1292 /* There's more set now than before. We
1293 * propagate the new mask to the parent's mask
1294 * (not caring if it actually was valid or
1295 * not). */
1296
1297 s->cgroup_members_mask |= m;
1298
1299 else
1300 /* There's less set now than before (or we
1301 * don't know), we need to recalculate
1302 * everything, so let's invalidate the
1303 * parent's members mask */
1304
1305 s->cgroup_members_mask_valid = false;
1306
1307 /* And now make sure that this change also hits our
1308 * grandparents */
1309 unit_update_cgroup_members_masks(s);
6414b7c9
DS
1310 }
1311}
1312
6592b975 1313const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
03b90d4b 1314
6592b975 1315 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
03b90d4b
LP
1316
1317 while (u) {
6592b975 1318
03b90d4b
LP
1319 if (u->cgroup_path &&
1320 u->cgroup_realized &&
1321 (u->cgroup_realized_mask & mask) == mask)
1322 return u->cgroup_path;
1323
1324 u = UNIT_DEREF(u->slice);
1325 }
1326
1327 return NULL;
1328}
1329
6592b975
LP
1330static const char *migrate_callback(CGroupMask mask, void *userdata) {
1331 return unit_get_realized_cgroup_path(userdata, mask);
1332}
1333
efdb0237
LP
1334char *unit_default_cgroup_path(Unit *u) {
1335 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1336 int r;
1337
1338 assert(u);
1339
1340 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1341 return strdup(u->manager->cgroup_root);
1342
1343 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1344 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1345 if (r < 0)
1346 return NULL;
1347 }
1348
1349 escaped = cg_escape(u->id);
1350 if (!escaped)
1351 return NULL;
1352
1353 if (slice)
605405c6
ZJS
1354 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1355 escaped);
efdb0237 1356 else
605405c6 1357 return strjoin(u->manager->cgroup_root, "/", escaped);
efdb0237
LP
1358}
1359
1360int unit_set_cgroup_path(Unit *u, const char *path) {
1361 _cleanup_free_ char *p = NULL;
1362 int r;
1363
1364 assert(u);
1365
1366 if (path) {
1367 p = strdup(path);
1368 if (!p)
1369 return -ENOMEM;
1370 } else
1371 p = NULL;
1372
1373 if (streq_ptr(u->cgroup_path, p))
1374 return 0;
1375
1376 if (p) {
1377 r = hashmap_put(u->manager->cgroup_unit, p, u);
1378 if (r < 0)
1379 return r;
1380 }
1381
1382 unit_release_cgroup(u);
1383
1384 u->cgroup_path = p;
1385 p = NULL;
1386
1387 return 1;
1388}
1389
1390int unit_watch_cgroup(Unit *u) {
ab2c3861 1391 _cleanup_free_ char *events = NULL;
efdb0237
LP
1392 int r;
1393
1394 assert(u);
1395
1396 if (!u->cgroup_path)
1397 return 0;
1398
1399 if (u->cgroup_inotify_wd >= 0)
1400 return 0;
1401
1402 /* Only applies to the unified hierarchy */
c22800e4 1403 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1404 if (r < 0)
1405 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1406 if (r == 0)
efdb0237
LP
1407 return 0;
1408
1409 /* Don't watch the root slice, it's pointless. */
1410 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1411 return 0;
1412
1413 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1414 if (r < 0)
1415 return log_oom();
1416
ab2c3861 1417 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1418 if (r < 0)
1419 return log_oom();
1420
ab2c3861 1421 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
efdb0237
LP
1422 if (u->cgroup_inotify_wd < 0) {
1423
1424 if (errno == ENOENT) /* If the directory is already
1425 * gone we don't need to track
1426 * it, so this is not an error */
1427 return 0;
1428
1429 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1430 }
1431
1432 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1433 if (r < 0)
1434 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1435
1436 return 0;
1437}
1438
a4634b21
LP
1439int unit_pick_cgroup_path(Unit *u) {
1440 _cleanup_free_ char *path = NULL;
1441 int r;
1442
1443 assert(u);
1444
1445 if (u->cgroup_path)
1446 return 0;
1447
1448 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1449 return -EINVAL;
1450
1451 path = unit_default_cgroup_path(u);
1452 if (!path)
1453 return log_oom();
1454
1455 r = unit_set_cgroup_path(u, path);
1456 if (r == -EEXIST)
1457 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1458 if (r < 0)
1459 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1460
1461 return 0;
1462}
1463
efdb0237
LP
1464static int unit_create_cgroup(
1465 Unit *u,
1466 CGroupMask target_mask,
906c06f6
DM
1467 CGroupMask enable_mask,
1468 bool needs_bpf) {
efdb0237 1469
0cd385d3 1470 CGroupContext *c;
bc432dc7 1471 int r;
64747e2d 1472
4ad49000 1473 assert(u);
64747e2d 1474
0cd385d3
LP
1475 c = unit_get_cgroup_context(u);
1476 if (!c)
1477 return 0;
1478
a4634b21
LP
1479 /* Figure out our cgroup path */
1480 r = unit_pick_cgroup_path(u);
1481 if (r < 0)
1482 return r;
b58b8e11 1483
03b90d4b 1484 /* First, create our own group */
efdb0237 1485 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 1486 if (r < 0)
efdb0237
LP
1487 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1488
1489 /* Start watching it */
1490 (void) unit_watch_cgroup(u);
1491
1492 /* Enable all controllers we need */
1493 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1494 if (r < 0)
1495 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
1496
1497 /* Keep track that this is now realized */
4ad49000 1498 u->cgroup_realized = true;
efdb0237 1499 u->cgroup_realized_mask = target_mask;
ccf78df1 1500 u->cgroup_enabled_mask = enable_mask;
906c06f6 1501 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
4ad49000 1502
1d9cc876 1503 if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
0cd385d3
LP
1504
1505 /* Then, possibly move things over, but not if
1506 * subgroups may contain processes, which is the case
1507 * for slice and delegation units. */
1508 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1509 if (r < 0)
efdb0237 1510 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 1511 }
03b90d4b 1512
64747e2d
LP
1513 return 0;
1514}
1515
6592b975
LP
1516static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1517 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1518 char *pp;
7b3fd631 1519 int r;
6592b975 1520
7b3fd631
LP
1521 assert(u);
1522
6592b975
LP
1523 if (MANAGER_IS_SYSTEM(u->manager))
1524 return -EINVAL;
1525
1526 if (!u->manager->system_bus)
1527 return -EIO;
1528
1529 if (!u->cgroup_path)
1530 return -EINVAL;
1531
1532 /* Determine this unit's cgroup path relative to our cgroup root */
1533 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1534 if (!pp)
1535 return -EINVAL;
1536
1537 pp = strjoina("/", pp, suffix_path);
1538 path_kill_slashes(pp);
1539
1540 r = sd_bus_call_method(u->manager->system_bus,
1541 "org.freedesktop.systemd1",
1542 "/org/freedesktop/systemd1",
1543 "org.freedesktop.systemd1.Manager",
1544 "AttachProcessesToUnit",
1545 &error, NULL,
1546 "ssau",
1547 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
7b3fd631 1548 if (r < 0)
6592b975
LP
1549 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1550
1551 return 0;
1552}
1553
1554int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1555 CGroupMask delegated_mask;
1556 const char *p;
1557 Iterator i;
1558 void *pidp;
1559 int r, q;
1560
1561 assert(u);
1562
1563 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1564 return -EINVAL;
1565
1566 if (set_isempty(pids))
1567 return 0;
7b3fd631 1568
6592b975 1569 r = unit_realize_cgroup(u);
7b3fd631
LP
1570 if (r < 0)
1571 return r;
1572
6592b975
LP
1573 if (isempty(suffix_path))
1574 p = u->cgroup_path;
1575 else
1576 p = strjoina(u->cgroup_path, "/", suffix_path);
1577
1578 delegated_mask = unit_get_delegate_mask(u);
1579
1580 r = 0;
1581 SET_FOREACH(pidp, pids, i) {
1582 pid_t pid = PTR_TO_PID(pidp);
1583 CGroupController c;
1584
1585 /* First, attach the PID to the main cgroup hierarchy */
1586 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1587 if (q < 0) {
1588 log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1589
1590 if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1591 int z;
1592
1593 /* If we are in a user instance, and we can't move the process ourselves due to
1594 * permission problems, let's ask the system instance about it instead. Since it's more
1595 * privileged it might be able to move the process across the leaves of a subtree who's
1596 * top node is not owned by us. */
1597
1598 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1599 if (z < 0)
1600 log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1601 else
1602 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1603 }
1604
1605 if (r >= 0)
1606 r = q; /* Remember first error */
1607
1608 continue;
1609 }
1610
1611 q = cg_all_unified();
1612 if (q < 0)
1613 return q;
1614 if (q > 0)
1615 continue;
1616
1617 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1618 * innermost realized one */
1619
1620 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1621 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1622 const char *realized;
1623
1624 if (!(u->manager->cgroup_supported & bit))
1625 continue;
1626
1627 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1628 if (delegated_mask & u->cgroup_realized_mask & bit) {
1629 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1630 if (q >= 0)
1631 continue; /* Success! */
1632
1633 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1634 pid, p, cgroup_controller_to_string(c));
1635 }
1636
1637 /* So this controller is either not delegate or realized, or something else weird happened. In
1638 * that case let's attach the PID at least to the closest cgroup up the tree that is
1639 * realized. */
1640 realized = unit_get_realized_cgroup_path(u, bit);
1641 if (!realized)
1642 continue; /* Not even realized in the root slice? Then let's not bother */
1643
1644 q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1645 if (q < 0)
1646 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1647 pid, realized, cgroup_controller_to_string(c));
1648 }
1649 }
1650
1651 return r;
7b3fd631
LP
1652}
1653
4b58153d
LP
1654static void cgroup_xattr_apply(Unit *u) {
1655 char ids[SD_ID128_STRING_MAX];
1656 int r;
1657
1658 assert(u);
1659
1660 if (!MANAGER_IS_SYSTEM(u->manager))
1661 return;
1662
1663 if (sd_id128_is_null(u->invocation_id))
1664 return;
1665
1666 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1667 "trusted.invocation_id",
1668 sd_id128_to_string(u->invocation_id, ids), 32,
1669 0);
1670 if (r < 0)
0fb84499 1671 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
4b58153d
LP
1672}
1673
906c06f6
DM
1674static bool unit_has_mask_realized(
1675 Unit *u,
1676 CGroupMask target_mask,
1677 CGroupMask enable_mask,
1678 bool needs_bpf) {
1679
bc432dc7
LP
1680 assert(u);
1681
906c06f6
DM
1682 return u->cgroup_realized &&
1683 u->cgroup_realized_mask == target_mask &&
1684 u->cgroup_enabled_mask == enable_mask &&
1685 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1686 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
6414b7c9
DS
1687}
1688
2aa57a65
LP
1689static void unit_add_to_cgroup_realize_queue(Unit *u) {
1690 assert(u);
1691
1692 if (u->in_cgroup_realize_queue)
1693 return;
1694
1695 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1696 u->in_cgroup_realize_queue = true;
1697}
1698
1699static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1700 assert(u);
1701
1702 if (!u->in_cgroup_realize_queue)
1703 return;
1704
1705 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1706 u->in_cgroup_realize_queue = false;
1707}
1708
1709
6414b7c9
DS
1710/* Check if necessary controllers and attributes for a unit are in place.
1711 *
1712 * If so, do nothing.
1713 * If not, create paths, move processes over, and set attributes.
1714 *
1715 * Returns 0 on success and < 0 on failure. */
db785129 1716static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1717 CGroupMask target_mask, enable_mask;
906c06f6 1718 bool needs_bpf, apply_bpf;
6414b7c9 1719 int r;
64747e2d 1720
4ad49000 1721 assert(u);
64747e2d 1722
2aa57a65 1723 unit_remove_from_cgroup_realize_queue(u);
64747e2d 1724
efdb0237 1725 target_mask = unit_get_target_mask(u);
ccf78df1 1726 enable_mask = unit_get_enable_mask(u);
906c06f6 1727 needs_bpf = unit_get_needs_bpf(u);
ccf78df1 1728
906c06f6 1729 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
0a1eb06d 1730 return 0;
64747e2d 1731
906c06f6
DM
1732 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1733 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1734 * this will trickle down properly to cgroupfs. */
1735 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1736
4ad49000 1737 /* First, realize parents */
6414b7c9 1738 if (UNIT_ISSET(u->slice)) {
db785129 1739 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1740 if (r < 0)
1741 return r;
1742 }
4ad49000
LP
1743
1744 /* And then do the real work */
906c06f6 1745 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
6414b7c9
DS
1746 if (r < 0)
1747 return r;
1748
1749 /* Finally, apply the necessary attributes. */
906c06f6 1750 cgroup_context_apply(u, target_mask, apply_bpf, state);
4b58153d 1751 cgroup_xattr_apply(u);
6414b7c9
DS
1752
1753 return 0;
64747e2d
LP
1754}
1755
91a6073e 1756unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
db785129 1757 ManagerState state;
4ad49000 1758 unsigned n = 0;
db785129 1759 Unit *i;
6414b7c9 1760 int r;
ecedd90f 1761
91a6073e
LP
1762 assert(m);
1763
db785129
LP
1764 state = manager_state(m);
1765
91a6073e
LP
1766 while ((i = m->cgroup_realize_queue)) {
1767 assert(i->in_cgroup_realize_queue);
ecedd90f 1768
2aa57a65
LP
1769 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1770 /* Maybe things changed, and the unit is not actually active anymore? */
1771 unit_remove_from_cgroup_realize_queue(i);
1772 continue;
1773 }
1774
db785129 1775 r = unit_realize_cgroup_now(i, state);
6414b7c9 1776 if (r < 0)
efdb0237 1777 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1778
4ad49000
LP
1779 n++;
1780 }
ecedd90f 1781
4ad49000 1782 return n;
8e274523
LP
1783}
1784
91a6073e 1785static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
4ad49000 1786 Unit *slice;
ca949c9d 1787
4ad49000
LP
1788 /* This adds the siblings of the specified unit and the
1789 * siblings of all parent units to the cgroup queue. (But
1790 * neither the specified unit itself nor the parents.) */
1791
1792 while ((slice = UNIT_DEREF(u->slice))) {
1793 Iterator i;
1794 Unit *m;
eef85c4a 1795 void *v;
8f53a7b8 1796
eef85c4a 1797 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
4ad49000
LP
1798 if (m == u)
1799 continue;
8e274523 1800
6414b7c9
DS
1801 /* Skip units that have a dependency on the slice
1802 * but aren't actually in it. */
4ad49000 1803 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1804 continue;
8e274523 1805
6414b7c9
DS
1806 /* No point in doing cgroup application for units
1807 * without active processes. */
1808 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1809 continue;
1810
1811 /* If the unit doesn't need any new controllers
1812 * and has current ones realized, it doesn't need
1813 * any changes. */
906c06f6
DM
1814 if (unit_has_mask_realized(m,
1815 unit_get_target_mask(m),
1816 unit_get_enable_mask(m),
1817 unit_get_needs_bpf(m)))
6414b7c9
DS
1818 continue;
1819
91a6073e 1820 unit_add_to_cgroup_realize_queue(m);
50159e6a
LP
1821 }
1822
4ad49000 1823 u = slice;
8e274523 1824 }
4ad49000
LP
1825}
1826
0a1eb06d 1827int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1828 assert(u);
1829
35b7ff80 1830 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1831 return 0;
8e274523 1832
4ad49000
LP
1833 /* So, here's the deal: when realizing the cgroups for this
1834 * unit, we need to first create all parents, but there's more
1835 * actually: for the weight-based controllers we also need to
1836 * make sure that all our siblings (i.e. units that are in the
73e231ab 1837 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1838 * would become very uneven as each of their processes would
1839 * get as much resources as all our group together. This call
1840 * will synchronously create the parent cgroups, but will
1841 * defer work on the siblings to the next event loop
1842 * iteration. */
ca949c9d 1843
4ad49000 1844 /* Add all sibling slices to the cgroup queue. */
91a6073e 1845 unit_add_siblings_to_cgroup_realize_queue(u);
4ad49000 1846
6414b7c9 1847 /* And realize this one now (and apply the values) */
db785129 1848 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1849}
1850
efdb0237
LP
1851void unit_release_cgroup(Unit *u) {
1852 assert(u);
1853
1854 /* Forgets all cgroup details for this cgroup */
1855
1856 if (u->cgroup_path) {
1857 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1858 u->cgroup_path = mfree(u->cgroup_path);
1859 }
1860
1861 if (u->cgroup_inotify_wd >= 0) {
1862 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1863 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1864
1865 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1866 u->cgroup_inotify_wd = -1;
1867 }
1868}
1869
1870void unit_prune_cgroup(Unit *u) {
8e274523 1871 int r;
efdb0237 1872 bool is_root_slice;
8e274523 1873
4ad49000 1874 assert(u);
8e274523 1875
efdb0237
LP
1876 /* Removes the cgroup, if empty and possible, and stops watching it. */
1877
4ad49000
LP
1878 if (!u->cgroup_path)
1879 return;
8e274523 1880
fe700f46
LP
1881 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1882
efdb0237
LP
1883 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1884
1885 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1886 if (r < 0) {
f29ff115 1887 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1888 return;
1889 }
8e274523 1890
efdb0237
LP
1891 if (is_root_slice)
1892 return;
1893
1894 unit_release_cgroup(u);
0a1eb06d 1895
4ad49000 1896 u->cgroup_realized = false;
bc432dc7 1897 u->cgroup_realized_mask = 0;
ccf78df1 1898 u->cgroup_enabled_mask = 0;
8e274523
LP
1899}
1900
efdb0237 1901int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1902 _cleanup_fclose_ FILE *f = NULL;
1903 pid_t pid = 0, npid, mypid;
efdb0237 1904 int r;
4ad49000
LP
1905
1906 assert(u);
efdb0237 1907 assert(ret);
4ad49000
LP
1908
1909 if (!u->cgroup_path)
efdb0237 1910 return -ENXIO;
4ad49000 1911
efdb0237
LP
1912 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1913 if (r < 0)
1914 return r;
4ad49000 1915
df0ff127 1916 mypid = getpid_cached();
4ad49000
LP
1917 while (cg_read_pid(f, &npid) > 0) {
1918 pid_t ppid;
1919
1920 if (npid == pid)
1921 continue;
8e274523 1922
4ad49000 1923 /* Ignore processes that aren't our kids */
6bc73acb 1924 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
4ad49000 1925 continue;
8e274523 1926
efdb0237 1927 if (pid != 0)
4ad49000
LP
1928 /* Dang, there's more than one daemonized PID
1929 in this group, so we don't know what process
1930 is the main process. */
efdb0237
LP
1931
1932 return -ENODATA;
8e274523 1933
4ad49000 1934 pid = npid;
8e274523
LP
1935 }
1936
efdb0237
LP
1937 *ret = pid;
1938 return 0;
1939}
1940
1941static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1942 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1943 _cleanup_fclose_ FILE *f = NULL;
1944 int ret = 0, r;
1945
1946 assert(u);
1947 assert(path);
1948
1949 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1950 if (r < 0)
1951 ret = r;
1952 else {
1953 pid_t pid;
1954
1955 while ((r = cg_read_pid(f, &pid)) > 0) {
1956 r = unit_watch_pid(u, pid);
1957 if (r < 0 && ret >= 0)
1958 ret = r;
1959 }
1960
1961 if (r < 0 && ret >= 0)
1962 ret = r;
1963 }
1964
1965 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1966 if (r < 0) {
1967 if (ret >= 0)
1968 ret = r;
1969 } else {
1970 char *fn;
1971
1972 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1973 _cleanup_free_ char *p = NULL;
1974
605405c6 1975 p = strjoin(path, "/", fn);
efdb0237
LP
1976 free(fn);
1977
1978 if (!p)
1979 return -ENOMEM;
1980
1981 r = unit_watch_pids_in_path(u, p);
1982 if (r < 0 && ret >= 0)
1983 ret = r;
1984 }
1985
1986 if (r < 0 && ret >= 0)
1987 ret = r;
1988 }
1989
1990 return ret;
1991}
1992
11aef522
LP
1993int unit_synthesize_cgroup_empty_event(Unit *u) {
1994 int r;
1995
1996 assert(u);
1997
1998 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1999 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2000 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2001
2002 if (!u->cgroup_path)
2003 return -ENOENT;
2004
2005 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2006 if (r < 0)
2007 return r;
2008 if (r > 0) /* On unified we have reliable notifications, and don't need this */
2009 return 0;
2010
2011 if (!set_isempty(u->pids))
2012 return 0;
2013
2014 unit_add_to_cgroup_empty_queue(u);
2015 return 0;
2016}
2017
efdb0237 2018int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
2019 int r;
2020
efdb0237
LP
2021 assert(u);
2022
2023 /* Adds all PIDs from our cgroup to the set of PIDs we
2024 * watch. This is a fallback logic for cases where we do not
2025 * get reliable cgroup empty notifications: we try to use
2026 * SIGCHLD as replacement. */
2027
2028 if (!u->cgroup_path)
2029 return -ENOENT;
2030
c22800e4 2031 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
2032 if (r < 0)
2033 return r;
2034 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
2035 return 0;
2036
2037 return unit_watch_pids_in_path(u, u->cgroup_path);
2038}
2039
09e24654
LP
2040static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2041 Manager *m = userdata;
2042 Unit *u;
efdb0237
LP
2043 int r;
2044
09e24654
LP
2045 assert(s);
2046 assert(m);
efdb0237 2047
09e24654
LP
2048 u = m->cgroup_empty_queue;
2049 if (!u)
efdb0237
LP
2050 return 0;
2051
09e24654
LP
2052 assert(u->in_cgroup_empty_queue);
2053 u->in_cgroup_empty_queue = false;
2054 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2055
2056 if (m->cgroup_empty_queue) {
2057 /* More stuff queued, let's make sure we remain enabled */
2058 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2059 if (r < 0)
2060 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2061 }
efdb0237
LP
2062
2063 unit_add_to_gc_queue(u);
2064
2065 if (UNIT_VTABLE(u)->notify_cgroup_empty)
2066 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2067
2068 return 0;
2069}
2070
09e24654
LP
2071void unit_add_to_cgroup_empty_queue(Unit *u) {
2072 int r;
2073
2074 assert(u);
2075
2076 /* Note that there are four different ways how cgroup empty events reach us:
2077 *
2078 * 1. On the unified hierarchy we get an inotify event on the cgroup
2079 *
2080 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2081 *
2082 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2083 *
2084 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2085 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2086 *
2087 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2088 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2089 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2090 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2091 * case for scope units). */
2092
2093 if (u->in_cgroup_empty_queue)
2094 return;
2095
2096 /* Let's verify that the cgroup is really empty */
2097 if (!u->cgroup_path)
2098 return;
2099 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2100 if (r < 0) {
2101 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2102 return;
2103 }
2104 if (r == 0)
2105 return;
2106
2107 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2108 u->in_cgroup_empty_queue = true;
2109
2110 /* Trigger the defer event */
2111 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2112 if (r < 0)
2113 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2114}
2115
efdb0237
LP
2116static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2117 Manager *m = userdata;
2118
2119 assert(s);
2120 assert(fd >= 0);
2121 assert(m);
2122
2123 for (;;) {
2124 union inotify_event_buffer buffer;
2125 struct inotify_event *e;
2126 ssize_t l;
2127
2128 l = read(fd, &buffer, sizeof(buffer));
2129 if (l < 0) {
47249640 2130 if (IN_SET(errno, EINTR, EAGAIN))
efdb0237
LP
2131 return 0;
2132
2133 return log_error_errno(errno, "Failed to read control group inotify events: %m");
2134 }
2135
2136 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2137 Unit *u;
2138
2139 if (e->wd < 0)
2140 /* Queue overflow has no watch descriptor */
2141 continue;
2142
2143 if (e->mask & IN_IGNORED)
2144 /* The watch was just removed */
2145 continue;
2146
2147 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2148 if (!u) /* Not that inotify might deliver
2149 * events for a watch even after it
2150 * was removed, because it was queued
2151 * before the removal. Let's ignore
2152 * this here safely. */
2153 continue;
2154
09e24654 2155 unit_add_to_cgroup_empty_queue(u);
efdb0237
LP
2156 }
2157 }
8e274523
LP
2158}
2159
8e274523 2160int manager_setup_cgroup(Manager *m) {
9444b1f2 2161 _cleanup_free_ char *path = NULL;
10bd3e2e 2162 const char *scope_path;
efdb0237 2163 CGroupController c;
b4cccbc1 2164 int r, all_unified;
efdb0237 2165 char *e;
8e274523
LP
2166
2167 assert(m);
2168
35d2e7ec 2169 /* 1. Determine hierarchy */
efdb0237 2170 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 2171 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
2172 if (r < 0)
2173 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 2174
efdb0237
LP
2175 /* Chop off the init scope, if we are already located in it */
2176 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 2177
efdb0237
LP
2178 /* LEGACY: Also chop off the system slice if we are in
2179 * it. This is to support live upgrades from older systemd
2180 * versions where PID 1 was moved there. Also see
2181 * cg_get_root_path(). */
463d0d15 2182 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 2183 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 2184 if (!e)
efdb0237 2185 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 2186 }
efdb0237
LP
2187 if (e)
2188 *e = 0;
7ccfb64a 2189
7546145e
LP
2190 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2191 * easily prepend it everywhere. */
2192 delete_trailing_chars(m->cgroup_root, "/");
8e274523 2193
35d2e7ec 2194 /* 2. Show data */
9444b1f2 2195 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
2196 if (r < 0)
2197 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 2198
415fc41c
TH
2199 r = cg_unified_flush();
2200 if (r < 0)
2201 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 2202
b4cccbc1 2203 all_unified = cg_all_unified();
d4c819ed
ZJS
2204 if (all_unified < 0)
2205 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2206 if (all_unified > 0)
efdb0237 2207 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 2208 else {
c22800e4 2209 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
2210 if (r < 0)
2211 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2212 if (r > 0)
2213 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2214 else
2215 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2216 }
efdb0237 2217
09e24654
LP
2218 /* 3. Allocate cgroup empty defer event source */
2219 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2220 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2221 if (r < 0)
2222 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2223
2224 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2225 if (r < 0)
2226 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2227
2228 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2229 if (r < 0)
2230 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2231
2232 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2233
2234 /* 4. Install notifier inotify object, or agent */
10bd3e2e 2235 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
c6c18be3 2236
09e24654 2237 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
efdb0237 2238
10bd3e2e
LP
2239 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2240 safe_close(m->cgroup_inotify_fd);
efdb0237 2241
10bd3e2e
LP
2242 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2243 if (m->cgroup_inotify_fd < 0)
2244 return log_error_errno(errno, "Failed to create control group inotify object: %m");
efdb0237 2245
10bd3e2e
LP
2246 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2247 if (r < 0)
2248 return log_error_errno(r, "Failed to watch control group inotify object: %m");
efdb0237 2249
10bd3e2e
LP
2250 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2251 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
09e24654 2252 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
10bd3e2e
LP
2253 if (r < 0)
2254 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
efdb0237 2255
10bd3e2e 2256 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
efdb0237 2257
10bd3e2e 2258 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
efdb0237 2259
10bd3e2e
LP
2260 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2261 * since it does not generate events when control groups with children run empty. */
8e274523 2262
10bd3e2e 2263 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
23bbb0de 2264 if (r < 0)
10bd3e2e
LP
2265 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2266 else if (r > 0)
2267 log_debug("Installed release agent.");
2268 else if (r == 0)
2269 log_debug("Release agent already installed.");
2270 }
efdb0237 2271
09e24654 2272 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
10bd3e2e
LP
2273 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2274 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
aa77e234
MS
2275 if (r >= 0) {
2276 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2277 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2278 if (r < 0)
2279 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 2280
aa77e234
MS
2281 /* 6. And pin it, so that it cannot be unmounted */
2282 safe_close(m->pin_cgroupfs_fd);
2283 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2284 if (m->pin_cgroupfs_fd < 0)
2285 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 2286
aa77e234
MS
2287 } else if (r < 0 && !m->test_run_flags)
2288 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
10bd3e2e 2289
09e24654 2290 /* 7. Always enable hierarchical support if it exists... */
10bd3e2e
LP
2291 if (!all_unified && m->test_run_flags == 0)
2292 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3 2293
09e24654 2294 /* 8. Figure out which controllers are supported, and log about it */
efdb0237
LP
2295 r = cg_mask_supported(&m->cgroup_supported);
2296 if (r < 0)
2297 return log_error_errno(r, "Failed to determine supported controllers: %m");
efdb0237 2298 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
eee0a1e4 2299 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 2300
a32360f1 2301 return 0;
8e274523
LP
2302}
2303
c6c18be3 2304void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
2305 assert(m);
2306
9444b1f2
LP
2307 /* We can't really delete the group, since we are in it. But
2308 * let's trim it. */
2309 if (delete && m->cgroup_root)
efdb0237
LP
2310 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2311
09e24654
LP
2312 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2313
efdb0237
LP
2314 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2315
2316 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2317 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 2318
03e334a1 2319 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 2320
efdb0237 2321 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
2322}
2323
4ad49000 2324Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 2325 char *p;
4ad49000 2326 Unit *u;
acb14d31
LP
2327
2328 assert(m);
2329 assert(cgroup);
acb14d31 2330
4ad49000
LP
2331 u = hashmap_get(m->cgroup_unit, cgroup);
2332 if (u)
2333 return u;
acb14d31 2334
8e70580b 2335 p = strdupa(cgroup);
acb14d31
LP
2336 for (;;) {
2337 char *e;
2338
2339 e = strrchr(p, '/');
efdb0237
LP
2340 if (!e || e == p)
2341 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
2342
2343 *e = 0;
2344
4ad49000
LP
2345 u = hashmap_get(m->cgroup_unit, p);
2346 if (u)
2347 return u;
acb14d31
LP
2348 }
2349}
2350
b3ac818b 2351Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 2352 _cleanup_free_ char *cgroup = NULL;
8e274523 2353
8c47c732
LP
2354 assert(m);
2355
62a76913 2356 if (!pid_is_valid(pid))
b3ac818b
LP
2357 return NULL;
2358
62a76913 2359 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
b3ac818b
LP
2360 return NULL;
2361
2362 return manager_get_unit_by_cgroup(m, cgroup);
2363}
2364
2365Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
62a76913 2366 Unit *u, **array;
b3ac818b
LP
2367
2368 assert(m);
2369
62a76913
LP
2370 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2371 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2372 * relevant one as children of the process will be assigned to that one, too, before all else. */
2373
2374 if (!pid_is_valid(pid))
8c47c732
LP
2375 return NULL;
2376
2ca9d979 2377 if (pid == getpid_cached())
efdb0237
LP
2378 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2379
62a76913 2380 u = manager_get_unit_by_pid_cgroup(m, pid);
5fe8876b
LP
2381 if (u)
2382 return u;
2383
62a76913 2384 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
5fe8876b
LP
2385 if (u)
2386 return u;
2387
62a76913
LP
2388 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2389 if (array)
2390 return array[0];
2391
2392 return NULL;
6dde1f33 2393}
4fbf50b3 2394
4ad49000
LP
2395int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2396 Unit *u;
4fbf50b3 2397
4ad49000
LP
2398 assert(m);
2399 assert(cgroup);
4fbf50b3 2400
09e24654
LP
2401 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2402 * or from the --system instance */
2403
d8fdc620
LP
2404 log_debug("Got cgroup empty notification for: %s", cgroup);
2405
4ad49000 2406 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
2407 if (!u)
2408 return 0;
b56c28c3 2409
09e24654
LP
2410 unit_add_to_cgroup_empty_queue(u);
2411 return 1;
5ad096b3
LP
2412}
2413
2414int unit_get_memory_current(Unit *u, uint64_t *ret) {
2415 _cleanup_free_ char *v = NULL;
2416 int r;
2417
2418 assert(u);
2419 assert(ret);
2420
2e4025c0 2421 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
cf3b4be1
LP
2422 return -ENODATA;
2423
5ad096b3
LP
2424 if (!u->cgroup_path)
2425 return -ENODATA;
2426
1f73aa00
LP
2427 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2428 if (unit_has_root_cgroup(u))
2429 return procfs_memory_get_current(ret);
2430
efdb0237 2431 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
2432 return -ENODATA;
2433
b4cccbc1
LP
2434 r = cg_all_unified();
2435 if (r < 0)
2436 return r;
2437 if (r > 0)
efdb0237 2438 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
b4cccbc1
LP
2439 else
2440 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
5ad096b3
LP
2441 if (r == -ENOENT)
2442 return -ENODATA;
2443 if (r < 0)
2444 return r;
2445
2446 return safe_atou64(v, ret);
2447}
2448
03a7b521
LP
2449int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2450 _cleanup_free_ char *v = NULL;
2451 int r;
2452
2453 assert(u);
2454 assert(ret);
2455
2e4025c0 2456 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
cf3b4be1
LP
2457 return -ENODATA;
2458
03a7b521
LP
2459 if (!u->cgroup_path)
2460 return -ENODATA;
2461
c36a69f4
LP
2462 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2463 if (unit_has_root_cgroup(u))
2464 return procfs_tasks_get_current(ret);
2465
1f73aa00
LP
2466 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2467 return -ENODATA;
2468
03a7b521
LP
2469 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2470 if (r == -ENOENT)
2471 return -ENODATA;
2472 if (r < 0)
2473 return r;
2474
2475 return safe_atou64(v, ret);
2476}
2477
5ad096b3
LP
2478static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2479 _cleanup_free_ char *v = NULL;
2480 uint64_t ns;
2481 int r;
2482
2483 assert(u);
2484 assert(ret);
2485
2486 if (!u->cgroup_path)
2487 return -ENODATA;
2488
1f73aa00
LP
2489 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2490 if (unit_has_root_cgroup(u))
2491 return procfs_cpu_get_usage(ret);
2492
b4cccbc1
LP
2493 r = cg_all_unified();
2494 if (r < 0)
2495 return r;
2496 if (r > 0) {
66ebf6c0
TH
2497 _cleanup_free_ char *val = NULL;
2498 uint64_t us;
5ad096b3 2499
66ebf6c0
TH
2500 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2501 return -ENODATA;
5ad096b3 2502
b734a4ff 2503 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
66ebf6c0
TH
2504 if (r < 0)
2505 return r;
b734a4ff
LP
2506 if (IN_SET(r, -ENOENT, -ENXIO))
2507 return -ENODATA;
66ebf6c0
TH
2508
2509 r = safe_atou64(val, &us);
2510 if (r < 0)
2511 return r;
2512
2513 ns = us * NSEC_PER_USEC;
2514 } else {
2515 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2516 return -ENODATA;
2517
2518 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2519 if (r == -ENOENT)
2520 return -ENODATA;
2521 if (r < 0)
2522 return r;
2523
2524 r = safe_atou64(v, &ns);
2525 if (r < 0)
2526 return r;
2527 }
5ad096b3
LP
2528
2529 *ret = ns;
2530 return 0;
2531}
2532
2533int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2534 nsec_t ns;
2535 int r;
2536
fe700f46
LP
2537 assert(u);
2538
2539 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2540 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2541 * call this function with a NULL return value. */
2542
2e4025c0 2543 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
cf3b4be1
LP
2544 return -ENODATA;
2545
5ad096b3 2546 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
2547 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2548 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2549 * cached value. */
2550
2551 if (ret)
2552 *ret = u->cpu_usage_last;
2553 return 0;
2554 }
5ad096b3
LP
2555 if (r < 0)
2556 return r;
2557
66ebf6c0
TH
2558 if (ns > u->cpu_usage_base)
2559 ns -= u->cpu_usage_base;
5ad096b3
LP
2560 else
2561 ns = 0;
2562
fe700f46
LP
2563 u->cpu_usage_last = ns;
2564 if (ret)
2565 *ret = ns;
2566
5ad096b3
LP
2567 return 0;
2568}
2569
906c06f6
DM
2570int unit_get_ip_accounting(
2571 Unit *u,
2572 CGroupIPAccountingMetric metric,
2573 uint64_t *ret) {
2574
6b659ed8 2575 uint64_t value;
906c06f6
DM
2576 int fd, r;
2577
2578 assert(u);
2579 assert(metric >= 0);
2580 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2581 assert(ret);
2582
2e4025c0 2583 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
cf3b4be1
LP
2584 return -ENODATA;
2585
906c06f6
DM
2586 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2587 u->ip_accounting_ingress_map_fd :
2588 u->ip_accounting_egress_map_fd;
906c06f6
DM
2589 if (fd < 0)
2590 return -ENODATA;
2591
2592 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
6b659ed8 2593 r = bpf_firewall_read_accounting(fd, &value, NULL);
906c06f6 2594 else
6b659ed8
LP
2595 r = bpf_firewall_read_accounting(fd, NULL, &value);
2596 if (r < 0)
2597 return r;
2598
2599 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2600 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2601 * ip_accounting_extra[] field, and add them in here transparently. */
2602
2603 *ret = value + u->ip_accounting_extra[metric];
906c06f6
DM
2604
2605 return r;
2606}
2607
2608int unit_reset_cpu_accounting(Unit *u) {
5ad096b3
LP
2609 nsec_t ns;
2610 int r;
2611
2612 assert(u);
2613
fe700f46
LP
2614 u->cpu_usage_last = NSEC_INFINITY;
2615
5ad096b3
LP
2616 r = unit_get_cpu_usage_raw(u, &ns);
2617 if (r < 0) {
66ebf6c0 2618 u->cpu_usage_base = 0;
5ad096b3 2619 return r;
b56c28c3 2620 }
2633eb83 2621
66ebf6c0 2622 u->cpu_usage_base = ns;
4ad49000 2623 return 0;
4fbf50b3
LP
2624}
2625
906c06f6
DM
2626int unit_reset_ip_accounting(Unit *u) {
2627 int r = 0, q = 0;
2628
2629 assert(u);
2630
2631 if (u->ip_accounting_ingress_map_fd >= 0)
2632 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2633
2634 if (u->ip_accounting_egress_map_fd >= 0)
2635 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2636
6b659ed8
LP
2637 zero(u->ip_accounting_extra);
2638
906c06f6
DM
2639 return r < 0 ? r : q;
2640}
2641
e7ab4d1a
LP
2642void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2643 assert(u);
2644
2645 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2646 return;
2647
2648 if (m == 0)
2649 return;
2650
538b4852
TH
2651 /* always invalidate compat pairs together */
2652 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2653 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2654
7cce4fb7
LP
2655 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2656 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2657
60c728ad 2658 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
e7ab4d1a
LP
2659 return;
2660
2661 u->cgroup_realized_mask &= ~m;
91a6073e 2662 unit_add_to_cgroup_realize_queue(u);
e7ab4d1a
LP
2663}
2664
906c06f6
DM
2665void unit_invalidate_cgroup_bpf(Unit *u) {
2666 assert(u);
2667
2668 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2669 return;
2670
60c728ad 2671 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
906c06f6
DM
2672 return;
2673
2674 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
91a6073e 2675 unit_add_to_cgroup_realize_queue(u);
906c06f6
DM
2676
2677 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2678 * list of our children includes our own. */
2679 if (u->type == UNIT_SLICE) {
2680 Unit *member;
2681 Iterator i;
eef85c4a 2682 void *v;
906c06f6 2683
eef85c4a 2684 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
906c06f6
DM
2685 if (member == u)
2686 continue;
2687
2688 if (UNIT_DEREF(member->slice) != u)
2689 continue;
2690
2691 unit_invalidate_cgroup_bpf(member);
2692 }
2693 }
2694}
2695
1d9cc876
LP
2696bool unit_cgroup_delegate(Unit *u) {
2697 CGroupContext *c;
2698
2699 assert(u);
2700
2701 if (!UNIT_VTABLE(u)->can_delegate)
2702 return false;
2703
2704 c = unit_get_cgroup_context(u);
2705 if (!c)
2706 return false;
2707
2708 return c->delegate;
2709}
2710
e7ab4d1a
LP
2711void manager_invalidate_startup_units(Manager *m) {
2712 Iterator i;
2713 Unit *u;
2714
2715 assert(m);
2716
2717 SET_FOREACH(u, m->startup_units, i)
13c31542 2718 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
2719}
2720
4ad49000
LP
2721static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2722 [CGROUP_AUTO] = "auto",
2723 [CGROUP_CLOSED] = "closed",
2724 [CGROUP_STRICT] = "strict",
2725};
4fbf50b3 2726
4ad49000 2727DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);