]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
47a3b6d3a2ff0e3f9fd5b4b893f0420e50285430
[thirdparty/systemd.git] / src / core / cgroup.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2013 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <fcntl.h>
22 #include <fnmatch.h>
23
24 #include "alloc-util.h"
25 #include "blockdev-util.h"
26 #include "bpf-firewall.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29 #include "fd-util.h"
30 #include "fileio.h"
31 #include "fs-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
35 #include "special.h"
36 #include "stdio-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
39
40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
41
42 static void cgroup_compat_warn(void) {
43 static bool cgroup_compat_warned = false;
44
45 if (cgroup_compat_warned)
46 return;
47
48 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
49 cgroup_compat_warned = true;
50 }
51
52 #define log_cgroup_compat(unit, fmt, ...) do { \
53 cgroup_compat_warn(); \
54 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
55 } while (false)
56
57 void cgroup_context_init(CGroupContext *c) {
58 assert(c);
59
60 /* Initialize everything to the kernel defaults, assuming the
61 * structure is preinitialized to 0 */
62
63 c->cpu_weight = CGROUP_WEIGHT_INVALID;
64 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
65 c->cpu_quota_per_sec_usec = USEC_INFINITY;
66
67 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
68 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
69
70 c->memory_high = CGROUP_LIMIT_MAX;
71 c->memory_max = CGROUP_LIMIT_MAX;
72 c->memory_swap_max = CGROUP_LIMIT_MAX;
73
74 c->memory_limit = CGROUP_LIMIT_MAX;
75
76 c->io_weight = CGROUP_WEIGHT_INVALID;
77 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
78
79 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
80 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
81
82 c->tasks_max = (uint64_t) -1;
83 }
84
85 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
86 assert(c);
87 assert(a);
88
89 LIST_REMOVE(device_allow, c->device_allow, a);
90 free(a->path);
91 free(a);
92 }
93
94 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
95 assert(c);
96 assert(w);
97
98 LIST_REMOVE(device_weights, c->io_device_weights, w);
99 free(w->path);
100 free(w);
101 }
102
103 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
104 assert(c);
105 assert(l);
106
107 LIST_REMOVE(device_limits, c->io_device_limits, l);
108 free(l->path);
109 free(l);
110 }
111
112 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
113 assert(c);
114 assert(w);
115
116 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
117 free(w->path);
118 free(w);
119 }
120
121 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
122 assert(c);
123 assert(b);
124
125 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
126 free(b->path);
127 free(b);
128 }
129
130 void cgroup_context_done(CGroupContext *c) {
131 assert(c);
132
133 while (c->io_device_weights)
134 cgroup_context_free_io_device_weight(c, c->io_device_weights);
135
136 while (c->io_device_limits)
137 cgroup_context_free_io_device_limit(c, c->io_device_limits);
138
139 while (c->blockio_device_weights)
140 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
141
142 while (c->blockio_device_bandwidths)
143 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
144
145 while (c->device_allow)
146 cgroup_context_free_device_allow(c, c->device_allow);
147
148 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
149 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
150 }
151
152 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
153 CGroupIODeviceLimit *il;
154 CGroupIODeviceWeight *iw;
155 CGroupBlockIODeviceBandwidth *b;
156 CGroupBlockIODeviceWeight *w;
157 CGroupDeviceAllow *a;
158 IPAddressAccessItem *iaai;
159 char u[FORMAT_TIMESPAN_MAX];
160
161 assert(c);
162 assert(f);
163
164 prefix = strempty(prefix);
165
166 fprintf(f,
167 "%sCPUAccounting=%s\n"
168 "%sIOAccounting=%s\n"
169 "%sBlockIOAccounting=%s\n"
170 "%sMemoryAccounting=%s\n"
171 "%sTasksAccounting=%s\n"
172 "%sIPAccounting=%s\n"
173 "%sCPUWeight=%" PRIu64 "\n"
174 "%sStartupCPUWeight=%" PRIu64 "\n"
175 "%sCPUShares=%" PRIu64 "\n"
176 "%sStartupCPUShares=%" PRIu64 "\n"
177 "%sCPUQuotaPerSecSec=%s\n"
178 "%sIOWeight=%" PRIu64 "\n"
179 "%sStartupIOWeight=%" PRIu64 "\n"
180 "%sBlockIOWeight=%" PRIu64 "\n"
181 "%sStartupBlockIOWeight=%" PRIu64 "\n"
182 "%sMemoryLow=%" PRIu64 "\n"
183 "%sMemoryHigh=%" PRIu64 "\n"
184 "%sMemoryMax=%" PRIu64 "\n"
185 "%sMemorySwapMax=%" PRIu64 "\n"
186 "%sMemoryLimit=%" PRIu64 "\n"
187 "%sTasksMax=%" PRIu64 "\n"
188 "%sDevicePolicy=%s\n"
189 "%sDelegate=%s\n",
190 prefix, yes_no(c->cpu_accounting),
191 prefix, yes_no(c->io_accounting),
192 prefix, yes_no(c->blockio_accounting),
193 prefix, yes_no(c->memory_accounting),
194 prefix, yes_no(c->tasks_accounting),
195 prefix, yes_no(c->ip_accounting),
196 prefix, c->cpu_weight,
197 prefix, c->startup_cpu_weight,
198 prefix, c->cpu_shares,
199 prefix, c->startup_cpu_shares,
200 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
201 prefix, c->io_weight,
202 prefix, c->startup_io_weight,
203 prefix, c->blockio_weight,
204 prefix, c->startup_blockio_weight,
205 prefix, c->memory_low,
206 prefix, c->memory_high,
207 prefix, c->memory_max,
208 prefix, c->memory_swap_max,
209 prefix, c->memory_limit,
210 prefix, c->tasks_max,
211 prefix, cgroup_device_policy_to_string(c->device_policy),
212 prefix, yes_no(c->delegate));
213
214 if (c->delegate) {
215 _cleanup_free_ char *t = NULL;
216
217 (void) cg_mask_to_string(c->delegate_controllers, &t);
218
219 fprintf(f, "%sDelegateControllers=%s\n",
220 prefix,
221 strempty(t));
222 }
223
224 LIST_FOREACH(device_allow, a, c->device_allow)
225 fprintf(f,
226 "%sDeviceAllow=%s %s%s%s\n",
227 prefix,
228 a->path,
229 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
230
231 LIST_FOREACH(device_weights, iw, c->io_device_weights)
232 fprintf(f,
233 "%sIODeviceWeight=%s %" PRIu64,
234 prefix,
235 iw->path,
236 iw->weight);
237
238 LIST_FOREACH(device_limits, il, c->io_device_limits) {
239 char buf[FORMAT_BYTES_MAX];
240 CGroupIOLimitType type;
241
242 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
243 if (il->limits[type] != cgroup_io_limit_defaults[type])
244 fprintf(f,
245 "%s%s=%s %s\n",
246 prefix,
247 cgroup_io_limit_type_to_string(type),
248 il->path,
249 format_bytes(buf, sizeof(buf), il->limits[type]));
250 }
251
252 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
253 fprintf(f,
254 "%sBlockIODeviceWeight=%s %" PRIu64,
255 prefix,
256 w->path,
257 w->weight);
258
259 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
260 char buf[FORMAT_BYTES_MAX];
261
262 if (b->rbps != CGROUP_LIMIT_MAX)
263 fprintf(f,
264 "%sBlockIOReadBandwidth=%s %s\n",
265 prefix,
266 b->path,
267 format_bytes(buf, sizeof(buf), b->rbps));
268 if (b->wbps != CGROUP_LIMIT_MAX)
269 fprintf(f,
270 "%sBlockIOWriteBandwidth=%s %s\n",
271 prefix,
272 b->path,
273 format_bytes(buf, sizeof(buf), b->wbps));
274 }
275
276 LIST_FOREACH(items, iaai, c->ip_address_allow) {
277 _cleanup_free_ char *k = NULL;
278
279 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
280 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
281 }
282
283 LIST_FOREACH(items, iaai, c->ip_address_deny) {
284 _cleanup_free_ char *k = NULL;
285
286 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
287 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
288 }
289 }
290
291 static int lookup_block_device(const char *p, dev_t *dev) {
292 struct stat st;
293 int r;
294
295 assert(p);
296 assert(dev);
297
298 r = stat(p, &st);
299 if (r < 0)
300 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
301
302 if (S_ISBLK(st.st_mode))
303 *dev = st.st_rdev;
304 else if (major(st.st_dev) != 0) {
305 /* If this is not a device node then find the block
306 * device this file is stored on */
307 *dev = st.st_dev;
308
309 /* If this is a partition, try to get the originating
310 * block device */
311 (void) block_get_whole_disk(*dev, dev);
312 } else {
313 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
314 return -ENODEV;
315 }
316
317 return 0;
318 }
319
320 static int whitelist_device(const char *path, const char *node, const char *acc) {
321 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
322 struct stat st;
323 bool ignore_notfound;
324 int r;
325
326 assert(path);
327 assert(acc);
328
329 if (node[0] == '-') {
330 /* Non-existent paths starting with "-" must be silently ignored */
331 node++;
332 ignore_notfound = true;
333 } else
334 ignore_notfound = false;
335
336 if (stat(node, &st) < 0) {
337 if (errno == ENOENT && ignore_notfound)
338 return 0;
339
340 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
341 }
342
343 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
344 log_warning("%s is not a device.", node);
345 return -ENODEV;
346 }
347
348 sprintf(buf,
349 "%c %u:%u %s",
350 S_ISCHR(st.st_mode) ? 'c' : 'b',
351 major(st.st_rdev), minor(st.st_rdev),
352 acc);
353
354 r = cg_set_attribute("devices", path, "devices.allow", buf);
355 if (r < 0)
356 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
357 "Failed to set devices.allow on %s: %m", path);
358
359 return r;
360 }
361
362 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
363 _cleanup_fclose_ FILE *f = NULL;
364 char line[LINE_MAX];
365 bool good = false;
366 int r;
367
368 assert(path);
369 assert(acc);
370 assert(IN_SET(type, 'b', 'c'));
371
372 f = fopen("/proc/devices", "re");
373 if (!f)
374 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
375
376 FOREACH_LINE(line, f, goto fail) {
377 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
378 unsigned maj;
379
380 truncate_nl(line);
381
382 if (type == 'c' && streq(line, "Character devices:")) {
383 good = true;
384 continue;
385 }
386
387 if (type == 'b' && streq(line, "Block devices:")) {
388 good = true;
389 continue;
390 }
391
392 if (isempty(line)) {
393 good = false;
394 continue;
395 }
396
397 if (!good)
398 continue;
399
400 p = strstrip(line);
401
402 w = strpbrk(p, WHITESPACE);
403 if (!w)
404 continue;
405 *w = 0;
406
407 r = safe_atou(p, &maj);
408 if (r < 0)
409 continue;
410 if (maj <= 0)
411 continue;
412
413 w++;
414 w += strspn(w, WHITESPACE);
415
416 if (fnmatch(name, w, 0) != 0)
417 continue;
418
419 sprintf(buf,
420 "%c %u:* %s",
421 type,
422 maj,
423 acc);
424
425 r = cg_set_attribute("devices", path, "devices.allow", buf);
426 if (r < 0)
427 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
428 "Failed to set devices.allow on %s: %m", path);
429 }
430
431 return 0;
432
433 fail:
434 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
435 }
436
437 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
438 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
439 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
440 }
441
442 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
443 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
444 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
445 }
446
447 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
448 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
449 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
450 return c->startup_cpu_weight;
451 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
452 return c->cpu_weight;
453 else
454 return CGROUP_WEIGHT_DEFAULT;
455 }
456
457 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
458 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
459 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
460 return c->startup_cpu_shares;
461 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
462 return c->cpu_shares;
463 else
464 return CGROUP_CPU_SHARES_DEFAULT;
465 }
466
467 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
468 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
469 int r;
470
471 xsprintf(buf, "%" PRIu64 "\n", weight);
472 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
473 if (r < 0)
474 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
475 "Failed to set cpu.weight: %m");
476
477 if (quota != USEC_INFINITY)
478 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
479 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
480 else
481 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
482
483 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
484
485 if (r < 0)
486 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
487 "Failed to set cpu.max: %m");
488 }
489
490 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
491 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
492 int r;
493
494 xsprintf(buf, "%" PRIu64 "\n", shares);
495 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
496 if (r < 0)
497 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
498 "Failed to set cpu.shares: %m");
499
500 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
501 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
502 if (r < 0)
503 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
504 "Failed to set cpu.cfs_period_us: %m");
505
506 if (quota != USEC_INFINITY) {
507 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
508 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
509 } else
510 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
511 if (r < 0)
512 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
513 "Failed to set cpu.cfs_quota_us: %m");
514 }
515
516 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
517 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
518 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
519 }
520
521 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
522 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
523 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
524 }
525
526 static bool cgroup_context_has_io_config(CGroupContext *c) {
527 return c->io_accounting ||
528 c->io_weight != CGROUP_WEIGHT_INVALID ||
529 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
530 c->io_device_weights ||
531 c->io_device_limits;
532 }
533
534 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
535 return c->blockio_accounting ||
536 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
537 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
538 c->blockio_device_weights ||
539 c->blockio_device_bandwidths;
540 }
541
542 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
543 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
544 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
545 return c->startup_io_weight;
546 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
547 return c->io_weight;
548 else
549 return CGROUP_WEIGHT_DEFAULT;
550 }
551
552 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
553 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
554 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
555 return c->startup_blockio_weight;
556 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
557 return c->blockio_weight;
558 else
559 return CGROUP_BLKIO_WEIGHT_DEFAULT;
560 }
561
562 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
563 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
564 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
565 }
566
567 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
568 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
569 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
570 }
571
572 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
573 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
574 dev_t dev;
575 int r;
576
577 r = lookup_block_device(dev_path, &dev);
578 if (r < 0)
579 return;
580
581 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
582 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
583 if (r < 0)
584 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
585 "Failed to set io.weight: %m");
586 }
587
588 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
589 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
590 dev_t dev;
591 int r;
592
593 r = lookup_block_device(dev_path, &dev);
594 if (r < 0)
595 return;
596
597 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
598 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
599 if (r < 0)
600 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
601 "Failed to set blkio.weight_device: %m");
602 }
603
604 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
605 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
606 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
607 CGroupIOLimitType type;
608 dev_t dev;
609 unsigned n = 0;
610 int r;
611
612 r = lookup_block_device(dev_path, &dev);
613 if (r < 0)
614 return 0;
615
616 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
617 if (limits[type] != cgroup_io_limit_defaults[type]) {
618 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
619 n++;
620 } else {
621 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
622 }
623 }
624
625 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
626 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
627 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
628 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
629 if (r < 0)
630 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
631 "Failed to set io.max: %m");
632 return n;
633 }
634
635 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
636 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
637 dev_t dev;
638 unsigned n = 0;
639 int r;
640
641 r = lookup_block_device(dev_path, &dev);
642 if (r < 0)
643 return 0;
644
645 if (rbps != CGROUP_LIMIT_MAX)
646 n++;
647 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
648 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
649 if (r < 0)
650 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
651 "Failed to set blkio.throttle.read_bps_device: %m");
652
653 if (wbps != CGROUP_LIMIT_MAX)
654 n++;
655 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
656 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
657 if (r < 0)
658 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
659 "Failed to set blkio.throttle.write_bps_device: %m");
660
661 return n;
662 }
663
664 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
665 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
666 }
667
668 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
669 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
670 int r;
671
672 if (v != CGROUP_LIMIT_MAX)
673 xsprintf(buf, "%" PRIu64 "\n", v);
674
675 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
676 if (r < 0)
677 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
678 "Failed to set %s: %m", file);
679 }
680
681 static void cgroup_apply_firewall(Unit *u) {
682 int r;
683
684 assert(u);
685
686 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
687 * not recursive we don't ever touch the bpf on them */
688 return;
689
690 r = bpf_firewall_compile(u);
691 if (r < 0)
692 return;
693
694 (void) bpf_firewall_install(u);
695 return;
696 }
697
698 static void cgroup_context_apply(
699 Unit *u,
700 CGroupMask apply_mask,
701 bool apply_bpf,
702 ManagerState state) {
703
704 const char *path;
705 CGroupContext *c;
706 bool is_root;
707 int r;
708
709 assert(u);
710
711 c = unit_get_cgroup_context(u);
712 path = u->cgroup_path;
713
714 assert(c);
715 assert(path);
716
717 /* Nothing to do? Exit early! */
718 if (apply_mask == 0 && !apply_bpf)
719 return;
720
721 /* Some cgroup attributes are not supported on the root cgroup,
722 * hence silently ignore */
723 is_root = isempty(path) || path_equal(path, "/");
724 if (is_root)
725 /* Make sure we don't try to display messages with an empty path. */
726 path = "/";
727
728 /* We generally ignore errors caused by read-only mounted
729 * cgroup trees (assuming we are running in a container then),
730 * and missing cgroups, i.e. EROFS and ENOENT. */
731
732 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
733 bool has_weight, has_shares;
734
735 has_weight = cgroup_context_has_cpu_weight(c);
736 has_shares = cgroup_context_has_cpu_shares(c);
737
738 if (cg_all_unified() > 0) {
739 uint64_t weight;
740
741 if (has_weight)
742 weight = cgroup_context_cpu_weight(c, state);
743 else if (has_shares) {
744 uint64_t shares = cgroup_context_cpu_shares(c, state);
745
746 weight = cgroup_cpu_shares_to_weight(shares);
747
748 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
749 shares, weight, path);
750 } else
751 weight = CGROUP_WEIGHT_DEFAULT;
752
753 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
754 } else {
755 uint64_t shares;
756
757 if (has_weight) {
758 uint64_t weight = cgroup_context_cpu_weight(c, state);
759
760 shares = cgroup_cpu_weight_to_shares(weight);
761
762 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
763 weight, shares, path);
764 } else if (has_shares)
765 shares = cgroup_context_cpu_shares(c, state);
766 else
767 shares = CGROUP_CPU_SHARES_DEFAULT;
768
769 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
770 }
771 }
772
773 if (apply_mask & CGROUP_MASK_IO) {
774 bool has_io = cgroup_context_has_io_config(c);
775 bool has_blockio = cgroup_context_has_blockio_config(c);
776
777 if (!is_root) {
778 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
779 uint64_t weight;
780
781 if (has_io)
782 weight = cgroup_context_io_weight(c, state);
783 else if (has_blockio) {
784 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
785
786 weight = cgroup_weight_blkio_to_io(blkio_weight);
787
788 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
789 blkio_weight, weight);
790 } else
791 weight = CGROUP_WEIGHT_DEFAULT;
792
793 xsprintf(buf, "default %" PRIu64 "\n", weight);
794 r = cg_set_attribute("io", path, "io.weight", buf);
795 if (r < 0)
796 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
797 "Failed to set io.weight: %m");
798
799 if (has_io) {
800 CGroupIODeviceWeight *w;
801
802 /* FIXME: no way to reset this list */
803 LIST_FOREACH(device_weights, w, c->io_device_weights)
804 cgroup_apply_io_device_weight(u, w->path, w->weight);
805 } else if (has_blockio) {
806 CGroupBlockIODeviceWeight *w;
807
808 /* FIXME: no way to reset this list */
809 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
810 weight = cgroup_weight_blkio_to_io(w->weight);
811
812 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
813 w->weight, weight, w->path);
814
815 cgroup_apply_io_device_weight(u, w->path, weight);
816 }
817 }
818 }
819
820 /* Apply limits and free ones without config. */
821 if (has_io) {
822 CGroupIODeviceLimit *l, *next;
823
824 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
825 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
826 cgroup_context_free_io_device_limit(c, l);
827 }
828 } else if (has_blockio) {
829 CGroupBlockIODeviceBandwidth *b, *next;
830
831 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
832 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
833 CGroupIOLimitType type;
834
835 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
836 limits[type] = cgroup_io_limit_defaults[type];
837
838 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
839 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
840
841 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
842 b->rbps, b->wbps, b->path);
843
844 if (!cgroup_apply_io_device_limit(u, b->path, limits))
845 cgroup_context_free_blockio_device_bandwidth(c, b);
846 }
847 }
848 }
849
850 if (apply_mask & CGROUP_MASK_BLKIO) {
851 bool has_io = cgroup_context_has_io_config(c);
852 bool has_blockio = cgroup_context_has_blockio_config(c);
853
854 if (!is_root) {
855 char buf[DECIMAL_STR_MAX(uint64_t)+1];
856 uint64_t weight;
857
858 if (has_io) {
859 uint64_t io_weight = cgroup_context_io_weight(c, state);
860
861 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
862
863 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
864 io_weight, weight);
865 } else if (has_blockio)
866 weight = cgroup_context_blkio_weight(c, state);
867 else
868 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
869
870 xsprintf(buf, "%" PRIu64 "\n", weight);
871 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
872 if (r < 0)
873 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
874 "Failed to set blkio.weight: %m");
875
876 if (has_io) {
877 CGroupIODeviceWeight *w;
878
879 /* FIXME: no way to reset this list */
880 LIST_FOREACH(device_weights, w, c->io_device_weights) {
881 weight = cgroup_weight_io_to_blkio(w->weight);
882
883 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
884 w->weight, weight, w->path);
885
886 cgroup_apply_blkio_device_weight(u, w->path, weight);
887 }
888 } else if (has_blockio) {
889 CGroupBlockIODeviceWeight *w;
890
891 /* FIXME: no way to reset this list */
892 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
893 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
894 }
895 }
896
897 /* Apply limits and free ones without config. */
898 if (has_io) {
899 CGroupIODeviceLimit *l, *next;
900
901 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
902 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
903 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
904
905 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
906 cgroup_context_free_io_device_limit(c, l);
907 }
908 } else if (has_blockio) {
909 CGroupBlockIODeviceBandwidth *b, *next;
910
911 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
912 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
913 cgroup_context_free_blockio_device_bandwidth(c, b);
914 }
915 }
916
917 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
918 if (cg_all_unified() > 0) {
919 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
920
921 if (cgroup_context_has_unified_memory_config(c)) {
922 max = c->memory_max;
923 swap_max = c->memory_swap_max;
924 } else {
925 max = c->memory_limit;
926
927 if (max != CGROUP_LIMIT_MAX)
928 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
929 }
930
931 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
932 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
933 cgroup_apply_unified_memory_limit(u, "memory.max", max);
934 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
935 } else {
936 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
937 uint64_t val;
938
939 if (cgroup_context_has_unified_memory_config(c)) {
940 val = c->memory_max;
941 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
942 } else
943 val = c->memory_limit;
944
945 if (val == CGROUP_LIMIT_MAX)
946 strncpy(buf, "-1\n", sizeof(buf));
947 else
948 xsprintf(buf, "%" PRIu64 "\n", val);
949
950 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
951 if (r < 0)
952 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
953 "Failed to set memory.limit_in_bytes: %m");
954 }
955 }
956
957 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
958 CGroupDeviceAllow *a;
959
960 /* Changing the devices list of a populated cgroup
961 * might result in EINVAL, hence ignore EINVAL
962 * here. */
963
964 if (c->device_allow || c->device_policy != CGROUP_AUTO)
965 r = cg_set_attribute("devices", path, "devices.deny", "a");
966 else
967 r = cg_set_attribute("devices", path, "devices.allow", "a");
968 if (r < 0)
969 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
970 "Failed to reset devices.list: %m");
971
972 if (c->device_policy == CGROUP_CLOSED ||
973 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
974 static const char auto_devices[] =
975 "/dev/null\0" "rwm\0"
976 "/dev/zero\0" "rwm\0"
977 "/dev/full\0" "rwm\0"
978 "/dev/random\0" "rwm\0"
979 "/dev/urandom\0" "rwm\0"
980 "/dev/tty\0" "rwm\0"
981 "/dev/ptmx\0" "rwm\0"
982 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
983 "-/run/systemd/inaccessible/chr\0" "rwm\0"
984 "-/run/systemd/inaccessible/blk\0" "rwm\0";
985
986 const char *x, *y;
987
988 NULSTR_FOREACH_PAIR(x, y, auto_devices)
989 whitelist_device(path, x, y);
990
991 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
992 whitelist_major(path, "pts", 'c', "rw");
993 }
994
995 LIST_FOREACH(device_allow, a, c->device_allow) {
996 char acc[4], *val;
997 unsigned k = 0;
998
999 if (a->r)
1000 acc[k++] = 'r';
1001 if (a->w)
1002 acc[k++] = 'w';
1003 if (a->m)
1004 acc[k++] = 'm';
1005
1006 if (k == 0)
1007 continue;
1008
1009 acc[k++] = 0;
1010
1011 if (path_startswith(a->path, "/dev/"))
1012 whitelist_device(path, a->path, acc);
1013 else if ((val = startswith(a->path, "block-")))
1014 whitelist_major(path, val, 'b', acc);
1015 else if ((val = startswith(a->path, "char-")))
1016 whitelist_major(path, val, 'c', acc);
1017 else
1018 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1019 }
1020 }
1021
1022 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1023
1024 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1025 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1026
1027 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1028 r = cg_set_attribute("pids", path, "pids.max", buf);
1029 } else
1030 r = cg_set_attribute("pids", path, "pids.max", "max");
1031
1032 if (r < 0)
1033 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1034 "Failed to set pids.max: %m");
1035 }
1036
1037 if (apply_bpf)
1038 cgroup_apply_firewall(u);
1039 }
1040
1041 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1042 CGroupMask mask = 0;
1043
1044 /* Figure out which controllers we need */
1045
1046 if (c->cpu_accounting ||
1047 cgroup_context_has_cpu_weight(c) ||
1048 cgroup_context_has_cpu_shares(c) ||
1049 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1050 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1051
1052 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1053 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1054
1055 if (c->memory_accounting ||
1056 c->memory_limit != CGROUP_LIMIT_MAX ||
1057 cgroup_context_has_unified_memory_config(c))
1058 mask |= CGROUP_MASK_MEMORY;
1059
1060 if (c->device_allow ||
1061 c->device_policy != CGROUP_AUTO)
1062 mask |= CGROUP_MASK_DEVICES;
1063
1064 if (c->tasks_accounting ||
1065 c->tasks_max != (uint64_t) -1)
1066 mask |= CGROUP_MASK_PIDS;
1067
1068 return mask;
1069 }
1070
1071 CGroupMask unit_get_own_mask(Unit *u) {
1072 CGroupContext *c;
1073
1074 /* Returns the mask of controllers the unit needs for itself */
1075
1076 c = unit_get_cgroup_context(u);
1077 if (!c)
1078 return 0;
1079
1080 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1081 }
1082
1083 CGroupMask unit_get_delegate_mask(Unit *u) {
1084 CGroupContext *c;
1085
1086 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1087 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1088 *
1089 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1090
1091 if (u->type == UNIT_SLICE)
1092 return 0;
1093
1094 c = unit_get_cgroup_context(u);
1095 if (!c)
1096 return 0;
1097
1098 if (!c->delegate)
1099 return 0;
1100
1101 if (cg_all_unified() <= 0) {
1102 ExecContext *e;
1103
1104 e = unit_get_exec_context(u);
1105 if (e && !exec_context_maintains_privileges(e))
1106 return 0;
1107 }
1108
1109 return c->delegate_controllers;
1110 }
1111
1112 CGroupMask unit_get_members_mask(Unit *u) {
1113 assert(u);
1114
1115 /* Returns the mask of controllers all of the unit's children require, merged */
1116
1117 if (u->cgroup_members_mask_valid)
1118 return u->cgroup_members_mask;
1119
1120 u->cgroup_members_mask = 0;
1121
1122 if (u->type == UNIT_SLICE) {
1123 void *v;
1124 Unit *member;
1125 Iterator i;
1126
1127 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1128
1129 if (member == u)
1130 continue;
1131
1132 if (UNIT_DEREF(member->slice) != u)
1133 continue;
1134
1135 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1136 }
1137 }
1138
1139 u->cgroup_members_mask_valid = true;
1140 return u->cgroup_members_mask;
1141 }
1142
1143 CGroupMask unit_get_siblings_mask(Unit *u) {
1144 assert(u);
1145
1146 /* Returns the mask of controllers all of the unit's siblings
1147 * require, i.e. the members mask of the unit's parent slice
1148 * if there is one. */
1149
1150 if (UNIT_ISSET(u->slice))
1151 return unit_get_members_mask(UNIT_DEREF(u->slice));
1152
1153 return unit_get_subtree_mask(u); /* we are the top-level slice */
1154 }
1155
1156 CGroupMask unit_get_subtree_mask(Unit *u) {
1157
1158 /* Returns the mask of this subtree, meaning of the group
1159 * itself and its children. */
1160
1161 return unit_get_own_mask(u) | unit_get_members_mask(u);
1162 }
1163
1164 CGroupMask unit_get_target_mask(Unit *u) {
1165 CGroupMask mask;
1166
1167 /* This returns the cgroup mask of all controllers to enable
1168 * for a specific cgroup, i.e. everything it needs itself,
1169 * plus all that its children need, plus all that its siblings
1170 * need. This is primarily useful on the legacy cgroup
1171 * hierarchy, where we need to duplicate each cgroup in each
1172 * hierarchy that shall be enabled for it. */
1173
1174 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1175 mask &= u->manager->cgroup_supported;
1176
1177 return mask;
1178 }
1179
1180 CGroupMask unit_get_enable_mask(Unit *u) {
1181 CGroupMask mask;
1182
1183 /* This returns the cgroup mask of all controllers to enable
1184 * for the children of a specific cgroup. This is primarily
1185 * useful for the unified cgroup hierarchy, where each cgroup
1186 * controls which controllers are enabled for its children. */
1187
1188 mask = unit_get_members_mask(u);
1189 mask &= u->manager->cgroup_supported;
1190
1191 return mask;
1192 }
1193
1194 bool unit_get_needs_bpf(Unit *u) {
1195 CGroupContext *c;
1196 Unit *p;
1197 assert(u);
1198
1199 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1200 * moment. */
1201 if (u->type == UNIT_SLICE)
1202 return false;
1203
1204 c = unit_get_cgroup_context(u);
1205 if (!c)
1206 return false;
1207
1208 if (c->ip_accounting ||
1209 c->ip_address_allow ||
1210 c->ip_address_deny)
1211 return true;
1212
1213 /* If any parent slice has an IP access list defined, it applies too */
1214 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1215 c = unit_get_cgroup_context(p);
1216 if (!c)
1217 return false;
1218
1219 if (c->ip_address_allow ||
1220 c->ip_address_deny)
1221 return true;
1222 }
1223
1224 return false;
1225 }
1226
1227 /* Recurse from a unit up through its containing slices, propagating
1228 * mask bits upward. A unit is also member of itself. */
1229 void unit_update_cgroup_members_masks(Unit *u) {
1230 CGroupMask m;
1231 bool more;
1232
1233 assert(u);
1234
1235 /* Calculate subtree mask */
1236 m = unit_get_subtree_mask(u);
1237
1238 /* See if anything changed from the previous invocation. If
1239 * not, we're done. */
1240 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1241 return;
1242
1243 more =
1244 u->cgroup_subtree_mask_valid &&
1245 ((m & ~u->cgroup_subtree_mask) != 0) &&
1246 ((~m & u->cgroup_subtree_mask) == 0);
1247
1248 u->cgroup_subtree_mask = m;
1249 u->cgroup_subtree_mask_valid = true;
1250
1251 if (UNIT_ISSET(u->slice)) {
1252 Unit *s = UNIT_DEREF(u->slice);
1253
1254 if (more)
1255 /* There's more set now than before. We
1256 * propagate the new mask to the parent's mask
1257 * (not caring if it actually was valid or
1258 * not). */
1259
1260 s->cgroup_members_mask |= m;
1261
1262 else
1263 /* There's less set now than before (or we
1264 * don't know), we need to recalculate
1265 * everything, so let's invalidate the
1266 * parent's members mask */
1267
1268 s->cgroup_members_mask_valid = false;
1269
1270 /* And now make sure that this change also hits our
1271 * grandparents */
1272 unit_update_cgroup_members_masks(s);
1273 }
1274 }
1275
1276 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1277 Unit *u = userdata;
1278
1279 assert(mask != 0);
1280 assert(u);
1281
1282 while (u) {
1283 if (u->cgroup_path &&
1284 u->cgroup_realized &&
1285 (u->cgroup_realized_mask & mask) == mask)
1286 return u->cgroup_path;
1287
1288 u = UNIT_DEREF(u->slice);
1289 }
1290
1291 return NULL;
1292 }
1293
1294 char *unit_default_cgroup_path(Unit *u) {
1295 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1296 int r;
1297
1298 assert(u);
1299
1300 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1301 return strdup(u->manager->cgroup_root);
1302
1303 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1304 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1305 if (r < 0)
1306 return NULL;
1307 }
1308
1309 escaped = cg_escape(u->id);
1310 if (!escaped)
1311 return NULL;
1312
1313 if (slice)
1314 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1315 escaped);
1316 else
1317 return strjoin(u->manager->cgroup_root, "/", escaped);
1318 }
1319
1320 int unit_set_cgroup_path(Unit *u, const char *path) {
1321 _cleanup_free_ char *p = NULL;
1322 int r;
1323
1324 assert(u);
1325
1326 if (path) {
1327 p = strdup(path);
1328 if (!p)
1329 return -ENOMEM;
1330 } else
1331 p = NULL;
1332
1333 if (streq_ptr(u->cgroup_path, p))
1334 return 0;
1335
1336 if (p) {
1337 r = hashmap_put(u->manager->cgroup_unit, p, u);
1338 if (r < 0)
1339 return r;
1340 }
1341
1342 unit_release_cgroup(u);
1343
1344 u->cgroup_path = p;
1345 p = NULL;
1346
1347 return 1;
1348 }
1349
1350 int unit_watch_cgroup(Unit *u) {
1351 _cleanup_free_ char *events = NULL;
1352 int r;
1353
1354 assert(u);
1355
1356 if (!u->cgroup_path)
1357 return 0;
1358
1359 if (u->cgroup_inotify_wd >= 0)
1360 return 0;
1361
1362 /* Only applies to the unified hierarchy */
1363 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1364 if (r < 0)
1365 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1366 if (r == 0)
1367 return 0;
1368
1369 /* Don't watch the root slice, it's pointless. */
1370 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1371 return 0;
1372
1373 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1374 if (r < 0)
1375 return log_oom();
1376
1377 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1378 if (r < 0)
1379 return log_oom();
1380
1381 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1382 if (u->cgroup_inotify_wd < 0) {
1383
1384 if (errno == ENOENT) /* If the directory is already
1385 * gone we don't need to track
1386 * it, so this is not an error */
1387 return 0;
1388
1389 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1390 }
1391
1392 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1393 if (r < 0)
1394 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1395
1396 return 0;
1397 }
1398
1399 int unit_pick_cgroup_path(Unit *u) {
1400 _cleanup_free_ char *path = NULL;
1401 int r;
1402
1403 assert(u);
1404
1405 if (u->cgroup_path)
1406 return 0;
1407
1408 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1409 return -EINVAL;
1410
1411 path = unit_default_cgroup_path(u);
1412 if (!path)
1413 return log_oom();
1414
1415 r = unit_set_cgroup_path(u, path);
1416 if (r == -EEXIST)
1417 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1418 if (r < 0)
1419 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1420
1421 return 0;
1422 }
1423
1424 static int unit_create_cgroup(
1425 Unit *u,
1426 CGroupMask target_mask,
1427 CGroupMask enable_mask,
1428 bool needs_bpf) {
1429
1430 CGroupContext *c;
1431 int r;
1432
1433 assert(u);
1434
1435 c = unit_get_cgroup_context(u);
1436 if (!c)
1437 return 0;
1438
1439 /* Figure out our cgroup path */
1440 r = unit_pick_cgroup_path(u);
1441 if (r < 0)
1442 return r;
1443
1444 /* First, create our own group */
1445 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1446 if (r < 0)
1447 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1448
1449 /* Start watching it */
1450 (void) unit_watch_cgroup(u);
1451
1452 /* Enable all controllers we need */
1453 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1454 if (r < 0)
1455 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1456
1457 /* Keep track that this is now realized */
1458 u->cgroup_realized = true;
1459 u->cgroup_realized_mask = target_mask;
1460 u->cgroup_enabled_mask = enable_mask;
1461 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1462
1463 if (u->type != UNIT_SLICE && !c->delegate) {
1464
1465 /* Then, possibly move things over, but not if
1466 * subgroups may contain processes, which is the case
1467 * for slice and delegation units. */
1468 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1469 if (r < 0)
1470 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1471 }
1472
1473 return 0;
1474 }
1475
1476 int unit_attach_pids_to_cgroup(Unit *u) {
1477 int r;
1478 assert(u);
1479
1480 r = unit_realize_cgroup(u);
1481 if (r < 0)
1482 return r;
1483
1484 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1485 if (r < 0)
1486 return r;
1487
1488 return 0;
1489 }
1490
1491 static void cgroup_xattr_apply(Unit *u) {
1492 char ids[SD_ID128_STRING_MAX];
1493 int r;
1494
1495 assert(u);
1496
1497 if (!MANAGER_IS_SYSTEM(u->manager))
1498 return;
1499
1500 if (sd_id128_is_null(u->invocation_id))
1501 return;
1502
1503 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1504 "trusted.invocation_id",
1505 sd_id128_to_string(u->invocation_id, ids), 32,
1506 0);
1507 if (r < 0)
1508 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1509 }
1510
1511 static bool unit_has_mask_realized(
1512 Unit *u,
1513 CGroupMask target_mask,
1514 CGroupMask enable_mask,
1515 bool needs_bpf) {
1516
1517 assert(u);
1518
1519 return u->cgroup_realized &&
1520 u->cgroup_realized_mask == target_mask &&
1521 u->cgroup_enabled_mask == enable_mask &&
1522 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1523 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1524 }
1525
1526 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1527 assert(u);
1528
1529 if (u->in_cgroup_realize_queue)
1530 return;
1531
1532 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1533 u->in_cgroup_realize_queue = true;
1534 }
1535
1536 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1537 assert(u);
1538
1539 if (!u->in_cgroup_realize_queue)
1540 return;
1541
1542 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1543 u->in_cgroup_realize_queue = false;
1544 }
1545
1546
1547 /* Check if necessary controllers and attributes for a unit are in place.
1548 *
1549 * If so, do nothing.
1550 * If not, create paths, move processes over, and set attributes.
1551 *
1552 * Returns 0 on success and < 0 on failure. */
1553 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1554 CGroupMask target_mask, enable_mask;
1555 bool needs_bpf, apply_bpf;
1556 int r;
1557
1558 assert(u);
1559
1560 unit_remove_from_cgroup_realize_queue(u);
1561
1562 target_mask = unit_get_target_mask(u);
1563 enable_mask = unit_get_enable_mask(u);
1564 needs_bpf = unit_get_needs_bpf(u);
1565
1566 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1567 return 0;
1568
1569 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1570 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1571 * this will trickle down properly to cgroupfs. */
1572 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1573
1574 /* First, realize parents */
1575 if (UNIT_ISSET(u->slice)) {
1576 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1577 if (r < 0)
1578 return r;
1579 }
1580
1581 /* And then do the real work */
1582 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1583 if (r < 0)
1584 return r;
1585
1586 /* Finally, apply the necessary attributes. */
1587 cgroup_context_apply(u, target_mask, apply_bpf, state);
1588 cgroup_xattr_apply(u);
1589
1590 return 0;
1591 }
1592
1593 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1594 ManagerState state;
1595 unsigned n = 0;
1596 Unit *i;
1597 int r;
1598
1599 assert(m);
1600
1601 state = manager_state(m);
1602
1603 while ((i = m->cgroup_realize_queue)) {
1604 assert(i->in_cgroup_realize_queue);
1605
1606 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1607 /* Maybe things changed, and the unit is not actually active anymore? */
1608 unit_remove_from_cgroup_realize_queue(i);
1609 continue;
1610 }
1611
1612 r = unit_realize_cgroup_now(i, state);
1613 if (r < 0)
1614 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1615
1616 n++;
1617 }
1618
1619 return n;
1620 }
1621
1622 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1623 Unit *slice;
1624
1625 /* This adds the siblings of the specified unit and the
1626 * siblings of all parent units to the cgroup queue. (But
1627 * neither the specified unit itself nor the parents.) */
1628
1629 while ((slice = UNIT_DEREF(u->slice))) {
1630 Iterator i;
1631 Unit *m;
1632 void *v;
1633
1634 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1635 if (m == u)
1636 continue;
1637
1638 /* Skip units that have a dependency on the slice
1639 * but aren't actually in it. */
1640 if (UNIT_DEREF(m->slice) != slice)
1641 continue;
1642
1643 /* No point in doing cgroup application for units
1644 * without active processes. */
1645 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1646 continue;
1647
1648 /* If the unit doesn't need any new controllers
1649 * and has current ones realized, it doesn't need
1650 * any changes. */
1651 if (unit_has_mask_realized(m,
1652 unit_get_target_mask(m),
1653 unit_get_enable_mask(m),
1654 unit_get_needs_bpf(m)))
1655 continue;
1656
1657 unit_add_to_cgroup_realize_queue(m);
1658 }
1659
1660 u = slice;
1661 }
1662 }
1663
1664 int unit_realize_cgroup(Unit *u) {
1665 assert(u);
1666
1667 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1668 return 0;
1669
1670 /* So, here's the deal: when realizing the cgroups for this
1671 * unit, we need to first create all parents, but there's more
1672 * actually: for the weight-based controllers we also need to
1673 * make sure that all our siblings (i.e. units that are in the
1674 * same slice as we are) have cgroups, too. Otherwise, things
1675 * would become very uneven as each of their processes would
1676 * get as much resources as all our group together. This call
1677 * will synchronously create the parent cgroups, but will
1678 * defer work on the siblings to the next event loop
1679 * iteration. */
1680
1681 /* Add all sibling slices to the cgroup queue. */
1682 unit_add_siblings_to_cgroup_realize_queue(u);
1683
1684 /* And realize this one now (and apply the values) */
1685 return unit_realize_cgroup_now(u, manager_state(u->manager));
1686 }
1687
1688 void unit_release_cgroup(Unit *u) {
1689 assert(u);
1690
1691 /* Forgets all cgroup details for this cgroup */
1692
1693 if (u->cgroup_path) {
1694 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1695 u->cgroup_path = mfree(u->cgroup_path);
1696 }
1697
1698 if (u->cgroup_inotify_wd >= 0) {
1699 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1700 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1701
1702 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1703 u->cgroup_inotify_wd = -1;
1704 }
1705 }
1706
1707 void unit_prune_cgroup(Unit *u) {
1708 int r;
1709 bool is_root_slice;
1710
1711 assert(u);
1712
1713 /* Removes the cgroup, if empty and possible, and stops watching it. */
1714
1715 if (!u->cgroup_path)
1716 return;
1717
1718 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1719
1720 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1721
1722 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1723 if (r < 0) {
1724 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1725 return;
1726 }
1727
1728 if (is_root_slice)
1729 return;
1730
1731 unit_release_cgroup(u);
1732
1733 u->cgroup_realized = false;
1734 u->cgroup_realized_mask = 0;
1735 u->cgroup_enabled_mask = 0;
1736 }
1737
1738 int unit_search_main_pid(Unit *u, pid_t *ret) {
1739 _cleanup_fclose_ FILE *f = NULL;
1740 pid_t pid = 0, npid, mypid;
1741 int r;
1742
1743 assert(u);
1744 assert(ret);
1745
1746 if (!u->cgroup_path)
1747 return -ENXIO;
1748
1749 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1750 if (r < 0)
1751 return r;
1752
1753 mypid = getpid_cached();
1754 while (cg_read_pid(f, &npid) > 0) {
1755 pid_t ppid;
1756
1757 if (npid == pid)
1758 continue;
1759
1760 /* Ignore processes that aren't our kids */
1761 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1762 continue;
1763
1764 if (pid != 0)
1765 /* Dang, there's more than one daemonized PID
1766 in this group, so we don't know what process
1767 is the main process. */
1768
1769 return -ENODATA;
1770
1771 pid = npid;
1772 }
1773
1774 *ret = pid;
1775 return 0;
1776 }
1777
1778 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1779 _cleanup_closedir_ DIR *d = NULL;
1780 _cleanup_fclose_ FILE *f = NULL;
1781 int ret = 0, r;
1782
1783 assert(u);
1784 assert(path);
1785
1786 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1787 if (r < 0)
1788 ret = r;
1789 else {
1790 pid_t pid;
1791
1792 while ((r = cg_read_pid(f, &pid)) > 0) {
1793 r = unit_watch_pid(u, pid);
1794 if (r < 0 && ret >= 0)
1795 ret = r;
1796 }
1797
1798 if (r < 0 && ret >= 0)
1799 ret = r;
1800 }
1801
1802 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1803 if (r < 0) {
1804 if (ret >= 0)
1805 ret = r;
1806 } else {
1807 char *fn;
1808
1809 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1810 _cleanup_free_ char *p = NULL;
1811
1812 p = strjoin(path, "/", fn);
1813 free(fn);
1814
1815 if (!p)
1816 return -ENOMEM;
1817
1818 r = unit_watch_pids_in_path(u, p);
1819 if (r < 0 && ret >= 0)
1820 ret = r;
1821 }
1822
1823 if (r < 0 && ret >= 0)
1824 ret = r;
1825 }
1826
1827 return ret;
1828 }
1829
1830 int unit_synthesize_cgroup_empty_event(Unit *u) {
1831 int r;
1832
1833 assert(u);
1834
1835 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1836 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1837 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1838
1839 if (!u->cgroup_path)
1840 return -ENOENT;
1841
1842 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1843 if (r < 0)
1844 return r;
1845 if (r > 0) /* On unified we have reliable notifications, and don't need this */
1846 return 0;
1847
1848 if (!set_isempty(u->pids))
1849 return 0;
1850
1851 unit_add_to_cgroup_empty_queue(u);
1852 return 0;
1853 }
1854
1855 int unit_watch_all_pids(Unit *u) {
1856 int r;
1857
1858 assert(u);
1859
1860 /* Adds all PIDs from our cgroup to the set of PIDs we
1861 * watch. This is a fallback logic for cases where we do not
1862 * get reliable cgroup empty notifications: we try to use
1863 * SIGCHLD as replacement. */
1864
1865 if (!u->cgroup_path)
1866 return -ENOENT;
1867
1868 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1869 if (r < 0)
1870 return r;
1871 if (r > 0) /* On unified we can use proper notifications */
1872 return 0;
1873
1874 return unit_watch_pids_in_path(u, u->cgroup_path);
1875 }
1876
1877 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1878 Manager *m = userdata;
1879 Unit *u;
1880 int r;
1881
1882 assert(s);
1883 assert(m);
1884
1885 u = m->cgroup_empty_queue;
1886 if (!u)
1887 return 0;
1888
1889 assert(u->in_cgroup_empty_queue);
1890 u->in_cgroup_empty_queue = false;
1891 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1892
1893 if (m->cgroup_empty_queue) {
1894 /* More stuff queued, let's make sure we remain enabled */
1895 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1896 if (r < 0)
1897 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1898 }
1899
1900 unit_add_to_gc_queue(u);
1901
1902 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1903 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1904
1905 return 0;
1906 }
1907
1908 void unit_add_to_cgroup_empty_queue(Unit *u) {
1909 int r;
1910
1911 assert(u);
1912
1913 /* Note that there are four different ways how cgroup empty events reach us:
1914 *
1915 * 1. On the unified hierarchy we get an inotify event on the cgroup
1916 *
1917 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1918 *
1919 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1920 *
1921 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1922 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1923 *
1924 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1925 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1926 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1927 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1928 * case for scope units). */
1929
1930 if (u->in_cgroup_empty_queue)
1931 return;
1932
1933 /* Let's verify that the cgroup is really empty */
1934 if (!u->cgroup_path)
1935 return;
1936 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1937 if (r < 0) {
1938 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1939 return;
1940 }
1941 if (r == 0)
1942 return;
1943
1944 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1945 u->in_cgroup_empty_queue = true;
1946
1947 /* Trigger the defer event */
1948 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1949 if (r < 0)
1950 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1951 }
1952
1953 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1954 Manager *m = userdata;
1955
1956 assert(s);
1957 assert(fd >= 0);
1958 assert(m);
1959
1960 for (;;) {
1961 union inotify_event_buffer buffer;
1962 struct inotify_event *e;
1963 ssize_t l;
1964
1965 l = read(fd, &buffer, sizeof(buffer));
1966 if (l < 0) {
1967 if (IN_SET(errno, EINTR, EAGAIN))
1968 return 0;
1969
1970 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1971 }
1972
1973 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1974 Unit *u;
1975
1976 if (e->wd < 0)
1977 /* Queue overflow has no watch descriptor */
1978 continue;
1979
1980 if (e->mask & IN_IGNORED)
1981 /* The watch was just removed */
1982 continue;
1983
1984 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1985 if (!u) /* Not that inotify might deliver
1986 * events for a watch even after it
1987 * was removed, because it was queued
1988 * before the removal. Let's ignore
1989 * this here safely. */
1990 continue;
1991
1992 unit_add_to_cgroup_empty_queue(u);
1993 }
1994 }
1995 }
1996
1997 int manager_setup_cgroup(Manager *m) {
1998 _cleanup_free_ char *path = NULL;
1999 const char *scope_path;
2000 CGroupController c;
2001 int r, all_unified;
2002 char *e;
2003
2004 assert(m);
2005
2006 /* 1. Determine hierarchy */
2007 m->cgroup_root = mfree(m->cgroup_root);
2008 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2009 if (r < 0)
2010 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2011
2012 /* Chop off the init scope, if we are already located in it */
2013 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2014
2015 /* LEGACY: Also chop off the system slice if we are in
2016 * it. This is to support live upgrades from older systemd
2017 * versions where PID 1 was moved there. Also see
2018 * cg_get_root_path(). */
2019 if (!e && MANAGER_IS_SYSTEM(m)) {
2020 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2021 if (!e)
2022 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2023 }
2024 if (e)
2025 *e = 0;
2026
2027 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2028 * easily prepend it everywhere. */
2029 delete_trailing_chars(m->cgroup_root, "/");
2030
2031 /* 2. Show data */
2032 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2033 if (r < 0)
2034 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2035
2036 r = cg_unified_flush();
2037 if (r < 0)
2038 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2039
2040 all_unified = cg_all_unified();
2041 if (all_unified < 0)
2042 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2043 if (all_unified > 0)
2044 log_debug("Unified cgroup hierarchy is located at %s.", path);
2045 else {
2046 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2047 if (r < 0)
2048 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2049 if (r > 0)
2050 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2051 else
2052 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2053 }
2054
2055 /* 3. Allocate cgroup empty defer event source */
2056 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2057 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2058 if (r < 0)
2059 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2060
2061 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2062 if (r < 0)
2063 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2064
2065 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2066 if (r < 0)
2067 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2068
2069 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2070
2071 /* 4. Install notifier inotify object, or agent */
2072 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2073
2074 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2075
2076 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2077 safe_close(m->cgroup_inotify_fd);
2078
2079 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2080 if (m->cgroup_inotify_fd < 0)
2081 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2082
2083 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2084 if (r < 0)
2085 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2086
2087 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2088 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2089 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2090 if (r < 0)
2091 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2092
2093 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2094
2095 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2096
2097 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2098 * since it does not generate events when control groups with children run empty. */
2099
2100 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2101 if (r < 0)
2102 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2103 else if (r > 0)
2104 log_debug("Installed release agent.");
2105 else if (r == 0)
2106 log_debug("Release agent already installed.");
2107 }
2108
2109 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2110 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2111 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2112 if (r < 0)
2113 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2114
2115 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2116 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2117 if (r < 0)
2118 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2119
2120 /* 6. And pin it, so that it cannot be unmounted */
2121 safe_close(m->pin_cgroupfs_fd);
2122 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2123 if (m->pin_cgroupfs_fd < 0)
2124 return log_error_errno(errno, "Failed to open pin file: %m");
2125
2126 /* 7. Always enable hierarchical support if it exists... */
2127 if (!all_unified && m->test_run_flags == 0)
2128 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2129
2130 /* 8. Figure out which controllers are supported, and log about it */
2131 r = cg_mask_supported(&m->cgroup_supported);
2132 if (r < 0)
2133 return log_error_errno(r, "Failed to determine supported controllers: %m");
2134 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2135 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2136
2137 return 0;
2138 }
2139
2140 void manager_shutdown_cgroup(Manager *m, bool delete) {
2141 assert(m);
2142
2143 /* We can't really delete the group, since we are in it. But
2144 * let's trim it. */
2145 if (delete && m->cgroup_root)
2146 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2147
2148 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2149
2150 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2151
2152 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2153 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2154
2155 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2156
2157 m->cgroup_root = mfree(m->cgroup_root);
2158 }
2159
2160 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2161 char *p;
2162 Unit *u;
2163
2164 assert(m);
2165 assert(cgroup);
2166
2167 u = hashmap_get(m->cgroup_unit, cgroup);
2168 if (u)
2169 return u;
2170
2171 p = strdupa(cgroup);
2172 for (;;) {
2173 char *e;
2174
2175 e = strrchr(p, '/');
2176 if (!e || e == p)
2177 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2178
2179 *e = 0;
2180
2181 u = hashmap_get(m->cgroup_unit, p);
2182 if (u)
2183 return u;
2184 }
2185 }
2186
2187 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2188 _cleanup_free_ char *cgroup = NULL;
2189 int r;
2190
2191 assert(m);
2192
2193 if (pid <= 0)
2194 return NULL;
2195
2196 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2197 if (r < 0)
2198 return NULL;
2199
2200 return manager_get_unit_by_cgroup(m, cgroup);
2201 }
2202
2203 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2204 Unit *u;
2205
2206 assert(m);
2207
2208 if (pid <= 0)
2209 return NULL;
2210
2211 if (pid == getpid_cached())
2212 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2213
2214 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2215 if (u)
2216 return u;
2217
2218 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2219 if (u)
2220 return u;
2221
2222 return manager_get_unit_by_pid_cgroup(m, pid);
2223 }
2224
2225 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2226 Unit *u;
2227
2228 assert(m);
2229 assert(cgroup);
2230
2231 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2232 * or from the --system instance */
2233
2234 log_debug("Got cgroup empty notification for: %s", cgroup);
2235
2236 u = manager_get_unit_by_cgroup(m, cgroup);
2237 if (!u)
2238 return 0;
2239
2240 unit_add_to_cgroup_empty_queue(u);
2241 return 1;
2242 }
2243
2244 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2245 _cleanup_free_ char *v = NULL;
2246 int r;
2247
2248 assert(u);
2249 assert(ret);
2250
2251 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2252 return -ENODATA;
2253
2254 if (!u->cgroup_path)
2255 return -ENODATA;
2256
2257 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2258 return -ENODATA;
2259
2260 r = cg_all_unified();
2261 if (r < 0)
2262 return r;
2263 if (r > 0)
2264 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2265 else
2266 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2267 if (r == -ENOENT)
2268 return -ENODATA;
2269 if (r < 0)
2270 return r;
2271
2272 return safe_atou64(v, ret);
2273 }
2274
2275 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2276 _cleanup_free_ char *v = NULL;
2277 int r;
2278
2279 assert(u);
2280 assert(ret);
2281
2282 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2283 return -ENODATA;
2284
2285 if (!u->cgroup_path)
2286 return -ENODATA;
2287
2288 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2289 return -ENODATA;
2290
2291 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2292 if (r == -ENOENT)
2293 return -ENODATA;
2294 if (r < 0)
2295 return r;
2296
2297 return safe_atou64(v, ret);
2298 }
2299
2300 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2301 _cleanup_free_ char *v = NULL;
2302 uint64_t ns;
2303 int r;
2304
2305 assert(u);
2306 assert(ret);
2307
2308 if (!u->cgroup_path)
2309 return -ENODATA;
2310
2311 r = cg_all_unified();
2312 if (r < 0)
2313 return r;
2314 if (r > 0) {
2315 const char *keys[] = { "usage_usec", NULL };
2316 _cleanup_free_ char *val = NULL;
2317 uint64_t us;
2318
2319 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2320 return -ENODATA;
2321
2322 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2323 if (r < 0)
2324 return r;
2325
2326 r = safe_atou64(val, &us);
2327 if (r < 0)
2328 return r;
2329
2330 ns = us * NSEC_PER_USEC;
2331 } else {
2332 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2333 return -ENODATA;
2334
2335 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2336 if (r == -ENOENT)
2337 return -ENODATA;
2338 if (r < 0)
2339 return r;
2340
2341 r = safe_atou64(v, &ns);
2342 if (r < 0)
2343 return r;
2344 }
2345
2346 *ret = ns;
2347 return 0;
2348 }
2349
2350 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2351 nsec_t ns;
2352 int r;
2353
2354 assert(u);
2355
2356 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2357 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2358 * call this function with a NULL return value. */
2359
2360 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2361 return -ENODATA;
2362
2363 r = unit_get_cpu_usage_raw(u, &ns);
2364 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2365 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2366 * cached value. */
2367
2368 if (ret)
2369 *ret = u->cpu_usage_last;
2370 return 0;
2371 }
2372 if (r < 0)
2373 return r;
2374
2375 if (ns > u->cpu_usage_base)
2376 ns -= u->cpu_usage_base;
2377 else
2378 ns = 0;
2379
2380 u->cpu_usage_last = ns;
2381 if (ret)
2382 *ret = ns;
2383
2384 return 0;
2385 }
2386
2387 int unit_get_ip_accounting(
2388 Unit *u,
2389 CGroupIPAccountingMetric metric,
2390 uint64_t *ret) {
2391
2392 uint64_t value;
2393 int fd, r;
2394
2395 assert(u);
2396 assert(metric >= 0);
2397 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2398 assert(ret);
2399
2400 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2401 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2402 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2403 * filters. */
2404 if (u->type == UNIT_SLICE)
2405 return -ENODATA;
2406
2407 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2408 return -ENODATA;
2409
2410 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2411 u->ip_accounting_ingress_map_fd :
2412 u->ip_accounting_egress_map_fd;
2413 if (fd < 0)
2414 return -ENODATA;
2415
2416 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2417 r = bpf_firewall_read_accounting(fd, &value, NULL);
2418 else
2419 r = bpf_firewall_read_accounting(fd, NULL, &value);
2420 if (r < 0)
2421 return r;
2422
2423 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2424 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2425 * ip_accounting_extra[] field, and add them in here transparently. */
2426
2427 *ret = value + u->ip_accounting_extra[metric];
2428
2429 return r;
2430 }
2431
2432 int unit_reset_cpu_accounting(Unit *u) {
2433 nsec_t ns;
2434 int r;
2435
2436 assert(u);
2437
2438 u->cpu_usage_last = NSEC_INFINITY;
2439
2440 r = unit_get_cpu_usage_raw(u, &ns);
2441 if (r < 0) {
2442 u->cpu_usage_base = 0;
2443 return r;
2444 }
2445
2446 u->cpu_usage_base = ns;
2447 return 0;
2448 }
2449
2450 int unit_reset_ip_accounting(Unit *u) {
2451 int r = 0, q = 0;
2452
2453 assert(u);
2454
2455 if (u->ip_accounting_ingress_map_fd >= 0)
2456 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2457
2458 if (u->ip_accounting_egress_map_fd >= 0)
2459 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2460
2461 zero(u->ip_accounting_extra);
2462
2463 return r < 0 ? r : q;
2464 }
2465
2466 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2467 assert(u);
2468
2469 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2470 return;
2471
2472 if (m == 0)
2473 return;
2474
2475 /* always invalidate compat pairs together */
2476 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2477 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2478
2479 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2480 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2481
2482 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2483 return;
2484
2485 u->cgroup_realized_mask &= ~m;
2486 unit_add_to_cgroup_realize_queue(u);
2487 }
2488
2489 void unit_invalidate_cgroup_bpf(Unit *u) {
2490 assert(u);
2491
2492 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2493 return;
2494
2495 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2496 return;
2497
2498 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2499 unit_add_to_cgroup_realize_queue(u);
2500
2501 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2502 * list of our children includes our own. */
2503 if (u->type == UNIT_SLICE) {
2504 Unit *member;
2505 Iterator i;
2506 void *v;
2507
2508 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2509 if (member == u)
2510 continue;
2511
2512 if (UNIT_DEREF(member->slice) != u)
2513 continue;
2514
2515 unit_invalidate_cgroup_bpf(member);
2516 }
2517 }
2518 }
2519
2520 void manager_invalidate_startup_units(Manager *m) {
2521 Iterator i;
2522 Unit *u;
2523
2524 assert(m);
2525
2526 SET_FOREACH(u, m->startup_units, i)
2527 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2528 }
2529
2530 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2531 [CGROUP_AUTO] = "auto",
2532 [CGROUP_CLOSED] = "closed",
2533 [CGROUP_STRICT] = "strict",
2534 };
2535
2536 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);