]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
core: add startup resource control option
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 void cgroup_context_init(CGroupContext *c) {
31 assert(c);
32
33 /* Initialize everything to the kernel defaults, assuming the
34 * structure is preinitialized to 0 */
35
36 c->cpu_shares = 1024;
37 c->startup_cpu_shares = 1024;
38 c->startup_cpu_shares_set = false;
39 c->memory_limit = (uint64_t) -1;
40 c->blockio_weight = 1000;
41 c->startup_blockio_weight = 1000;
42 c->startup_blockio_weight_set = false;
43
44 c->cpu_quota_per_sec_usec = (usec_t) -1;
45 c->cpu_quota_usec = (usec_t) -1;
46 c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
47 }
48
49 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
50 assert(c);
51 assert(a);
52
53 LIST_REMOVE(device_allow, c->device_allow, a);
54 free(a->path);
55 free(a);
56 }
57
58 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
59 assert(c);
60 assert(w);
61
62 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
63 free(w->path);
64 free(w);
65 }
66
67 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
68 assert(c);
69 assert(b);
70
71 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
72 free(b->path);
73 free(b);
74 }
75
76 void cgroup_context_done(CGroupContext *c) {
77 assert(c);
78
79 while (c->blockio_device_weights)
80 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
81
82 while (c->blockio_device_bandwidths)
83 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
84
85 while (c->device_allow)
86 cgroup_context_free_device_allow(c, c->device_allow);
87 }
88
89 usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
90 assert(c);
91
92 /* Returns the absolute CPU quota */
93
94 if (c->cpu_quota_usec != (usec_t) -1)
95 return c->cpu_quota_usec;
96 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
97 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
98 else
99 return (usec_t) -1;
100 }
101
102 usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
103 assert(c);
104
105 /* Returns the CPU quota relative to 1s */
106
107 if (c->cpu_quota_usec != (usec_t) -1)
108 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
109 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
110 return c->cpu_quota_per_sec_usec;
111 else
112 return (usec_t) -1;
113 }
114
115 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
116 CGroupBlockIODeviceBandwidth *b;
117 CGroupBlockIODeviceWeight *w;
118 CGroupDeviceAllow *a;
119 char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
120
121 assert(c);
122 assert(f);
123
124 prefix = strempty(prefix);
125
126 fprintf(f,
127 "%sCPUAccounting=%s\n"
128 "%sBlockIOAccounting=%s\n"
129 "%sMemoryAccounting=%s\n"
130 "%sCPUShares=%lu\n"
131 "%sStartupCPUShares=%lu\n"
132 "%sCPUQuota=%s\n"
133 "%sCPUQuotaPerSecSec=%s\n"
134 "%sCPUQuotaPeriodSec=%s\n"
135 "%sBlockIOWeight=%lu\n"
136 "%sStartupBlockIOWeight=%lu\n"
137 "%sMemoryLimit=%" PRIu64 "\n"
138 "%sDevicePolicy=%s\n",
139 prefix, yes_no(c->cpu_accounting),
140 prefix, yes_no(c->blockio_accounting),
141 prefix, yes_no(c->memory_accounting),
142 prefix, c->cpu_shares,
143 prefix, c->startup_cpu_shares,
144 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
145 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
146 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
147 prefix, c->blockio_weight,
148 prefix, c->startup_blockio_weight,
149 prefix, c->memory_limit,
150 prefix, cgroup_device_policy_to_string(c->device_policy));
151
152 LIST_FOREACH(device_allow, a, c->device_allow)
153 fprintf(f,
154 "%sDeviceAllow=%s %s%s%s\n",
155 prefix,
156 a->path,
157 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
158
159 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
160 fprintf(f,
161 "%sBlockIODeviceWeight=%s %lu",
162 prefix,
163 w->path,
164 w->weight);
165
166 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
167 char buf[FORMAT_BYTES_MAX];
168
169 fprintf(f,
170 "%s%s=%s %s\n",
171 prefix,
172 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
173 b->path,
174 format_bytes(buf, sizeof(buf), b->bandwidth));
175 }
176 }
177
178 static int lookup_blkio_device(const char *p, dev_t *dev) {
179 struct stat st;
180 int r;
181
182 assert(p);
183 assert(dev);
184
185 r = stat(p, &st);
186 if (r < 0) {
187 log_warning("Couldn't stat device %s: %m", p);
188 return -errno;
189 }
190
191 if (S_ISBLK(st.st_mode))
192 *dev = st.st_rdev;
193 else if (major(st.st_dev) != 0) {
194 /* If this is not a device node then find the block
195 * device this file is stored on */
196 *dev = st.st_dev;
197
198 /* If this is a partition, try to get the originating
199 * block device */
200 block_get_whole_disk(*dev, dev);
201 } else {
202 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
203 return -ENODEV;
204 }
205
206 return 0;
207 }
208
209 static int whitelist_device(const char *path, const char *node, const char *acc) {
210 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
211 struct stat st;
212 int r;
213
214 assert(path);
215 assert(acc);
216
217 if (stat(node, &st) < 0) {
218 log_warning("Couldn't stat device %s", node);
219 return -errno;
220 }
221
222 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
223 log_warning("%s is not a device.", node);
224 return -ENODEV;
225 }
226
227 sprintf(buf,
228 "%c %u:%u %s",
229 S_ISCHR(st.st_mode) ? 'c' : 'b',
230 major(st.st_rdev), minor(st.st_rdev),
231 acc);
232
233 r = cg_set_attribute("devices", path, "devices.allow", buf);
234 if (r < 0)
235 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
236
237 return r;
238 }
239
240 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
241 _cleanup_fclose_ FILE *f = NULL;
242 char line[LINE_MAX];
243 bool good = false;
244 int r;
245
246 assert(path);
247 assert(acc);
248 assert(type == 'b' || type == 'c');
249
250 f = fopen("/proc/devices", "re");
251 if (!f) {
252 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
253 return -errno;
254 }
255
256 FOREACH_LINE(line, f, goto fail) {
257 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
258 unsigned maj;
259
260 truncate_nl(line);
261
262 if (type == 'c' && streq(line, "Character devices:")) {
263 good = true;
264 continue;
265 }
266
267 if (type == 'b' && streq(line, "Block devices:")) {
268 good = true;
269 continue;
270 }
271
272 if (isempty(line)) {
273 good = false;
274 continue;
275 }
276
277 if (!good)
278 continue;
279
280 p = strstrip(line);
281
282 w = strpbrk(p, WHITESPACE);
283 if (!w)
284 continue;
285 *w = 0;
286
287 r = safe_atou(p, &maj);
288 if (r < 0)
289 continue;
290 if (maj <= 0)
291 continue;
292
293 w++;
294 w += strspn(w, WHITESPACE);
295
296 if (fnmatch(name, w, 0) != 0)
297 continue;
298
299 sprintf(buf,
300 "%c %u:* %s",
301 type,
302 maj,
303 acc);
304
305 r = cg_set_attribute("devices", path, "devices.allow", buf);
306 if (r < 0)
307 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
308 }
309
310 return 0;
311
312 fail:
313 log_warning("Failed to read /proc/devices: %m");
314 return -errno;
315 }
316
317 void cgroup_context_apply(Manager *m, CGroupContext *c, CGroupControllerMask mask, const char *path) {
318 bool is_root;
319 int r;
320
321 assert(c);
322 assert(path);
323
324 if (mask == 0)
325 return;
326
327 /* Some cgroup attributes are not support on the root cgroup,
328 * hence silently ignore */
329 is_root = isempty(path) || path_equal(path, "/");
330
331 if ((mask & CGROUP_CPU) && !is_root) {
332 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
333 usec_t q;
334
335 sprintf(buf, "%lu\n", manager_state(m) == MANAGER_STARTING
336 ? c->startup_cpu_shares
337 : c->cpu_shares);
338 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
339 if (r < 0)
340 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
341
342 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
343 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
344 if (r < 0)
345 log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
346
347 q = cgroup_context_get_cpu_quota_usec(c);
348 if (q != (usec_t) -1) {
349 sprintf(buf, USEC_FMT "\n", q);
350 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
351 } else
352 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
353 if (r < 0)
354 log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
355 }
356
357 if (mask & CGROUP_BLKIO) {
358 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
359 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
360 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
361 CGroupBlockIODeviceWeight *w;
362 CGroupBlockIODeviceBandwidth *b;
363
364 if (!is_root) {
365 sprintf(buf, "%lu\n", manager_state(m) == MANAGER_STARTING
366 ? c->startup_blockio_weight
367 : c->blockio_weight);
368 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
369 if (r < 0)
370 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
371
372 /* FIXME: no way to reset this list */
373 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
374 dev_t dev;
375
376 r = lookup_blkio_device(w->path, &dev);
377 if (r < 0)
378 continue;
379
380 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
381 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
382 if (r < 0)
383 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
384 }
385 }
386
387 /* FIXME: no way to reset this list */
388 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
389 const char *a;
390 dev_t dev;
391
392 r = lookup_blkio_device(b->path, &dev);
393 if (r < 0)
394 continue;
395
396 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
397
398 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
399 r = cg_set_attribute("blkio", path, a, buf);
400 if (r < 0)
401 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
402 }
403 }
404
405 if (mask & CGROUP_MEMORY) {
406 if (c->memory_limit != (uint64_t) -1) {
407 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
408
409 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
410 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
411 } else
412 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
413
414 if (r < 0)
415 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
416 }
417
418 if ((mask & CGROUP_DEVICE) && !is_root) {
419 CGroupDeviceAllow *a;
420
421 if (c->device_allow || c->device_policy != CGROUP_AUTO)
422 r = cg_set_attribute("devices", path, "devices.deny", "a");
423 else
424 r = cg_set_attribute("devices", path, "devices.allow", "a");
425 if (r < 0)
426 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
427
428 if (c->device_policy == CGROUP_CLOSED ||
429 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
430 static const char auto_devices[] =
431 "/dev/null\0" "rwm\0"
432 "/dev/zero\0" "rwm\0"
433 "/dev/full\0" "rwm\0"
434 "/dev/random\0" "rwm\0"
435 "/dev/urandom\0" "rwm\0"
436 "/dev/tty\0" "rwm\0"
437 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
438
439 const char *x, *y;
440
441 NULSTR_FOREACH_PAIR(x, y, auto_devices)
442 whitelist_device(path, x, y);
443
444 whitelist_major(path, "pts", 'c', "rw");
445 whitelist_major(path, "kdbus", 'c', "rw");
446 whitelist_major(path, "kdbus/*", 'c', "rw");
447 }
448
449 LIST_FOREACH(device_allow, a, c->device_allow) {
450 char acc[4];
451 unsigned k = 0;
452
453 if (a->r)
454 acc[k++] = 'r';
455 if (a->w)
456 acc[k++] = 'w';
457 if (a->m)
458 acc[k++] = 'm';
459
460 if (k == 0)
461 continue;
462
463 acc[k++] = 0;
464
465 if (startswith(a->path, "/dev/"))
466 whitelist_device(path, a->path, acc);
467 else if (startswith(a->path, "block-"))
468 whitelist_major(path, a->path + 6, 'b', acc);
469 else if (startswith(a->path, "char-"))
470 whitelist_major(path, a->path + 5, 'c', acc);
471 else
472 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
473 }
474 }
475 }
476
477 CGroupControllerMask cgroup_context_get_mask(Manager *m, CGroupContext *c) {
478 CGroupControllerMask mask = 0;
479
480 /* Figure out which controllers we need */
481
482 if (c->cpu_accounting ||
483 (manager_state(m) == MANAGER_STARTING ? c->startup_cpu_shares : c->cpu_shares) != 1024 ||
484 (manager_state(m) != MANAGER_STARTING && c->startup_cpu_shares_set && c->startup_cpu_shares != c->cpu_shares) ||
485 c->cpu_quota_usec != (usec_t) -1 ||
486 c->cpu_quota_per_sec_usec != (usec_t) -1) {
487 mask |= CGROUP_CPUACCT | CGROUP_CPU;
488 if (manager_state(m) != MANAGER_STARTING)
489 c->startup_cpu_shares_set = false;
490 }
491
492 if (c->blockio_accounting ||
493 (manager_state(m) == MANAGER_STARTING ? c->startup_blockio_weight : c->blockio_weight) != 1000 ||
494 (manager_state(m) != MANAGER_STARTING && c->startup_blockio_weight_set && c->startup_blockio_weight != c->blockio_weight) ||
495 c->blockio_device_weights ||
496 c->blockio_device_bandwidths) {
497 mask |= CGROUP_BLKIO;
498 if (manager_state(m) != MANAGER_STARTING)
499 c->startup_blockio_weight_set = false;
500 }
501
502 if (c->memory_accounting ||
503 c->memory_limit != (uint64_t) -1)
504 mask |= CGROUP_MEMORY;
505
506 if (c->device_allow || c->device_policy != CGROUP_AUTO)
507 mask |= CGROUP_DEVICE;
508
509 return mask;
510 }
511
512 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
513 CGroupContext *c;
514
515 c = unit_get_cgroup_context(u);
516 if (!c)
517 return 0;
518
519 return cgroup_context_get_mask(u->manager, c);
520 }
521
522 CGroupControllerMask unit_get_members_mask(Unit *u) {
523 assert(u);
524
525 if (u->cgroup_members_mask_valid)
526 return u->cgroup_members_mask;
527
528 u->cgroup_members_mask = 0;
529
530 if (u->type == UNIT_SLICE) {
531 Unit *member;
532 Iterator i;
533
534 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
535
536 if (member == u)
537 continue;
538
539 if (UNIT_DEREF(member->slice) != u)
540 continue;
541
542 u->cgroup_members_mask |=
543 unit_get_cgroup_mask(member) |
544 unit_get_members_mask(member);
545 }
546 }
547
548 u->cgroup_members_mask_valid = true;
549 return u->cgroup_members_mask;
550 }
551
552 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
553 CGroupControllerMask m;
554
555 assert(u);
556
557 if (UNIT_ISSET(u->slice))
558 m = unit_get_members_mask(UNIT_DEREF(u->slice));
559 else
560 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
561
562 /* Sibling propagation is only relevant for weight-based
563 * controllers, so let's mask out everything else */
564 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
565 }
566
567 CGroupControllerMask unit_get_target_mask(Unit *u) {
568 CGroupControllerMask mask;
569
570 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
571 mask &= u->manager->cgroup_supported;
572
573 return mask;
574 }
575
576 /* Recurse from a unit up through its containing slices, propagating
577 * mask bits upward. A unit is also member of itself. */
578 void unit_update_cgroup_members_masks(Unit *u) {
579 CGroupControllerMask m;
580 bool more;
581
582 assert(u);
583
584 /* Calculate subtree mask */
585 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
586
587 /* See if anything changed from the previous invocation. If
588 * not, we're done. */
589 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
590 return;
591
592 more =
593 u->cgroup_subtree_mask_valid &&
594 ((m & ~u->cgroup_subtree_mask) != 0) &&
595 ((~m & u->cgroup_subtree_mask) == 0);
596
597 u->cgroup_subtree_mask = m;
598 u->cgroup_subtree_mask_valid = true;
599
600 if (UNIT_ISSET(u->slice)) {
601 Unit *s = UNIT_DEREF(u->slice);
602
603 if (more)
604 /* There's more set now than before. We
605 * propagate the new mask to the parent's mask
606 * (not caring if it actually was valid or
607 * not). */
608
609 s->cgroup_members_mask |= m;
610
611 else
612 /* There's less set now than before (or we
613 * don't know), we need to recalculate
614 * everything, so let's invalidate the
615 * parent's members mask */
616
617 s->cgroup_members_mask_valid = false;
618
619 /* And now make sure that this change also hits our
620 * grandparents */
621 unit_update_cgroup_members_masks(s);
622 }
623 }
624
625 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
626 Unit *u = userdata;
627
628 assert(mask != 0);
629 assert(u);
630
631 while (u) {
632 if (u->cgroup_path &&
633 u->cgroup_realized &&
634 (u->cgroup_realized_mask & mask) == mask)
635 return u->cgroup_path;
636
637 u = UNIT_DEREF(u->slice);
638 }
639
640 return NULL;
641 }
642
643 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
644 _cleanup_free_ char *path = NULL;
645 int r;
646
647 assert(u);
648
649 path = unit_default_cgroup_path(u);
650 if (!path)
651 return log_oom();
652
653 r = hashmap_put(u->manager->cgroup_unit, path, u);
654 if (r < 0) {
655 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
656 return r;
657 }
658 if (r > 0) {
659 u->cgroup_path = path;
660 path = NULL;
661 }
662
663 /* First, create our own group */
664 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
665 if (r < 0) {
666 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
667 return r;
668 }
669
670 /* Keep track that this is now realized */
671 u->cgroup_realized = true;
672 u->cgroup_realized_mask = mask;
673
674 /* Then, possibly move things over */
675 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
676 if (r < 0)
677 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
678
679 return 0;
680 }
681
682 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
683 assert(u);
684
685 return u->cgroup_realized && u->cgroup_realized_mask == mask;
686 }
687
688 /* Check if necessary controllers and attributes for a unit are in place.
689 *
690 * If so, do nothing.
691 * If not, create paths, move processes over, and set attributes.
692 *
693 * Returns 0 on success and < 0 on failure. */
694 static int unit_realize_cgroup_now(Unit *u) {
695 CGroupControllerMask mask;
696 int r;
697
698 assert(u);
699
700 if (u->in_cgroup_queue) {
701 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
702 u->in_cgroup_queue = false;
703 }
704
705 mask = unit_get_target_mask(u);
706
707 if (unit_has_mask_realized(u, mask))
708 return 0;
709
710 /* First, realize parents */
711 if (UNIT_ISSET(u->slice)) {
712 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
713 if (r < 0)
714 return r;
715 }
716
717 /* And then do the real work */
718 r = unit_create_cgroups(u, mask);
719 if (r < 0)
720 return r;
721
722 /* Finally, apply the necessary attributes. */
723 cgroup_context_apply(u->manager, unit_get_cgroup_context(u), mask, u->cgroup_path);
724
725 return 0;
726 }
727
728 static void unit_add_to_cgroup_queue(Unit *u) {
729
730 if (u->in_cgroup_queue)
731 return;
732
733 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
734 u->in_cgroup_queue = true;
735 }
736
737 unsigned manager_dispatch_cgroup_queue(Manager *m) {
738 Unit *i;
739 unsigned n = 0;
740 int r;
741
742 while ((i = m->cgroup_queue)) {
743 assert(i->in_cgroup_queue);
744
745 r = unit_realize_cgroup_now(i);
746 if (r < 0)
747 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
748
749 n++;
750 }
751
752 return n;
753 }
754
755 static void unit_queue_siblings(Unit *u) {
756 Unit *slice;
757
758 /* This adds the siblings of the specified unit and the
759 * siblings of all parent units to the cgroup queue. (But
760 * neither the specified unit itself nor the parents.) */
761
762 while ((slice = UNIT_DEREF(u->slice))) {
763 Iterator i;
764 Unit *m;
765
766 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
767 if (m == u)
768 continue;
769
770 /* Skip units that have a dependency on the slice
771 * but aren't actually in it. */
772 if (UNIT_DEREF(m->slice) != slice)
773 continue;
774
775 /* No point in doing cgroup application for units
776 * without active processes. */
777 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
778 continue;
779
780 /* If the unit doesn't need any new controllers
781 * and has current ones realized, it doesn't need
782 * any changes. */
783 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
784 continue;
785
786 unit_add_to_cgroup_queue(m);
787 }
788
789 u = slice;
790 }
791 }
792
793 int unit_realize_cgroup(Unit *u) {
794 CGroupContext *c;
795
796 assert(u);
797
798 c = unit_get_cgroup_context(u);
799 if (!c)
800 return 0;
801
802 /* So, here's the deal: when realizing the cgroups for this
803 * unit, we need to first create all parents, but there's more
804 * actually: for the weight-based controllers we also need to
805 * make sure that all our siblings (i.e. units that are in the
806 * same slice as we are) have cgroups, too. Otherwise, things
807 * would become very uneven as each of their processes would
808 * get as much resources as all our group together. This call
809 * will synchronously create the parent cgroups, but will
810 * defer work on the siblings to the next event loop
811 * iteration. */
812
813 /* Add all sibling slices to the cgroup queue. */
814 unit_queue_siblings(u);
815
816 /* And realize this one now (and apply the values) */
817 return unit_realize_cgroup_now(u);
818 }
819
820 void unit_destroy_cgroup(Unit *u) {
821 int r;
822
823 assert(u);
824
825 if (!u->cgroup_path)
826 return;
827
828 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
829 if (r < 0)
830 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
831
832 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
833
834 free(u->cgroup_path);
835 u->cgroup_path = NULL;
836 u->cgroup_realized = false;
837 u->cgroup_realized_mask = 0;
838
839 }
840
841 pid_t unit_search_main_pid(Unit *u) {
842 _cleanup_fclose_ FILE *f = NULL;
843 pid_t pid = 0, npid, mypid;
844
845 assert(u);
846
847 if (!u->cgroup_path)
848 return 0;
849
850 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
851 return 0;
852
853 mypid = getpid();
854 while (cg_read_pid(f, &npid) > 0) {
855 pid_t ppid;
856
857 if (npid == pid)
858 continue;
859
860 /* Ignore processes that aren't our kids */
861 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
862 continue;
863
864 if (pid != 0) {
865 /* Dang, there's more than one daemonized PID
866 in this group, so we don't know what process
867 is the main process. */
868 pid = 0;
869 break;
870 }
871
872 pid = npid;
873 }
874
875 return pid;
876 }
877
878 int manager_setup_cgroup(Manager *m) {
879 _cleanup_free_ char *path = NULL;
880 char *e;
881 int r;
882
883 assert(m);
884
885 /* 1. Determine hierarchy */
886 free(m->cgroup_root);
887 m->cgroup_root = NULL;
888
889 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
890 if (r < 0) {
891 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
892 return r;
893 }
894
895 /* LEGACY: Already in /system.slice? If so, let's cut this
896 * off. This is to support live upgrades from older systemd
897 * versions where PID 1 was moved there. */
898 if (m->running_as == SYSTEMD_SYSTEM) {
899 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
900 if (!e)
901 e = endswith(m->cgroup_root, "/system");
902 if (e)
903 *e = 0;
904 }
905
906 /* And make sure to store away the root value without trailing
907 * slash, even for the root dir, so that we can easily prepend
908 * it everywhere. */
909 if (streq(m->cgroup_root, "/"))
910 m->cgroup_root[0] = 0;
911
912 /* 2. Show data */
913 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
914 if (r < 0) {
915 log_error("Cannot find cgroup mount point: %s", strerror(-r));
916 return r;
917 }
918
919 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
920
921 /* 3. Install agent */
922 if (m->running_as == SYSTEMD_SYSTEM) {
923 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
924 if (r < 0)
925 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
926 else if (r > 0)
927 log_debug("Installed release agent.");
928 else
929 log_debug("Release agent already installed.");
930 }
931
932 /* 4. Make sure we are in the root cgroup */
933 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
934 if (r < 0) {
935 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
936 return r;
937 }
938
939 /* 5. And pin it, so that it cannot be unmounted */
940 safe_close(m->pin_cgroupfs_fd);
941
942 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
943 if (m->pin_cgroupfs_fd < 0) {
944 log_error("Failed to open pin file: %m");
945 return -errno;
946 }
947
948 /* 6. Figure out which controllers are supported */
949 m->cgroup_supported = cg_mask_supported();
950
951 /* 7. Always enable hierarchial support if it exists... */
952 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
953
954 return 0;
955 }
956
957 void manager_shutdown_cgroup(Manager *m, bool delete) {
958 assert(m);
959
960 /* We can't really delete the group, since we are in it. But
961 * let's trim it. */
962 if (delete && m->cgroup_root)
963 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
964
965 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
966
967 free(m->cgroup_root);
968 m->cgroup_root = NULL;
969 }
970
971 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
972 char *p;
973 Unit *u;
974
975 assert(m);
976 assert(cgroup);
977
978 u = hashmap_get(m->cgroup_unit, cgroup);
979 if (u)
980 return u;
981
982 p = strdupa(cgroup);
983 for (;;) {
984 char *e;
985
986 e = strrchr(p, '/');
987 if (e == p || !e)
988 return NULL;
989
990 *e = 0;
991
992 u = hashmap_get(m->cgroup_unit, p);
993 if (u)
994 return u;
995 }
996 }
997
998 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
999 _cleanup_free_ char *cgroup = NULL;
1000 int r;
1001
1002 assert(m);
1003
1004 if (pid <= 1)
1005 return NULL;
1006
1007 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1008 if (r < 0)
1009 return NULL;
1010
1011 return manager_get_unit_by_cgroup(m, cgroup);
1012 }
1013
1014 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1015 Unit *u;
1016 int r;
1017
1018 assert(m);
1019 assert(cgroup);
1020
1021 u = manager_get_unit_by_cgroup(m, cgroup);
1022 if (u) {
1023 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1024 if (r > 0) {
1025 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1026 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1027
1028 unit_add_to_gc_queue(u);
1029 }
1030 }
1031
1032 return 0;
1033 }
1034
1035 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1036 [CGROUP_AUTO] = "auto",
1037 [CGROUP_CLOSED] = "closed",
1038 [CGROUP_STRICT] = "strict",
1039 };
1040
1041 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);