]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
core: check the right variable for failed open()
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 void cgroup_context_init(CGroupContext *c) {
31 assert(c);
32
33 /* Initialize everything to the kernel defaults, assuming the
34 * structure is preinitialized to 0 */
35
36 c->cpu_shares = 1024;
37 c->memory_limit = (uint64_t) -1;
38 c->blockio_weight = 1000;
39
40 c->cpu_quota_per_sec_usec = (usec_t) -1;
41 c->cpu_quota_usec = (usec_t) -1;
42 c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
43 }
44
45 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
46 assert(c);
47 assert(a);
48
49 LIST_REMOVE(device_allow, c->device_allow, a);
50 free(a->path);
51 free(a);
52 }
53
54 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
55 assert(c);
56 assert(w);
57
58 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
59 free(w->path);
60 free(w);
61 }
62
63 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
64 assert(c);
65 assert(b);
66
67 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
68 free(b->path);
69 free(b);
70 }
71
72 void cgroup_context_done(CGroupContext *c) {
73 assert(c);
74
75 while (c->blockio_device_weights)
76 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
77
78 while (c->blockio_device_bandwidths)
79 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
80
81 while (c->device_allow)
82 cgroup_context_free_device_allow(c, c->device_allow);
83 }
84
85 usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
86 assert(c);
87
88 /* Returns the absolute CPU quota */
89
90 if (c->cpu_quota_usec != (usec_t) -1)
91 return c->cpu_quota_usec;
92 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
93 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
94 else
95 return (usec_t) -1;
96 }
97
98 usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
99 assert(c);
100
101 /* Returns the CPU quota relative to 1s */
102
103 if (c->cpu_quota_usec != (usec_t) -1)
104 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
105 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
106 return c->cpu_quota_per_sec_usec;
107 else
108 return (usec_t) -1;
109 }
110
111 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
112 CGroupBlockIODeviceBandwidth *b;
113 CGroupBlockIODeviceWeight *w;
114 CGroupDeviceAllow *a;
115 char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
116
117 assert(c);
118 assert(f);
119
120 prefix = strempty(prefix);
121
122 fprintf(f,
123 "%sCPUAccounting=%s\n"
124 "%sBlockIOAccounting=%s\n"
125 "%sMemoryAccounting=%s\n"
126 "%sCPUShares=%lu\n"
127 "%sCPUQuota=%s\n"
128 "%sCPUQuotaPerSecSec=%s\n"
129 "%sCPUQuotaPeriodSec=%s\n"
130 "%sBlockIOWeight=%lu\n"
131 "%sMemoryLimit=%" PRIu64 "\n"
132 "%sDevicePolicy=%s\n",
133 prefix, yes_no(c->cpu_accounting),
134 prefix, yes_no(c->blockio_accounting),
135 prefix, yes_no(c->memory_accounting),
136 prefix, c->cpu_shares,
137 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
138 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
139 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
140 prefix, c->blockio_weight,
141 prefix, c->memory_limit,
142 prefix, cgroup_device_policy_to_string(c->device_policy));
143
144 LIST_FOREACH(device_allow, a, c->device_allow)
145 fprintf(f,
146 "%sDeviceAllow=%s %s%s%s\n",
147 prefix,
148 a->path,
149 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
150
151 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
152 fprintf(f,
153 "%sBlockIODeviceWeight=%s %lu",
154 prefix,
155 w->path,
156 w->weight);
157
158 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
159 char buf[FORMAT_BYTES_MAX];
160
161 fprintf(f,
162 "%s%s=%s %s\n",
163 prefix,
164 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
165 b->path,
166 format_bytes(buf, sizeof(buf), b->bandwidth));
167 }
168 }
169
170 static int lookup_blkio_device(const char *p, dev_t *dev) {
171 struct stat st;
172 int r;
173
174 assert(p);
175 assert(dev);
176
177 r = stat(p, &st);
178 if (r < 0) {
179 log_warning("Couldn't stat device %s: %m", p);
180 return -errno;
181 }
182
183 if (S_ISBLK(st.st_mode))
184 *dev = st.st_rdev;
185 else if (major(st.st_dev) != 0) {
186 /* If this is not a device node then find the block
187 * device this file is stored on */
188 *dev = st.st_dev;
189
190 /* If this is a partition, try to get the originating
191 * block device */
192 block_get_whole_disk(*dev, dev);
193 } else {
194 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
195 return -ENODEV;
196 }
197
198 return 0;
199 }
200
201 static int whitelist_device(const char *path, const char *node, const char *acc) {
202 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
203 struct stat st;
204 int r;
205
206 assert(path);
207 assert(acc);
208
209 if (stat(node, &st) < 0) {
210 log_warning("Couldn't stat device %s", node);
211 return -errno;
212 }
213
214 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
215 log_warning("%s is not a device.", node);
216 return -ENODEV;
217 }
218
219 sprintf(buf,
220 "%c %u:%u %s",
221 S_ISCHR(st.st_mode) ? 'c' : 'b',
222 major(st.st_rdev), minor(st.st_rdev),
223 acc);
224
225 r = cg_set_attribute("devices", path, "devices.allow", buf);
226 if (r < 0)
227 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
228
229 return r;
230 }
231
232 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
233 _cleanup_fclose_ FILE *f = NULL;
234 char line[LINE_MAX];
235 bool good = false;
236 int r;
237
238 assert(path);
239 assert(acc);
240 assert(type == 'b' || type == 'c');
241
242 f = fopen("/proc/devices", "re");
243 if (!f) {
244 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
245 return -errno;
246 }
247
248 FOREACH_LINE(line, f, goto fail) {
249 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
250 unsigned maj;
251
252 truncate_nl(line);
253
254 if (type == 'c' && streq(line, "Character devices:")) {
255 good = true;
256 continue;
257 }
258
259 if (type == 'b' && streq(line, "Block devices:")) {
260 good = true;
261 continue;
262 }
263
264 if (isempty(line)) {
265 good = false;
266 continue;
267 }
268
269 if (!good)
270 continue;
271
272 p = strstrip(line);
273
274 w = strpbrk(p, WHITESPACE);
275 if (!w)
276 continue;
277 *w = 0;
278
279 r = safe_atou(p, &maj);
280 if (r < 0)
281 continue;
282 if (maj <= 0)
283 continue;
284
285 w++;
286 w += strspn(w, WHITESPACE);
287
288 if (fnmatch(name, w, 0) != 0)
289 continue;
290
291 sprintf(buf,
292 "%c %u:* %s",
293 type,
294 maj,
295 acc);
296
297 r = cg_set_attribute("devices", path, "devices.allow", buf);
298 if (r < 0)
299 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
300 }
301
302 return 0;
303
304 fail:
305 log_warning("Failed to read /proc/devices: %m");
306 return -errno;
307 }
308
309 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
310 bool is_root;
311 int r;
312
313 assert(c);
314 assert(path);
315
316 if (mask == 0)
317 return;
318
319 /* Some cgroup attributes are not support on the root cgroup,
320 * hence silently ignore */
321 is_root = isempty(path) || path_equal(path, "/");
322
323 if ((mask & CGROUP_CPU) && !is_root) {
324 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
325 usec_t q;
326
327 sprintf(buf, "%lu\n", c->cpu_shares);
328 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
329 if (r < 0)
330 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
331
332 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
333 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
334 if (r < 0)
335 log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
336
337 q = cgroup_context_get_cpu_quota_usec(c);
338 if (q != (usec_t) -1) {
339 sprintf(buf, USEC_FMT "\n", q);
340 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
341 } else
342 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
343 if (r < 0)
344 log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
345 }
346
347 if (mask & CGROUP_BLKIO) {
348 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
349 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
350 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
351 CGroupBlockIODeviceWeight *w;
352 CGroupBlockIODeviceBandwidth *b;
353
354 if (!is_root) {
355 sprintf(buf, "%lu\n", c->blockio_weight);
356 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
357 if (r < 0)
358 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
359
360 /* FIXME: no way to reset this list */
361 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
362 dev_t dev;
363
364 r = lookup_blkio_device(w->path, &dev);
365 if (r < 0)
366 continue;
367
368 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
369 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
370 if (r < 0)
371 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
372 }
373 }
374
375 /* FIXME: no way to reset this list */
376 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
377 const char *a;
378 dev_t dev;
379
380 r = lookup_blkio_device(b->path, &dev);
381 if (r < 0)
382 continue;
383
384 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
385
386 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
387 r = cg_set_attribute("blkio", path, a, buf);
388 if (r < 0)
389 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
390 }
391 }
392
393 if (mask & CGROUP_MEMORY) {
394 if (c->memory_limit != (uint64_t) -1) {
395 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
396
397 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
398 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
399 } else
400 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
401
402 if (r < 0)
403 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
404 }
405
406 if ((mask & CGROUP_DEVICE) && !is_root) {
407 CGroupDeviceAllow *a;
408
409 if (c->device_allow || c->device_policy != CGROUP_AUTO)
410 r = cg_set_attribute("devices", path, "devices.deny", "a");
411 else
412 r = cg_set_attribute("devices", path, "devices.allow", "a");
413 if (r < 0)
414 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
415
416 if (c->device_policy == CGROUP_CLOSED ||
417 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
418 static const char auto_devices[] =
419 "/dev/null\0" "rwm\0"
420 "/dev/zero\0" "rwm\0"
421 "/dev/full\0" "rwm\0"
422 "/dev/random\0" "rwm\0"
423 "/dev/urandom\0" "rwm\0"
424 "/dev/tty\0" "rwm\0"
425 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
426
427 const char *x, *y;
428
429 NULSTR_FOREACH_PAIR(x, y, auto_devices)
430 whitelist_device(path, x, y);
431
432 whitelist_major(path, "pts", 'c', "rw");
433 whitelist_major(path, "kdbus", 'c', "rw");
434 whitelist_major(path, "kdbus/*", 'c', "rw");
435 }
436
437 LIST_FOREACH(device_allow, a, c->device_allow) {
438 char acc[4];
439 unsigned k = 0;
440
441 if (a->r)
442 acc[k++] = 'r';
443 if (a->w)
444 acc[k++] = 'w';
445 if (a->m)
446 acc[k++] = 'm';
447
448 if (k == 0)
449 continue;
450
451 acc[k++] = 0;
452
453 if (startswith(a->path, "/dev/"))
454 whitelist_device(path, a->path, acc);
455 else if (startswith(a->path, "block-"))
456 whitelist_major(path, a->path + 6, 'b', acc);
457 else if (startswith(a->path, "char-"))
458 whitelist_major(path, a->path + 5, 'c', acc);
459 else
460 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
461 }
462 }
463 }
464
465 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
466 CGroupControllerMask mask = 0;
467
468 /* Figure out which controllers we need */
469
470 if (c->cpu_accounting ||
471 c->cpu_shares != 1024 ||
472 c->cpu_quota_usec != (usec_t) -1 ||
473 c->cpu_quota_per_sec_usec != (usec_t) -1)
474 mask |= CGROUP_CPUACCT | CGROUP_CPU;
475
476 if (c->blockio_accounting ||
477 c->blockio_weight != 1000 ||
478 c->blockio_device_weights ||
479 c->blockio_device_bandwidths)
480 mask |= CGROUP_BLKIO;
481
482 if (c->memory_accounting ||
483 c->memory_limit != (uint64_t) -1)
484 mask |= CGROUP_MEMORY;
485
486 if (c->device_allow || c->device_policy != CGROUP_AUTO)
487 mask |= CGROUP_DEVICE;
488
489 return mask;
490 }
491
492 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
493 CGroupContext *c;
494
495 c = unit_get_cgroup_context(u);
496 if (!c)
497 return 0;
498
499 return cgroup_context_get_mask(c);
500 }
501
502 CGroupControllerMask unit_get_members_mask(Unit *u) {
503 assert(u);
504
505 if (u->cgroup_members_mask_valid)
506 return u->cgroup_members_mask;
507
508 u->cgroup_members_mask = 0;
509
510 if (u->type == UNIT_SLICE) {
511 Unit *member;
512 Iterator i;
513
514 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
515
516 if (member == u)
517 continue;
518
519 if (UNIT_DEREF(member->slice) != u)
520 continue;
521
522 u->cgroup_members_mask |=
523 unit_get_cgroup_mask(member) |
524 unit_get_members_mask(member);
525 }
526 }
527
528 u->cgroup_members_mask_valid = true;
529 return u->cgroup_members_mask;
530 }
531
532 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
533 CGroupControllerMask m;
534
535 assert(u);
536
537 if (UNIT_ISSET(u->slice))
538 m = unit_get_members_mask(UNIT_DEREF(u->slice));
539 else
540 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
541
542 /* Sibling propagation is only relevant for weight-based
543 * controllers, so let's mask out everything else */
544 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
545 }
546
547 CGroupControllerMask unit_get_target_mask(Unit *u) {
548 CGroupControllerMask mask;
549
550 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
551 mask &= u->manager->cgroup_supported;
552
553 return mask;
554 }
555
556 /* Recurse from a unit up through its containing slices, propagating
557 * mask bits upward. A unit is also member of itself. */
558 void unit_update_cgroup_members_masks(Unit *u) {
559 CGroupControllerMask m;
560 bool more;
561
562 assert(u);
563
564 /* Calculate subtree mask */
565 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
566
567 /* See if anything changed from the previous invocation. If
568 * not, we're done. */
569 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
570 return;
571
572 more =
573 u->cgroup_subtree_mask_valid &&
574 ((m & ~u->cgroup_subtree_mask) != 0) &&
575 ((~m & u->cgroup_subtree_mask) == 0);
576
577 u->cgroup_subtree_mask = m;
578 u->cgroup_subtree_mask_valid = true;
579
580 if (UNIT_ISSET(u->slice)) {
581 Unit *s = UNIT_DEREF(u->slice);
582
583 if (more)
584 /* There's more set now than before. We
585 * propagate the new mask to the parent's mask
586 * (not caring if it actually was valid or
587 * not). */
588
589 s->cgroup_members_mask |= m;
590
591 else
592 /* There's less set now than before (or we
593 * don't know), we need to recalculate
594 * everything, so let's invalidate the
595 * parent's members mask */
596
597 s->cgroup_members_mask_valid = false;
598
599 /* And now make sure that this change also hits our
600 * grandparents */
601 unit_update_cgroup_members_masks(s);
602 }
603 }
604
605 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
606 Unit *u = userdata;
607
608 assert(mask != 0);
609 assert(u);
610
611 while (u) {
612 if (u->cgroup_path &&
613 u->cgroup_realized &&
614 (u->cgroup_realized_mask & mask) == mask)
615 return u->cgroup_path;
616
617 u = UNIT_DEREF(u->slice);
618 }
619
620 return NULL;
621 }
622
623 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
624 _cleanup_free_ char *path = NULL;
625 int r;
626
627 assert(u);
628
629 path = unit_default_cgroup_path(u);
630 if (!path)
631 return log_oom();
632
633 r = hashmap_put(u->manager->cgroup_unit, path, u);
634 if (r < 0) {
635 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
636 return r;
637 }
638 if (r > 0) {
639 u->cgroup_path = path;
640 path = NULL;
641 }
642
643 /* First, create our own group */
644 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
645 if (r < 0) {
646 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
647 return r;
648 }
649
650 /* Keep track that this is now realized */
651 u->cgroup_realized = true;
652 u->cgroup_realized_mask = mask;
653
654 /* Then, possibly move things over */
655 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
656 if (r < 0)
657 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
658
659 return 0;
660 }
661
662 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
663 assert(u);
664
665 return u->cgroup_realized && u->cgroup_realized_mask == mask;
666 }
667
668 /* Check if necessary controllers and attributes for a unit are in place.
669 *
670 * If so, do nothing.
671 * If not, create paths, move processes over, and set attributes.
672 *
673 * Returns 0 on success and < 0 on failure. */
674 static int unit_realize_cgroup_now(Unit *u) {
675 CGroupControllerMask mask;
676 int r;
677
678 assert(u);
679
680 if (u->in_cgroup_queue) {
681 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
682 u->in_cgroup_queue = false;
683 }
684
685 mask = unit_get_target_mask(u);
686
687 if (unit_has_mask_realized(u, mask))
688 return 0;
689
690 /* First, realize parents */
691 if (UNIT_ISSET(u->slice)) {
692 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
693 if (r < 0)
694 return r;
695 }
696
697 /* And then do the real work */
698 r = unit_create_cgroups(u, mask);
699 if (r < 0)
700 return r;
701
702 /* Finally, apply the necessary attributes. */
703 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
704
705 return 0;
706 }
707
708 static void unit_add_to_cgroup_queue(Unit *u) {
709
710 if (u->in_cgroup_queue)
711 return;
712
713 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
714 u->in_cgroup_queue = true;
715 }
716
717 unsigned manager_dispatch_cgroup_queue(Manager *m) {
718 Unit *i;
719 unsigned n = 0;
720 int r;
721
722 while ((i = m->cgroup_queue)) {
723 assert(i->in_cgroup_queue);
724
725 r = unit_realize_cgroup_now(i);
726 if (r < 0)
727 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
728
729 n++;
730 }
731
732 return n;
733 }
734
735 static void unit_queue_siblings(Unit *u) {
736 Unit *slice;
737
738 /* This adds the siblings of the specified unit and the
739 * siblings of all parent units to the cgroup queue. (But
740 * neither the specified unit itself nor the parents.) */
741
742 while ((slice = UNIT_DEREF(u->slice))) {
743 Iterator i;
744 Unit *m;
745
746 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
747 if (m == u)
748 continue;
749
750 /* Skip units that have a dependency on the slice
751 * but aren't actually in it. */
752 if (UNIT_DEREF(m->slice) != slice)
753 continue;
754
755 /* No point in doing cgroup application for units
756 * without active processes. */
757 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
758 continue;
759
760 /* If the unit doesn't need any new controllers
761 * and has current ones realized, it doesn't need
762 * any changes. */
763 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
764 continue;
765
766 unit_add_to_cgroup_queue(m);
767 }
768
769 u = slice;
770 }
771 }
772
773 int unit_realize_cgroup(Unit *u) {
774 CGroupContext *c;
775
776 assert(u);
777
778 c = unit_get_cgroup_context(u);
779 if (!c)
780 return 0;
781
782 /* So, here's the deal: when realizing the cgroups for this
783 * unit, we need to first create all parents, but there's more
784 * actually: for the weight-based controllers we also need to
785 * make sure that all our siblings (i.e. units that are in the
786 * same slice as we are) have cgroups, too. Otherwise, things
787 * would become very uneven as each of their processes would
788 * get as much resources as all our group together. This call
789 * will synchronously create the parent cgroups, but will
790 * defer work on the siblings to the next event loop
791 * iteration. */
792
793 /* Add all sibling slices to the cgroup queue. */
794 unit_queue_siblings(u);
795
796 /* And realize this one now (and apply the values) */
797 return unit_realize_cgroup_now(u);
798 }
799
800 void unit_destroy_cgroup(Unit *u) {
801 int r;
802
803 assert(u);
804
805 if (!u->cgroup_path)
806 return;
807
808 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
809 if (r < 0)
810 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
811
812 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
813
814 free(u->cgroup_path);
815 u->cgroup_path = NULL;
816 u->cgroup_realized = false;
817 u->cgroup_realized_mask = 0;
818
819 }
820
821 pid_t unit_search_main_pid(Unit *u) {
822 _cleanup_fclose_ FILE *f = NULL;
823 pid_t pid = 0, npid, mypid;
824
825 assert(u);
826
827 if (!u->cgroup_path)
828 return 0;
829
830 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
831 return 0;
832
833 mypid = getpid();
834 while (cg_read_pid(f, &npid) > 0) {
835 pid_t ppid;
836
837 if (npid == pid)
838 continue;
839
840 /* Ignore processes that aren't our kids */
841 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
842 continue;
843
844 if (pid != 0) {
845 /* Dang, there's more than one daemonized PID
846 in this group, so we don't know what process
847 is the main process. */
848 pid = 0;
849 break;
850 }
851
852 pid = npid;
853 }
854
855 return pid;
856 }
857
858 int manager_setup_cgroup(Manager *m) {
859 _cleanup_free_ char *path = NULL;
860 char *e;
861 int r;
862
863 assert(m);
864
865 /* 1. Determine hierarchy */
866 free(m->cgroup_root);
867 m->cgroup_root = NULL;
868
869 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
870 if (r < 0) {
871 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
872 return r;
873 }
874
875 /* LEGACY: Already in /system.slice? If so, let's cut this
876 * off. This is to support live upgrades from older systemd
877 * versions where PID 1 was moved there. */
878 if (m->running_as == SYSTEMD_SYSTEM) {
879 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
880 if (!e)
881 e = endswith(m->cgroup_root, "/system");
882 if (e)
883 *e = 0;
884 }
885
886 /* And make sure to store away the root value without trailing
887 * slash, even for the root dir, so that we can easily prepend
888 * it everywhere. */
889 if (streq(m->cgroup_root, "/"))
890 m->cgroup_root[0] = 0;
891
892 /* 2. Show data */
893 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
894 if (r < 0) {
895 log_error("Cannot find cgroup mount point: %s", strerror(-r));
896 return r;
897 }
898
899 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
900
901 /* 3. Install agent */
902 if (m->running_as == SYSTEMD_SYSTEM) {
903 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
904 if (r < 0)
905 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
906 else if (r > 0)
907 log_debug("Installed release agent.");
908 else
909 log_debug("Release agent already installed.");
910 }
911
912 /* 4. Make sure we are in the root cgroup */
913 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
914 if (r < 0) {
915 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
916 return r;
917 }
918
919 /* 5. And pin it, so that it cannot be unmounted */
920 safe_close(m->pin_cgroupfs_fd);
921
922 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
923 if (m->pin_cgroupfs_fd < 0) {
924 log_error("Failed to open pin file: %m");
925 return -errno;
926 }
927
928 /* 6. Figure out which controllers are supported */
929 m->cgroup_supported = cg_mask_supported();
930
931 /* 7. Always enable hierarchial support if it exists... */
932 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
933
934 return 0;
935 }
936
937 void manager_shutdown_cgroup(Manager *m, bool delete) {
938 assert(m);
939
940 /* We can't really delete the group, since we are in it. But
941 * let's trim it. */
942 if (delete && m->cgroup_root)
943 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
944
945 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
946
947 free(m->cgroup_root);
948 m->cgroup_root = NULL;
949 }
950
951 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
952 char *p;
953 Unit *u;
954
955 assert(m);
956 assert(cgroup);
957
958 u = hashmap_get(m->cgroup_unit, cgroup);
959 if (u)
960 return u;
961
962 p = strdupa(cgroup);
963 for (;;) {
964 char *e;
965
966 e = strrchr(p, '/');
967 if (e == p || !e)
968 return NULL;
969
970 *e = 0;
971
972 u = hashmap_get(m->cgroup_unit, p);
973 if (u)
974 return u;
975 }
976 }
977
978 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
979 _cleanup_free_ char *cgroup = NULL;
980 int r;
981
982 assert(m);
983
984 if (pid <= 1)
985 return NULL;
986
987 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
988 if (r < 0)
989 return NULL;
990
991 return manager_get_unit_by_cgroup(m, cgroup);
992 }
993
994 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
995 Unit *u;
996 int r;
997
998 assert(m);
999 assert(cgroup);
1000
1001 u = manager_get_unit_by_cgroup(m, cgroup);
1002 if (u) {
1003 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1004 if (r > 0) {
1005 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1006 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1007
1008 unit_add_to_gc_queue(u);
1009 }
1010 }
1011
1012 return 0;
1013 }
1014
1015 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1016 [CGROUP_AUTO] = "auto",
1017 [CGROUP_CLOSED] = "closed",
1018 [CGROUP_STRICT] = "strict",
1019 };
1020
1021 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);