]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
cgroups: simplify CPUQuota= logic
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
31
32 void cgroup_context_init(CGroupContext *c) {
33 assert(c);
34
35 /* Initialize everything to the kernel defaults, assuming the
36 * structure is preinitialized to 0 */
37
38 c->cpu_shares = (unsigned long) -1;
39 c->startup_cpu_shares = (unsigned long) -1;
40 c->memory_limit = (uint64_t) -1;
41 c->blockio_weight = (unsigned long) -1;
42 c->startup_blockio_weight = (unsigned long) -1;
43
44 c->cpu_quota_per_sec_usec = (usec_t) -1;
45 }
46
47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
48 assert(c);
49 assert(a);
50
51 LIST_REMOVE(device_allow, c->device_allow, a);
52 free(a->path);
53 free(a);
54 }
55
56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
57 assert(c);
58 assert(w);
59
60 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
61 free(w->path);
62 free(w);
63 }
64
65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
66 assert(c);
67 assert(b);
68
69 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
70 free(b->path);
71 free(b);
72 }
73
74 void cgroup_context_done(CGroupContext *c) {
75 assert(c);
76
77 while (c->blockio_device_weights)
78 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
79
80 while (c->blockio_device_bandwidths)
81 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
82
83 while (c->device_allow)
84 cgroup_context_free_device_allow(c, c->device_allow);
85 }
86
87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
88 CGroupBlockIODeviceBandwidth *b;
89 CGroupBlockIODeviceWeight *w;
90 CGroupDeviceAllow *a;
91 char u[FORMAT_TIMESPAN_MAX];
92
93 assert(c);
94 assert(f);
95
96 prefix = strempty(prefix);
97
98 fprintf(f,
99 "%sCPUAccounting=%s\n"
100 "%sBlockIOAccounting=%s\n"
101 "%sMemoryAccounting=%s\n"
102 "%sCPUShares=%lu\n"
103 "%sStartupCPUShares=%lu\n"
104 "%sCPUQuotaPerSecSec=%s\n"
105 "%sBlockIOWeight=%lu\n"
106 "%sStartupBlockIOWeight=%lu\n"
107 "%sMemoryLimit=%" PRIu64 "\n"
108 "%sDevicePolicy=%s\n",
109 prefix, yes_no(c->cpu_accounting),
110 prefix, yes_no(c->blockio_accounting),
111 prefix, yes_no(c->memory_accounting),
112 prefix, c->cpu_shares,
113 prefix, c->startup_cpu_shares,
114 prefix, strna(format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1)),
115 prefix, c->blockio_weight,
116 prefix, c->startup_blockio_weight,
117 prefix, c->memory_limit,
118 prefix, cgroup_device_policy_to_string(c->device_policy));
119
120 LIST_FOREACH(device_allow, a, c->device_allow)
121 fprintf(f,
122 "%sDeviceAllow=%s %s%s%s\n",
123 prefix,
124 a->path,
125 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
126
127 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
128 fprintf(f,
129 "%sBlockIODeviceWeight=%s %lu",
130 prefix,
131 w->path,
132 w->weight);
133
134 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
135 char buf[FORMAT_BYTES_MAX];
136
137 fprintf(f,
138 "%s%s=%s %s\n",
139 prefix,
140 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
141 b->path,
142 format_bytes(buf, sizeof(buf), b->bandwidth));
143 }
144 }
145
146 static int lookup_blkio_device(const char *p, dev_t *dev) {
147 struct stat st;
148 int r;
149
150 assert(p);
151 assert(dev);
152
153 r = stat(p, &st);
154 if (r < 0) {
155 log_warning("Couldn't stat device %s: %m", p);
156 return -errno;
157 }
158
159 if (S_ISBLK(st.st_mode))
160 *dev = st.st_rdev;
161 else if (major(st.st_dev) != 0) {
162 /* If this is not a device node then find the block
163 * device this file is stored on */
164 *dev = st.st_dev;
165
166 /* If this is a partition, try to get the originating
167 * block device */
168 block_get_whole_disk(*dev, dev);
169 } else {
170 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
171 return -ENODEV;
172 }
173
174 return 0;
175 }
176
177 static int whitelist_device(const char *path, const char *node, const char *acc) {
178 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
179 struct stat st;
180 int r;
181
182 assert(path);
183 assert(acc);
184
185 if (stat(node, &st) < 0) {
186 log_warning("Couldn't stat device %s", node);
187 return -errno;
188 }
189
190 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
191 log_warning("%s is not a device.", node);
192 return -ENODEV;
193 }
194
195 sprintf(buf,
196 "%c %u:%u %s",
197 S_ISCHR(st.st_mode) ? 'c' : 'b',
198 major(st.st_rdev), minor(st.st_rdev),
199 acc);
200
201 r = cg_set_attribute("devices", path, "devices.allow", buf);
202 if (r < 0)
203 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
204
205 return r;
206 }
207
208 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
209 _cleanup_fclose_ FILE *f = NULL;
210 char line[LINE_MAX];
211 bool good = false;
212 int r;
213
214 assert(path);
215 assert(acc);
216 assert(type == 'b' || type == 'c');
217
218 f = fopen("/proc/devices", "re");
219 if (!f) {
220 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
221 return -errno;
222 }
223
224 FOREACH_LINE(line, f, goto fail) {
225 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
226 unsigned maj;
227
228 truncate_nl(line);
229
230 if (type == 'c' && streq(line, "Character devices:")) {
231 good = true;
232 continue;
233 }
234
235 if (type == 'b' && streq(line, "Block devices:")) {
236 good = true;
237 continue;
238 }
239
240 if (isempty(line)) {
241 good = false;
242 continue;
243 }
244
245 if (!good)
246 continue;
247
248 p = strstrip(line);
249
250 w = strpbrk(p, WHITESPACE);
251 if (!w)
252 continue;
253 *w = 0;
254
255 r = safe_atou(p, &maj);
256 if (r < 0)
257 continue;
258 if (maj <= 0)
259 continue;
260
261 w++;
262 w += strspn(w, WHITESPACE);
263
264 if (fnmatch(name, w, 0) != 0)
265 continue;
266
267 sprintf(buf,
268 "%c %u:* %s",
269 type,
270 maj,
271 acc);
272
273 r = cg_set_attribute("devices", path, "devices.allow", buf);
274 if (r < 0)
275 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
276 }
277
278 return 0;
279
280 fail:
281 log_warning("Failed to read /proc/devices: %m");
282 return -errno;
283 }
284
285 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
286 bool is_root;
287 int r;
288
289 assert(c);
290 assert(path);
291
292 if (mask == 0)
293 return;
294
295 /* Some cgroup attributes are not support on the root cgroup,
296 * hence silently ignore */
297 is_root = isempty(path) || path_equal(path, "/");
298
299 if ((mask & CGROUP_CPU) && !is_root) {
300 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
301
302 sprintf(buf, "%lu\n",
303 state == MANAGER_STARTING && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
304 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
305 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
306 if (r < 0)
307 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
308
309 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
310 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
311 if (r < 0)
312 log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
313
314 if (c->cpu_quota_per_sec_usec != (usec_t) -1) {
315 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
316 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
317 } else
318 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
319 if (r < 0)
320 log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
321 }
322
323 if (mask & CGROUP_BLKIO) {
324 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
325 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
326 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
327 CGroupBlockIODeviceWeight *w;
328 CGroupBlockIODeviceBandwidth *b;
329
330 if (!is_root) {
331 sprintf(buf, "%lu\n", state == MANAGER_STARTING && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
332 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
333 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
334 if (r < 0)
335 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
336
337 /* FIXME: no way to reset this list */
338 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
339 dev_t dev;
340
341 r = lookup_blkio_device(w->path, &dev);
342 if (r < 0)
343 continue;
344
345 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
346 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
347 if (r < 0)
348 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
349 }
350 }
351
352 /* FIXME: no way to reset this list */
353 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
354 const char *a;
355 dev_t dev;
356
357 r = lookup_blkio_device(b->path, &dev);
358 if (r < 0)
359 continue;
360
361 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
362
363 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
364 r = cg_set_attribute("blkio", path, a, buf);
365 if (r < 0)
366 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
367 }
368 }
369
370 if (mask & CGROUP_MEMORY) {
371 if (c->memory_limit != (uint64_t) -1) {
372 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
373
374 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
375 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
376 } else
377 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
378
379 if (r < 0)
380 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
381 }
382
383 if ((mask & CGROUP_DEVICE) && !is_root) {
384 CGroupDeviceAllow *a;
385
386 if (c->device_allow || c->device_policy != CGROUP_AUTO)
387 r = cg_set_attribute("devices", path, "devices.deny", "a");
388 else
389 r = cg_set_attribute("devices", path, "devices.allow", "a");
390 if (r < 0)
391 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
392
393 if (c->device_policy == CGROUP_CLOSED ||
394 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
395 static const char auto_devices[] =
396 "/dev/null\0" "rwm\0"
397 "/dev/zero\0" "rwm\0"
398 "/dev/full\0" "rwm\0"
399 "/dev/random\0" "rwm\0"
400 "/dev/urandom\0" "rwm\0"
401 "/dev/tty\0" "rwm\0"
402 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
403
404 const char *x, *y;
405
406 NULSTR_FOREACH_PAIR(x, y, auto_devices)
407 whitelist_device(path, x, y);
408
409 whitelist_major(path, "pts", 'c', "rw");
410 whitelist_major(path, "kdbus", 'c', "rw");
411 whitelist_major(path, "kdbus/*", 'c', "rw");
412 }
413
414 LIST_FOREACH(device_allow, a, c->device_allow) {
415 char acc[4];
416 unsigned k = 0;
417
418 if (a->r)
419 acc[k++] = 'r';
420 if (a->w)
421 acc[k++] = 'w';
422 if (a->m)
423 acc[k++] = 'm';
424
425 if (k == 0)
426 continue;
427
428 acc[k++] = 0;
429
430 if (startswith(a->path, "/dev/"))
431 whitelist_device(path, a->path, acc);
432 else if (startswith(a->path, "block-"))
433 whitelist_major(path, a->path + 6, 'b', acc);
434 else if (startswith(a->path, "char-"))
435 whitelist_major(path, a->path + 5, 'c', acc);
436 else
437 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
438 }
439 }
440 }
441
442 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
443 CGroupControllerMask mask = 0;
444
445 /* Figure out which controllers we need */
446
447 if (c->cpu_accounting ||
448 c->cpu_shares != (unsigned long) -1 ||
449 c->startup_cpu_shares != (unsigned long) -1 ||
450 c->cpu_quota_per_sec_usec != (usec_t) -1)
451 mask |= CGROUP_CPUACCT | CGROUP_CPU;
452
453 if (c->blockio_accounting ||
454 c->blockio_weight != (unsigned long) -1 ||
455 c->startup_blockio_weight != (unsigned long) -1 ||
456 c->blockio_device_weights ||
457 c->blockio_device_bandwidths)
458 mask |= CGROUP_BLKIO;
459
460 if (c->memory_accounting ||
461 c->memory_limit != (uint64_t) -1)
462 mask |= CGROUP_MEMORY;
463
464 if (c->device_allow || c->device_policy != CGROUP_AUTO)
465 mask |= CGROUP_DEVICE;
466
467 return mask;
468 }
469
470 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
471 CGroupContext *c;
472
473 c = unit_get_cgroup_context(u);
474 if (!c)
475 return 0;
476
477 return cgroup_context_get_mask(c);
478 }
479
480 CGroupControllerMask unit_get_members_mask(Unit *u) {
481 assert(u);
482
483 if (u->cgroup_members_mask_valid)
484 return u->cgroup_members_mask;
485
486 u->cgroup_members_mask = 0;
487
488 if (u->type == UNIT_SLICE) {
489 Unit *member;
490 Iterator i;
491
492 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
493
494 if (member == u)
495 continue;
496
497 if (UNIT_DEREF(member->slice) != u)
498 continue;
499
500 u->cgroup_members_mask |=
501 unit_get_cgroup_mask(member) |
502 unit_get_members_mask(member);
503 }
504 }
505
506 u->cgroup_members_mask_valid = true;
507 return u->cgroup_members_mask;
508 }
509
510 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
511 assert(u);
512
513 if (UNIT_ISSET(u->slice))
514 return unit_get_members_mask(UNIT_DEREF(u->slice));
515
516 return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
517 }
518
519 CGroupControllerMask unit_get_target_mask(Unit *u) {
520 CGroupControllerMask mask;
521
522 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
523 mask &= u->manager->cgroup_supported;
524
525 return mask;
526 }
527
528 /* Recurse from a unit up through its containing slices, propagating
529 * mask bits upward. A unit is also member of itself. */
530 void unit_update_cgroup_members_masks(Unit *u) {
531 CGroupControllerMask m;
532 bool more;
533
534 assert(u);
535
536 /* Calculate subtree mask */
537 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
538
539 /* See if anything changed from the previous invocation. If
540 * not, we're done. */
541 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
542 return;
543
544 more =
545 u->cgroup_subtree_mask_valid &&
546 ((m & ~u->cgroup_subtree_mask) != 0) &&
547 ((~m & u->cgroup_subtree_mask) == 0);
548
549 u->cgroup_subtree_mask = m;
550 u->cgroup_subtree_mask_valid = true;
551
552 if (UNIT_ISSET(u->slice)) {
553 Unit *s = UNIT_DEREF(u->slice);
554
555 if (more)
556 /* There's more set now than before. We
557 * propagate the new mask to the parent's mask
558 * (not caring if it actually was valid or
559 * not). */
560
561 s->cgroup_members_mask |= m;
562
563 else
564 /* There's less set now than before (or we
565 * don't know), we need to recalculate
566 * everything, so let's invalidate the
567 * parent's members mask */
568
569 s->cgroup_members_mask_valid = false;
570
571 /* And now make sure that this change also hits our
572 * grandparents */
573 unit_update_cgroup_members_masks(s);
574 }
575 }
576
577 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
578 Unit *u = userdata;
579
580 assert(mask != 0);
581 assert(u);
582
583 while (u) {
584 if (u->cgroup_path &&
585 u->cgroup_realized &&
586 (u->cgroup_realized_mask & mask) == mask)
587 return u->cgroup_path;
588
589 u = UNIT_DEREF(u->slice);
590 }
591
592 return NULL;
593 }
594
595 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
596 _cleanup_free_ char *path = NULL;
597 int r;
598
599 assert(u);
600
601 path = unit_default_cgroup_path(u);
602 if (!path)
603 return log_oom();
604
605 r = hashmap_put(u->manager->cgroup_unit, path, u);
606 if (r < 0) {
607 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
608 return r;
609 }
610 if (r > 0) {
611 u->cgroup_path = path;
612 path = NULL;
613 }
614
615 /* First, create our own group */
616 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
617 if (r < 0) {
618 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
619 return r;
620 }
621
622 /* Keep track that this is now realized */
623 u->cgroup_realized = true;
624 u->cgroup_realized_mask = mask;
625
626 /* Then, possibly move things over */
627 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
628 if (r < 0)
629 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
630
631 return 0;
632 }
633
634 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
635 assert(u);
636
637 return u->cgroup_realized && u->cgroup_realized_mask == mask;
638 }
639
640 /* Check if necessary controllers and attributes for a unit are in place.
641 *
642 * If so, do nothing.
643 * If not, create paths, move processes over, and set attributes.
644 *
645 * Returns 0 on success and < 0 on failure. */
646 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
647 CGroupControllerMask mask;
648 int r;
649
650 assert(u);
651
652 if (u->in_cgroup_queue) {
653 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
654 u->in_cgroup_queue = false;
655 }
656
657 mask = unit_get_target_mask(u);
658
659 if (unit_has_mask_realized(u, mask))
660 return 0;
661
662 /* First, realize parents */
663 if (UNIT_ISSET(u->slice)) {
664 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
665 if (r < 0)
666 return r;
667 }
668
669 /* And then do the real work */
670 r = unit_create_cgroups(u, mask);
671 if (r < 0)
672 return r;
673
674 /* Finally, apply the necessary attributes. */
675 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
676
677 return 0;
678 }
679
680 static void unit_add_to_cgroup_queue(Unit *u) {
681
682 if (u->in_cgroup_queue)
683 return;
684
685 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
686 u->in_cgroup_queue = true;
687 }
688
689 unsigned manager_dispatch_cgroup_queue(Manager *m) {
690 ManagerState state;
691 unsigned n = 0;
692 Unit *i;
693 int r;
694
695 state = manager_state(m);
696
697 while ((i = m->cgroup_queue)) {
698 assert(i->in_cgroup_queue);
699
700 r = unit_realize_cgroup_now(i, state);
701 if (r < 0)
702 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
703
704 n++;
705 }
706
707 return n;
708 }
709
710 static void unit_queue_siblings(Unit *u) {
711 Unit *slice;
712
713 /* This adds the siblings of the specified unit and the
714 * siblings of all parent units to the cgroup queue. (But
715 * neither the specified unit itself nor the parents.) */
716
717 while ((slice = UNIT_DEREF(u->slice))) {
718 Iterator i;
719 Unit *m;
720
721 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
722 if (m == u)
723 continue;
724
725 /* Skip units that have a dependency on the slice
726 * but aren't actually in it. */
727 if (UNIT_DEREF(m->slice) != slice)
728 continue;
729
730 /* No point in doing cgroup application for units
731 * without active processes. */
732 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
733 continue;
734
735 /* If the unit doesn't need any new controllers
736 * and has current ones realized, it doesn't need
737 * any changes. */
738 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
739 continue;
740
741 unit_add_to_cgroup_queue(m);
742 }
743
744 u = slice;
745 }
746 }
747
748 int unit_realize_cgroup(Unit *u) {
749 CGroupContext *c;
750
751 assert(u);
752
753 c = unit_get_cgroup_context(u);
754 if (!c)
755 return 0;
756
757 /* So, here's the deal: when realizing the cgroups for this
758 * unit, we need to first create all parents, but there's more
759 * actually: for the weight-based controllers we also need to
760 * make sure that all our siblings (i.e. units that are in the
761 * same slice as we are) have cgroups, too. Otherwise, things
762 * would become very uneven as each of their processes would
763 * get as much resources as all our group together. This call
764 * will synchronously create the parent cgroups, but will
765 * defer work on the siblings to the next event loop
766 * iteration. */
767
768 /* Add all sibling slices to the cgroup queue. */
769 unit_queue_siblings(u);
770
771 /* And realize this one now (and apply the values) */
772 return unit_realize_cgroup_now(u, manager_state(u->manager));
773 }
774
775 void unit_destroy_cgroup(Unit *u) {
776 int r;
777
778 assert(u);
779
780 if (!u->cgroup_path)
781 return;
782
783 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
784 if (r < 0)
785 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
786
787 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
788
789 free(u->cgroup_path);
790 u->cgroup_path = NULL;
791 u->cgroup_realized = false;
792 u->cgroup_realized_mask = 0;
793
794 }
795
796 pid_t unit_search_main_pid(Unit *u) {
797 _cleanup_fclose_ FILE *f = NULL;
798 pid_t pid = 0, npid, mypid;
799
800 assert(u);
801
802 if (!u->cgroup_path)
803 return 0;
804
805 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
806 return 0;
807
808 mypid = getpid();
809 while (cg_read_pid(f, &npid) > 0) {
810 pid_t ppid;
811
812 if (npid == pid)
813 continue;
814
815 /* Ignore processes that aren't our kids */
816 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
817 continue;
818
819 if (pid != 0) {
820 /* Dang, there's more than one daemonized PID
821 in this group, so we don't know what process
822 is the main process. */
823 pid = 0;
824 break;
825 }
826
827 pid = npid;
828 }
829
830 return pid;
831 }
832
833 int manager_setup_cgroup(Manager *m) {
834 _cleanup_free_ char *path = NULL;
835 char *e;
836 int r;
837
838 assert(m);
839
840 /* 1. Determine hierarchy */
841 free(m->cgroup_root);
842 m->cgroup_root = NULL;
843
844 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
845 if (r < 0) {
846 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
847 return r;
848 }
849
850 /* LEGACY: Already in /system.slice? If so, let's cut this
851 * off. This is to support live upgrades from older systemd
852 * versions where PID 1 was moved there. */
853 if (m->running_as == SYSTEMD_SYSTEM) {
854 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
855 if (!e)
856 e = endswith(m->cgroup_root, "/system");
857 if (e)
858 *e = 0;
859 }
860
861 /* And make sure to store away the root value without trailing
862 * slash, even for the root dir, so that we can easily prepend
863 * it everywhere. */
864 if (streq(m->cgroup_root, "/"))
865 m->cgroup_root[0] = 0;
866
867 /* 2. Show data */
868 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
869 if (r < 0) {
870 log_error("Cannot find cgroup mount point: %s", strerror(-r));
871 return r;
872 }
873
874 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
875
876 /* 3. Install agent */
877 if (m->running_as == SYSTEMD_SYSTEM) {
878 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
879 if (r < 0)
880 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
881 else if (r > 0)
882 log_debug("Installed release agent.");
883 else
884 log_debug("Release agent already installed.");
885 }
886
887 /* 4. Make sure we are in the root cgroup */
888 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
889 if (r < 0) {
890 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
891 return r;
892 }
893
894 /* 5. And pin it, so that it cannot be unmounted */
895 safe_close(m->pin_cgroupfs_fd);
896
897 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
898 if (m->pin_cgroupfs_fd < 0) {
899 log_error("Failed to open pin file: %m");
900 return -errno;
901 }
902
903 /* 6. Figure out which controllers are supported */
904 m->cgroup_supported = cg_mask_supported();
905
906 /* 7. Always enable hierarchial support if it exists... */
907 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
908
909 return 0;
910 }
911
912 void manager_shutdown_cgroup(Manager *m, bool delete) {
913 assert(m);
914
915 /* We can't really delete the group, since we are in it. But
916 * let's trim it. */
917 if (delete && m->cgroup_root)
918 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
919
920 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
921
922 free(m->cgroup_root);
923 m->cgroup_root = NULL;
924 }
925
926 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
927 char *p;
928 Unit *u;
929
930 assert(m);
931 assert(cgroup);
932
933 u = hashmap_get(m->cgroup_unit, cgroup);
934 if (u)
935 return u;
936
937 p = strdupa(cgroup);
938 for (;;) {
939 char *e;
940
941 e = strrchr(p, '/');
942 if (e == p || !e)
943 return NULL;
944
945 *e = 0;
946
947 u = hashmap_get(m->cgroup_unit, p);
948 if (u)
949 return u;
950 }
951 }
952
953 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
954 _cleanup_free_ char *cgroup = NULL;
955 int r;
956
957 assert(m);
958
959 if (pid <= 1)
960 return NULL;
961
962 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
963 if (r < 0)
964 return NULL;
965
966 return manager_get_unit_by_cgroup(m, cgroup);
967 }
968
969 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
970 Unit *u;
971 int r;
972
973 assert(m);
974 assert(cgroup);
975
976 u = manager_get_unit_by_cgroup(m, cgroup);
977 if (u) {
978 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
979 if (r > 0) {
980 if (UNIT_VTABLE(u)->notify_cgroup_empty)
981 UNIT_VTABLE(u)->notify_cgroup_empty(u);
982
983 unit_add_to_gc_queue(u);
984 }
985 }
986
987 return 0;
988 }
989
990 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
991 [CGROUP_AUTO] = "auto",
992 [CGROUP_CLOSED] = "closed",
993 [CGROUP_STRICT] = "strict",
994 };
995
996 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);