]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
cgroup: rework startup logic
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 void cgroup_context_init(CGroupContext *c) {
31 assert(c);
32
33 /* Initialize everything to the kernel defaults, assuming the
34 * structure is preinitialized to 0 */
35
36 c->cpu_shares = (unsigned long) -1;
37 c->startup_cpu_shares = (unsigned long) -1;
38 c->memory_limit = (uint64_t) -1;
39 c->blockio_weight = (unsigned long) -1;
40 c->startup_blockio_weight = (unsigned long) -1;
41
42 c->cpu_quota_per_sec_usec = (usec_t) -1;
43 c->cpu_quota_usec = (usec_t) -1;
44 c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
45 }
46
47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
48 assert(c);
49 assert(a);
50
51 LIST_REMOVE(device_allow, c->device_allow, a);
52 free(a->path);
53 free(a);
54 }
55
56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
57 assert(c);
58 assert(w);
59
60 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
61 free(w->path);
62 free(w);
63 }
64
65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
66 assert(c);
67 assert(b);
68
69 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
70 free(b->path);
71 free(b);
72 }
73
74 void cgroup_context_done(CGroupContext *c) {
75 assert(c);
76
77 while (c->blockio_device_weights)
78 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
79
80 while (c->blockio_device_bandwidths)
81 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
82
83 while (c->device_allow)
84 cgroup_context_free_device_allow(c, c->device_allow);
85 }
86
87 usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
88 assert(c);
89
90 /* Returns the absolute CPU quota */
91
92 if (c->cpu_quota_usec != (usec_t) -1)
93 return c->cpu_quota_usec;
94 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
95 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
96 else
97 return (usec_t) -1;
98 }
99
100 usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
101 assert(c);
102
103 /* Returns the CPU quota relative to 1s */
104
105 if (c->cpu_quota_usec != (usec_t) -1)
106 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
107 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
108 return c->cpu_quota_per_sec_usec;
109 else
110 return (usec_t) -1;
111 }
112
113 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
114 CGroupBlockIODeviceBandwidth *b;
115 CGroupBlockIODeviceWeight *w;
116 CGroupDeviceAllow *a;
117 char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
118
119 assert(c);
120 assert(f);
121
122 prefix = strempty(prefix);
123
124 fprintf(f,
125 "%sCPUAccounting=%s\n"
126 "%sBlockIOAccounting=%s\n"
127 "%sMemoryAccounting=%s\n"
128 "%sCPUShares=%lu\n"
129 "%sStartupCPUShares=%lu\n"
130 "%sCPUQuota=%s\n"
131 "%sCPUQuotaPerSecSec=%s\n"
132 "%sCPUQuotaPeriodSec=%s\n"
133 "%sBlockIOWeight=%lu\n"
134 "%sStartupBlockIOWeight=%lu\n"
135 "%sMemoryLimit=%" PRIu64 "\n"
136 "%sDevicePolicy=%s\n",
137 prefix, yes_no(c->cpu_accounting),
138 prefix, yes_no(c->blockio_accounting),
139 prefix, yes_no(c->memory_accounting),
140 prefix, c->cpu_shares,
141 prefix, c->startup_cpu_shares,
142 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
143 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
144 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
145 prefix, c->blockio_weight,
146 prefix, c->startup_blockio_weight,
147 prefix, c->memory_limit,
148 prefix, cgroup_device_policy_to_string(c->device_policy));
149
150 LIST_FOREACH(device_allow, a, c->device_allow)
151 fprintf(f,
152 "%sDeviceAllow=%s %s%s%s\n",
153 prefix,
154 a->path,
155 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
156
157 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
158 fprintf(f,
159 "%sBlockIODeviceWeight=%s %lu",
160 prefix,
161 w->path,
162 w->weight);
163
164 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
165 char buf[FORMAT_BYTES_MAX];
166
167 fprintf(f,
168 "%s%s=%s %s\n",
169 prefix,
170 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
171 b->path,
172 format_bytes(buf, sizeof(buf), b->bandwidth));
173 }
174 }
175
176 static int lookup_blkio_device(const char *p, dev_t *dev) {
177 struct stat st;
178 int r;
179
180 assert(p);
181 assert(dev);
182
183 r = stat(p, &st);
184 if (r < 0) {
185 log_warning("Couldn't stat device %s: %m", p);
186 return -errno;
187 }
188
189 if (S_ISBLK(st.st_mode))
190 *dev = st.st_rdev;
191 else if (major(st.st_dev) != 0) {
192 /* If this is not a device node then find the block
193 * device this file is stored on */
194 *dev = st.st_dev;
195
196 /* If this is a partition, try to get the originating
197 * block device */
198 block_get_whole_disk(*dev, dev);
199 } else {
200 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
201 return -ENODEV;
202 }
203
204 return 0;
205 }
206
207 static int whitelist_device(const char *path, const char *node, const char *acc) {
208 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
209 struct stat st;
210 int r;
211
212 assert(path);
213 assert(acc);
214
215 if (stat(node, &st) < 0) {
216 log_warning("Couldn't stat device %s", node);
217 return -errno;
218 }
219
220 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
221 log_warning("%s is not a device.", node);
222 return -ENODEV;
223 }
224
225 sprintf(buf,
226 "%c %u:%u %s",
227 S_ISCHR(st.st_mode) ? 'c' : 'b',
228 major(st.st_rdev), minor(st.st_rdev),
229 acc);
230
231 r = cg_set_attribute("devices", path, "devices.allow", buf);
232 if (r < 0)
233 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
234
235 return r;
236 }
237
238 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
239 _cleanup_fclose_ FILE *f = NULL;
240 char line[LINE_MAX];
241 bool good = false;
242 int r;
243
244 assert(path);
245 assert(acc);
246 assert(type == 'b' || type == 'c');
247
248 f = fopen("/proc/devices", "re");
249 if (!f) {
250 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
251 return -errno;
252 }
253
254 FOREACH_LINE(line, f, goto fail) {
255 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
256 unsigned maj;
257
258 truncate_nl(line);
259
260 if (type == 'c' && streq(line, "Character devices:")) {
261 good = true;
262 continue;
263 }
264
265 if (type == 'b' && streq(line, "Block devices:")) {
266 good = true;
267 continue;
268 }
269
270 if (isempty(line)) {
271 good = false;
272 continue;
273 }
274
275 if (!good)
276 continue;
277
278 p = strstrip(line);
279
280 w = strpbrk(p, WHITESPACE);
281 if (!w)
282 continue;
283 *w = 0;
284
285 r = safe_atou(p, &maj);
286 if (r < 0)
287 continue;
288 if (maj <= 0)
289 continue;
290
291 w++;
292 w += strspn(w, WHITESPACE);
293
294 if (fnmatch(name, w, 0) != 0)
295 continue;
296
297 sprintf(buf,
298 "%c %u:* %s",
299 type,
300 maj,
301 acc);
302
303 r = cg_set_attribute("devices", path, "devices.allow", buf);
304 if (r < 0)
305 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
306 }
307
308 return 0;
309
310 fail:
311 log_warning("Failed to read /proc/devices: %m");
312 return -errno;
313 }
314
315 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
316 bool is_root;
317 int r;
318
319 assert(c);
320 assert(path);
321
322 if (mask == 0)
323 return;
324
325 /* Some cgroup attributes are not support on the root cgroup,
326 * hence silently ignore */
327 is_root = isempty(path) || path_equal(path, "/");
328
329 if ((mask & CGROUP_CPU) && !is_root) {
330 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
331 usec_t q;
332
333 sprintf(buf, "%lu\n",
334 state == MANAGER_STARTING && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
335 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
336 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
337 if (r < 0)
338 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
339
340 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
341 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
342 if (r < 0)
343 log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
344
345 q = cgroup_context_get_cpu_quota_usec(c);
346 if (q != (usec_t) -1) {
347 sprintf(buf, USEC_FMT "\n", q);
348 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
349 } else
350 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
351 if (r < 0)
352 log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
353 }
354
355 if (mask & CGROUP_BLKIO) {
356 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
357 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
358 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
359 CGroupBlockIODeviceWeight *w;
360 CGroupBlockIODeviceBandwidth *b;
361
362 if (!is_root) {
363 sprintf(buf, "%lu\n", state == MANAGER_STARTING && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
364 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
365 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
366 if (r < 0)
367 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
368
369 /* FIXME: no way to reset this list */
370 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
371 dev_t dev;
372
373 r = lookup_blkio_device(w->path, &dev);
374 if (r < 0)
375 continue;
376
377 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
378 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
379 if (r < 0)
380 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
381 }
382 }
383
384 /* FIXME: no way to reset this list */
385 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
386 const char *a;
387 dev_t dev;
388
389 r = lookup_blkio_device(b->path, &dev);
390 if (r < 0)
391 continue;
392
393 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
394
395 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
396 r = cg_set_attribute("blkio", path, a, buf);
397 if (r < 0)
398 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
399 }
400 }
401
402 if (mask & CGROUP_MEMORY) {
403 if (c->memory_limit != (uint64_t) -1) {
404 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
405
406 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
407 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
408 } else
409 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
410
411 if (r < 0)
412 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
413 }
414
415 if ((mask & CGROUP_DEVICE) && !is_root) {
416 CGroupDeviceAllow *a;
417
418 if (c->device_allow || c->device_policy != CGROUP_AUTO)
419 r = cg_set_attribute("devices", path, "devices.deny", "a");
420 else
421 r = cg_set_attribute("devices", path, "devices.allow", "a");
422 if (r < 0)
423 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
424
425 if (c->device_policy == CGROUP_CLOSED ||
426 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
427 static const char auto_devices[] =
428 "/dev/null\0" "rwm\0"
429 "/dev/zero\0" "rwm\0"
430 "/dev/full\0" "rwm\0"
431 "/dev/random\0" "rwm\0"
432 "/dev/urandom\0" "rwm\0"
433 "/dev/tty\0" "rwm\0"
434 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
435
436 const char *x, *y;
437
438 NULSTR_FOREACH_PAIR(x, y, auto_devices)
439 whitelist_device(path, x, y);
440
441 whitelist_major(path, "pts", 'c', "rw");
442 whitelist_major(path, "kdbus", 'c', "rw");
443 whitelist_major(path, "kdbus/*", 'c', "rw");
444 }
445
446 LIST_FOREACH(device_allow, a, c->device_allow) {
447 char acc[4];
448 unsigned k = 0;
449
450 if (a->r)
451 acc[k++] = 'r';
452 if (a->w)
453 acc[k++] = 'w';
454 if (a->m)
455 acc[k++] = 'm';
456
457 if (k == 0)
458 continue;
459
460 acc[k++] = 0;
461
462 if (startswith(a->path, "/dev/"))
463 whitelist_device(path, a->path, acc);
464 else if (startswith(a->path, "block-"))
465 whitelist_major(path, a->path + 6, 'b', acc);
466 else if (startswith(a->path, "char-"))
467 whitelist_major(path, a->path + 5, 'c', acc);
468 else
469 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
470 }
471 }
472 }
473
474 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
475 CGroupControllerMask mask = 0;
476
477 /* Figure out which controllers we need */
478
479 if (c->cpu_accounting ||
480 c->cpu_shares != (unsigned long) -1 ||
481 c->startup_cpu_shares != (unsigned long) -1 ||
482 c->cpu_quota_usec != (usec_t) -1 ||
483 c->cpu_quota_per_sec_usec != (usec_t) -1)
484 mask |= CGROUP_CPUACCT | CGROUP_CPU;
485
486 if (c->blockio_accounting ||
487 c->blockio_weight != (unsigned long) -1 ||
488 c->startup_blockio_weight != (unsigned long) -1 ||
489 c->blockio_device_weights ||
490 c->blockio_device_bandwidths)
491 mask |= CGROUP_BLKIO;
492
493 if (c->memory_accounting ||
494 c->memory_limit != (uint64_t) -1)
495 mask |= CGROUP_MEMORY;
496
497 if (c->device_allow || c->device_policy != CGROUP_AUTO)
498 mask |= CGROUP_DEVICE;
499
500 return mask;
501 }
502
503 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
504 CGroupContext *c;
505
506 c = unit_get_cgroup_context(u);
507 if (!c)
508 return 0;
509
510 return cgroup_context_get_mask(c);
511 }
512
513 CGroupControllerMask unit_get_members_mask(Unit *u) {
514 assert(u);
515
516 if (u->cgroup_members_mask_valid)
517 return u->cgroup_members_mask;
518
519 u->cgroup_members_mask = 0;
520
521 if (u->type == UNIT_SLICE) {
522 Unit *member;
523 Iterator i;
524
525 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
526
527 if (member == u)
528 continue;
529
530 if (UNIT_DEREF(member->slice) != u)
531 continue;
532
533 u->cgroup_members_mask |=
534 unit_get_cgroup_mask(member) |
535 unit_get_members_mask(member);
536 }
537 }
538
539 u->cgroup_members_mask_valid = true;
540 return u->cgroup_members_mask;
541 }
542
543 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
544 CGroupControllerMask m;
545
546 assert(u);
547
548 if (UNIT_ISSET(u->slice))
549 m = unit_get_members_mask(UNIT_DEREF(u->slice));
550 else
551 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
552
553 /* Sibling propagation is only relevant for weight-based
554 * controllers, so let's mask out everything else */
555 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
556 }
557
558 CGroupControllerMask unit_get_target_mask(Unit *u) {
559 CGroupControllerMask mask;
560
561 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
562 mask &= u->manager->cgroup_supported;
563
564 return mask;
565 }
566
567 /* Recurse from a unit up through its containing slices, propagating
568 * mask bits upward. A unit is also member of itself. */
569 void unit_update_cgroup_members_masks(Unit *u) {
570 CGroupControllerMask m;
571 bool more;
572
573 assert(u);
574
575 /* Calculate subtree mask */
576 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
577
578 /* See if anything changed from the previous invocation. If
579 * not, we're done. */
580 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
581 return;
582
583 more =
584 u->cgroup_subtree_mask_valid &&
585 ((m & ~u->cgroup_subtree_mask) != 0) &&
586 ((~m & u->cgroup_subtree_mask) == 0);
587
588 u->cgroup_subtree_mask = m;
589 u->cgroup_subtree_mask_valid = true;
590
591 if (UNIT_ISSET(u->slice)) {
592 Unit *s = UNIT_DEREF(u->slice);
593
594 if (more)
595 /* There's more set now than before. We
596 * propagate the new mask to the parent's mask
597 * (not caring if it actually was valid or
598 * not). */
599
600 s->cgroup_members_mask |= m;
601
602 else
603 /* There's less set now than before (or we
604 * don't know), we need to recalculate
605 * everything, so let's invalidate the
606 * parent's members mask */
607
608 s->cgroup_members_mask_valid = false;
609
610 /* And now make sure that this change also hits our
611 * grandparents */
612 unit_update_cgroup_members_masks(s);
613 }
614 }
615
616 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
617 Unit *u = userdata;
618
619 assert(mask != 0);
620 assert(u);
621
622 while (u) {
623 if (u->cgroup_path &&
624 u->cgroup_realized &&
625 (u->cgroup_realized_mask & mask) == mask)
626 return u->cgroup_path;
627
628 u = UNIT_DEREF(u->slice);
629 }
630
631 return NULL;
632 }
633
634 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
635 _cleanup_free_ char *path = NULL;
636 int r;
637
638 assert(u);
639
640 path = unit_default_cgroup_path(u);
641 if (!path)
642 return log_oom();
643
644 r = hashmap_put(u->manager->cgroup_unit, path, u);
645 if (r < 0) {
646 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
647 return r;
648 }
649 if (r > 0) {
650 u->cgroup_path = path;
651 path = NULL;
652 }
653
654 /* First, create our own group */
655 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
656 if (r < 0) {
657 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
658 return r;
659 }
660
661 /* Keep track that this is now realized */
662 u->cgroup_realized = true;
663 u->cgroup_realized_mask = mask;
664
665 /* Then, possibly move things over */
666 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
667 if (r < 0)
668 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
669
670 return 0;
671 }
672
673 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
674 assert(u);
675
676 return u->cgroup_realized && u->cgroup_realized_mask == mask;
677 }
678
679 /* Check if necessary controllers and attributes for a unit are in place.
680 *
681 * If so, do nothing.
682 * If not, create paths, move processes over, and set attributes.
683 *
684 * Returns 0 on success and < 0 on failure. */
685 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
686 CGroupControllerMask mask;
687 int r;
688
689 assert(u);
690
691 if (u->in_cgroup_queue) {
692 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
693 u->in_cgroup_queue = false;
694 }
695
696 mask = unit_get_target_mask(u);
697
698 if (unit_has_mask_realized(u, mask))
699 return 0;
700
701 /* First, realize parents */
702 if (UNIT_ISSET(u->slice)) {
703 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
704 if (r < 0)
705 return r;
706 }
707
708 /* And then do the real work */
709 r = unit_create_cgroups(u, mask);
710 if (r < 0)
711 return r;
712
713 /* Finally, apply the necessary attributes. */
714 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
715
716 return 0;
717 }
718
719 static void unit_add_to_cgroup_queue(Unit *u) {
720
721 if (u->in_cgroup_queue)
722 return;
723
724 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
725 u->in_cgroup_queue = true;
726 }
727
728 unsigned manager_dispatch_cgroup_queue(Manager *m) {
729 ManagerState state;
730 unsigned n = 0;
731 Unit *i;
732 int r;
733
734 state = manager_state(m);
735
736 while ((i = m->cgroup_queue)) {
737 assert(i->in_cgroup_queue);
738
739 r = unit_realize_cgroup_now(i, state);
740 if (r < 0)
741 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
742
743 n++;
744 }
745
746 return n;
747 }
748
749 static void unit_queue_siblings(Unit *u) {
750 Unit *slice;
751
752 /* This adds the siblings of the specified unit and the
753 * siblings of all parent units to the cgroup queue. (But
754 * neither the specified unit itself nor the parents.) */
755
756 while ((slice = UNIT_DEREF(u->slice))) {
757 Iterator i;
758 Unit *m;
759
760 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
761 if (m == u)
762 continue;
763
764 /* Skip units that have a dependency on the slice
765 * but aren't actually in it. */
766 if (UNIT_DEREF(m->slice) != slice)
767 continue;
768
769 /* No point in doing cgroup application for units
770 * without active processes. */
771 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
772 continue;
773
774 /* If the unit doesn't need any new controllers
775 * and has current ones realized, it doesn't need
776 * any changes. */
777 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
778 continue;
779
780 unit_add_to_cgroup_queue(m);
781 }
782
783 u = slice;
784 }
785 }
786
787 int unit_realize_cgroup(Unit *u) {
788 CGroupContext *c;
789
790 assert(u);
791
792 c = unit_get_cgroup_context(u);
793 if (!c)
794 return 0;
795
796 /* So, here's the deal: when realizing the cgroups for this
797 * unit, we need to first create all parents, but there's more
798 * actually: for the weight-based controllers we also need to
799 * make sure that all our siblings (i.e. units that are in the
800 * same slice as we are) have cgroups, too. Otherwise, things
801 * would become very uneven as each of their processes would
802 * get as much resources as all our group together. This call
803 * will synchronously create the parent cgroups, but will
804 * defer work on the siblings to the next event loop
805 * iteration. */
806
807 /* Add all sibling slices to the cgroup queue. */
808 unit_queue_siblings(u);
809
810 /* And realize this one now (and apply the values) */
811 return unit_realize_cgroup_now(u, manager_state(u->manager));
812 }
813
814 void unit_destroy_cgroup(Unit *u) {
815 int r;
816
817 assert(u);
818
819 if (!u->cgroup_path)
820 return;
821
822 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
823 if (r < 0)
824 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
825
826 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
827
828 free(u->cgroup_path);
829 u->cgroup_path = NULL;
830 u->cgroup_realized = false;
831 u->cgroup_realized_mask = 0;
832
833 }
834
835 pid_t unit_search_main_pid(Unit *u) {
836 _cleanup_fclose_ FILE *f = NULL;
837 pid_t pid = 0, npid, mypid;
838
839 assert(u);
840
841 if (!u->cgroup_path)
842 return 0;
843
844 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
845 return 0;
846
847 mypid = getpid();
848 while (cg_read_pid(f, &npid) > 0) {
849 pid_t ppid;
850
851 if (npid == pid)
852 continue;
853
854 /* Ignore processes that aren't our kids */
855 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
856 continue;
857
858 if (pid != 0) {
859 /* Dang, there's more than one daemonized PID
860 in this group, so we don't know what process
861 is the main process. */
862 pid = 0;
863 break;
864 }
865
866 pid = npid;
867 }
868
869 return pid;
870 }
871
872 int manager_setup_cgroup(Manager *m) {
873 _cleanup_free_ char *path = NULL;
874 char *e;
875 int r;
876
877 assert(m);
878
879 /* 1. Determine hierarchy */
880 free(m->cgroup_root);
881 m->cgroup_root = NULL;
882
883 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
884 if (r < 0) {
885 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
886 return r;
887 }
888
889 /* LEGACY: Already in /system.slice? If so, let's cut this
890 * off. This is to support live upgrades from older systemd
891 * versions where PID 1 was moved there. */
892 if (m->running_as == SYSTEMD_SYSTEM) {
893 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
894 if (!e)
895 e = endswith(m->cgroup_root, "/system");
896 if (e)
897 *e = 0;
898 }
899
900 /* And make sure to store away the root value without trailing
901 * slash, even for the root dir, so that we can easily prepend
902 * it everywhere. */
903 if (streq(m->cgroup_root, "/"))
904 m->cgroup_root[0] = 0;
905
906 /* 2. Show data */
907 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
908 if (r < 0) {
909 log_error("Cannot find cgroup mount point: %s", strerror(-r));
910 return r;
911 }
912
913 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
914
915 /* 3. Install agent */
916 if (m->running_as == SYSTEMD_SYSTEM) {
917 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
918 if (r < 0)
919 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
920 else if (r > 0)
921 log_debug("Installed release agent.");
922 else
923 log_debug("Release agent already installed.");
924 }
925
926 /* 4. Make sure we are in the root cgroup */
927 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
928 if (r < 0) {
929 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
930 return r;
931 }
932
933 /* 5. And pin it, so that it cannot be unmounted */
934 safe_close(m->pin_cgroupfs_fd);
935
936 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
937 if (m->pin_cgroupfs_fd < 0) {
938 log_error("Failed to open pin file: %m");
939 return -errno;
940 }
941
942 /* 6. Figure out which controllers are supported */
943 m->cgroup_supported = cg_mask_supported();
944
945 /* 7. Always enable hierarchial support if it exists... */
946 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
947
948 return 0;
949 }
950
951 void manager_shutdown_cgroup(Manager *m, bool delete) {
952 assert(m);
953
954 /* We can't really delete the group, since we are in it. But
955 * let's trim it. */
956 if (delete && m->cgroup_root)
957 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
958
959 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
960
961 free(m->cgroup_root);
962 m->cgroup_root = NULL;
963 }
964
965 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
966 char *p;
967 Unit *u;
968
969 assert(m);
970 assert(cgroup);
971
972 u = hashmap_get(m->cgroup_unit, cgroup);
973 if (u)
974 return u;
975
976 p = strdupa(cgroup);
977 for (;;) {
978 char *e;
979
980 e = strrchr(p, '/');
981 if (e == p || !e)
982 return NULL;
983
984 *e = 0;
985
986 u = hashmap_get(m->cgroup_unit, p);
987 if (u)
988 return u;
989 }
990 }
991
992 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
993 _cleanup_free_ char *cgroup = NULL;
994 int r;
995
996 assert(m);
997
998 if (pid <= 1)
999 return NULL;
1000
1001 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1002 if (r < 0)
1003 return NULL;
1004
1005 return manager_get_unit_by_cgroup(m, cgroup);
1006 }
1007
1008 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1009 Unit *u;
1010 int r;
1011
1012 assert(m);
1013 assert(cgroup);
1014
1015 u = manager_get_unit_by_cgroup(m, cgroup);
1016 if (u) {
1017 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1018 if (r > 0) {
1019 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1020 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1021
1022 unit_add_to_gc_queue(u);
1023 }
1024 }
1025
1026 return 0;
1027 }
1028
1029 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1030 [CGROUP_AUTO] = "auto",
1031 [CGROUP_CLOSED] = "closed",
1032 [CGROUP_STRICT] = "strict",
1033 };
1034
1035 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);