]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
core: introduce new Delegate=yes/no property controlling creation of cgroup subhierar...
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
31
32 void cgroup_context_init(CGroupContext *c) {
33 assert(c);
34
35 /* Initialize everything to the kernel defaults, assuming the
36 * structure is preinitialized to 0 */
37
38 c->cpu_shares = (unsigned long) -1;
39 c->startup_cpu_shares = (unsigned long) -1;
40 c->memory_limit = (uint64_t) -1;
41 c->blockio_weight = (unsigned long) -1;
42 c->startup_blockio_weight = (unsigned long) -1;
43
44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
45 }
46
47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
48 assert(c);
49 assert(a);
50
51 LIST_REMOVE(device_allow, c->device_allow, a);
52 free(a->path);
53 free(a);
54 }
55
56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
57 assert(c);
58 assert(w);
59
60 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
61 free(w->path);
62 free(w);
63 }
64
65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
66 assert(c);
67 assert(b);
68
69 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
70 free(b->path);
71 free(b);
72 }
73
74 void cgroup_context_done(CGroupContext *c) {
75 assert(c);
76
77 while (c->blockio_device_weights)
78 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
79
80 while (c->blockio_device_bandwidths)
81 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
82
83 while (c->device_allow)
84 cgroup_context_free_device_allow(c, c->device_allow);
85 }
86
87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
88 CGroupBlockIODeviceBandwidth *b;
89 CGroupBlockIODeviceWeight *w;
90 CGroupDeviceAllow *a;
91 char u[FORMAT_TIMESPAN_MAX];
92
93 assert(c);
94 assert(f);
95
96 prefix = strempty(prefix);
97
98 fprintf(f,
99 "%sCPUAccounting=%s\n"
100 "%sBlockIOAccounting=%s\n"
101 "%sMemoryAccounting=%s\n"
102 "%sCPUShares=%lu\n"
103 "%sStartupCPUShares=%lu\n"
104 "%sCPUQuotaPerSecSec=%s\n"
105 "%sBlockIOWeight=%lu\n"
106 "%sStartupBlockIOWeight=%lu\n"
107 "%sMemoryLimit=%" PRIu64 "\n"
108 "%sDevicePolicy=%s\n"
109 "%sDelegate=%s\n",
110 prefix, yes_no(c->cpu_accounting),
111 prefix, yes_no(c->blockio_accounting),
112 prefix, yes_no(c->memory_accounting),
113 prefix, c->cpu_shares,
114 prefix, c->startup_cpu_shares,
115 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
116 prefix, c->blockio_weight,
117 prefix, c->startup_blockio_weight,
118 prefix, c->memory_limit,
119 prefix, cgroup_device_policy_to_string(c->device_policy),
120 prefix, yes_no(c->delegate));
121
122 LIST_FOREACH(device_allow, a, c->device_allow)
123 fprintf(f,
124 "%sDeviceAllow=%s %s%s%s\n",
125 prefix,
126 a->path,
127 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
128
129 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
130 fprintf(f,
131 "%sBlockIODeviceWeight=%s %lu",
132 prefix,
133 w->path,
134 w->weight);
135
136 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
137 char buf[FORMAT_BYTES_MAX];
138
139 fprintf(f,
140 "%s%s=%s %s\n",
141 prefix,
142 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
143 b->path,
144 format_bytes(buf, sizeof(buf), b->bandwidth));
145 }
146 }
147
148 static int lookup_blkio_device(const char *p, dev_t *dev) {
149 struct stat st;
150 int r;
151
152 assert(p);
153 assert(dev);
154
155 r = stat(p, &st);
156 if (r < 0) {
157 log_warning("Couldn't stat device %s: %m", p);
158 return -errno;
159 }
160
161 if (S_ISBLK(st.st_mode))
162 *dev = st.st_rdev;
163 else if (major(st.st_dev) != 0) {
164 /* If this is not a device node then find the block
165 * device this file is stored on */
166 *dev = st.st_dev;
167
168 /* If this is a partition, try to get the originating
169 * block device */
170 block_get_whole_disk(*dev, dev);
171 } else {
172 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
173 return -ENODEV;
174 }
175
176 return 0;
177 }
178
179 static int whitelist_device(const char *path, const char *node, const char *acc) {
180 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
181 struct stat st;
182 int r;
183
184 assert(path);
185 assert(acc);
186
187 if (stat(node, &st) < 0) {
188 log_warning("Couldn't stat device %s", node);
189 return -errno;
190 }
191
192 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
193 log_warning("%s is not a device.", node);
194 return -ENODEV;
195 }
196
197 sprintf(buf,
198 "%c %u:%u %s",
199 S_ISCHR(st.st_mode) ? 'c' : 'b',
200 major(st.st_rdev), minor(st.st_rdev),
201 acc);
202
203 r = cg_set_attribute("devices", path, "devices.allow", buf);
204 if (r < 0)
205 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
206
207 return r;
208 }
209
210 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
211 _cleanup_fclose_ FILE *f = NULL;
212 char line[LINE_MAX];
213 bool good = false;
214 int r;
215
216 assert(path);
217 assert(acc);
218 assert(type == 'b' || type == 'c');
219
220 f = fopen("/proc/devices", "re");
221 if (!f) {
222 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
223 return -errno;
224 }
225
226 FOREACH_LINE(line, f, goto fail) {
227 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
228 unsigned maj;
229
230 truncate_nl(line);
231
232 if (type == 'c' && streq(line, "Character devices:")) {
233 good = true;
234 continue;
235 }
236
237 if (type == 'b' && streq(line, "Block devices:")) {
238 good = true;
239 continue;
240 }
241
242 if (isempty(line)) {
243 good = false;
244 continue;
245 }
246
247 if (!good)
248 continue;
249
250 p = strstrip(line);
251
252 w = strpbrk(p, WHITESPACE);
253 if (!w)
254 continue;
255 *w = 0;
256
257 r = safe_atou(p, &maj);
258 if (r < 0)
259 continue;
260 if (maj <= 0)
261 continue;
262
263 w++;
264 w += strspn(w, WHITESPACE);
265
266 if (fnmatch(name, w, 0) != 0)
267 continue;
268
269 sprintf(buf,
270 "%c %u:* %s",
271 type,
272 maj,
273 acc);
274
275 r = cg_set_attribute("devices", path, "devices.allow", buf);
276 if (r < 0)
277 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
278 }
279
280 return 0;
281
282 fail:
283 log_warning("Failed to read /proc/devices: %m");
284 return -errno;
285 }
286
287 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
288 bool is_root;
289 int r;
290
291 assert(c);
292 assert(path);
293
294 if (mask == 0)
295 return;
296
297 /* Some cgroup attributes are not support on the root cgroup,
298 * hence silently ignore */
299 is_root = isempty(path) || path_equal(path, "/");
300
301 if ((mask & CGROUP_CPU) && !is_root) {
302 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
303
304 sprintf(buf, "%lu\n",
305 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
306 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
307 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
308 if (r < 0)
309 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
310
311 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
312 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
313 if (r < 0)
314 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
315
316 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
317 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
318 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
319 } else
320 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
321 if (r < 0)
322 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
323 }
324
325 if (mask & CGROUP_BLKIO) {
326 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
327 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
328 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
329 CGroupBlockIODeviceWeight *w;
330 CGroupBlockIODeviceBandwidth *b;
331
332 if (!is_root) {
333 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
334 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
335 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
336 if (r < 0)
337 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
338
339 /* FIXME: no way to reset this list */
340 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
341 dev_t dev;
342
343 r = lookup_blkio_device(w->path, &dev);
344 if (r < 0)
345 continue;
346
347 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
348 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
349 if (r < 0)
350 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
351 }
352 }
353
354 /* FIXME: no way to reset this list */
355 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
356 const char *a;
357 dev_t dev;
358
359 r = lookup_blkio_device(b->path, &dev);
360 if (r < 0)
361 continue;
362
363 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
364
365 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
366 r = cg_set_attribute("blkio", path, a, buf);
367 if (r < 0)
368 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
369 }
370 }
371
372 if (mask & CGROUP_MEMORY) {
373 if (c->memory_limit != (uint64_t) -1) {
374 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
375
376 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
377 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
378 } else
379 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
380
381 if (r < 0)
382 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
383 }
384
385 if ((mask & CGROUP_DEVICE) && !is_root) {
386 CGroupDeviceAllow *a;
387
388 if (c->device_allow || c->device_policy != CGROUP_AUTO)
389 r = cg_set_attribute("devices", path, "devices.deny", "a");
390 else
391 r = cg_set_attribute("devices", path, "devices.allow", "a");
392 if (r < 0)
393 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
394
395 if (c->device_policy == CGROUP_CLOSED ||
396 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
397 static const char auto_devices[] =
398 "/dev/null\0" "rwm\0"
399 "/dev/zero\0" "rwm\0"
400 "/dev/full\0" "rwm\0"
401 "/dev/random\0" "rwm\0"
402 "/dev/urandom\0" "rwm\0"
403 "/dev/tty\0" "rwm\0"
404 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
405
406 const char *x, *y;
407
408 NULSTR_FOREACH_PAIR(x, y, auto_devices)
409 whitelist_device(path, x, y);
410
411 whitelist_major(path, "pts", 'c', "rw");
412 whitelist_major(path, "kdbus", 'c', "rw");
413 whitelist_major(path, "kdbus/*", 'c', "rw");
414 }
415
416 LIST_FOREACH(device_allow, a, c->device_allow) {
417 char acc[4];
418 unsigned k = 0;
419
420 if (a->r)
421 acc[k++] = 'r';
422 if (a->w)
423 acc[k++] = 'w';
424 if (a->m)
425 acc[k++] = 'm';
426
427 if (k == 0)
428 continue;
429
430 acc[k++] = 0;
431
432 if (startswith(a->path, "/dev/"))
433 whitelist_device(path, a->path, acc);
434 else if (startswith(a->path, "block-"))
435 whitelist_major(path, a->path + 6, 'b', acc);
436 else if (startswith(a->path, "char-"))
437 whitelist_major(path, a->path + 5, 'c', acc);
438 else
439 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
440 }
441 }
442 }
443
444 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
445 CGroupControllerMask mask = 0;
446
447 /* Figure out which controllers we need */
448
449 if (c->cpu_accounting ||
450 c->cpu_shares != (unsigned long) -1 ||
451 c->startup_cpu_shares != (unsigned long) -1 ||
452 c->cpu_quota_per_sec_usec != USEC_INFINITY)
453 mask |= CGROUP_CPUACCT | CGROUP_CPU;
454
455 if (c->blockio_accounting ||
456 c->blockio_weight != (unsigned long) -1 ||
457 c->startup_blockio_weight != (unsigned long) -1 ||
458 c->blockio_device_weights ||
459 c->blockio_device_bandwidths)
460 mask |= CGROUP_BLKIO;
461
462 if (c->memory_accounting ||
463 c->memory_limit != (uint64_t) -1)
464 mask |= CGROUP_MEMORY;
465
466 if (c->device_allow ||
467 c->device_policy != CGROUP_AUTO)
468 mask |= CGROUP_DEVICE;
469
470 return mask;
471 }
472
473 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
474 CGroupContext *c;
475
476 c = unit_get_cgroup_context(u);
477 if (!c)
478 return 0;
479
480 /* If delegation is turned on, then turn on all cgroups,
481 * unless the process we fork into it is known to drop
482 * privileges anyway, and shouldn't get access to the
483 * controllers anyway. */
484
485 if (c->delegate) {
486 ExecContext *e;
487
488 e = unit_get_exec_context(u);
489 if (!e || exec_context_maintains_privileges(e))
490 return _CGROUP_CONTROLLER_MASK_ALL;
491 }
492
493 return cgroup_context_get_mask(c);
494 }
495
496 CGroupControllerMask unit_get_members_mask(Unit *u) {
497 assert(u);
498
499 if (u->cgroup_members_mask_valid)
500 return u->cgroup_members_mask;
501
502 u->cgroup_members_mask = 0;
503
504 if (u->type == UNIT_SLICE) {
505 Unit *member;
506 Iterator i;
507
508 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
509
510 if (member == u)
511 continue;
512
513 if (UNIT_DEREF(member->slice) != u)
514 continue;
515
516 u->cgroup_members_mask |=
517 unit_get_cgroup_mask(member) |
518 unit_get_members_mask(member);
519 }
520 }
521
522 u->cgroup_members_mask_valid = true;
523 return u->cgroup_members_mask;
524 }
525
526 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
527 assert(u);
528
529 if (UNIT_ISSET(u->slice))
530 return unit_get_members_mask(UNIT_DEREF(u->slice));
531
532 return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
533 }
534
535 CGroupControllerMask unit_get_target_mask(Unit *u) {
536 CGroupControllerMask mask;
537
538 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
539 mask &= u->manager->cgroup_supported;
540
541 return mask;
542 }
543
544 /* Recurse from a unit up through its containing slices, propagating
545 * mask bits upward. A unit is also member of itself. */
546 void unit_update_cgroup_members_masks(Unit *u) {
547 CGroupControllerMask m;
548 bool more;
549
550 assert(u);
551
552 /* Calculate subtree mask */
553 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
554
555 /* See if anything changed from the previous invocation. If
556 * not, we're done. */
557 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
558 return;
559
560 more =
561 u->cgroup_subtree_mask_valid &&
562 ((m & ~u->cgroup_subtree_mask) != 0) &&
563 ((~m & u->cgroup_subtree_mask) == 0);
564
565 u->cgroup_subtree_mask = m;
566 u->cgroup_subtree_mask_valid = true;
567
568 if (UNIT_ISSET(u->slice)) {
569 Unit *s = UNIT_DEREF(u->slice);
570
571 if (more)
572 /* There's more set now than before. We
573 * propagate the new mask to the parent's mask
574 * (not caring if it actually was valid or
575 * not). */
576
577 s->cgroup_members_mask |= m;
578
579 else
580 /* There's less set now than before (or we
581 * don't know), we need to recalculate
582 * everything, so let's invalidate the
583 * parent's members mask */
584
585 s->cgroup_members_mask_valid = false;
586
587 /* And now make sure that this change also hits our
588 * grandparents */
589 unit_update_cgroup_members_masks(s);
590 }
591 }
592
593 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
594 Unit *u = userdata;
595
596 assert(mask != 0);
597 assert(u);
598
599 while (u) {
600 if (u->cgroup_path &&
601 u->cgroup_realized &&
602 (u->cgroup_realized_mask & mask) == mask)
603 return u->cgroup_path;
604
605 u = UNIT_DEREF(u->slice);
606 }
607
608 return NULL;
609 }
610
611 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
612 _cleanup_free_ char *path = NULL;
613 int r;
614
615 assert(u);
616
617 path = unit_default_cgroup_path(u);
618 if (!path)
619 return log_oom();
620
621 r = hashmap_put(u->manager->cgroup_unit, path, u);
622 if (r < 0) {
623 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
624 return r;
625 }
626 if (r > 0) {
627 u->cgroup_path = path;
628 path = NULL;
629 }
630
631 /* First, create our own group */
632 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
633 if (r < 0) {
634 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
635 return r;
636 }
637
638 /* Keep track that this is now realized */
639 u->cgroup_realized = true;
640 u->cgroup_realized_mask = mask;
641
642 /* Then, possibly move things over */
643 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
644 if (r < 0)
645 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
646
647 return 0;
648 }
649
650 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
651 assert(u);
652
653 return u->cgroup_realized && u->cgroup_realized_mask == mask;
654 }
655
656 /* Check if necessary controllers and attributes for a unit are in place.
657 *
658 * If so, do nothing.
659 * If not, create paths, move processes over, and set attributes.
660 *
661 * Returns 0 on success and < 0 on failure. */
662 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
663 CGroupControllerMask mask;
664 int r;
665
666 assert(u);
667
668 if (u->in_cgroup_queue) {
669 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
670 u->in_cgroup_queue = false;
671 }
672
673 mask = unit_get_target_mask(u);
674
675 if (unit_has_mask_realized(u, mask))
676 return 0;
677
678 /* First, realize parents */
679 if (UNIT_ISSET(u->slice)) {
680 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
681 if (r < 0)
682 return r;
683 }
684
685 /* And then do the real work */
686 r = unit_create_cgroups(u, mask);
687 if (r < 0)
688 return r;
689
690 /* Finally, apply the necessary attributes. */
691 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
692
693 return 0;
694 }
695
696 static void unit_add_to_cgroup_queue(Unit *u) {
697
698 if (u->in_cgroup_queue)
699 return;
700
701 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
702 u->in_cgroup_queue = true;
703 }
704
705 unsigned manager_dispatch_cgroup_queue(Manager *m) {
706 ManagerState state;
707 unsigned n = 0;
708 Unit *i;
709 int r;
710
711 state = manager_state(m);
712
713 while ((i = m->cgroup_queue)) {
714 assert(i->in_cgroup_queue);
715
716 r = unit_realize_cgroup_now(i, state);
717 if (r < 0)
718 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
719
720 n++;
721 }
722
723 return n;
724 }
725
726 static void unit_queue_siblings(Unit *u) {
727 Unit *slice;
728
729 /* This adds the siblings of the specified unit and the
730 * siblings of all parent units to the cgroup queue. (But
731 * neither the specified unit itself nor the parents.) */
732
733 while ((slice = UNIT_DEREF(u->slice))) {
734 Iterator i;
735 Unit *m;
736
737 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
738 if (m == u)
739 continue;
740
741 /* Skip units that have a dependency on the slice
742 * but aren't actually in it. */
743 if (UNIT_DEREF(m->slice) != slice)
744 continue;
745
746 /* No point in doing cgroup application for units
747 * without active processes. */
748 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
749 continue;
750
751 /* If the unit doesn't need any new controllers
752 * and has current ones realized, it doesn't need
753 * any changes. */
754 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
755 continue;
756
757 unit_add_to_cgroup_queue(m);
758 }
759
760 u = slice;
761 }
762 }
763
764 int unit_realize_cgroup(Unit *u) {
765 CGroupContext *c;
766
767 assert(u);
768
769 c = unit_get_cgroup_context(u);
770 if (!c)
771 return 0;
772
773 /* So, here's the deal: when realizing the cgroups for this
774 * unit, we need to first create all parents, but there's more
775 * actually: for the weight-based controllers we also need to
776 * make sure that all our siblings (i.e. units that are in the
777 * same slice as we are) have cgroups, too. Otherwise, things
778 * would become very uneven as each of their processes would
779 * get as much resources as all our group together. This call
780 * will synchronously create the parent cgroups, but will
781 * defer work on the siblings to the next event loop
782 * iteration. */
783
784 /* Add all sibling slices to the cgroup queue. */
785 unit_queue_siblings(u);
786
787 /* And realize this one now (and apply the values) */
788 return unit_realize_cgroup_now(u, manager_state(u->manager));
789 }
790
791 void unit_destroy_cgroup(Unit *u) {
792 int r;
793
794 assert(u);
795
796 if (!u->cgroup_path)
797 return;
798
799 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
800 if (r < 0)
801 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
802
803 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
804
805 free(u->cgroup_path);
806 u->cgroup_path = NULL;
807 u->cgroup_realized = false;
808 u->cgroup_realized_mask = 0;
809
810 }
811
812 pid_t unit_search_main_pid(Unit *u) {
813 _cleanup_fclose_ FILE *f = NULL;
814 pid_t pid = 0, npid, mypid;
815
816 assert(u);
817
818 if (!u->cgroup_path)
819 return 0;
820
821 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
822 return 0;
823
824 mypid = getpid();
825 while (cg_read_pid(f, &npid) > 0) {
826 pid_t ppid;
827
828 if (npid == pid)
829 continue;
830
831 /* Ignore processes that aren't our kids */
832 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
833 continue;
834
835 if (pid != 0) {
836 /* Dang, there's more than one daemonized PID
837 in this group, so we don't know what process
838 is the main process. */
839 pid = 0;
840 break;
841 }
842
843 pid = npid;
844 }
845
846 return pid;
847 }
848
849 int manager_setup_cgroup(Manager *m) {
850 _cleanup_free_ char *path = NULL;
851 int r;
852
853 assert(m);
854
855 /* 1. Determine hierarchy */
856 free(m->cgroup_root);
857 m->cgroup_root = NULL;
858
859 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
860 if (r < 0) {
861 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
862 return r;
863 }
864
865 /* LEGACY: Already in /system.slice? If so, let's cut this
866 * off. This is to support live upgrades from older systemd
867 * versions where PID 1 was moved there. */
868 if (m->running_as == SYSTEMD_SYSTEM) {
869 char *e;
870
871 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
872 if (!e)
873 e = endswith(m->cgroup_root, "/system");
874 if (e)
875 *e = 0;
876 }
877
878 /* And make sure to store away the root value without trailing
879 * slash, even for the root dir, so that we can easily prepend
880 * it everywhere. */
881 if (streq(m->cgroup_root, "/"))
882 m->cgroup_root[0] = 0;
883
884 /* 2. Show data */
885 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
886 if (r < 0) {
887 log_error("Cannot find cgroup mount point: %s", strerror(-r));
888 return r;
889 }
890
891 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
892 if (!m->test_run) {
893
894 /* 3. Install agent */
895 if (m->running_as == SYSTEMD_SYSTEM) {
896 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
897 if (r < 0)
898 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
899 else if (r > 0)
900 log_debug("Installed release agent.");
901 else
902 log_debug("Release agent already installed.");
903 }
904
905 /* 4. Make sure we are in the root cgroup */
906 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
907 if (r < 0) {
908 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
909 return r;
910 }
911
912 /* 5. And pin it, so that it cannot be unmounted */
913 safe_close(m->pin_cgroupfs_fd);
914
915 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
916 if (m->pin_cgroupfs_fd < 0) {
917 log_error("Failed to open pin file: %m");
918 return -errno;
919 }
920
921 /* 6. Always enable hierarchial support if it exists... */
922 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
923 }
924
925 /* 7. Figure out which controllers are supported */
926 m->cgroup_supported = cg_mask_supported();
927
928 return 0;
929 }
930
931 void manager_shutdown_cgroup(Manager *m, bool delete) {
932 assert(m);
933
934 /* We can't really delete the group, since we are in it. But
935 * let's trim it. */
936 if (delete && m->cgroup_root)
937 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
938
939 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
940
941 free(m->cgroup_root);
942 m->cgroup_root = NULL;
943 }
944
945 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
946 char *p;
947 Unit *u;
948
949 assert(m);
950 assert(cgroup);
951
952 u = hashmap_get(m->cgroup_unit, cgroup);
953 if (u)
954 return u;
955
956 p = strdupa(cgroup);
957 for (;;) {
958 char *e;
959
960 e = strrchr(p, '/');
961 if (e == p || !e)
962 return NULL;
963
964 *e = 0;
965
966 u = hashmap_get(m->cgroup_unit, p);
967 if (u)
968 return u;
969 }
970 }
971
972 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
973 _cleanup_free_ char *cgroup = NULL;
974 int r;
975
976 assert(m);
977
978 if (pid <= 1)
979 return NULL;
980
981 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
982 if (r < 0)
983 return NULL;
984
985 return manager_get_unit_by_cgroup(m, cgroup);
986 }
987
988 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
989 Unit *u;
990 int r;
991
992 assert(m);
993 assert(cgroup);
994
995 u = manager_get_unit_by_cgroup(m, cgroup);
996 if (u) {
997 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
998 if (r > 0) {
999 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1000 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1001
1002 unit_add_to_gc_queue(u);
1003 }
1004 }
1005
1006 return 0;
1007 }
1008
1009 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1010 [CGROUP_AUTO] = "auto",
1011 [CGROUP_CLOSED] = "closed",
1012 [CGROUP_STRICT] = "strict",
1013 };
1014
1015 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);