]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
cgroup: downgrade log messages about non-existant cgroup attributes to LOG_DEBUG
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
31
32 void cgroup_context_init(CGroupContext *c) {
33 assert(c);
34
35 /* Initialize everything to the kernel defaults, assuming the
36 * structure is preinitialized to 0 */
37
38 c->cpu_shares = (unsigned long) -1;
39 c->startup_cpu_shares = (unsigned long) -1;
40 c->memory_limit = (uint64_t) -1;
41 c->blockio_weight = (unsigned long) -1;
42 c->startup_blockio_weight = (unsigned long) -1;
43
44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
45 }
46
47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
48 assert(c);
49 assert(a);
50
51 LIST_REMOVE(device_allow, c->device_allow, a);
52 free(a->path);
53 free(a);
54 }
55
56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
57 assert(c);
58 assert(w);
59
60 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
61 free(w->path);
62 free(w);
63 }
64
65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
66 assert(c);
67 assert(b);
68
69 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
70 free(b->path);
71 free(b);
72 }
73
74 void cgroup_context_done(CGroupContext *c) {
75 assert(c);
76
77 while (c->blockio_device_weights)
78 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
79
80 while (c->blockio_device_bandwidths)
81 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
82
83 while (c->device_allow)
84 cgroup_context_free_device_allow(c, c->device_allow);
85 }
86
87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
88 CGroupBlockIODeviceBandwidth *b;
89 CGroupBlockIODeviceWeight *w;
90 CGroupDeviceAllow *a;
91 char u[FORMAT_TIMESPAN_MAX];
92
93 assert(c);
94 assert(f);
95
96 prefix = strempty(prefix);
97
98 fprintf(f,
99 "%sCPUAccounting=%s\n"
100 "%sBlockIOAccounting=%s\n"
101 "%sMemoryAccounting=%s\n"
102 "%sCPUShares=%lu\n"
103 "%sStartupCPUShares=%lu\n"
104 "%sCPUQuotaPerSecSec=%s\n"
105 "%sBlockIOWeight=%lu\n"
106 "%sStartupBlockIOWeight=%lu\n"
107 "%sMemoryLimit=%" PRIu64 "\n"
108 "%sDevicePolicy=%s\n",
109 prefix, yes_no(c->cpu_accounting),
110 prefix, yes_no(c->blockio_accounting),
111 prefix, yes_no(c->memory_accounting),
112 prefix, c->cpu_shares,
113 prefix, c->startup_cpu_shares,
114 prefix, strna(format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1)),
115 prefix, c->blockio_weight,
116 prefix, c->startup_blockio_weight,
117 prefix, c->memory_limit,
118 prefix, cgroup_device_policy_to_string(c->device_policy));
119
120 LIST_FOREACH(device_allow, a, c->device_allow)
121 fprintf(f,
122 "%sDeviceAllow=%s %s%s%s\n",
123 prefix,
124 a->path,
125 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
126
127 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
128 fprintf(f,
129 "%sBlockIODeviceWeight=%s %lu",
130 prefix,
131 w->path,
132 w->weight);
133
134 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
135 char buf[FORMAT_BYTES_MAX];
136
137 fprintf(f,
138 "%s%s=%s %s\n",
139 prefix,
140 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
141 b->path,
142 format_bytes(buf, sizeof(buf), b->bandwidth));
143 }
144 }
145
146 static int lookup_blkio_device(const char *p, dev_t *dev) {
147 struct stat st;
148 int r;
149
150 assert(p);
151 assert(dev);
152
153 r = stat(p, &st);
154 if (r < 0) {
155 log_warning("Couldn't stat device %s: %m", p);
156 return -errno;
157 }
158
159 if (S_ISBLK(st.st_mode))
160 *dev = st.st_rdev;
161 else if (major(st.st_dev) != 0) {
162 /* If this is not a device node then find the block
163 * device this file is stored on */
164 *dev = st.st_dev;
165
166 /* If this is a partition, try to get the originating
167 * block device */
168 block_get_whole_disk(*dev, dev);
169 } else {
170 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
171 return -ENODEV;
172 }
173
174 return 0;
175 }
176
177 static int whitelist_device(const char *path, const char *node, const char *acc) {
178 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
179 struct stat st;
180 int r;
181
182 assert(path);
183 assert(acc);
184
185 if (stat(node, &st) < 0) {
186 log_warning("Couldn't stat device %s", node);
187 return -errno;
188 }
189
190 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
191 log_warning("%s is not a device.", node);
192 return -ENODEV;
193 }
194
195 sprintf(buf,
196 "%c %u:%u %s",
197 S_ISCHR(st.st_mode) ? 'c' : 'b',
198 major(st.st_rdev), minor(st.st_rdev),
199 acc);
200
201 r = cg_set_attribute("devices", path, "devices.allow", buf);
202 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
203
204 return r;
205 }
206
207 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
208 _cleanup_fclose_ FILE *f = NULL;
209 char line[LINE_MAX];
210 bool good = false;
211 int r;
212
213 assert(path);
214 assert(acc);
215 assert(type == 'b' || type == 'c');
216
217 f = fopen("/proc/devices", "re");
218 if (!f) {
219 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
220 return -errno;
221 }
222
223 FOREACH_LINE(line, f, goto fail) {
224 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
225 unsigned maj;
226
227 truncate_nl(line);
228
229 if (type == 'c' && streq(line, "Character devices:")) {
230 good = true;
231 continue;
232 }
233
234 if (type == 'b' && streq(line, "Block devices:")) {
235 good = true;
236 continue;
237 }
238
239 if (isempty(line)) {
240 good = false;
241 continue;
242 }
243
244 if (!good)
245 continue;
246
247 p = strstrip(line);
248
249 w = strpbrk(p, WHITESPACE);
250 if (!w)
251 continue;
252 *w = 0;
253
254 r = safe_atou(p, &maj);
255 if (r < 0)
256 continue;
257 if (maj <= 0)
258 continue;
259
260 w++;
261 w += strspn(w, WHITESPACE);
262
263 if (fnmatch(name, w, 0) != 0)
264 continue;
265
266 sprintf(buf,
267 "%c %u:* %s",
268 type,
269 maj,
270 acc);
271
272 r = cg_set_attribute("devices", path, "devices.allow", buf);
273 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
274 }
275
276 return 0;
277
278 fail:
279 log_warning("Failed to read /proc/devices: %m");
280 return -errno;
281 }
282
283 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
284 bool is_root;
285 int r;
286
287 assert(c);
288 assert(path);
289
290 if (mask == 0)
291 return;
292
293 /* Some cgroup attributes are not support on the root cgroup,
294 * hence silently ignore */
295 is_root = isempty(path) || path_equal(path, "/");
296
297 if ((mask & CGROUP_CPU) && !is_root) {
298 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
299
300 sprintf(buf, "%lu\n",
301 state == MANAGER_STARTING && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
302 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
303 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
304 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
305
306 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
307 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
308 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
309
310 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
311 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
312 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
313 } else
314 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
315 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
316 }
317
318 if (mask & CGROUP_BLKIO) {
319 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
320 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
321 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
322 CGroupBlockIODeviceWeight *w;
323 CGroupBlockIODeviceBandwidth *b;
324
325 if (!is_root) {
326 sprintf(buf, "%lu\n", state == MANAGER_STARTING && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
327 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
328 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
329 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
330
331 /* FIXME: no way to reset this list */
332 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
333 dev_t dev;
334
335 r = lookup_blkio_device(w->path, &dev);
336 if (r < 0)
337 continue;
338
339 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
340 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
341 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
342 }
343 }
344
345 /* FIXME: no way to reset this list */
346 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
347 const char *a;
348 dev_t dev;
349
350 r = lookup_blkio_device(b->path, &dev);
351 if (r < 0)
352 continue;
353
354 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
355
356 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
357 r = cg_set_attribute("blkio", path, a, buf);
358 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
359 }
360 }
361
362 if (mask & CGROUP_MEMORY) {
363 if (c->memory_limit != (uint64_t) -1) {
364 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
365
366 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
367 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
368 } else
369 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
370
371 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
372 }
373
374 if ((mask & CGROUP_DEVICE) && !is_root) {
375 CGroupDeviceAllow *a;
376
377 if (c->device_allow || c->device_policy != CGROUP_AUTO)
378 r = cg_set_attribute("devices", path, "devices.deny", "a");
379 else
380 r = cg_set_attribute("devices", path, "devices.allow", "a");
381 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
382
383 if (c->device_policy == CGROUP_CLOSED ||
384 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
385 static const char auto_devices[] =
386 "/dev/null\0" "rwm\0"
387 "/dev/zero\0" "rwm\0"
388 "/dev/full\0" "rwm\0"
389 "/dev/random\0" "rwm\0"
390 "/dev/urandom\0" "rwm\0"
391 "/dev/tty\0" "rwm\0"
392 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
393
394 const char *x, *y;
395
396 NULSTR_FOREACH_PAIR(x, y, auto_devices)
397 whitelist_device(path, x, y);
398
399 whitelist_major(path, "pts", 'c', "rw");
400 whitelist_major(path, "kdbus", 'c', "rw");
401 whitelist_major(path, "kdbus/*", 'c', "rw");
402 }
403
404 LIST_FOREACH(device_allow, a, c->device_allow) {
405 char acc[4];
406 unsigned k = 0;
407
408 if (a->r)
409 acc[k++] = 'r';
410 if (a->w)
411 acc[k++] = 'w';
412 if (a->m)
413 acc[k++] = 'm';
414
415 if (k == 0)
416 continue;
417
418 acc[k++] = 0;
419
420 if (startswith(a->path, "/dev/"))
421 whitelist_device(path, a->path, acc);
422 else if (startswith(a->path, "block-"))
423 whitelist_major(path, a->path + 6, 'b', acc);
424 else if (startswith(a->path, "char-"))
425 whitelist_major(path, a->path + 5, 'c', acc);
426 else
427 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
428 }
429 }
430 }
431
432 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
433 CGroupControllerMask mask = 0;
434
435 /* Figure out which controllers we need */
436
437 if (c->cpu_accounting ||
438 c->cpu_shares != (unsigned long) -1 ||
439 c->startup_cpu_shares != (unsigned long) -1 ||
440 c->cpu_quota_per_sec_usec != USEC_INFINITY)
441 mask |= CGROUP_CPUACCT | CGROUP_CPU;
442
443 if (c->blockio_accounting ||
444 c->blockio_weight != (unsigned long) -1 ||
445 c->startup_blockio_weight != (unsigned long) -1 ||
446 c->blockio_device_weights ||
447 c->blockio_device_bandwidths)
448 mask |= CGROUP_BLKIO;
449
450 if (c->memory_accounting ||
451 c->memory_limit != (uint64_t) -1)
452 mask |= CGROUP_MEMORY;
453
454 if (c->device_allow || c->device_policy != CGROUP_AUTO)
455 mask |= CGROUP_DEVICE;
456
457 return mask;
458 }
459
460 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
461 CGroupContext *c;
462
463 c = unit_get_cgroup_context(u);
464 if (!c)
465 return 0;
466
467 return cgroup_context_get_mask(c);
468 }
469
470 CGroupControllerMask unit_get_members_mask(Unit *u) {
471 assert(u);
472
473 if (u->cgroup_members_mask_valid)
474 return u->cgroup_members_mask;
475
476 u->cgroup_members_mask = 0;
477
478 if (u->type == UNIT_SLICE) {
479 Unit *member;
480 Iterator i;
481
482 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
483
484 if (member == u)
485 continue;
486
487 if (UNIT_DEREF(member->slice) != u)
488 continue;
489
490 u->cgroup_members_mask |=
491 unit_get_cgroup_mask(member) |
492 unit_get_members_mask(member);
493 }
494 }
495
496 u->cgroup_members_mask_valid = true;
497 return u->cgroup_members_mask;
498 }
499
500 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
501 assert(u);
502
503 if (UNIT_ISSET(u->slice))
504 return unit_get_members_mask(UNIT_DEREF(u->slice));
505
506 return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
507 }
508
509 CGroupControllerMask unit_get_target_mask(Unit *u) {
510 CGroupControllerMask mask;
511
512 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
513 mask &= u->manager->cgroup_supported;
514
515 return mask;
516 }
517
518 /* Recurse from a unit up through its containing slices, propagating
519 * mask bits upward. A unit is also member of itself. */
520 void unit_update_cgroup_members_masks(Unit *u) {
521 CGroupControllerMask m;
522 bool more;
523
524 assert(u);
525
526 /* Calculate subtree mask */
527 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
528
529 /* See if anything changed from the previous invocation. If
530 * not, we're done. */
531 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
532 return;
533
534 more =
535 u->cgroup_subtree_mask_valid &&
536 ((m & ~u->cgroup_subtree_mask) != 0) &&
537 ((~m & u->cgroup_subtree_mask) == 0);
538
539 u->cgroup_subtree_mask = m;
540 u->cgroup_subtree_mask_valid = true;
541
542 if (UNIT_ISSET(u->slice)) {
543 Unit *s = UNIT_DEREF(u->slice);
544
545 if (more)
546 /* There's more set now than before. We
547 * propagate the new mask to the parent's mask
548 * (not caring if it actually was valid or
549 * not). */
550
551 s->cgroup_members_mask |= m;
552
553 else
554 /* There's less set now than before (or we
555 * don't know), we need to recalculate
556 * everything, so let's invalidate the
557 * parent's members mask */
558
559 s->cgroup_members_mask_valid = false;
560
561 /* And now make sure that this change also hits our
562 * grandparents */
563 unit_update_cgroup_members_masks(s);
564 }
565 }
566
567 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
568 Unit *u = userdata;
569
570 assert(mask != 0);
571 assert(u);
572
573 while (u) {
574 if (u->cgroup_path &&
575 u->cgroup_realized &&
576 (u->cgroup_realized_mask & mask) == mask)
577 return u->cgroup_path;
578
579 u = UNIT_DEREF(u->slice);
580 }
581
582 return NULL;
583 }
584
585 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
586 _cleanup_free_ char *path = NULL;
587 int r;
588
589 assert(u);
590
591 path = unit_default_cgroup_path(u);
592 if (!path)
593 return log_oom();
594
595 r = hashmap_put(u->manager->cgroup_unit, path, u);
596 if (r < 0) {
597 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
598 return r;
599 }
600 if (r > 0) {
601 u->cgroup_path = path;
602 path = NULL;
603 }
604
605 /* First, create our own group */
606 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
607 if (r < 0) {
608 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
609 return r;
610 }
611
612 /* Keep track that this is now realized */
613 u->cgroup_realized = true;
614 u->cgroup_realized_mask = mask;
615
616 /* Then, possibly move things over */
617 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
618 if (r < 0)
619 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
620
621 return 0;
622 }
623
624 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
625 assert(u);
626
627 return u->cgroup_realized && u->cgroup_realized_mask == mask;
628 }
629
630 /* Check if necessary controllers and attributes for a unit are in place.
631 *
632 * If so, do nothing.
633 * If not, create paths, move processes over, and set attributes.
634 *
635 * Returns 0 on success and < 0 on failure. */
636 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
637 CGroupControllerMask mask;
638 int r;
639
640 assert(u);
641
642 if (u->in_cgroup_queue) {
643 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
644 u->in_cgroup_queue = false;
645 }
646
647 mask = unit_get_target_mask(u);
648
649 if (unit_has_mask_realized(u, mask))
650 return 0;
651
652 /* First, realize parents */
653 if (UNIT_ISSET(u->slice)) {
654 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
655 if (r < 0)
656 return r;
657 }
658
659 /* And then do the real work */
660 r = unit_create_cgroups(u, mask);
661 if (r < 0)
662 return r;
663
664 /* Finally, apply the necessary attributes. */
665 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
666
667 return 0;
668 }
669
670 static void unit_add_to_cgroup_queue(Unit *u) {
671
672 if (u->in_cgroup_queue)
673 return;
674
675 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
676 u->in_cgroup_queue = true;
677 }
678
679 unsigned manager_dispatch_cgroup_queue(Manager *m) {
680 ManagerState state;
681 unsigned n = 0;
682 Unit *i;
683 int r;
684
685 state = manager_state(m);
686
687 while ((i = m->cgroup_queue)) {
688 assert(i->in_cgroup_queue);
689
690 r = unit_realize_cgroup_now(i, state);
691 if (r < 0)
692 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
693
694 n++;
695 }
696
697 return n;
698 }
699
700 static void unit_queue_siblings(Unit *u) {
701 Unit *slice;
702
703 /* This adds the siblings of the specified unit and the
704 * siblings of all parent units to the cgroup queue. (But
705 * neither the specified unit itself nor the parents.) */
706
707 while ((slice = UNIT_DEREF(u->slice))) {
708 Iterator i;
709 Unit *m;
710
711 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
712 if (m == u)
713 continue;
714
715 /* Skip units that have a dependency on the slice
716 * but aren't actually in it. */
717 if (UNIT_DEREF(m->slice) != slice)
718 continue;
719
720 /* No point in doing cgroup application for units
721 * without active processes. */
722 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
723 continue;
724
725 /* If the unit doesn't need any new controllers
726 * and has current ones realized, it doesn't need
727 * any changes. */
728 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
729 continue;
730
731 unit_add_to_cgroup_queue(m);
732 }
733
734 u = slice;
735 }
736 }
737
738 int unit_realize_cgroup(Unit *u) {
739 CGroupContext *c;
740
741 assert(u);
742
743 c = unit_get_cgroup_context(u);
744 if (!c)
745 return 0;
746
747 /* So, here's the deal: when realizing the cgroups for this
748 * unit, we need to first create all parents, but there's more
749 * actually: for the weight-based controllers we also need to
750 * make sure that all our siblings (i.e. units that are in the
751 * same slice as we are) have cgroups, too. Otherwise, things
752 * would become very uneven as each of their processes would
753 * get as much resources as all our group together. This call
754 * will synchronously create the parent cgroups, but will
755 * defer work on the siblings to the next event loop
756 * iteration. */
757
758 /* Add all sibling slices to the cgroup queue. */
759 unit_queue_siblings(u);
760
761 /* And realize this one now (and apply the values) */
762 return unit_realize_cgroup_now(u, manager_state(u->manager));
763 }
764
765 void unit_destroy_cgroup(Unit *u) {
766 int r;
767
768 assert(u);
769
770 if (!u->cgroup_path)
771 return;
772
773 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
774 if (r < 0)
775 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
776
777 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
778
779 free(u->cgroup_path);
780 u->cgroup_path = NULL;
781 u->cgroup_realized = false;
782 u->cgroup_realized_mask = 0;
783
784 }
785
786 pid_t unit_search_main_pid(Unit *u) {
787 _cleanup_fclose_ FILE *f = NULL;
788 pid_t pid = 0, npid, mypid;
789
790 assert(u);
791
792 if (!u->cgroup_path)
793 return 0;
794
795 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
796 return 0;
797
798 mypid = getpid();
799 while (cg_read_pid(f, &npid) > 0) {
800 pid_t ppid;
801
802 if (npid == pid)
803 continue;
804
805 /* Ignore processes that aren't our kids */
806 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
807 continue;
808
809 if (pid != 0) {
810 /* Dang, there's more than one daemonized PID
811 in this group, so we don't know what process
812 is the main process. */
813 pid = 0;
814 break;
815 }
816
817 pid = npid;
818 }
819
820 return pid;
821 }
822
823 int manager_setup_cgroup(Manager *m) {
824 _cleanup_free_ char *path = NULL;
825 int r;
826
827 assert(m);
828
829 /* 1. Determine hierarchy */
830 free(m->cgroup_root);
831 m->cgroup_root = NULL;
832
833 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
834 if (r < 0) {
835 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
836 return r;
837 }
838
839 /* LEGACY: Already in /system.slice? If so, let's cut this
840 * off. This is to support live upgrades from older systemd
841 * versions where PID 1 was moved there. */
842 if (m->running_as == SYSTEMD_SYSTEM) {
843 char *e;
844
845 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
846 if (!e)
847 e = endswith(m->cgroup_root, "/system");
848 if (e)
849 *e = 0;
850 }
851
852 /* And make sure to store away the root value without trailing
853 * slash, even for the root dir, so that we can easily prepend
854 * it everywhere. */
855 if (streq(m->cgroup_root, "/"))
856 m->cgroup_root[0] = 0;
857
858 /* 2. Show data */
859 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
860 if (r < 0) {
861 log_error("Cannot find cgroup mount point: %s", strerror(-r));
862 return r;
863 }
864
865 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
866 if (!m->test_run) {
867
868 /* 3. Install agent */
869 if (m->running_as == SYSTEMD_SYSTEM) {
870 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
871 if (r < 0)
872 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
873 else if (r > 0)
874 log_debug("Installed release agent.");
875 else
876 log_debug("Release agent already installed.");
877 }
878
879 /* 4. Make sure we are in the root cgroup */
880 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
881 if (r < 0) {
882 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
883 return r;
884 }
885
886 /* 5. And pin it, so that it cannot be unmounted */
887 safe_close(m->pin_cgroupfs_fd);
888
889 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
890 if (m->pin_cgroupfs_fd < 0) {
891 log_error("Failed to open pin file: %m");
892 return -errno;
893 }
894
895 /* 6. Always enable hierarchial support if it exists... */
896 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
897 }
898
899 /* 7. Figure out which controllers are supported */
900 m->cgroup_supported = cg_mask_supported();
901
902 return 0;
903 }
904
905 void manager_shutdown_cgroup(Manager *m, bool delete) {
906 assert(m);
907
908 /* We can't really delete the group, since we are in it. But
909 * let's trim it. */
910 if (delete && m->cgroup_root)
911 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
912
913 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
914
915 free(m->cgroup_root);
916 m->cgroup_root = NULL;
917 }
918
919 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
920 char *p;
921 Unit *u;
922
923 assert(m);
924 assert(cgroup);
925
926 u = hashmap_get(m->cgroup_unit, cgroup);
927 if (u)
928 return u;
929
930 p = strdupa(cgroup);
931 for (;;) {
932 char *e;
933
934 e = strrchr(p, '/');
935 if (e == p || !e)
936 return NULL;
937
938 *e = 0;
939
940 u = hashmap_get(m->cgroup_unit, p);
941 if (u)
942 return u;
943 }
944 }
945
946 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
947 _cleanup_free_ char *cgroup = NULL;
948 int r;
949
950 assert(m);
951
952 if (pid <= 1)
953 return NULL;
954
955 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
956 if (r < 0)
957 return NULL;
958
959 return manager_get_unit_by_cgroup(m, cgroup);
960 }
961
962 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
963 Unit *u;
964 int r;
965
966 assert(m);
967 assert(cgroup);
968
969 u = manager_get_unit_by_cgroup(m, cgroup);
970 if (u) {
971 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
972 if (r > 0) {
973 if (UNIT_VTABLE(u)->notify_cgroup_empty)
974 UNIT_VTABLE(u)->notify_cgroup_empty(u);
975
976 unit_add_to_gc_queue(u);
977 }
978 }
979
980 return 0;
981 }
982
983 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
984 [CGROUP_AUTO] = "auto",
985 [CGROUP_CLOSED] = "closed",
986 [CGROUP_STRICT] = "strict",
987 };
988
989 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);