]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
treewide: fix multiple typos
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
31
32 void cgroup_context_init(CGroupContext *c) {
33 assert(c);
34
35 /* Initialize everything to the kernel defaults, assuming the
36 * structure is preinitialized to 0 */
37
38 c->cpu_shares = (unsigned long) -1;
39 c->startup_cpu_shares = (unsigned long) -1;
40 c->memory_limit = (uint64_t) -1;
41 c->blockio_weight = (unsigned long) -1;
42 c->startup_blockio_weight = (unsigned long) -1;
43
44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
45 }
46
47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
48 assert(c);
49 assert(a);
50
51 LIST_REMOVE(device_allow, c->device_allow, a);
52 free(a->path);
53 free(a);
54 }
55
56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
57 assert(c);
58 assert(w);
59
60 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
61 free(w->path);
62 free(w);
63 }
64
65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
66 assert(c);
67 assert(b);
68
69 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
70 free(b->path);
71 free(b);
72 }
73
74 void cgroup_context_done(CGroupContext *c) {
75 assert(c);
76
77 while (c->blockio_device_weights)
78 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
79
80 while (c->blockio_device_bandwidths)
81 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
82
83 while (c->device_allow)
84 cgroup_context_free_device_allow(c, c->device_allow);
85 }
86
87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
88 CGroupBlockIODeviceBandwidth *b;
89 CGroupBlockIODeviceWeight *w;
90 CGroupDeviceAllow *a;
91 char u[FORMAT_TIMESPAN_MAX];
92
93 assert(c);
94 assert(f);
95
96 prefix = strempty(prefix);
97
98 fprintf(f,
99 "%sCPUAccounting=%s\n"
100 "%sBlockIOAccounting=%s\n"
101 "%sMemoryAccounting=%s\n"
102 "%sCPUShares=%lu\n"
103 "%sStartupCPUShares=%lu\n"
104 "%sCPUQuotaPerSecSec=%s\n"
105 "%sBlockIOWeight=%lu\n"
106 "%sStartupBlockIOWeight=%lu\n"
107 "%sMemoryLimit=%" PRIu64 "\n"
108 "%sDevicePolicy=%s\n"
109 "%sDelegate=%s\n",
110 prefix, yes_no(c->cpu_accounting),
111 prefix, yes_no(c->blockio_accounting),
112 prefix, yes_no(c->memory_accounting),
113 prefix, c->cpu_shares,
114 prefix, c->startup_cpu_shares,
115 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
116 prefix, c->blockio_weight,
117 prefix, c->startup_blockio_weight,
118 prefix, c->memory_limit,
119 prefix, cgroup_device_policy_to_string(c->device_policy),
120 prefix, yes_no(c->delegate));
121
122 LIST_FOREACH(device_allow, a, c->device_allow)
123 fprintf(f,
124 "%sDeviceAllow=%s %s%s%s\n",
125 prefix,
126 a->path,
127 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
128
129 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
130 fprintf(f,
131 "%sBlockIODeviceWeight=%s %lu",
132 prefix,
133 w->path,
134 w->weight);
135
136 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
137 char buf[FORMAT_BYTES_MAX];
138
139 fprintf(f,
140 "%s%s=%s %s\n",
141 prefix,
142 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
143 b->path,
144 format_bytes(buf, sizeof(buf), b->bandwidth));
145 }
146 }
147
148 static int lookup_blkio_device(const char *p, dev_t *dev) {
149 struct stat st;
150 int r;
151
152 assert(p);
153 assert(dev);
154
155 r = stat(p, &st);
156 if (r < 0)
157 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
158
159 if (S_ISBLK(st.st_mode))
160 *dev = st.st_rdev;
161 else if (major(st.st_dev) != 0) {
162 /* If this is not a device node then find the block
163 * device this file is stored on */
164 *dev = st.st_dev;
165
166 /* If this is a partition, try to get the originating
167 * block device */
168 block_get_whole_disk(*dev, dev);
169 } else {
170 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
171 return -ENODEV;
172 }
173
174 return 0;
175 }
176
177 static int whitelist_device(const char *path, const char *node, const char *acc) {
178 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
179 struct stat st;
180 int r;
181
182 assert(path);
183 assert(acc);
184
185 if (stat(node, &st) < 0) {
186 log_warning("Couldn't stat device %s", node);
187 return -errno;
188 }
189
190 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
191 log_warning("%s is not a device.", node);
192 return -ENODEV;
193 }
194
195 sprintf(buf,
196 "%c %u:%u %s",
197 S_ISCHR(st.st_mode) ? 'c' : 'b',
198 major(st.st_rdev), minor(st.st_rdev),
199 acc);
200
201 r = cg_set_attribute("devices", path, "devices.allow", buf);
202 if (r < 0)
203 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
204 "Failed to set devices.allow on %s: %m", path);
205
206 return r;
207 }
208
209 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
210 _cleanup_fclose_ FILE *f = NULL;
211 char line[LINE_MAX];
212 bool good = false;
213 int r;
214
215 assert(path);
216 assert(acc);
217 assert(type == 'b' || type == 'c');
218
219 f = fopen("/proc/devices", "re");
220 if (!f)
221 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
222
223 FOREACH_LINE(line, f, goto fail) {
224 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
225 unsigned maj;
226
227 truncate_nl(line);
228
229 if (type == 'c' && streq(line, "Character devices:")) {
230 good = true;
231 continue;
232 }
233
234 if (type == 'b' && streq(line, "Block devices:")) {
235 good = true;
236 continue;
237 }
238
239 if (isempty(line)) {
240 good = false;
241 continue;
242 }
243
244 if (!good)
245 continue;
246
247 p = strstrip(line);
248
249 w = strpbrk(p, WHITESPACE);
250 if (!w)
251 continue;
252 *w = 0;
253
254 r = safe_atou(p, &maj);
255 if (r < 0)
256 continue;
257 if (maj <= 0)
258 continue;
259
260 w++;
261 w += strspn(w, WHITESPACE);
262
263 if (fnmatch(name, w, 0) != 0)
264 continue;
265
266 sprintf(buf,
267 "%c %u:* %s",
268 type,
269 maj,
270 acc);
271
272 r = cg_set_attribute("devices", path, "devices.allow", buf);
273 if (r < 0)
274 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
275 "Failed to set devices.allow on %s: %m", path);
276 }
277
278 return 0;
279
280 fail:
281 log_warning_errno(errno, "Failed to read /proc/devices: %m");
282 return -errno;
283 }
284
285 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
286 bool is_root;
287 int r;
288
289 assert(c);
290 assert(path);
291
292 if (mask == 0)
293 return;
294
295 /* Some cgroup attributes are not supported on the root cgroup,
296 * hence silently ignore */
297 is_root = isempty(path) || path_equal(path, "/");
298 if (is_root)
299 /* Make sure we don't try to display messages with an empty path. */
300 path = "/";
301
302 /* We generally ignore errors caused by read-only mounted
303 * cgroup trees (assuming we are running in a container then),
304 * and missing cgroups, i.e. EROFS and ENOENT. */
305
306 if ((mask & CGROUP_CPU) && !is_root) {
307 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
308
309 sprintf(buf, "%lu\n",
310 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
311 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
312 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
313 if (r < 0)
314 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
315 "Failed to set cpu.shares on %s: %m", path);
316
317 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
318 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
319 if (r < 0)
320 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
321 "Failed to set cpu.cfs_period_us on %s: %m", path);
322
323 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
324 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
325 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
326 } else
327 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
328 if (r < 0)
329 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
330 "Failed to set cpu.cfs_quota_us on %s: %m", path);
331 }
332
333 if (mask & CGROUP_BLKIO) {
334 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
335 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
336 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
337 CGroupBlockIODeviceWeight *w;
338 CGroupBlockIODeviceBandwidth *b;
339
340 if (!is_root) {
341 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
342 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
343 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
344 if (r < 0)
345 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
346 "Failed to set blkio.weight on %s: %m", path);
347
348 /* FIXME: no way to reset this list */
349 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
350 dev_t dev;
351
352 r = lookup_blkio_device(w->path, &dev);
353 if (r < 0)
354 continue;
355
356 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
357 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
358 if (r < 0)
359 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
360 "Failed to set blkio.weight_device on %s: %m", path);
361 }
362 }
363
364 /* FIXME: no way to reset this list */
365 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
366 const char *a;
367 dev_t dev;
368
369 r = lookup_blkio_device(b->path, &dev);
370 if (r < 0)
371 continue;
372
373 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
374
375 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
376 r = cg_set_attribute("blkio", path, a, buf);
377 if (r < 0)
378 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
379 "Failed to set %s on %s: %m", a, path);
380 }
381 }
382
383 if ((mask & CGROUP_MEMORY) & !is_root) {
384 if (c->memory_limit != (uint64_t) -1) {
385 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
386
387 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
388 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
389 } else
390 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
391
392 if (r < 0)
393 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
394 "Failed to set memory.limit_in_bytes on %s: %m", path);
395 }
396
397 if ((mask & CGROUP_DEVICE) && !is_root) {
398 CGroupDeviceAllow *a;
399
400 /* Changing the devices list of a populated cgroup
401 * might result in EINVAL, hence ignore EINVAL
402 * here. */
403
404 if (c->device_allow || c->device_policy != CGROUP_AUTO)
405 r = cg_set_attribute("devices", path, "devices.deny", "a");
406 else
407 r = cg_set_attribute("devices", path, "devices.allow", "a");
408 if (r < 0)
409 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
410 "Failed to reset devices.list on %s: %m", path);
411
412 if (c->device_policy == CGROUP_CLOSED ||
413 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
414 static const char auto_devices[] =
415 "/dev/null\0" "rwm\0"
416 "/dev/zero\0" "rwm\0"
417 "/dev/full\0" "rwm\0"
418 "/dev/random\0" "rwm\0"
419 "/dev/urandom\0" "rwm\0"
420 "/dev/tty\0" "rwm\0"
421 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
422
423 const char *x, *y;
424
425 NULSTR_FOREACH_PAIR(x, y, auto_devices)
426 whitelist_device(path, x, y);
427
428 whitelist_major(path, "pts", 'c', "rw");
429 whitelist_major(path, "kdbus", 'c', "rw");
430 whitelist_major(path, "kdbus/*", 'c', "rw");
431 }
432
433 LIST_FOREACH(device_allow, a, c->device_allow) {
434 char acc[4];
435 unsigned k = 0;
436
437 if (a->r)
438 acc[k++] = 'r';
439 if (a->w)
440 acc[k++] = 'w';
441 if (a->m)
442 acc[k++] = 'm';
443
444 if (k == 0)
445 continue;
446
447 acc[k++] = 0;
448
449 if (startswith(a->path, "/dev/"))
450 whitelist_device(path, a->path, acc);
451 else if (startswith(a->path, "block-"))
452 whitelist_major(path, a->path + 6, 'b', acc);
453 else if (startswith(a->path, "char-"))
454 whitelist_major(path, a->path + 5, 'c', acc);
455 else
456 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
457 }
458 }
459 }
460
461 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
462 CGroupControllerMask mask = 0;
463
464 /* Figure out which controllers we need */
465
466 if (c->cpu_accounting ||
467 c->cpu_shares != (unsigned long) -1 ||
468 c->startup_cpu_shares != (unsigned long) -1 ||
469 c->cpu_quota_per_sec_usec != USEC_INFINITY)
470 mask |= CGROUP_CPUACCT | CGROUP_CPU;
471
472 if (c->blockio_accounting ||
473 c->blockio_weight != (unsigned long) -1 ||
474 c->startup_blockio_weight != (unsigned long) -1 ||
475 c->blockio_device_weights ||
476 c->blockio_device_bandwidths)
477 mask |= CGROUP_BLKIO;
478
479 if (c->memory_accounting ||
480 c->memory_limit != (uint64_t) -1)
481 mask |= CGROUP_MEMORY;
482
483 if (c->device_allow ||
484 c->device_policy != CGROUP_AUTO)
485 mask |= CGROUP_DEVICE;
486
487 return mask;
488 }
489
490 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
491 CGroupContext *c;
492
493 c = unit_get_cgroup_context(u);
494 if (!c)
495 return 0;
496
497 /* If delegation is turned on, then turn on all cgroups,
498 * unless the process we fork into it is known to drop
499 * privileges anyway, and shouldn't get access to the
500 * controllers anyway. */
501
502 if (c->delegate) {
503 ExecContext *e;
504
505 e = unit_get_exec_context(u);
506 if (!e || exec_context_maintains_privileges(e))
507 return _CGROUP_CONTROLLER_MASK_ALL;
508 }
509
510 return cgroup_context_get_mask(c);
511 }
512
513 CGroupControllerMask unit_get_members_mask(Unit *u) {
514 assert(u);
515
516 if (u->cgroup_members_mask_valid)
517 return u->cgroup_members_mask;
518
519 u->cgroup_members_mask = 0;
520
521 if (u->type == UNIT_SLICE) {
522 Unit *member;
523 Iterator i;
524
525 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
526
527 if (member == u)
528 continue;
529
530 if (UNIT_DEREF(member->slice) != u)
531 continue;
532
533 u->cgroup_members_mask |=
534 unit_get_cgroup_mask(member) |
535 unit_get_members_mask(member);
536 }
537 }
538
539 u->cgroup_members_mask_valid = true;
540 return u->cgroup_members_mask;
541 }
542
543 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
544 assert(u);
545
546 if (UNIT_ISSET(u->slice))
547 return unit_get_members_mask(UNIT_DEREF(u->slice));
548
549 return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
550 }
551
552 CGroupControllerMask unit_get_target_mask(Unit *u) {
553 CGroupControllerMask mask;
554
555 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
556 mask &= u->manager->cgroup_supported;
557
558 return mask;
559 }
560
561 /* Recurse from a unit up through its containing slices, propagating
562 * mask bits upward. A unit is also member of itself. */
563 void unit_update_cgroup_members_masks(Unit *u) {
564 CGroupControllerMask m;
565 bool more;
566
567 assert(u);
568
569 /* Calculate subtree mask */
570 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
571
572 /* See if anything changed from the previous invocation. If
573 * not, we're done. */
574 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
575 return;
576
577 more =
578 u->cgroup_subtree_mask_valid &&
579 ((m & ~u->cgroup_subtree_mask) != 0) &&
580 ((~m & u->cgroup_subtree_mask) == 0);
581
582 u->cgroup_subtree_mask = m;
583 u->cgroup_subtree_mask_valid = true;
584
585 if (UNIT_ISSET(u->slice)) {
586 Unit *s = UNIT_DEREF(u->slice);
587
588 if (more)
589 /* There's more set now than before. We
590 * propagate the new mask to the parent's mask
591 * (not caring if it actually was valid or
592 * not). */
593
594 s->cgroup_members_mask |= m;
595
596 else
597 /* There's less set now than before (or we
598 * don't know), we need to recalculate
599 * everything, so let's invalidate the
600 * parent's members mask */
601
602 s->cgroup_members_mask_valid = false;
603
604 /* And now make sure that this change also hits our
605 * grandparents */
606 unit_update_cgroup_members_masks(s);
607 }
608 }
609
610 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
611 Unit *u = userdata;
612
613 assert(mask != 0);
614 assert(u);
615
616 while (u) {
617 if (u->cgroup_path &&
618 u->cgroup_realized &&
619 (u->cgroup_realized_mask & mask) == mask)
620 return u->cgroup_path;
621
622 u = UNIT_DEREF(u->slice);
623 }
624
625 return NULL;
626 }
627
628 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
629 CGroupContext *c;
630 int r;
631
632 assert(u);
633
634 c = unit_get_cgroup_context(u);
635 if (!c)
636 return 0;
637
638 if (!u->cgroup_path) {
639 _cleanup_free_ char *path = NULL;
640
641 path = unit_default_cgroup_path(u);
642 if (!path)
643 return log_oom();
644
645 r = hashmap_put(u->manager->cgroup_unit, path, u);
646 if (r < 0) {
647 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
648 return r;
649 }
650 if (r > 0) {
651 u->cgroup_path = path;
652 path = NULL;
653 }
654 }
655
656 /* First, create our own group */
657 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
658 if (r < 0)
659 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
660
661 /* Keep track that this is now realized */
662 u->cgroup_realized = true;
663 u->cgroup_realized_mask = mask;
664
665 if (u->type != UNIT_SLICE && !c->delegate) {
666
667 /* Then, possibly move things over, but not if
668 * subgroups may contain processes, which is the case
669 * for slice and delegation units. */
670 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
671 if (r < 0)
672 log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
673 }
674
675 return 0;
676 }
677
678 int unit_attach_pids_to_cgroup(Unit *u) {
679 int r;
680 assert(u);
681
682 r = unit_realize_cgroup(u);
683 if (r < 0)
684 return r;
685
686 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
687 if (r < 0)
688 return r;
689
690 return 0;
691 }
692
693 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
694 assert(u);
695
696 return u->cgroup_realized && u->cgroup_realized_mask == mask;
697 }
698
699 /* Check if necessary controllers and attributes for a unit are in place.
700 *
701 * If so, do nothing.
702 * If not, create paths, move processes over, and set attributes.
703 *
704 * Returns 0 on success and < 0 on failure. */
705 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
706 CGroupControllerMask mask;
707 int r;
708
709 assert(u);
710
711 if (u->in_cgroup_queue) {
712 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
713 u->in_cgroup_queue = false;
714 }
715
716 mask = unit_get_target_mask(u);
717
718 if (unit_has_mask_realized(u, mask))
719 return 0;
720
721 /* First, realize parents */
722 if (UNIT_ISSET(u->slice)) {
723 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
724 if (r < 0)
725 return r;
726 }
727
728 /* And then do the real work */
729 r = unit_create_cgroups(u, mask);
730 if (r < 0)
731 return r;
732
733 /* Finally, apply the necessary attributes. */
734 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
735
736 return 0;
737 }
738
739 static void unit_add_to_cgroup_queue(Unit *u) {
740
741 if (u->in_cgroup_queue)
742 return;
743
744 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
745 u->in_cgroup_queue = true;
746 }
747
748 unsigned manager_dispatch_cgroup_queue(Manager *m) {
749 ManagerState state;
750 unsigned n = 0;
751 Unit *i;
752 int r;
753
754 state = manager_state(m);
755
756 while ((i = m->cgroup_queue)) {
757 assert(i->in_cgroup_queue);
758
759 r = unit_realize_cgroup_now(i, state);
760 if (r < 0)
761 log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
762
763 n++;
764 }
765
766 return n;
767 }
768
769 static void unit_queue_siblings(Unit *u) {
770 Unit *slice;
771
772 /* This adds the siblings of the specified unit and the
773 * siblings of all parent units to the cgroup queue. (But
774 * neither the specified unit itself nor the parents.) */
775
776 while ((slice = UNIT_DEREF(u->slice))) {
777 Iterator i;
778 Unit *m;
779
780 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
781 if (m == u)
782 continue;
783
784 /* Skip units that have a dependency on the slice
785 * but aren't actually in it. */
786 if (UNIT_DEREF(m->slice) != slice)
787 continue;
788
789 /* No point in doing cgroup application for units
790 * without active processes. */
791 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
792 continue;
793
794 /* If the unit doesn't need any new controllers
795 * and has current ones realized, it doesn't need
796 * any changes. */
797 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
798 continue;
799
800 unit_add_to_cgroup_queue(m);
801 }
802
803 u = slice;
804 }
805 }
806
807 int unit_realize_cgroup(Unit *u) {
808 CGroupContext *c;
809
810 assert(u);
811
812 c = unit_get_cgroup_context(u);
813 if (!c)
814 return 0;
815
816 /* So, here's the deal: when realizing the cgroups for this
817 * unit, we need to first create all parents, but there's more
818 * actually: for the weight-based controllers we also need to
819 * make sure that all our siblings (i.e. units that are in the
820 * same slice as we are) have cgroups, too. Otherwise, things
821 * would become very uneven as each of their processes would
822 * get as much resources as all our group together. This call
823 * will synchronously create the parent cgroups, but will
824 * defer work on the siblings to the next event loop
825 * iteration. */
826
827 /* Add all sibling slices to the cgroup queue. */
828 unit_queue_siblings(u);
829
830 /* And realize this one now (and apply the values) */
831 return unit_realize_cgroup_now(u, manager_state(u->manager));
832 }
833
834 void unit_destroy_cgroup_if_empty(Unit *u) {
835 int r;
836
837 assert(u);
838
839 if (!u->cgroup_path)
840 return;
841
842 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
843 if (r < 0) {
844 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
845 return;
846 }
847
848 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
849
850 free(u->cgroup_path);
851 u->cgroup_path = NULL;
852 u->cgroup_realized = false;
853 u->cgroup_realized_mask = 0;
854 }
855
856 pid_t unit_search_main_pid(Unit *u) {
857 _cleanup_fclose_ FILE *f = NULL;
858 pid_t pid = 0, npid, mypid;
859
860 assert(u);
861
862 if (!u->cgroup_path)
863 return 0;
864
865 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
866 return 0;
867
868 mypid = getpid();
869 while (cg_read_pid(f, &npid) > 0) {
870 pid_t ppid;
871
872 if (npid == pid)
873 continue;
874
875 /* Ignore processes that aren't our kids */
876 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
877 continue;
878
879 if (pid != 0) {
880 /* Dang, there's more than one daemonized PID
881 in this group, so we don't know what process
882 is the main process. */
883 pid = 0;
884 break;
885 }
886
887 pid = npid;
888 }
889
890 return pid;
891 }
892
893 int manager_setup_cgroup(Manager *m) {
894 _cleanup_free_ char *path = NULL;
895 int r;
896
897 assert(m);
898
899 /* 1. Determine hierarchy */
900 free(m->cgroup_root);
901 m->cgroup_root = NULL;
902
903 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
904 if (r < 0)
905 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
906
907 /* LEGACY: Already in /system.slice? If so, let's cut this
908 * off. This is to support live upgrades from older systemd
909 * versions where PID 1 was moved there. */
910 if (m->running_as == SYSTEMD_SYSTEM) {
911 char *e;
912
913 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
914 if (!e)
915 e = endswith(m->cgroup_root, "/system");
916 if (e)
917 *e = 0;
918 }
919
920 /* And make sure to store away the root value without trailing
921 * slash, even for the root dir, so that we can easily prepend
922 * it everywhere. */
923 if (streq(m->cgroup_root, "/"))
924 m->cgroup_root[0] = 0;
925
926 /* 2. Show data */
927 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
928 if (r < 0)
929 return log_error_errno(r, "Cannot find cgroup mount point: %m");
930
931 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
932 if (!m->test_run) {
933
934 /* 3. Install agent */
935 if (m->running_as == SYSTEMD_SYSTEM) {
936 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
937 if (r < 0)
938 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
939 else if (r > 0)
940 log_debug("Installed release agent.");
941 else
942 log_debug("Release agent already installed.");
943 }
944
945 /* 4. Make sure we are in the root cgroup */
946 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
947 if (r < 0)
948 return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
949
950 /* 5. And pin it, so that it cannot be unmounted */
951 safe_close(m->pin_cgroupfs_fd);
952
953 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
954 if (m->pin_cgroupfs_fd < 0)
955 return log_error_errno(errno, "Failed to open pin file: %m");
956
957 /* 6. Always enable hierarchical support if it exists... */
958 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
959 }
960
961 /* 7. Figure out which controllers are supported */
962 m->cgroup_supported = cg_mask_supported();
963
964 return 0;
965 }
966
967 void manager_shutdown_cgroup(Manager *m, bool delete) {
968 assert(m);
969
970 /* We can't really delete the group, since we are in it. But
971 * let's trim it. */
972 if (delete && m->cgroup_root)
973 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
974
975 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
976
977 free(m->cgroup_root);
978 m->cgroup_root = NULL;
979 }
980
981 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
982 char *p;
983 Unit *u;
984
985 assert(m);
986 assert(cgroup);
987
988 u = hashmap_get(m->cgroup_unit, cgroup);
989 if (u)
990 return u;
991
992 p = strdupa(cgroup);
993 for (;;) {
994 char *e;
995
996 e = strrchr(p, '/');
997 if (e == p || !e)
998 return NULL;
999
1000 *e = 0;
1001
1002 u = hashmap_get(m->cgroup_unit, p);
1003 if (u)
1004 return u;
1005 }
1006 }
1007
1008 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1009 _cleanup_free_ char *cgroup = NULL;
1010 int r;
1011
1012 assert(m);
1013
1014 if (pid <= 1)
1015 return NULL;
1016
1017 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1018 if (r < 0)
1019 return NULL;
1020
1021 return manager_get_unit_by_cgroup(m, cgroup);
1022 }
1023
1024 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1025 Unit *u;
1026 int r;
1027
1028 assert(m);
1029 assert(cgroup);
1030
1031 u = manager_get_unit_by_cgroup(m, cgroup);
1032 if (u) {
1033 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1034 if (r > 0) {
1035 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1036 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1037
1038 unit_add_to_gc_queue(u);
1039 }
1040 }
1041
1042 return 0;
1043 }
1044
1045 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1046 [CGROUP_AUTO] = "auto",
1047 [CGROUP_CLOSED] = "closed",
1048 [CGROUP_STRICT] = "strict",
1049 };
1050
1051 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);