]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
Merge pull request #1111 from poettering/more-cgroup-fixes
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "process-util.h"
26 #include "path-util.h"
27 #include "special.h"
28 #include "cgroup-util.h"
29 #include "cgroup.h"
30
31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
32
33 void cgroup_context_init(CGroupContext *c) {
34 assert(c);
35
36 /* Initialize everything to the kernel defaults, assuming the
37 * structure is preinitialized to 0 */
38
39 c->cpu_shares = (unsigned long) -1;
40 c->startup_cpu_shares = (unsigned long) -1;
41 c->memory_limit = (uint64_t) -1;
42 c->blockio_weight = (unsigned long) -1;
43 c->startup_blockio_weight = (unsigned long) -1;
44
45 c->cpu_quota_per_sec_usec = USEC_INFINITY;
46 }
47
48 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
49 assert(c);
50 assert(a);
51
52 LIST_REMOVE(device_allow, c->device_allow, a);
53 free(a->path);
54 free(a);
55 }
56
57 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
58 assert(c);
59 assert(w);
60
61 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
62 free(w->path);
63 free(w);
64 }
65
66 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
67 assert(c);
68 assert(b);
69
70 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
71 free(b->path);
72 free(b);
73 }
74
75 void cgroup_context_done(CGroupContext *c) {
76 assert(c);
77
78 while (c->blockio_device_weights)
79 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
80
81 while (c->blockio_device_bandwidths)
82 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
83
84 while (c->device_allow)
85 cgroup_context_free_device_allow(c, c->device_allow);
86 }
87
88 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
89 CGroupBlockIODeviceBandwidth *b;
90 CGroupBlockIODeviceWeight *w;
91 CGroupDeviceAllow *a;
92 char u[FORMAT_TIMESPAN_MAX];
93
94 assert(c);
95 assert(f);
96
97 prefix = strempty(prefix);
98
99 fprintf(f,
100 "%sCPUAccounting=%s\n"
101 "%sBlockIOAccounting=%s\n"
102 "%sMemoryAccounting=%s\n"
103 "%sCPUShares=%lu\n"
104 "%sStartupCPUShares=%lu\n"
105 "%sCPUQuotaPerSecSec=%s\n"
106 "%sBlockIOWeight=%lu\n"
107 "%sStartupBlockIOWeight=%lu\n"
108 "%sMemoryLimit=%" PRIu64 "\n"
109 "%sDevicePolicy=%s\n"
110 "%sDelegate=%s\n",
111 prefix, yes_no(c->cpu_accounting),
112 prefix, yes_no(c->blockio_accounting),
113 prefix, yes_no(c->memory_accounting),
114 prefix, c->cpu_shares,
115 prefix, c->startup_cpu_shares,
116 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
117 prefix, c->blockio_weight,
118 prefix, c->startup_blockio_weight,
119 prefix, c->memory_limit,
120 prefix, cgroup_device_policy_to_string(c->device_policy),
121 prefix, yes_no(c->delegate));
122
123 LIST_FOREACH(device_allow, a, c->device_allow)
124 fprintf(f,
125 "%sDeviceAllow=%s %s%s%s\n",
126 prefix,
127 a->path,
128 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
129
130 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
131 fprintf(f,
132 "%sBlockIODeviceWeight=%s %lu",
133 prefix,
134 w->path,
135 w->weight);
136
137 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
138 char buf[FORMAT_BYTES_MAX];
139
140 fprintf(f,
141 "%s%s=%s %s\n",
142 prefix,
143 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
144 b->path,
145 format_bytes(buf, sizeof(buf), b->bandwidth));
146 }
147 }
148
149 static int lookup_blkio_device(const char *p, dev_t *dev) {
150 struct stat st;
151 int r;
152
153 assert(p);
154 assert(dev);
155
156 r = stat(p, &st);
157 if (r < 0)
158 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
159
160 if (S_ISBLK(st.st_mode))
161 *dev = st.st_rdev;
162 else if (major(st.st_dev) != 0) {
163 /* If this is not a device node then find the block
164 * device this file is stored on */
165 *dev = st.st_dev;
166
167 /* If this is a partition, try to get the originating
168 * block device */
169 block_get_whole_disk(*dev, dev);
170 } else {
171 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
172 return -ENODEV;
173 }
174
175 return 0;
176 }
177
178 static int whitelist_device(const char *path, const char *node, const char *acc) {
179 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
180 struct stat st;
181 int r;
182
183 assert(path);
184 assert(acc);
185
186 if (stat(node, &st) < 0) {
187 log_warning("Couldn't stat device %s", node);
188 return -errno;
189 }
190
191 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
192 log_warning("%s is not a device.", node);
193 return -ENODEV;
194 }
195
196 sprintf(buf,
197 "%c %u:%u %s",
198 S_ISCHR(st.st_mode) ? 'c' : 'b',
199 major(st.st_rdev), minor(st.st_rdev),
200 acc);
201
202 r = cg_set_attribute("devices", path, "devices.allow", buf);
203 if (r < 0)
204 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
205 "Failed to set devices.allow on %s: %m", path);
206
207 return r;
208 }
209
210 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
211 _cleanup_fclose_ FILE *f = NULL;
212 char line[LINE_MAX];
213 bool good = false;
214 int r;
215
216 assert(path);
217 assert(acc);
218 assert(type == 'b' || type == 'c');
219
220 f = fopen("/proc/devices", "re");
221 if (!f)
222 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
223
224 FOREACH_LINE(line, f, goto fail) {
225 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
226 unsigned maj;
227
228 truncate_nl(line);
229
230 if (type == 'c' && streq(line, "Character devices:")) {
231 good = true;
232 continue;
233 }
234
235 if (type == 'b' && streq(line, "Block devices:")) {
236 good = true;
237 continue;
238 }
239
240 if (isempty(line)) {
241 good = false;
242 continue;
243 }
244
245 if (!good)
246 continue;
247
248 p = strstrip(line);
249
250 w = strpbrk(p, WHITESPACE);
251 if (!w)
252 continue;
253 *w = 0;
254
255 r = safe_atou(p, &maj);
256 if (r < 0)
257 continue;
258 if (maj <= 0)
259 continue;
260
261 w++;
262 w += strspn(w, WHITESPACE);
263
264 if (fnmatch(name, w, 0) != 0)
265 continue;
266
267 sprintf(buf,
268 "%c %u:* %s",
269 type,
270 maj,
271 acc);
272
273 r = cg_set_attribute("devices", path, "devices.allow", buf);
274 if (r < 0)
275 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
276 "Failed to set devices.allow on %s: %m", path);
277 }
278
279 return 0;
280
281 fail:
282 log_warning_errno(errno, "Failed to read /proc/devices: %m");
283 return -errno;
284 }
285
286 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
287 bool is_root;
288 int r;
289
290 assert(c);
291 assert(path);
292
293 if (mask == 0)
294 return;
295
296 /* Some cgroup attributes are not supported on the root cgroup,
297 * hence silently ignore */
298 is_root = isempty(path) || path_equal(path, "/");
299 if (is_root)
300 /* Make sure we don't try to display messages with an empty path. */
301 path = "/";
302
303 /* We generally ignore errors caused by read-only mounted
304 * cgroup trees (assuming we are running in a container then),
305 * and missing cgroups, i.e. EROFS and ENOENT. */
306
307 if ((mask & CGROUP_CPU) && !is_root) {
308 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
309
310 sprintf(buf, "%lu\n",
311 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
312 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
313 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
314 if (r < 0)
315 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
316 "Failed to set cpu.shares on %s: %m", path);
317
318 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
319 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
320 if (r < 0)
321 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
322 "Failed to set cpu.cfs_period_us on %s: %m", path);
323
324 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
325 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
326 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
327 } else
328 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
329 if (r < 0)
330 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
331 "Failed to set cpu.cfs_quota_us on %s: %m", path);
332 }
333
334 if (mask & CGROUP_BLKIO) {
335 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
336 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
337 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
338 CGroupBlockIODeviceWeight *w;
339 CGroupBlockIODeviceBandwidth *b;
340
341 if (!is_root) {
342 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
343 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
344 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
345 if (r < 0)
346 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
347 "Failed to set blkio.weight on %s: %m", path);
348
349 /* FIXME: no way to reset this list */
350 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
351 dev_t dev;
352
353 r = lookup_blkio_device(w->path, &dev);
354 if (r < 0)
355 continue;
356
357 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
358 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
359 if (r < 0)
360 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
361 "Failed to set blkio.weight_device on %s: %m", path);
362 }
363 }
364
365 /* FIXME: no way to reset this list */
366 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
367 const char *a;
368 dev_t dev;
369
370 r = lookup_blkio_device(b->path, &dev);
371 if (r < 0)
372 continue;
373
374 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
375
376 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
377 r = cg_set_attribute("blkio", path, a, buf);
378 if (r < 0)
379 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
380 "Failed to set %s on %s: %m", a, path);
381 }
382 }
383
384 if ((mask & CGROUP_MEMORY) && !is_root) {
385 if (c->memory_limit != (uint64_t) -1) {
386 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
387
388 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
389 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
390 } else
391 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
392
393 if (r < 0)
394 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
395 "Failed to set memory.limit_in_bytes on %s: %m", path);
396 }
397
398 if ((mask & CGROUP_DEVICE) && !is_root) {
399 CGroupDeviceAllow *a;
400
401 /* Changing the devices list of a populated cgroup
402 * might result in EINVAL, hence ignore EINVAL
403 * here. */
404
405 if (c->device_allow || c->device_policy != CGROUP_AUTO)
406 r = cg_set_attribute("devices", path, "devices.deny", "a");
407 else
408 r = cg_set_attribute("devices", path, "devices.allow", "a");
409 if (r < 0)
410 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
411 "Failed to reset devices.list on %s: %m", path);
412
413 if (c->device_policy == CGROUP_CLOSED ||
414 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
415 static const char auto_devices[] =
416 "/dev/null\0" "rwm\0"
417 "/dev/zero\0" "rwm\0"
418 "/dev/full\0" "rwm\0"
419 "/dev/random\0" "rwm\0"
420 "/dev/urandom\0" "rwm\0"
421 "/dev/tty\0" "rwm\0"
422 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
423
424 const char *x, *y;
425
426 NULSTR_FOREACH_PAIR(x, y, auto_devices)
427 whitelist_device(path, x, y);
428
429 whitelist_major(path, "pts", 'c', "rw");
430 whitelist_major(path, "kdbus", 'c', "rw");
431 whitelist_major(path, "kdbus/*", 'c', "rw");
432 }
433
434 LIST_FOREACH(device_allow, a, c->device_allow) {
435 char acc[4];
436 unsigned k = 0;
437
438 if (a->r)
439 acc[k++] = 'r';
440 if (a->w)
441 acc[k++] = 'w';
442 if (a->m)
443 acc[k++] = 'm';
444
445 if (k == 0)
446 continue;
447
448 acc[k++] = 0;
449
450 if (startswith(a->path, "/dev/"))
451 whitelist_device(path, a->path, acc);
452 else if (startswith(a->path, "block-"))
453 whitelist_major(path, a->path + 6, 'b', acc);
454 else if (startswith(a->path, "char-"))
455 whitelist_major(path, a->path + 5, 'c', acc);
456 else
457 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
458 }
459 }
460 }
461
462 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
463 CGroupControllerMask mask = 0;
464
465 /* Figure out which controllers we need */
466
467 if (c->cpu_accounting ||
468 c->cpu_shares != (unsigned long) -1 ||
469 c->startup_cpu_shares != (unsigned long) -1 ||
470 c->cpu_quota_per_sec_usec != USEC_INFINITY)
471 mask |= CGROUP_CPUACCT | CGROUP_CPU;
472
473 if (c->blockio_accounting ||
474 c->blockio_weight != (unsigned long) -1 ||
475 c->startup_blockio_weight != (unsigned long) -1 ||
476 c->blockio_device_weights ||
477 c->blockio_device_bandwidths)
478 mask |= CGROUP_BLKIO;
479
480 if (c->memory_accounting ||
481 c->memory_limit != (uint64_t) -1)
482 mask |= CGROUP_MEMORY;
483
484 if (c->device_allow ||
485 c->device_policy != CGROUP_AUTO)
486 mask |= CGROUP_DEVICE;
487
488 return mask;
489 }
490
491 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
492 CGroupContext *c;
493
494 c = unit_get_cgroup_context(u);
495 if (!c)
496 return 0;
497
498 /* If delegation is turned on, then turn on all cgroups,
499 * unless the process we fork into it is known to drop
500 * privileges anyway, and shouldn't get access to the
501 * controllers anyway. */
502
503 if (c->delegate) {
504 ExecContext *e;
505
506 e = unit_get_exec_context(u);
507 if (!e || exec_context_maintains_privileges(e))
508 return _CGROUP_CONTROLLER_MASK_ALL;
509 }
510
511 return cgroup_context_get_mask(c);
512 }
513
514 CGroupControllerMask unit_get_members_mask(Unit *u) {
515 assert(u);
516
517 if (u->cgroup_members_mask_valid)
518 return u->cgroup_members_mask;
519
520 u->cgroup_members_mask = 0;
521
522 if (u->type == UNIT_SLICE) {
523 Unit *member;
524 Iterator i;
525
526 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
527
528 if (member == u)
529 continue;
530
531 if (UNIT_DEREF(member->slice) != u)
532 continue;
533
534 u->cgroup_members_mask |=
535 unit_get_cgroup_mask(member) |
536 unit_get_members_mask(member);
537 }
538 }
539
540 u->cgroup_members_mask_valid = true;
541 return u->cgroup_members_mask;
542 }
543
544 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
545 assert(u);
546
547 if (UNIT_ISSET(u->slice))
548 return unit_get_members_mask(UNIT_DEREF(u->slice));
549
550 return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
551 }
552
553 CGroupControllerMask unit_get_target_mask(Unit *u) {
554 CGroupControllerMask mask;
555
556 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
557 mask &= u->manager->cgroup_supported;
558
559 return mask;
560 }
561
562 /* Recurse from a unit up through its containing slices, propagating
563 * mask bits upward. A unit is also member of itself. */
564 void unit_update_cgroup_members_masks(Unit *u) {
565 CGroupControllerMask m;
566 bool more;
567
568 assert(u);
569
570 /* Calculate subtree mask */
571 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
572
573 /* See if anything changed from the previous invocation. If
574 * not, we're done. */
575 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
576 return;
577
578 more =
579 u->cgroup_subtree_mask_valid &&
580 ((m & ~u->cgroup_subtree_mask) != 0) &&
581 ((~m & u->cgroup_subtree_mask) == 0);
582
583 u->cgroup_subtree_mask = m;
584 u->cgroup_subtree_mask_valid = true;
585
586 if (UNIT_ISSET(u->slice)) {
587 Unit *s = UNIT_DEREF(u->slice);
588
589 if (more)
590 /* There's more set now than before. We
591 * propagate the new mask to the parent's mask
592 * (not caring if it actually was valid or
593 * not). */
594
595 s->cgroup_members_mask |= m;
596
597 else
598 /* There's less set now than before (or we
599 * don't know), we need to recalculate
600 * everything, so let's invalidate the
601 * parent's members mask */
602
603 s->cgroup_members_mask_valid = false;
604
605 /* And now make sure that this change also hits our
606 * grandparents */
607 unit_update_cgroup_members_masks(s);
608 }
609 }
610
611 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
612 Unit *u = userdata;
613
614 assert(mask != 0);
615 assert(u);
616
617 while (u) {
618 if (u->cgroup_path &&
619 u->cgroup_realized &&
620 (u->cgroup_realized_mask & mask) == mask)
621 return u->cgroup_path;
622
623 u = UNIT_DEREF(u->slice);
624 }
625
626 return NULL;
627 }
628
629 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
630 CGroupContext *c;
631 int r;
632
633 assert(u);
634
635 c = unit_get_cgroup_context(u);
636 if (!c)
637 return 0;
638
639 if (!u->cgroup_path) {
640 _cleanup_free_ char *path = NULL;
641
642 path = unit_default_cgroup_path(u);
643 if (!path)
644 return log_oom();
645
646 r = hashmap_put(u->manager->cgroup_unit, path, u);
647 if (r < 0) {
648 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
649 return r;
650 }
651 if (r > 0) {
652 u->cgroup_path = path;
653 path = NULL;
654 }
655 }
656
657 /* First, create our own group */
658 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
659 if (r < 0)
660 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
661
662 /* Keep track that this is now realized */
663 u->cgroup_realized = true;
664 u->cgroup_realized_mask = mask;
665
666 if (u->type != UNIT_SLICE && !c->delegate) {
667
668 /* Then, possibly move things over, but not if
669 * subgroups may contain processes, which is the case
670 * for slice and delegation units. */
671 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
672 if (r < 0)
673 log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
674 }
675
676 return 0;
677 }
678
679 int unit_attach_pids_to_cgroup(Unit *u) {
680 int r;
681 assert(u);
682
683 r = unit_realize_cgroup(u);
684 if (r < 0)
685 return r;
686
687 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
688 if (r < 0)
689 return r;
690
691 return 0;
692 }
693
694 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
695 assert(u);
696
697 return u->cgroup_realized && u->cgroup_realized_mask == mask;
698 }
699
700 /* Check if necessary controllers and attributes for a unit are in place.
701 *
702 * If so, do nothing.
703 * If not, create paths, move processes over, and set attributes.
704 *
705 * Returns 0 on success and < 0 on failure. */
706 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
707 CGroupControllerMask mask;
708 int r;
709
710 assert(u);
711
712 if (u->in_cgroup_queue) {
713 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
714 u->in_cgroup_queue = false;
715 }
716
717 mask = unit_get_target_mask(u);
718
719 if (unit_has_mask_realized(u, mask))
720 return 0;
721
722 /* First, realize parents */
723 if (UNIT_ISSET(u->slice)) {
724 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
725 if (r < 0)
726 return r;
727 }
728
729 /* And then do the real work */
730 r = unit_create_cgroups(u, mask);
731 if (r < 0)
732 return r;
733
734 /* Finally, apply the necessary attributes. */
735 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
736
737 return 0;
738 }
739
740 static void unit_add_to_cgroup_queue(Unit *u) {
741
742 if (u->in_cgroup_queue)
743 return;
744
745 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
746 u->in_cgroup_queue = true;
747 }
748
749 unsigned manager_dispatch_cgroup_queue(Manager *m) {
750 ManagerState state;
751 unsigned n = 0;
752 Unit *i;
753 int r;
754
755 state = manager_state(m);
756
757 while ((i = m->cgroup_queue)) {
758 assert(i->in_cgroup_queue);
759
760 r = unit_realize_cgroup_now(i, state);
761 if (r < 0)
762 log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
763
764 n++;
765 }
766
767 return n;
768 }
769
770 static void unit_queue_siblings(Unit *u) {
771 Unit *slice;
772
773 /* This adds the siblings of the specified unit and the
774 * siblings of all parent units to the cgroup queue. (But
775 * neither the specified unit itself nor the parents.) */
776
777 while ((slice = UNIT_DEREF(u->slice))) {
778 Iterator i;
779 Unit *m;
780
781 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
782 if (m == u)
783 continue;
784
785 /* Skip units that have a dependency on the slice
786 * but aren't actually in it. */
787 if (UNIT_DEREF(m->slice) != slice)
788 continue;
789
790 /* No point in doing cgroup application for units
791 * without active processes. */
792 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
793 continue;
794
795 /* If the unit doesn't need any new controllers
796 * and has current ones realized, it doesn't need
797 * any changes. */
798 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
799 continue;
800
801 unit_add_to_cgroup_queue(m);
802 }
803
804 u = slice;
805 }
806 }
807
808 int unit_realize_cgroup(Unit *u) {
809 assert(u);
810
811 if (!UNIT_HAS_CGROUP_CONTEXT(u))
812 return 0;
813
814 /* So, here's the deal: when realizing the cgroups for this
815 * unit, we need to first create all parents, but there's more
816 * actually: for the weight-based controllers we also need to
817 * make sure that all our siblings (i.e. units that are in the
818 * same slice as we are) have cgroups, too. Otherwise, things
819 * would become very uneven as each of their processes would
820 * get as much resources as all our group together. This call
821 * will synchronously create the parent cgroups, but will
822 * defer work on the siblings to the next event loop
823 * iteration. */
824
825 /* Add all sibling slices to the cgroup queue. */
826 unit_queue_siblings(u);
827
828 /* And realize this one now (and apply the values) */
829 return unit_realize_cgroup_now(u, manager_state(u->manager));
830 }
831
832 void unit_destroy_cgroup_if_empty(Unit *u) {
833 int r;
834
835 assert(u);
836
837 if (!u->cgroup_path)
838 return;
839
840 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
841 if (r < 0) {
842 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
843 return;
844 }
845
846 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
847
848 free(u->cgroup_path);
849 u->cgroup_path = NULL;
850 u->cgroup_realized = false;
851 u->cgroup_realized_mask = 0;
852 }
853
854 pid_t unit_search_main_pid(Unit *u) {
855 _cleanup_fclose_ FILE *f = NULL;
856 pid_t pid = 0, npid, mypid;
857
858 assert(u);
859
860 if (!u->cgroup_path)
861 return 0;
862
863 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
864 return 0;
865
866 mypid = getpid();
867 while (cg_read_pid(f, &npid) > 0) {
868 pid_t ppid;
869
870 if (npid == pid)
871 continue;
872
873 /* Ignore processes that aren't our kids */
874 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
875 continue;
876
877 if (pid != 0) {
878 /* Dang, there's more than one daemonized PID
879 in this group, so we don't know what process
880 is the main process. */
881 pid = 0;
882 break;
883 }
884
885 pid = npid;
886 }
887
888 return pid;
889 }
890
891 int manager_setup_cgroup(Manager *m) {
892 _cleanup_free_ char *path = NULL;
893 int r;
894
895 assert(m);
896
897 /* 1. Determine hierarchy */
898 free(m->cgroup_root);
899 m->cgroup_root = NULL;
900
901 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
902 if (r < 0)
903 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
904
905 /* LEGACY: Already in /system.slice? If so, let's cut this
906 * off. This is to support live upgrades from older systemd
907 * versions where PID 1 was moved there. */
908 if (m->running_as == MANAGER_SYSTEM) {
909 char *e;
910
911 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
912 if (!e)
913 e = endswith(m->cgroup_root, "/system");
914 if (e)
915 *e = 0;
916 }
917
918 /* And make sure to store away the root value without trailing
919 * slash, even for the root dir, so that we can easily prepend
920 * it everywhere. */
921 if (streq(m->cgroup_root, "/"))
922 m->cgroup_root[0] = 0;
923
924 /* 2. Show data */
925 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
926 if (r < 0)
927 return log_error_errno(r, "Cannot find cgroup mount point: %m");
928
929 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
930 if (!m->test_run) {
931
932 /* 3. Install agent */
933 if (m->running_as == MANAGER_SYSTEM) {
934 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
935 if (r < 0)
936 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
937 else if (r > 0)
938 log_debug("Installed release agent.");
939 else
940 log_debug("Release agent already installed.");
941 }
942
943 /* 4. Make sure we are in the root cgroup */
944 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
945 if (r < 0)
946 return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
947
948 /* 5. And pin it, so that it cannot be unmounted */
949 safe_close(m->pin_cgroupfs_fd);
950
951 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
952 if (m->pin_cgroupfs_fd < 0)
953 return log_error_errno(errno, "Failed to open pin file: %m");
954
955 /* 6. Always enable hierarchical support if it exists... */
956 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
957 }
958
959 /* 7. Figure out which controllers are supported */
960 m->cgroup_supported = cg_mask_supported();
961
962 return 0;
963 }
964
965 void manager_shutdown_cgroup(Manager *m, bool delete) {
966 assert(m);
967
968 /* We can't really delete the group, since we are in it. But
969 * let's trim it. */
970 if (delete && m->cgroup_root)
971 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
972
973 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
974
975 free(m->cgroup_root);
976 m->cgroup_root = NULL;
977 }
978
979 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
980 char *p;
981 Unit *u;
982
983 assert(m);
984 assert(cgroup);
985
986 u = hashmap_get(m->cgroup_unit, cgroup);
987 if (u)
988 return u;
989
990 p = strdupa(cgroup);
991 for (;;) {
992 char *e;
993
994 e = strrchr(p, '/');
995 if (e == p || !e)
996 return NULL;
997
998 *e = 0;
999
1000 u = hashmap_get(m->cgroup_unit, p);
1001 if (u)
1002 return u;
1003 }
1004 }
1005
1006 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1007 _cleanup_free_ char *cgroup = NULL;
1008 Unit *u;
1009 int r;
1010
1011 assert(m);
1012
1013 if (pid <= 1)
1014 return NULL;
1015
1016 u = hashmap_get(m->watch_pids1, LONG_TO_PTR(pid));
1017 if (u)
1018 return u;
1019
1020 u = hashmap_get(m->watch_pids2, LONG_TO_PTR(pid));
1021 if (u)
1022 return u;
1023
1024 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1025 if (r < 0)
1026 return NULL;
1027
1028 return manager_get_unit_by_cgroup(m, cgroup);
1029 }
1030
1031 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1032 Unit *u;
1033 int r;
1034
1035 assert(m);
1036 assert(cgroup);
1037
1038 u = manager_get_unit_by_cgroup(m, cgroup);
1039 if (!u)
1040 return 0;
1041
1042 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1043 if (r <= 0)
1044 return r;
1045
1046 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1047 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1048
1049 unit_add_to_gc_queue(u);
1050 return 0;
1051 }
1052
1053 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1054 _cleanup_free_ char *v = NULL;
1055 int r;
1056
1057 assert(u);
1058 assert(ret);
1059
1060 if (!u->cgroup_path)
1061 return -ENODATA;
1062
1063 if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0)
1064 return -ENODATA;
1065
1066 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1067 if (r == -ENOENT)
1068 return -ENODATA;
1069 if (r < 0)
1070 return r;
1071
1072 return safe_atou64(v, ret);
1073 }
1074
1075 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1076 _cleanup_free_ char *v = NULL;
1077 uint64_t ns;
1078 int r;
1079
1080 assert(u);
1081 assert(ret);
1082
1083 if (!u->cgroup_path)
1084 return -ENODATA;
1085
1086 if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0)
1087 return -ENODATA;
1088
1089 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1090 if (r == -ENOENT)
1091 return -ENODATA;
1092 if (r < 0)
1093 return r;
1094
1095 r = safe_atou64(v, &ns);
1096 if (r < 0)
1097 return r;
1098
1099 *ret = ns;
1100 return 0;
1101 }
1102
1103 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1104 nsec_t ns;
1105 int r;
1106
1107 r = unit_get_cpu_usage_raw(u, &ns);
1108 if (r < 0)
1109 return r;
1110
1111 if (ns > u->cpuacct_usage_base)
1112 ns -= u->cpuacct_usage_base;
1113 else
1114 ns = 0;
1115
1116 *ret = ns;
1117 return 0;
1118 }
1119
1120 int unit_reset_cpu_usage(Unit *u) {
1121 nsec_t ns;
1122 int r;
1123
1124 assert(u);
1125
1126 r = unit_get_cpu_usage_raw(u, &ns);
1127 if (r < 0) {
1128 u->cpuacct_usage_base = 0;
1129 return r;
1130 }
1131
1132 u->cpuacct_usage_base = ns;
1133 return 0;
1134 }
1135
1136 bool unit_cgroup_delegate(Unit *u) {
1137 CGroupContext *c;
1138
1139 assert(u);
1140
1141 c = unit_get_cgroup_context(u);
1142 if (!c)
1143 return false;
1144
1145 return c->delegate;
1146 }
1147
1148 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1149 [CGROUP_AUTO] = "auto",
1150 [CGROUP_CLOSED] = "closed",
1151 [CGROUP_STRICT] = "strict",
1152 };
1153
1154 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);