]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
core: require cgroups filesystem to be available
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
4ad49000 6 Copyright 2013 Lennart Poettering
8e274523
LP
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
c6c18be3 22#include <fcntl.h>
e41969e3 23#include <fnmatch.h>
8c6db833 24
9eb977db 25#include "path-util.h"
9444b1f2 26#include "special.h"
4ad49000
LP
27#include "cgroup-util.h"
28#include "cgroup.h"
8e274523 29
4ad49000
LP
30void cgroup_context_init(CGroupContext *c) {
31 assert(c);
32
33 /* Initialize everything to the kernel defaults, assuming the
34 * structure is preinitialized to 0 */
35
36 c->cpu_shares = 1024;
ddca82ac 37 c->memory_limit = (uint64_t) -1;
4ad49000 38 c->blockio_weight = 1000;
b2f8b02e
LP
39
40 c->cpu_quota_per_sec_usec = (usec_t) -1;
41 c->cpu_quota_usec = (usec_t) -1;
42 c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
4ad49000 43}
8e274523 44
4ad49000
LP
45void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
46 assert(c);
47 assert(a);
48
71fda00f 49 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
50 free(a->path);
51 free(a);
52}
53
54void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
55 assert(c);
56 assert(w);
57
71fda00f 58 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
59 free(w->path);
60 free(w);
61}
62
63void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
64 assert(c);
8e274523 65 assert(b);
8e274523 66
71fda00f 67 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
68 free(b->path);
69 free(b);
70}
71
72void cgroup_context_done(CGroupContext *c) {
73 assert(c);
74
75 while (c->blockio_device_weights)
76 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
77
78 while (c->blockio_device_bandwidths)
79 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
80
81 while (c->device_allow)
82 cgroup_context_free_device_allow(c, c->device_allow);
83}
84
b2f8b02e
LP
85usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
86 assert(c);
87
88 /* Returns the absolute CPU quota */
89
90 if (c->cpu_quota_usec != (usec_t) -1)
91 return c->cpu_quota_usec;
92 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
93 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
94 else
95 return (usec_t) -1;
96}
97
98usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
99 assert(c);
100
101 /* Returns the CPU quota relative to 1s */
102
103 if (c->cpu_quota_usec != (usec_t) -1)
104 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
105 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
106 return c->cpu_quota_per_sec_usec;
107 else
108 return (usec_t) -1;
109}
110
4ad49000
LP
111void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
112 CGroupBlockIODeviceBandwidth *b;
113 CGroupBlockIODeviceWeight *w;
114 CGroupDeviceAllow *a;
b2f8b02e 115 char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
116
117 assert(c);
118 assert(f);
119
120 prefix = strempty(prefix);
121
122 fprintf(f,
123 "%sCPUAccounting=%s\n"
124 "%sBlockIOAccounting=%s\n"
125 "%sMemoryAccounting=%s\n"
126 "%sCPUShares=%lu\n"
b2f8b02e
LP
127 "%sCPUQuota=%s\n"
128 "%sCPUQuotaPerSecSec=%s\n"
129 "%sCPUQuotaPeriodSec=%s\n"
112a7f46 130 "%sBlockIOWeight=%lu\n"
4ad49000 131 "%sMemoryLimit=%" PRIu64 "\n"
4ad49000
LP
132 "%sDevicePolicy=%s\n",
133 prefix, yes_no(c->cpu_accounting),
134 prefix, yes_no(c->blockio_accounting),
135 prefix, yes_no(c->memory_accounting),
136 prefix, c->cpu_shares,
b2f8b02e
LP
137 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
138 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
139 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
4ad49000
LP
140 prefix, c->blockio_weight,
141 prefix, c->memory_limit,
4ad49000
LP
142 prefix, cgroup_device_policy_to_string(c->device_policy));
143
144 LIST_FOREACH(device_allow, a, c->device_allow)
145 fprintf(f,
146 "%sDeviceAllow=%s %s%s%s\n",
147 prefix,
148 a->path,
149 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
150
151 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
152 fprintf(f,
8e7076ca 153 "%sBlockIODeviceWeight=%s %lu",
4ad49000
LP
154 prefix,
155 w->path,
156 w->weight);
157
158 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
159 char buf[FORMAT_BYTES_MAX];
160
161 fprintf(f,
162 "%s%s=%s %s\n",
163 prefix,
164 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
165 b->path,
166 format_bytes(buf, sizeof(buf), b->bandwidth));
167 }
168}
169
170static int lookup_blkio_device(const char *p, dev_t *dev) {
171 struct stat st;
172 int r;
173
174 assert(p);
175 assert(dev);
176
177 r = stat(p, &st);
ab1f0633 178 if (r < 0) {
4ad49000
LP
179 log_warning("Couldn't stat device %s: %m", p);
180 return -errno;
ab1f0633 181 }
8e274523 182
4ad49000
LP
183 if (S_ISBLK(st.st_mode))
184 *dev = st.st_rdev;
185 else if (major(st.st_dev) != 0) {
186 /* If this is not a device node then find the block
187 * device this file is stored on */
188 *dev = st.st_dev;
189
190 /* If this is a partition, try to get the originating
191 * block device */
192 block_get_whole_disk(*dev, dev);
193 } else {
194 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
195 return -ENODEV;
196 }
8e274523 197
8e274523 198 return 0;
8e274523
LP
199}
200
4ad49000
LP
201static int whitelist_device(const char *path, const char *node, const char *acc) {
202 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
203 struct stat st;
8c6db833 204 int r;
8e274523 205
4ad49000
LP
206 assert(path);
207 assert(acc);
8e274523 208
4ad49000
LP
209 if (stat(node, &st) < 0) {
210 log_warning("Couldn't stat device %s", node);
211 return -errno;
212 }
213
214 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
215 log_warning("%s is not a device.", node);
216 return -ENODEV;
217 }
218
219 sprintf(buf,
220 "%c %u:%u %s",
221 S_ISCHR(st.st_mode) ? 'c' : 'b',
222 major(st.st_rdev), minor(st.st_rdev),
223 acc);
224
225 r = cg_set_attribute("devices", path, "devices.allow", buf);
226 if (r < 0)
227 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
228
229 return r;
8e274523
LP
230}
231
90060676
LP
232static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
233 _cleanup_fclose_ FILE *f = NULL;
234 char line[LINE_MAX];
235 bool good = false;
236 int r;
237
238 assert(path);
239 assert(acc);
240 assert(type == 'b' || type == 'c');
241
242 f = fopen("/proc/devices", "re");
243 if (!f) {
244 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
245 return -errno;
246 }
247
248 FOREACH_LINE(line, f, goto fail) {
249 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
250 unsigned maj;
251
252 truncate_nl(line);
253
254 if (type == 'c' && streq(line, "Character devices:")) {
255 good = true;
256 continue;
257 }
258
259 if (type == 'b' && streq(line, "Block devices:")) {
260 good = true;
261 continue;
262 }
263
264 if (isempty(line)) {
265 good = false;
266 continue;
267 }
268
269 if (!good)
270 continue;
271
272 p = strstrip(line);
273
274 w = strpbrk(p, WHITESPACE);
275 if (!w)
276 continue;
277 *w = 0;
278
279 r = safe_atou(p, &maj);
280 if (r < 0)
281 continue;
282 if (maj <= 0)
283 continue;
284
285 w++;
286 w += strspn(w, WHITESPACE);
e41969e3
LP
287
288 if (fnmatch(name, w, 0) != 0)
90060676
LP
289 continue;
290
291 sprintf(buf,
292 "%c %u:* %s",
293 type,
294 maj,
295 acc);
296
297 r = cg_set_attribute("devices", path, "devices.allow", buf);
298 if (r < 0)
299 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
300 }
301
302 return 0;
303
304fail:
305 log_warning("Failed to read /proc/devices: %m");
306 return -errno;
307}
308
4ad49000 309void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
01efdf13 310 bool is_root;
4ad49000
LP
311 int r;
312
313 assert(c);
314 assert(path);
8e274523 315
4ad49000
LP
316 if (mask == 0)
317 return;
8e274523 318
01efdf13
LP
319 /* Some cgroup attributes are not support on the root cgroup,
320 * hence silently ignore */
321 is_root = isempty(path) || path_equal(path, "/");
322
323 if ((mask & CGROUP_CPU) && !is_root) {
b2f8b02e
LP
324 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
325 usec_t q;
8e274523 326
4ad49000
LP
327 sprintf(buf, "%lu\n", c->cpu_shares);
328 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
329 if (r < 0)
330 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
b2f8b02e
LP
331
332 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
333 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
334 if (r < 0)
335 log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
336
337 q = cgroup_context_get_cpu_quota_usec(c);
338 if (q != (usec_t) -1) {
339 sprintf(buf, USEC_FMT "\n", q);
340 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
341 } else
342 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
343 if (r < 0)
344 log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
4ad49000
LP
345 }
346
347 if (mask & CGROUP_BLKIO) {
348 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
349 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
350 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
351 CGroupBlockIODeviceWeight *w;
352 CGroupBlockIODeviceBandwidth *b;
353
01efdf13
LP
354 if (!is_root) {
355 sprintf(buf, "%lu\n", c->blockio_weight);
356 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
357 if (r < 0)
358 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
4ad49000 359
01efdf13
LP
360 /* FIXME: no way to reset this list */
361 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
362 dev_t dev;
4ad49000 363
01efdf13
LP
364 r = lookup_blkio_device(w->path, &dev);
365 if (r < 0)
366 continue;
8e274523 367
01efdf13
LP
368 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
369 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
370 if (r < 0)
371 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
372 }
4ad49000
LP
373 }
374
375 /* FIXME: no way to reset this list */
376 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
377 const char *a;
378 dev_t dev;
379
380 r = lookup_blkio_device(b->path, &dev);
381 if (r < 0)
382 continue;
383
384 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
385
386 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
387 r = cg_set_attribute("blkio", path, a, buf);
388 if (r < 0)
389 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
d686d8a9 390 }
8e274523
LP
391 }
392
4ad49000 393 if (mask & CGROUP_MEMORY) {
6a94f2e9 394 if (c->memory_limit != (uint64_t) -1) {
e58cec11
LP
395 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
396
6a94f2e9
G
397 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
398 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
399 } else
400 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
8e274523 401
4ad49000
LP
402 if (r < 0)
403 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
4ad49000 404 }
8e274523 405
01efdf13 406 if ((mask & CGROUP_DEVICE) && !is_root) {
4ad49000 407 CGroupDeviceAllow *a;
8e274523 408
4ad49000
LP
409 if (c->device_allow || c->device_policy != CGROUP_AUTO)
410 r = cg_set_attribute("devices", path, "devices.deny", "a");
411 else
412 r = cg_set_attribute("devices", path, "devices.allow", "a");
413 if (r < 0)
01efdf13 414 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
fb385181 415
4ad49000
LP
416 if (c->device_policy == CGROUP_CLOSED ||
417 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
418 static const char auto_devices[] =
7d711efb
LP
419 "/dev/null\0" "rwm\0"
420 "/dev/zero\0" "rwm\0"
421 "/dev/full\0" "rwm\0"
422 "/dev/random\0" "rwm\0"
423 "/dev/urandom\0" "rwm\0"
424 "/dev/tty\0" "rwm\0"
425 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
4ad49000
LP
426
427 const char *x, *y;
428
429 NULSTR_FOREACH_PAIR(x, y, auto_devices)
430 whitelist_device(path, x, y);
7d711efb
LP
431
432 whitelist_major(path, "pts", 'c', "rw");
433 whitelist_major(path, "kdbus", 'c', "rw");
434 whitelist_major(path, "kdbus/*", 'c', "rw");
4ad49000
LP
435 }
436
437 LIST_FOREACH(device_allow, a, c->device_allow) {
438 char acc[4];
439 unsigned k = 0;
440
441 if (a->r)
442 acc[k++] = 'r';
443 if (a->w)
444 acc[k++] = 'w';
445 if (a->m)
446 acc[k++] = 'm';
fb385181 447
4ad49000
LP
448 if (k == 0)
449 continue;
fb385181 450
4ad49000 451 acc[k++] = 0;
90060676
LP
452
453 if (startswith(a->path, "/dev/"))
454 whitelist_device(path, a->path, acc);
455 else if (startswith(a->path, "block-"))
456 whitelist_major(path, a->path + 6, 'b', acc);
457 else if (startswith(a->path, "char-"))
458 whitelist_major(path, a->path + 5, 'c', acc);
459 else
460 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
461 }
462 }
fb385181
LP
463}
464
4ad49000
LP
465CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
466 CGroupControllerMask mask = 0;
8e274523 467
4ad49000 468 /* Figure out which controllers we need */
8e274523 469
b2f8b02e
LP
470 if (c->cpu_accounting ||
471 c->cpu_shares != 1024 ||
472 c->cpu_quota_usec != (usec_t) -1 ||
473 c->cpu_quota_per_sec_usec != (usec_t) -1)
4ad49000 474 mask |= CGROUP_CPUACCT | CGROUP_CPU;
ecedd90f 475
4ad49000
LP
476 if (c->blockio_accounting ||
477 c->blockio_weight != 1000 ||
478 c->blockio_device_weights ||
479 c->blockio_device_bandwidths)
480 mask |= CGROUP_BLKIO;
ecedd90f 481
4ad49000 482 if (c->memory_accounting ||
ddca82ac 483 c->memory_limit != (uint64_t) -1)
4ad49000 484 mask |= CGROUP_MEMORY;
8e274523 485
4ad49000
LP
486 if (c->device_allow || c->device_policy != CGROUP_AUTO)
487 mask |= CGROUP_DEVICE;
488
489 return mask;
8e274523
LP
490}
491
bc432dc7 492CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
4ad49000 493 CGroupContext *c;
8e274523 494
4ad49000
LP
495 c = unit_get_cgroup_context(u);
496 if (!c)
497 return 0;
8e274523 498
4ad49000 499 return cgroup_context_get_mask(c);
8e274523
LP
500}
501
bc432dc7 502CGroupControllerMask unit_get_members_mask(Unit *u) {
4ad49000 503 assert(u);
bc432dc7
LP
504
505 if (u->cgroup_members_mask_valid)
506 return u->cgroup_members_mask;
507
508 u->cgroup_members_mask = 0;
509
510 if (u->type == UNIT_SLICE) {
511 Unit *member;
512 Iterator i;
513
514 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
515
516 if (member == u)
517 continue;
518
d4fdc205 519 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
520 continue;
521
522 u->cgroup_members_mask |=
523 unit_get_cgroup_mask(member) |
524 unit_get_members_mask(member);
525 }
526 }
527
528 u->cgroup_members_mask_valid = true;
6414b7c9 529 return u->cgroup_members_mask;
246aa6dd
LP
530}
531
bc432dc7
LP
532CGroupControllerMask unit_get_siblings_mask(Unit *u) {
533 CGroupControllerMask m;
534
4ad49000 535 assert(u);
246aa6dd 536
bc432dc7
LP
537 if (UNIT_ISSET(u->slice))
538 m = unit_get_members_mask(UNIT_DEREF(u->slice));
539 else
540 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
4ad49000
LP
541
542 /* Sibling propagation is only relevant for weight-based
543 * controllers, so let's mask out everything else */
bc432dc7 544 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
246aa6dd
LP
545}
546
bc432dc7 547CGroupControllerMask unit_get_target_mask(Unit *u) {
6414b7c9
DS
548 CGroupControllerMask mask;
549
550 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
551 mask &= u->manager->cgroup_supported;
552
553 return mask;
554}
555
556/* Recurse from a unit up through its containing slices, propagating
557 * mask bits upward. A unit is also member of itself. */
bc432dc7
LP
558void unit_update_cgroup_members_masks(Unit *u) {
559 CGroupControllerMask m;
560 bool more;
561
562 assert(u);
563
564 /* Calculate subtree mask */
565 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
566
567 /* See if anything changed from the previous invocation. If
568 * not, we're done. */
569 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
570 return;
571
572 more =
573 u->cgroup_subtree_mask_valid &&
574 ((m & ~u->cgroup_subtree_mask) != 0) &&
575 ((~m & u->cgroup_subtree_mask) == 0);
576
577 u->cgroup_subtree_mask = m;
578 u->cgroup_subtree_mask_valid = true;
579
6414b7c9
DS
580 if (UNIT_ISSET(u->slice)) {
581 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
582
583 if (more)
584 /* There's more set now than before. We
585 * propagate the new mask to the parent's mask
586 * (not caring if it actually was valid or
587 * not). */
588
589 s->cgroup_members_mask |= m;
590
591 else
592 /* There's less set now than before (or we
593 * don't know), we need to recalculate
594 * everything, so let's invalidate the
595 * parent's members mask */
596
597 s->cgroup_members_mask_valid = false;
598
599 /* And now make sure that this change also hits our
600 * grandparents */
601 unit_update_cgroup_members_masks(s);
6414b7c9
DS
602 }
603}
604
03b90d4b
LP
605static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
606 Unit *u = userdata;
607
608 assert(mask != 0);
609 assert(u);
610
611 while (u) {
612 if (u->cgroup_path &&
613 u->cgroup_realized &&
614 (u->cgroup_realized_mask & mask) == mask)
615 return u->cgroup_path;
616
617 u = UNIT_DEREF(u->slice);
618 }
619
620 return NULL;
621}
622
4ad49000 623static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
03b90d4b 624 _cleanup_free_ char *path = NULL;
bc432dc7 625 int r;
64747e2d 626
4ad49000 627 assert(u);
64747e2d 628
4ad49000
LP
629 path = unit_default_cgroup_path(u);
630 if (!path)
a94042fa 631 return log_oom();
64747e2d 632
0a1eb06d 633 r = hashmap_put(u->manager->cgroup_unit, path, u);
03b90d4b
LP
634 if (r < 0) {
635 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
0a1eb06d 636 return r;
b58b8e11 637 }
03b90d4b 638 if (r > 0) {
b58b8e11 639 u->cgroup_path = path;
a94042fa 640 path = NULL;
b58b8e11
HH
641 }
642
03b90d4b
LP
643 /* First, create our own group */
644 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
645 if (r < 0) {
646 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
647 return r;
648 }
649
650 /* Keep track that this is now realized */
4ad49000 651 u->cgroup_realized = true;
bc432dc7 652 u->cgroup_realized_mask = mask;
4ad49000 653
03b90d4b
LP
654 /* Then, possibly move things over */
655 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
656 if (r < 0)
657 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
658
64747e2d
LP
659 return 0;
660}
661
6414b7c9 662static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
bc432dc7
LP
663 assert(u);
664
665 return u->cgroup_realized && u->cgroup_realized_mask == mask;
6414b7c9
DS
666}
667
668/* Check if necessary controllers and attributes for a unit are in place.
669 *
670 * If so, do nothing.
671 * If not, create paths, move processes over, and set attributes.
672 *
673 * Returns 0 on success and < 0 on failure. */
0a1eb06d 674static int unit_realize_cgroup_now(Unit *u) {
4ad49000 675 CGroupControllerMask mask;
6414b7c9 676 int r;
64747e2d 677
4ad49000 678 assert(u);
64747e2d 679
4ad49000 680 if (u->in_cgroup_queue) {
71fda00f 681 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
682 u->in_cgroup_queue = false;
683 }
64747e2d 684
6414b7c9 685 mask = unit_get_target_mask(u);
64747e2d 686
6414b7c9 687 if (unit_has_mask_realized(u, mask))
0a1eb06d 688 return 0;
64747e2d 689
4ad49000 690 /* First, realize parents */
6414b7c9
DS
691 if (UNIT_ISSET(u->slice)) {
692 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
693 if (r < 0)
694 return r;
695 }
4ad49000
LP
696
697 /* And then do the real work */
6414b7c9
DS
698 r = unit_create_cgroups(u, mask);
699 if (r < 0)
700 return r;
701
702 /* Finally, apply the necessary attributes. */
703 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
704
705 return 0;
64747e2d
LP
706}
707
4ad49000 708static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 709
4ad49000
LP
710 if (u->in_cgroup_queue)
711 return;
8e274523 712
71fda00f 713 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
714 u->in_cgroup_queue = true;
715}
8c6db833 716
4ad49000
LP
717unsigned manager_dispatch_cgroup_queue(Manager *m) {
718 Unit *i;
719 unsigned n = 0;
6414b7c9 720 int r;
ecedd90f 721
4ad49000
LP
722 while ((i = m->cgroup_queue)) {
723 assert(i->in_cgroup_queue);
ecedd90f 724
6414b7c9
DS
725 r = unit_realize_cgroup_now(i);
726 if (r < 0)
727 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
0a1eb06d 728
4ad49000
LP
729 n++;
730 }
ecedd90f 731
4ad49000 732 return n;
8e274523
LP
733}
734
4ad49000
LP
735static void unit_queue_siblings(Unit *u) {
736 Unit *slice;
ca949c9d 737
4ad49000
LP
738 /* This adds the siblings of the specified unit and the
739 * siblings of all parent units to the cgroup queue. (But
740 * neither the specified unit itself nor the parents.) */
741
742 while ((slice = UNIT_DEREF(u->slice))) {
743 Iterator i;
744 Unit *m;
8f53a7b8 745
4ad49000
LP
746 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
747 if (m == u)
748 continue;
8e274523 749
6414b7c9
DS
750 /* Skip units that have a dependency on the slice
751 * but aren't actually in it. */
4ad49000 752 if (UNIT_DEREF(m->slice) != slice)
50159e6a 753 continue;
8e274523 754
6414b7c9
DS
755 /* No point in doing cgroup application for units
756 * without active processes. */
757 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
758 continue;
759
760 /* If the unit doesn't need any new controllers
761 * and has current ones realized, it doesn't need
762 * any changes. */
763 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
764 continue;
765
4ad49000 766 unit_add_to_cgroup_queue(m);
50159e6a
LP
767 }
768
4ad49000 769 u = slice;
8e274523 770 }
4ad49000
LP
771}
772
0a1eb06d 773int unit_realize_cgroup(Unit *u) {
4ad49000
LP
774 CGroupContext *c;
775
776 assert(u);
777
778 c = unit_get_cgroup_context(u);
779 if (!c)
0a1eb06d 780 return 0;
8e274523 781
4ad49000
LP
782 /* So, here's the deal: when realizing the cgroups for this
783 * unit, we need to first create all parents, but there's more
784 * actually: for the weight-based controllers we also need to
785 * make sure that all our siblings (i.e. units that are in the
73e231ab 786 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
787 * would become very uneven as each of their processes would
788 * get as much resources as all our group together. This call
789 * will synchronously create the parent cgroups, but will
790 * defer work on the siblings to the next event loop
791 * iteration. */
ca949c9d 792
4ad49000
LP
793 /* Add all sibling slices to the cgroup queue. */
794 unit_queue_siblings(u);
795
6414b7c9 796 /* And realize this one now (and apply the values) */
bc432dc7 797 return unit_realize_cgroup_now(u);
8e274523
LP
798}
799
4ad49000 800void unit_destroy_cgroup(Unit *u) {
8e274523
LP
801 int r;
802
4ad49000 803 assert(u);
8e274523 804
4ad49000
LP
805 if (!u->cgroup_path)
806 return;
8e274523 807
13b84ec7 808 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
4ad49000 809 if (r < 0)
376dd21d 810 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
8e274523 811
0a1eb06d
LP
812 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
813
4ad49000
LP
814 free(u->cgroup_path);
815 u->cgroup_path = NULL;
816 u->cgroup_realized = false;
bc432dc7 817 u->cgroup_realized_mask = 0;
0a1eb06d 818
8e274523
LP
819}
820
4ad49000
LP
821pid_t unit_search_main_pid(Unit *u) {
822 _cleanup_fclose_ FILE *f = NULL;
823 pid_t pid = 0, npid, mypid;
824
825 assert(u);
826
827 if (!u->cgroup_path)
828 return 0;
829
830 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
831 return 0;
832
833 mypid = getpid();
834 while (cg_read_pid(f, &npid) > 0) {
835 pid_t ppid;
836
837 if (npid == pid)
838 continue;
8e274523 839
4ad49000
LP
840 /* Ignore processes that aren't our kids */
841 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
842 continue;
8e274523 843
4ad49000
LP
844 if (pid != 0) {
845 /* Dang, there's more than one daemonized PID
846 in this group, so we don't know what process
847 is the main process. */
848 pid = 0;
849 break;
850 }
8e274523 851
4ad49000 852 pid = npid;
8e274523
LP
853 }
854
4ad49000 855 return pid;
8e274523
LP
856}
857
8e274523 858int manager_setup_cgroup(Manager *m) {
9444b1f2 859 _cleanup_free_ char *path = NULL;
15c60e99 860 char *e;
8e274523 861 int r;
8e274523
LP
862
863 assert(m);
864
35d2e7ec 865 /* 1. Determine hierarchy */
9444b1f2
LP
866 free(m->cgroup_root);
867 m->cgroup_root = NULL;
868
869 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
9156e799 870 if (r < 0) {
12235040 871 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
a32360f1 872 return r;
12235040 873 }
8e274523 874
15c60e99
LP
875 /* LEGACY: Already in /system.slice? If so, let's cut this
876 * off. This is to support live upgrades from older systemd
877 * versions where PID 1 was moved there. */
9444b1f2
LP
878 if (m->running_as == SYSTEMD_SYSTEM) {
879 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99
LP
880 if (!e)
881 e = endswith(m->cgroup_root, "/system");
9444b1f2
LP
882 if (e)
883 *e = 0;
0baf24dd 884 }
7ccfb64a 885
9444b1f2
LP
886 /* And make sure to store away the root value without trailing
887 * slash, even for the root dir, so that we can easily prepend
888 * it everywhere. */
889 if (streq(m->cgroup_root, "/"))
890 m->cgroup_root[0] = 0;
8e274523 891
35d2e7ec 892 /* 2. Show data */
9444b1f2 893 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
3474ae3c 894 if (r < 0) {
12235040 895 log_error("Cannot find cgroup mount point: %s", strerror(-r));
a32360f1 896 return r;
12235040 897 }
8e274523 898
c6c18be3
LP
899 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
900
35d2e7ec 901 /* 3. Install agent */
a32360f1
LP
902 if (m->running_as == SYSTEMD_SYSTEM) {
903 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
904 if (r < 0)
905 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
906 else if (r > 0)
907 log_debug("Installed release agent.");
908 else
909 log_debug("Release agent already installed.");
910 }
8e274523 911
15c60e99
LP
912 /* 4. Make sure we are in the root cgroup */
913 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
9156e799 914 if (r < 0) {
8e274523 915 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
a32360f1 916 return r;
c6c18be3
LP
917 }
918
35d2e7ec 919 /* 5. And pin it, so that it cannot be unmounted */
03e334a1 920 safe_close(m->pin_cgroupfs_fd);
c6c18be3 921
9156e799
LP
922 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
923 if (r < 0) {
12235040 924 log_error("Failed to open pin file: %m");
a32360f1 925 return -errno;
c6c18be3
LP
926 }
927
4ad49000
LP
928 /* 6. Figure out which controllers are supported */
929 m->cgroup_supported = cg_mask_supported();
9156e799 930
e58cec11
LP
931 /* 7. Always enable hierarchial support if it exists... */
932 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
933
a32360f1 934 return 0;
8e274523
LP
935}
936
c6c18be3 937void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
938 assert(m);
939
9444b1f2
LP
940 /* We can't really delete the group, since we are in it. But
941 * let's trim it. */
942 if (delete && m->cgroup_root)
943 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
8e274523 944
03e334a1 945 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 946
9444b1f2
LP
947 free(m->cgroup_root);
948 m->cgroup_root = NULL;
8e274523
LP
949}
950
4ad49000 951Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 952 char *p;
4ad49000 953 Unit *u;
acb14d31
LP
954
955 assert(m);
956 assert(cgroup);
acb14d31 957
4ad49000
LP
958 u = hashmap_get(m->cgroup_unit, cgroup);
959 if (u)
960 return u;
acb14d31 961
8e70580b 962 p = strdupa(cgroup);
acb14d31
LP
963 for (;;) {
964 char *e;
965
966 e = strrchr(p, '/');
4ad49000
LP
967 if (e == p || !e)
968 return NULL;
acb14d31
LP
969
970 *e = 0;
971
4ad49000
LP
972 u = hashmap_get(m->cgroup_unit, p);
973 if (u)
974 return u;
acb14d31
LP
975 }
976}
977
4ad49000
LP
978Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
979 _cleanup_free_ char *cgroup = NULL;
acb14d31 980 int r;
8e274523 981
8c47c732
LP
982 assert(m);
983
984 if (pid <= 1)
985 return NULL;
986
4ad49000
LP
987 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
988 if (r < 0)
6dde1f33
LP
989 return NULL;
990
4ad49000 991 return manager_get_unit_by_cgroup(m, cgroup);
6dde1f33 992}
4fbf50b3 993
4ad49000
LP
994int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
995 Unit *u;
996 int r;
4fbf50b3 997
4ad49000
LP
998 assert(m);
999 assert(cgroup);
4fbf50b3 1000
4ad49000 1001 u = manager_get_unit_by_cgroup(m, cgroup);
b56c28c3 1002 if (u) {
06025d91
LP
1003 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1004 if (r > 0) {
1005 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1006 UNIT_VTABLE(u)->notify_cgroup_empty(u);
b56c28c3 1007
06025d91
LP
1008 unit_add_to_gc_queue(u);
1009 }
b56c28c3 1010 }
2633eb83 1011
4ad49000 1012 return 0;
4fbf50b3
LP
1013}
1014
4ad49000
LP
1015static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1016 [CGROUP_AUTO] = "auto",
1017 [CGROUP_CLOSED] = "closed",
1018 [CGROUP_STRICT] = "strict",
1019};
4fbf50b3 1020
4ad49000 1021DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);