]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
core: add startup resource control option
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
4ad49000 6 Copyright 2013 Lennart Poettering
8e274523
LP
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
c6c18be3 22#include <fcntl.h>
e41969e3 23#include <fnmatch.h>
8c6db833 24
9eb977db 25#include "path-util.h"
9444b1f2 26#include "special.h"
4ad49000
LP
27#include "cgroup-util.h"
28#include "cgroup.h"
8e274523 29
4ad49000
LP
30void cgroup_context_init(CGroupContext *c) {
31 assert(c);
32
33 /* Initialize everything to the kernel defaults, assuming the
34 * structure is preinitialized to 0 */
35
36 c->cpu_shares = 1024;
95ae05c0
WC
37 c->startup_cpu_shares = 1024;
38 c->startup_cpu_shares_set = false;
ddca82ac 39 c->memory_limit = (uint64_t) -1;
4ad49000 40 c->blockio_weight = 1000;
95ae05c0
WC
41 c->startup_blockio_weight = 1000;
42 c->startup_blockio_weight_set = false;
b2f8b02e
LP
43
44 c->cpu_quota_per_sec_usec = (usec_t) -1;
45 c->cpu_quota_usec = (usec_t) -1;
46 c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
4ad49000 47}
8e274523 48
4ad49000
LP
49void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
50 assert(c);
51 assert(a);
52
71fda00f 53 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
54 free(a->path);
55 free(a);
56}
57
58void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
59 assert(c);
60 assert(w);
61
71fda00f 62 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
63 free(w->path);
64 free(w);
65}
66
67void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
68 assert(c);
8e274523 69 assert(b);
8e274523 70
71fda00f 71 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
72 free(b->path);
73 free(b);
74}
75
76void cgroup_context_done(CGroupContext *c) {
77 assert(c);
78
79 while (c->blockio_device_weights)
80 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
81
82 while (c->blockio_device_bandwidths)
83 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
84
85 while (c->device_allow)
86 cgroup_context_free_device_allow(c, c->device_allow);
87}
88
b2f8b02e
LP
89usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
90 assert(c);
91
92 /* Returns the absolute CPU quota */
93
94 if (c->cpu_quota_usec != (usec_t) -1)
95 return c->cpu_quota_usec;
96 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
97 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
98 else
99 return (usec_t) -1;
100}
101
102usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
103 assert(c);
104
105 /* Returns the CPU quota relative to 1s */
106
107 if (c->cpu_quota_usec != (usec_t) -1)
108 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
109 else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
110 return c->cpu_quota_per_sec_usec;
111 else
112 return (usec_t) -1;
113}
114
4ad49000
LP
115void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
116 CGroupBlockIODeviceBandwidth *b;
117 CGroupBlockIODeviceWeight *w;
118 CGroupDeviceAllow *a;
b2f8b02e 119 char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
120
121 assert(c);
122 assert(f);
123
124 prefix = strempty(prefix);
125
126 fprintf(f,
127 "%sCPUAccounting=%s\n"
128 "%sBlockIOAccounting=%s\n"
129 "%sMemoryAccounting=%s\n"
130 "%sCPUShares=%lu\n"
95ae05c0 131 "%sStartupCPUShares=%lu\n"
b2f8b02e
LP
132 "%sCPUQuota=%s\n"
133 "%sCPUQuotaPerSecSec=%s\n"
134 "%sCPUQuotaPeriodSec=%s\n"
112a7f46 135 "%sBlockIOWeight=%lu\n"
95ae05c0 136 "%sStartupBlockIOWeight=%lu\n"
4ad49000 137 "%sMemoryLimit=%" PRIu64 "\n"
4ad49000
LP
138 "%sDevicePolicy=%s\n",
139 prefix, yes_no(c->cpu_accounting),
140 prefix, yes_no(c->blockio_accounting),
141 prefix, yes_no(c->memory_accounting),
142 prefix, c->cpu_shares,
95ae05c0 143 prefix, c->startup_cpu_shares,
b2f8b02e
LP
144 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
145 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
146 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
4ad49000 147 prefix, c->blockio_weight,
95ae05c0 148 prefix, c->startup_blockio_weight,
4ad49000 149 prefix, c->memory_limit,
4ad49000
LP
150 prefix, cgroup_device_policy_to_string(c->device_policy));
151
152 LIST_FOREACH(device_allow, a, c->device_allow)
153 fprintf(f,
154 "%sDeviceAllow=%s %s%s%s\n",
155 prefix,
156 a->path,
157 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
158
159 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
160 fprintf(f,
8e7076ca 161 "%sBlockIODeviceWeight=%s %lu",
4ad49000
LP
162 prefix,
163 w->path,
164 w->weight);
165
166 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
167 char buf[FORMAT_BYTES_MAX];
168
169 fprintf(f,
170 "%s%s=%s %s\n",
171 prefix,
172 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
173 b->path,
174 format_bytes(buf, sizeof(buf), b->bandwidth));
175 }
176}
177
178static int lookup_blkio_device(const char *p, dev_t *dev) {
179 struct stat st;
180 int r;
181
182 assert(p);
183 assert(dev);
184
185 r = stat(p, &st);
ab1f0633 186 if (r < 0) {
4ad49000
LP
187 log_warning("Couldn't stat device %s: %m", p);
188 return -errno;
ab1f0633 189 }
8e274523 190
4ad49000
LP
191 if (S_ISBLK(st.st_mode))
192 *dev = st.st_rdev;
193 else if (major(st.st_dev) != 0) {
194 /* If this is not a device node then find the block
195 * device this file is stored on */
196 *dev = st.st_dev;
197
198 /* If this is a partition, try to get the originating
199 * block device */
200 block_get_whole_disk(*dev, dev);
201 } else {
202 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
203 return -ENODEV;
204 }
8e274523 205
8e274523 206 return 0;
8e274523
LP
207}
208
4ad49000
LP
209static int whitelist_device(const char *path, const char *node, const char *acc) {
210 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
211 struct stat st;
8c6db833 212 int r;
8e274523 213
4ad49000
LP
214 assert(path);
215 assert(acc);
8e274523 216
4ad49000
LP
217 if (stat(node, &st) < 0) {
218 log_warning("Couldn't stat device %s", node);
219 return -errno;
220 }
221
222 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
223 log_warning("%s is not a device.", node);
224 return -ENODEV;
225 }
226
227 sprintf(buf,
228 "%c %u:%u %s",
229 S_ISCHR(st.st_mode) ? 'c' : 'b',
230 major(st.st_rdev), minor(st.st_rdev),
231 acc);
232
233 r = cg_set_attribute("devices", path, "devices.allow", buf);
234 if (r < 0)
235 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
236
237 return r;
8e274523
LP
238}
239
90060676
LP
240static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
241 _cleanup_fclose_ FILE *f = NULL;
242 char line[LINE_MAX];
243 bool good = false;
244 int r;
245
246 assert(path);
247 assert(acc);
248 assert(type == 'b' || type == 'c');
249
250 f = fopen("/proc/devices", "re");
251 if (!f) {
252 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
253 return -errno;
254 }
255
256 FOREACH_LINE(line, f, goto fail) {
257 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
258 unsigned maj;
259
260 truncate_nl(line);
261
262 if (type == 'c' && streq(line, "Character devices:")) {
263 good = true;
264 continue;
265 }
266
267 if (type == 'b' && streq(line, "Block devices:")) {
268 good = true;
269 continue;
270 }
271
272 if (isempty(line)) {
273 good = false;
274 continue;
275 }
276
277 if (!good)
278 continue;
279
280 p = strstrip(line);
281
282 w = strpbrk(p, WHITESPACE);
283 if (!w)
284 continue;
285 *w = 0;
286
287 r = safe_atou(p, &maj);
288 if (r < 0)
289 continue;
290 if (maj <= 0)
291 continue;
292
293 w++;
294 w += strspn(w, WHITESPACE);
e41969e3
LP
295
296 if (fnmatch(name, w, 0) != 0)
90060676
LP
297 continue;
298
299 sprintf(buf,
300 "%c %u:* %s",
301 type,
302 maj,
303 acc);
304
305 r = cg_set_attribute("devices", path, "devices.allow", buf);
306 if (r < 0)
307 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
308 }
309
310 return 0;
311
312fail:
313 log_warning("Failed to read /proc/devices: %m");
314 return -errno;
315}
316
95ae05c0 317void cgroup_context_apply(Manager *m, CGroupContext *c, CGroupControllerMask mask, const char *path) {
01efdf13 318 bool is_root;
4ad49000
LP
319 int r;
320
321 assert(c);
322 assert(path);
8e274523 323
4ad49000
LP
324 if (mask == 0)
325 return;
8e274523 326
01efdf13
LP
327 /* Some cgroup attributes are not support on the root cgroup,
328 * hence silently ignore */
329 is_root = isempty(path) || path_equal(path, "/");
330
331 if ((mask & CGROUP_CPU) && !is_root) {
b2f8b02e
LP
332 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
333 usec_t q;
8e274523 334
95ae05c0
WC
335 sprintf(buf, "%lu\n", manager_state(m) == MANAGER_STARTING
336 ? c->startup_cpu_shares
337 : c->cpu_shares);
4ad49000
LP
338 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
339 if (r < 0)
340 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
b2f8b02e
LP
341
342 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
343 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
344 if (r < 0)
345 log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
346
347 q = cgroup_context_get_cpu_quota_usec(c);
348 if (q != (usec_t) -1) {
349 sprintf(buf, USEC_FMT "\n", q);
350 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
351 } else
352 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
353 if (r < 0)
354 log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
4ad49000
LP
355 }
356
357 if (mask & CGROUP_BLKIO) {
358 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
359 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
360 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
361 CGroupBlockIODeviceWeight *w;
362 CGroupBlockIODeviceBandwidth *b;
363
01efdf13 364 if (!is_root) {
95ae05c0
WC
365 sprintf(buf, "%lu\n", manager_state(m) == MANAGER_STARTING
366 ? c->startup_blockio_weight
367 : c->blockio_weight);
01efdf13
LP
368 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
369 if (r < 0)
370 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
4ad49000 371
01efdf13
LP
372 /* FIXME: no way to reset this list */
373 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
374 dev_t dev;
4ad49000 375
01efdf13
LP
376 r = lookup_blkio_device(w->path, &dev);
377 if (r < 0)
378 continue;
8e274523 379
01efdf13
LP
380 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
381 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
382 if (r < 0)
383 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
384 }
4ad49000
LP
385 }
386
387 /* FIXME: no way to reset this list */
388 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
389 const char *a;
390 dev_t dev;
391
392 r = lookup_blkio_device(b->path, &dev);
393 if (r < 0)
394 continue;
395
396 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
397
398 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
399 r = cg_set_attribute("blkio", path, a, buf);
400 if (r < 0)
401 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
d686d8a9 402 }
8e274523
LP
403 }
404
4ad49000 405 if (mask & CGROUP_MEMORY) {
6a94f2e9 406 if (c->memory_limit != (uint64_t) -1) {
e58cec11
LP
407 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
408
6a94f2e9
G
409 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
410 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
411 } else
412 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
8e274523 413
4ad49000
LP
414 if (r < 0)
415 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
4ad49000 416 }
8e274523 417
01efdf13 418 if ((mask & CGROUP_DEVICE) && !is_root) {
4ad49000 419 CGroupDeviceAllow *a;
8e274523 420
4ad49000
LP
421 if (c->device_allow || c->device_policy != CGROUP_AUTO)
422 r = cg_set_attribute("devices", path, "devices.deny", "a");
423 else
424 r = cg_set_attribute("devices", path, "devices.allow", "a");
425 if (r < 0)
01efdf13 426 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
fb385181 427
4ad49000
LP
428 if (c->device_policy == CGROUP_CLOSED ||
429 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
430 static const char auto_devices[] =
7d711efb
LP
431 "/dev/null\0" "rwm\0"
432 "/dev/zero\0" "rwm\0"
433 "/dev/full\0" "rwm\0"
434 "/dev/random\0" "rwm\0"
435 "/dev/urandom\0" "rwm\0"
436 "/dev/tty\0" "rwm\0"
437 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
4ad49000
LP
438
439 const char *x, *y;
440
441 NULSTR_FOREACH_PAIR(x, y, auto_devices)
442 whitelist_device(path, x, y);
7d711efb
LP
443
444 whitelist_major(path, "pts", 'c', "rw");
445 whitelist_major(path, "kdbus", 'c', "rw");
446 whitelist_major(path, "kdbus/*", 'c', "rw");
4ad49000
LP
447 }
448
449 LIST_FOREACH(device_allow, a, c->device_allow) {
450 char acc[4];
451 unsigned k = 0;
452
453 if (a->r)
454 acc[k++] = 'r';
455 if (a->w)
456 acc[k++] = 'w';
457 if (a->m)
458 acc[k++] = 'm';
fb385181 459
4ad49000
LP
460 if (k == 0)
461 continue;
fb385181 462
4ad49000 463 acc[k++] = 0;
90060676
LP
464
465 if (startswith(a->path, "/dev/"))
466 whitelist_device(path, a->path, acc);
467 else if (startswith(a->path, "block-"))
468 whitelist_major(path, a->path + 6, 'b', acc);
469 else if (startswith(a->path, "char-"))
470 whitelist_major(path, a->path + 5, 'c', acc);
471 else
472 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
473 }
474 }
fb385181
LP
475}
476
95ae05c0 477CGroupControllerMask cgroup_context_get_mask(Manager *m, CGroupContext *c) {
4ad49000 478 CGroupControllerMask mask = 0;
8e274523 479
4ad49000 480 /* Figure out which controllers we need */
8e274523 481
b2f8b02e 482 if (c->cpu_accounting ||
95ae05c0
WC
483 (manager_state(m) == MANAGER_STARTING ? c->startup_cpu_shares : c->cpu_shares) != 1024 ||
484 (manager_state(m) != MANAGER_STARTING && c->startup_cpu_shares_set && c->startup_cpu_shares != c->cpu_shares) ||
b2f8b02e 485 c->cpu_quota_usec != (usec_t) -1 ||
95ae05c0 486 c->cpu_quota_per_sec_usec != (usec_t) -1) {
4ad49000 487 mask |= CGROUP_CPUACCT | CGROUP_CPU;
95ae05c0
WC
488 if (manager_state(m) != MANAGER_STARTING)
489 c->startup_cpu_shares_set = false;
490 }
ecedd90f 491
4ad49000 492 if (c->blockio_accounting ||
95ae05c0
WC
493 (manager_state(m) == MANAGER_STARTING ? c->startup_blockio_weight : c->blockio_weight) != 1000 ||
494 (manager_state(m) != MANAGER_STARTING && c->startup_blockio_weight_set && c->startup_blockio_weight != c->blockio_weight) ||
4ad49000 495 c->blockio_device_weights ||
95ae05c0 496 c->blockio_device_bandwidths) {
4ad49000 497 mask |= CGROUP_BLKIO;
95ae05c0
WC
498 if (manager_state(m) != MANAGER_STARTING)
499 c->startup_blockio_weight_set = false;
500 }
ecedd90f 501
4ad49000 502 if (c->memory_accounting ||
ddca82ac 503 c->memory_limit != (uint64_t) -1)
4ad49000 504 mask |= CGROUP_MEMORY;
8e274523 505
4ad49000
LP
506 if (c->device_allow || c->device_policy != CGROUP_AUTO)
507 mask |= CGROUP_DEVICE;
508
509 return mask;
8e274523
LP
510}
511
bc432dc7 512CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
4ad49000 513 CGroupContext *c;
8e274523 514
4ad49000
LP
515 c = unit_get_cgroup_context(u);
516 if (!c)
517 return 0;
8e274523 518
95ae05c0 519 return cgroup_context_get_mask(u->manager, c);
8e274523
LP
520}
521
bc432dc7 522CGroupControllerMask unit_get_members_mask(Unit *u) {
4ad49000 523 assert(u);
bc432dc7
LP
524
525 if (u->cgroup_members_mask_valid)
526 return u->cgroup_members_mask;
527
528 u->cgroup_members_mask = 0;
529
530 if (u->type == UNIT_SLICE) {
531 Unit *member;
532 Iterator i;
533
534 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
535
536 if (member == u)
537 continue;
538
d4fdc205 539 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
540 continue;
541
542 u->cgroup_members_mask |=
543 unit_get_cgroup_mask(member) |
544 unit_get_members_mask(member);
545 }
546 }
547
548 u->cgroup_members_mask_valid = true;
6414b7c9 549 return u->cgroup_members_mask;
246aa6dd
LP
550}
551
bc432dc7
LP
552CGroupControllerMask unit_get_siblings_mask(Unit *u) {
553 CGroupControllerMask m;
554
4ad49000 555 assert(u);
246aa6dd 556
bc432dc7
LP
557 if (UNIT_ISSET(u->slice))
558 m = unit_get_members_mask(UNIT_DEREF(u->slice));
559 else
560 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
4ad49000
LP
561
562 /* Sibling propagation is only relevant for weight-based
563 * controllers, so let's mask out everything else */
bc432dc7 564 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
246aa6dd
LP
565}
566
bc432dc7 567CGroupControllerMask unit_get_target_mask(Unit *u) {
6414b7c9
DS
568 CGroupControllerMask mask;
569
570 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
571 mask &= u->manager->cgroup_supported;
572
573 return mask;
574}
575
576/* Recurse from a unit up through its containing slices, propagating
577 * mask bits upward. A unit is also member of itself. */
bc432dc7
LP
578void unit_update_cgroup_members_masks(Unit *u) {
579 CGroupControllerMask m;
580 bool more;
581
582 assert(u);
583
584 /* Calculate subtree mask */
585 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
586
587 /* See if anything changed from the previous invocation. If
588 * not, we're done. */
589 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
590 return;
591
592 more =
593 u->cgroup_subtree_mask_valid &&
594 ((m & ~u->cgroup_subtree_mask) != 0) &&
595 ((~m & u->cgroup_subtree_mask) == 0);
596
597 u->cgroup_subtree_mask = m;
598 u->cgroup_subtree_mask_valid = true;
599
6414b7c9
DS
600 if (UNIT_ISSET(u->slice)) {
601 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
602
603 if (more)
604 /* There's more set now than before. We
605 * propagate the new mask to the parent's mask
606 * (not caring if it actually was valid or
607 * not). */
608
609 s->cgroup_members_mask |= m;
610
611 else
612 /* There's less set now than before (or we
613 * don't know), we need to recalculate
614 * everything, so let's invalidate the
615 * parent's members mask */
616
617 s->cgroup_members_mask_valid = false;
618
619 /* And now make sure that this change also hits our
620 * grandparents */
621 unit_update_cgroup_members_masks(s);
6414b7c9
DS
622 }
623}
624
03b90d4b
LP
625static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
626 Unit *u = userdata;
627
628 assert(mask != 0);
629 assert(u);
630
631 while (u) {
632 if (u->cgroup_path &&
633 u->cgroup_realized &&
634 (u->cgroup_realized_mask & mask) == mask)
635 return u->cgroup_path;
636
637 u = UNIT_DEREF(u->slice);
638 }
639
640 return NULL;
641}
642
4ad49000 643static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
03b90d4b 644 _cleanup_free_ char *path = NULL;
bc432dc7 645 int r;
64747e2d 646
4ad49000 647 assert(u);
64747e2d 648
4ad49000
LP
649 path = unit_default_cgroup_path(u);
650 if (!path)
a94042fa 651 return log_oom();
64747e2d 652
0a1eb06d 653 r = hashmap_put(u->manager->cgroup_unit, path, u);
03b90d4b
LP
654 if (r < 0) {
655 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
0a1eb06d 656 return r;
b58b8e11 657 }
03b90d4b 658 if (r > 0) {
b58b8e11 659 u->cgroup_path = path;
a94042fa 660 path = NULL;
b58b8e11
HH
661 }
662
03b90d4b
LP
663 /* First, create our own group */
664 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
665 if (r < 0) {
666 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
667 return r;
668 }
669
670 /* Keep track that this is now realized */
4ad49000 671 u->cgroup_realized = true;
bc432dc7 672 u->cgroup_realized_mask = mask;
4ad49000 673
03b90d4b
LP
674 /* Then, possibly move things over */
675 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
676 if (r < 0)
677 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
678
64747e2d
LP
679 return 0;
680}
681
6414b7c9 682static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
bc432dc7
LP
683 assert(u);
684
685 return u->cgroup_realized && u->cgroup_realized_mask == mask;
6414b7c9
DS
686}
687
688/* Check if necessary controllers and attributes for a unit are in place.
689 *
690 * If so, do nothing.
691 * If not, create paths, move processes over, and set attributes.
692 *
693 * Returns 0 on success and < 0 on failure. */
0a1eb06d 694static int unit_realize_cgroup_now(Unit *u) {
4ad49000 695 CGroupControllerMask mask;
6414b7c9 696 int r;
64747e2d 697
4ad49000 698 assert(u);
64747e2d 699
4ad49000 700 if (u->in_cgroup_queue) {
71fda00f 701 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
702 u->in_cgroup_queue = false;
703 }
64747e2d 704
6414b7c9 705 mask = unit_get_target_mask(u);
64747e2d 706
6414b7c9 707 if (unit_has_mask_realized(u, mask))
0a1eb06d 708 return 0;
64747e2d 709
4ad49000 710 /* First, realize parents */
6414b7c9
DS
711 if (UNIT_ISSET(u->slice)) {
712 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
713 if (r < 0)
714 return r;
715 }
4ad49000
LP
716
717 /* And then do the real work */
6414b7c9
DS
718 r = unit_create_cgroups(u, mask);
719 if (r < 0)
720 return r;
721
722 /* Finally, apply the necessary attributes. */
95ae05c0 723 cgroup_context_apply(u->manager, unit_get_cgroup_context(u), mask, u->cgroup_path);
6414b7c9
DS
724
725 return 0;
64747e2d
LP
726}
727
4ad49000 728static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 729
4ad49000
LP
730 if (u->in_cgroup_queue)
731 return;
8e274523 732
71fda00f 733 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
734 u->in_cgroup_queue = true;
735}
8c6db833 736
4ad49000
LP
737unsigned manager_dispatch_cgroup_queue(Manager *m) {
738 Unit *i;
739 unsigned n = 0;
6414b7c9 740 int r;
ecedd90f 741
4ad49000
LP
742 while ((i = m->cgroup_queue)) {
743 assert(i->in_cgroup_queue);
ecedd90f 744
6414b7c9
DS
745 r = unit_realize_cgroup_now(i);
746 if (r < 0)
747 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
0a1eb06d 748
4ad49000
LP
749 n++;
750 }
ecedd90f 751
4ad49000 752 return n;
8e274523
LP
753}
754
4ad49000
LP
755static void unit_queue_siblings(Unit *u) {
756 Unit *slice;
ca949c9d 757
4ad49000
LP
758 /* This adds the siblings of the specified unit and the
759 * siblings of all parent units to the cgroup queue. (But
760 * neither the specified unit itself nor the parents.) */
761
762 while ((slice = UNIT_DEREF(u->slice))) {
763 Iterator i;
764 Unit *m;
8f53a7b8 765
4ad49000
LP
766 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
767 if (m == u)
768 continue;
8e274523 769
6414b7c9
DS
770 /* Skip units that have a dependency on the slice
771 * but aren't actually in it. */
4ad49000 772 if (UNIT_DEREF(m->slice) != slice)
50159e6a 773 continue;
8e274523 774
6414b7c9
DS
775 /* No point in doing cgroup application for units
776 * without active processes. */
777 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
778 continue;
779
780 /* If the unit doesn't need any new controllers
781 * and has current ones realized, it doesn't need
782 * any changes. */
783 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
784 continue;
785
4ad49000 786 unit_add_to_cgroup_queue(m);
50159e6a
LP
787 }
788
4ad49000 789 u = slice;
8e274523 790 }
4ad49000
LP
791}
792
0a1eb06d 793int unit_realize_cgroup(Unit *u) {
4ad49000
LP
794 CGroupContext *c;
795
796 assert(u);
797
798 c = unit_get_cgroup_context(u);
799 if (!c)
0a1eb06d 800 return 0;
8e274523 801
4ad49000
LP
802 /* So, here's the deal: when realizing the cgroups for this
803 * unit, we need to first create all parents, but there's more
804 * actually: for the weight-based controllers we also need to
805 * make sure that all our siblings (i.e. units that are in the
73e231ab 806 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
807 * would become very uneven as each of their processes would
808 * get as much resources as all our group together. This call
809 * will synchronously create the parent cgroups, but will
810 * defer work on the siblings to the next event loop
811 * iteration. */
ca949c9d 812
4ad49000
LP
813 /* Add all sibling slices to the cgroup queue. */
814 unit_queue_siblings(u);
815
6414b7c9 816 /* And realize this one now (and apply the values) */
bc432dc7 817 return unit_realize_cgroup_now(u);
8e274523
LP
818}
819
4ad49000 820void unit_destroy_cgroup(Unit *u) {
8e274523
LP
821 int r;
822
4ad49000 823 assert(u);
8e274523 824
4ad49000
LP
825 if (!u->cgroup_path)
826 return;
8e274523 827
13b84ec7 828 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
4ad49000 829 if (r < 0)
376dd21d 830 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
8e274523 831
0a1eb06d
LP
832 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
833
4ad49000
LP
834 free(u->cgroup_path);
835 u->cgroup_path = NULL;
836 u->cgroup_realized = false;
bc432dc7 837 u->cgroup_realized_mask = 0;
0a1eb06d 838
8e274523
LP
839}
840
4ad49000
LP
841pid_t unit_search_main_pid(Unit *u) {
842 _cleanup_fclose_ FILE *f = NULL;
843 pid_t pid = 0, npid, mypid;
844
845 assert(u);
846
847 if (!u->cgroup_path)
848 return 0;
849
850 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
851 return 0;
852
853 mypid = getpid();
854 while (cg_read_pid(f, &npid) > 0) {
855 pid_t ppid;
856
857 if (npid == pid)
858 continue;
8e274523 859
4ad49000
LP
860 /* Ignore processes that aren't our kids */
861 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
862 continue;
8e274523 863
4ad49000
LP
864 if (pid != 0) {
865 /* Dang, there's more than one daemonized PID
866 in this group, so we don't know what process
867 is the main process. */
868 pid = 0;
869 break;
870 }
8e274523 871
4ad49000 872 pid = npid;
8e274523
LP
873 }
874
4ad49000 875 return pid;
8e274523
LP
876}
877
8e274523 878int manager_setup_cgroup(Manager *m) {
9444b1f2 879 _cleanup_free_ char *path = NULL;
15c60e99 880 char *e;
8e274523 881 int r;
8e274523
LP
882
883 assert(m);
884
35d2e7ec 885 /* 1. Determine hierarchy */
9444b1f2
LP
886 free(m->cgroup_root);
887 m->cgroup_root = NULL;
888
889 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
9156e799 890 if (r < 0) {
12235040 891 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
a32360f1 892 return r;
12235040 893 }
8e274523 894
15c60e99
LP
895 /* LEGACY: Already in /system.slice? If so, let's cut this
896 * off. This is to support live upgrades from older systemd
897 * versions where PID 1 was moved there. */
9444b1f2
LP
898 if (m->running_as == SYSTEMD_SYSTEM) {
899 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99
LP
900 if (!e)
901 e = endswith(m->cgroup_root, "/system");
9444b1f2
LP
902 if (e)
903 *e = 0;
0baf24dd 904 }
7ccfb64a 905
9444b1f2
LP
906 /* And make sure to store away the root value without trailing
907 * slash, even for the root dir, so that we can easily prepend
908 * it everywhere. */
909 if (streq(m->cgroup_root, "/"))
910 m->cgroup_root[0] = 0;
8e274523 911
35d2e7ec 912 /* 2. Show data */
9444b1f2 913 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
3474ae3c 914 if (r < 0) {
12235040 915 log_error("Cannot find cgroup mount point: %s", strerror(-r));
a32360f1 916 return r;
12235040 917 }
8e274523 918
c6c18be3
LP
919 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
920
35d2e7ec 921 /* 3. Install agent */
a32360f1
LP
922 if (m->running_as == SYSTEMD_SYSTEM) {
923 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
924 if (r < 0)
925 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
926 else if (r > 0)
927 log_debug("Installed release agent.");
928 else
929 log_debug("Release agent already installed.");
930 }
8e274523 931
15c60e99
LP
932 /* 4. Make sure we are in the root cgroup */
933 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
9156e799 934 if (r < 0) {
8e274523 935 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
a32360f1 936 return r;
c6c18be3
LP
937 }
938
35d2e7ec 939 /* 5. And pin it, so that it cannot be unmounted */
03e334a1 940 safe_close(m->pin_cgroupfs_fd);
c6c18be3 941
9156e799 942 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
cd7affae 943 if (m->pin_cgroupfs_fd < 0) {
12235040 944 log_error("Failed to open pin file: %m");
a32360f1 945 return -errno;
c6c18be3
LP
946 }
947
4ad49000
LP
948 /* 6. Figure out which controllers are supported */
949 m->cgroup_supported = cg_mask_supported();
9156e799 950
e58cec11
LP
951 /* 7. Always enable hierarchial support if it exists... */
952 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
953
a32360f1 954 return 0;
8e274523
LP
955}
956
c6c18be3 957void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
958 assert(m);
959
9444b1f2
LP
960 /* We can't really delete the group, since we are in it. But
961 * let's trim it. */
962 if (delete && m->cgroup_root)
963 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
8e274523 964
03e334a1 965 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 966
9444b1f2
LP
967 free(m->cgroup_root);
968 m->cgroup_root = NULL;
8e274523
LP
969}
970
4ad49000 971Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 972 char *p;
4ad49000 973 Unit *u;
acb14d31
LP
974
975 assert(m);
976 assert(cgroup);
acb14d31 977
4ad49000
LP
978 u = hashmap_get(m->cgroup_unit, cgroup);
979 if (u)
980 return u;
acb14d31 981
8e70580b 982 p = strdupa(cgroup);
acb14d31
LP
983 for (;;) {
984 char *e;
985
986 e = strrchr(p, '/');
4ad49000
LP
987 if (e == p || !e)
988 return NULL;
acb14d31
LP
989
990 *e = 0;
991
4ad49000
LP
992 u = hashmap_get(m->cgroup_unit, p);
993 if (u)
994 return u;
acb14d31
LP
995 }
996}
997
4ad49000
LP
998Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
999 _cleanup_free_ char *cgroup = NULL;
acb14d31 1000 int r;
8e274523 1001
8c47c732
LP
1002 assert(m);
1003
1004 if (pid <= 1)
1005 return NULL;
1006
4ad49000
LP
1007 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1008 if (r < 0)
6dde1f33
LP
1009 return NULL;
1010
4ad49000 1011 return manager_get_unit_by_cgroup(m, cgroup);
6dde1f33 1012}
4fbf50b3 1013
4ad49000
LP
1014int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1015 Unit *u;
1016 int r;
4fbf50b3 1017
4ad49000
LP
1018 assert(m);
1019 assert(cgroup);
4fbf50b3 1020
4ad49000 1021 u = manager_get_unit_by_cgroup(m, cgroup);
b56c28c3 1022 if (u) {
06025d91
LP
1023 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1024 if (r > 0) {
1025 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1026 UNIT_VTABLE(u)->notify_cgroup_empty(u);
b56c28c3 1027
06025d91
LP
1028 unit_add_to_gc_queue(u);
1029 }
b56c28c3 1030 }
2633eb83 1031
4ad49000 1032 return 0;
4fbf50b3
LP
1033}
1034
4ad49000
LP
1035static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1036 [CGROUP_AUTO] = "auto",
1037 [CGROUP_CLOSED] = "closed",
1038 [CGROUP_STRICT] = "strict",
1039};
4fbf50b3 1040
4ad49000 1041DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);