]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
core: unified cgroup hierarchy support
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
4ad49000 6 Copyright 2013 Lennart Poettering
8e274523
LP
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
c6c18be3 22#include <fcntl.h>
e41969e3 23#include <fnmatch.h>
8c6db833 24
0b452006 25#include "process-util.h"
9eb977db 26#include "path-util.h"
9444b1f2 27#include "special.h"
4ad49000
LP
28#include "cgroup-util.h"
29#include "cgroup.h"
8e274523 30
9a054909
LP
31#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
32
4ad49000
LP
33void cgroup_context_init(CGroupContext *c) {
34 assert(c);
35
36 /* Initialize everything to the kernel defaults, assuming the
37 * structure is preinitialized to 0 */
38
db785129
LP
39 c->cpu_shares = (unsigned long) -1;
40 c->startup_cpu_shares = (unsigned long) -1;
ddca82ac 41 c->memory_limit = (uint64_t) -1;
db785129
LP
42 c->blockio_weight = (unsigned long) -1;
43 c->startup_blockio_weight = (unsigned long) -1;
b2f8b02e 44
3a43da28 45 c->cpu_quota_per_sec_usec = USEC_INFINITY;
4ad49000 46}
8e274523 47
4ad49000
LP
48void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
49 assert(c);
50 assert(a);
51
71fda00f 52 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
53 free(a->path);
54 free(a);
55}
56
57void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
58 assert(c);
59 assert(w);
60
71fda00f 61 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
62 free(w->path);
63 free(w);
64}
65
66void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
67 assert(c);
8e274523 68 assert(b);
8e274523 69
71fda00f 70 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
71 free(b->path);
72 free(b);
73}
74
75void cgroup_context_done(CGroupContext *c) {
76 assert(c);
77
78 while (c->blockio_device_weights)
79 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
80
81 while (c->blockio_device_bandwidths)
82 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
83
84 while (c->device_allow)
85 cgroup_context_free_device_allow(c, c->device_allow);
86}
87
88void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
89 CGroupBlockIODeviceBandwidth *b;
90 CGroupBlockIODeviceWeight *w;
91 CGroupDeviceAllow *a;
9a054909 92 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
93
94 assert(c);
95 assert(f);
96
97 prefix = strempty(prefix);
98
99 fprintf(f,
100 "%sCPUAccounting=%s\n"
101 "%sBlockIOAccounting=%s\n"
102 "%sMemoryAccounting=%s\n"
103 "%sCPUShares=%lu\n"
95ae05c0 104 "%sStartupCPUShares=%lu\n"
b2f8b02e 105 "%sCPUQuotaPerSecSec=%s\n"
112a7f46 106 "%sBlockIOWeight=%lu\n"
95ae05c0 107 "%sStartupBlockIOWeight=%lu\n"
4ad49000 108 "%sMemoryLimit=%" PRIu64 "\n"
a931ad47
LP
109 "%sDevicePolicy=%s\n"
110 "%sDelegate=%s\n",
4ad49000
LP
111 prefix, yes_no(c->cpu_accounting),
112 prefix, yes_no(c->blockio_accounting),
113 prefix, yes_no(c->memory_accounting),
114 prefix, c->cpu_shares,
95ae05c0 115 prefix, c->startup_cpu_shares,
b1d6dcf5 116 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
4ad49000 117 prefix, c->blockio_weight,
95ae05c0 118 prefix, c->startup_blockio_weight,
4ad49000 119 prefix, c->memory_limit,
a931ad47
LP
120 prefix, cgroup_device_policy_to_string(c->device_policy),
121 prefix, yes_no(c->delegate));
4ad49000
LP
122
123 LIST_FOREACH(device_allow, a, c->device_allow)
124 fprintf(f,
125 "%sDeviceAllow=%s %s%s%s\n",
126 prefix,
127 a->path,
128 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
129
130 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
131 fprintf(f,
8e7076ca 132 "%sBlockIODeviceWeight=%s %lu",
4ad49000
LP
133 prefix,
134 w->path,
135 w->weight);
136
137 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
138 char buf[FORMAT_BYTES_MAX];
139
140 fprintf(f,
141 "%s%s=%s %s\n",
142 prefix,
143 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
144 b->path,
145 format_bytes(buf, sizeof(buf), b->bandwidth));
146 }
147}
148
149static int lookup_blkio_device(const char *p, dev_t *dev) {
150 struct stat st;
151 int r;
152
153 assert(p);
154 assert(dev);
155
156 r = stat(p, &st);
4a62c710
MS
157 if (r < 0)
158 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 159
4ad49000
LP
160 if (S_ISBLK(st.st_mode))
161 *dev = st.st_rdev;
162 else if (major(st.st_dev) != 0) {
163 /* If this is not a device node then find the block
164 * device this file is stored on */
165 *dev = st.st_dev;
166
167 /* If this is a partition, try to get the originating
168 * block device */
169 block_get_whole_disk(*dev, dev);
170 } else {
171 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
172 return -ENODEV;
173 }
8e274523 174
8e274523 175 return 0;
8e274523
LP
176}
177
4ad49000
LP
178static int whitelist_device(const char *path, const char *node, const char *acc) {
179 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
180 struct stat st;
8c6db833 181 int r;
8e274523 182
4ad49000
LP
183 assert(path);
184 assert(acc);
8e274523 185
4ad49000
LP
186 if (stat(node, &st) < 0) {
187 log_warning("Couldn't stat device %s", node);
188 return -errno;
189 }
190
191 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
192 log_warning("%s is not a device.", node);
193 return -ENODEV;
194 }
195
196 sprintf(buf,
197 "%c %u:%u %s",
198 S_ISCHR(st.st_mode) ? 'c' : 'b',
199 major(st.st_rdev), minor(st.st_rdev),
200 acc);
201
202 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 203 if (r < 0)
714e2e1d
LP
204 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
205 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
206
207 return r;
8e274523
LP
208}
209
90060676
LP
210static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
211 _cleanup_fclose_ FILE *f = NULL;
212 char line[LINE_MAX];
213 bool good = false;
214 int r;
215
216 assert(path);
217 assert(acc);
218 assert(type == 'b' || type == 'c');
219
220 f = fopen("/proc/devices", "re");
4a62c710
MS
221 if (!f)
222 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
223
224 FOREACH_LINE(line, f, goto fail) {
225 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
226 unsigned maj;
227
228 truncate_nl(line);
229
230 if (type == 'c' && streq(line, "Character devices:")) {
231 good = true;
232 continue;
233 }
234
235 if (type == 'b' && streq(line, "Block devices:")) {
236 good = true;
237 continue;
238 }
239
240 if (isempty(line)) {
241 good = false;
242 continue;
243 }
244
245 if (!good)
246 continue;
247
248 p = strstrip(line);
249
250 w = strpbrk(p, WHITESPACE);
251 if (!w)
252 continue;
253 *w = 0;
254
255 r = safe_atou(p, &maj);
256 if (r < 0)
257 continue;
258 if (maj <= 0)
259 continue;
260
261 w++;
262 w += strspn(w, WHITESPACE);
e41969e3
LP
263
264 if (fnmatch(name, w, 0) != 0)
90060676
LP
265 continue;
266
267 sprintf(buf,
268 "%c %u:* %s",
269 type,
270 maj,
271 acc);
272
273 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 274 if (r < 0)
714e2e1d
LP
275 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
276 "Failed to set devices.allow on %s: %m", path);
90060676
LP
277 }
278
279 return 0;
280
281fail:
56f64d95 282 log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
283 return -errno;
284}
285
efdb0237 286void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
01efdf13 287 bool is_root;
4ad49000
LP
288 int r;
289
290 assert(c);
291 assert(path);
8e274523 292
4ad49000
LP
293 if (mask == 0)
294 return;
8e274523 295
71c26873 296 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
297 * hence silently ignore */
298 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
299 if (is_root)
300 /* Make sure we don't try to display messages with an empty path. */
301 path = "/";
01efdf13 302
714e2e1d
LP
303 /* We generally ignore errors caused by read-only mounted
304 * cgroup trees (assuming we are running in a container then),
305 * and missing cgroups, i.e. EROFS and ENOENT. */
306
efdb0237 307 if ((mask & CGROUP_MASK_CPU) && !is_root) {
b2f8b02e 308 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
8e274523 309
db785129 310 sprintf(buf, "%lu\n",
d81afec1 311 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
db785129 312 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
4ad49000 313 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
1aeab12b 314 if (r < 0)
714e2e1d
LP
315 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
316 "Failed to set cpu.shares on %s: %m", path);
b2f8b02e 317
9a054909 318 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
b2f8b02e 319 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
1aeab12b 320 if (r < 0)
714e2e1d
LP
321 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
322 "Failed to set cpu.cfs_period_us on %s: %m", path);
b2f8b02e 323
3a43da28 324 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
9a054909 325 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
b2f8b02e
LP
326 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
327 } else
328 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
1aeab12b 329 if (r < 0)
714e2e1d
LP
330 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
331 "Failed to set cpu.cfs_quota_us on %s: %m", path);
4ad49000
LP
332 }
333
efdb0237 334 if (mask & CGROUP_MASK_BLKIO) {
4ad49000
LP
335 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
336 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
337 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
338 CGroupBlockIODeviceWeight *w;
339 CGroupBlockIODeviceBandwidth *b;
340
01efdf13 341 if (!is_root) {
d81afec1 342 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
db785129 343 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
01efdf13 344 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 345 if (r < 0)
714e2e1d
LP
346 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
347 "Failed to set blkio.weight on %s: %m", path);
4ad49000 348
01efdf13
LP
349 /* FIXME: no way to reset this list */
350 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
351 dev_t dev;
4ad49000 352
01efdf13
LP
353 r = lookup_blkio_device(w->path, &dev);
354 if (r < 0)
355 continue;
8e274523 356
01efdf13
LP
357 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
358 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
1aeab12b 359 if (r < 0)
714e2e1d
LP
360 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
361 "Failed to set blkio.weight_device on %s: %m", path);
01efdf13 362 }
4ad49000
LP
363 }
364
365 /* FIXME: no way to reset this list */
366 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
367 const char *a;
368 dev_t dev;
369
370 r = lookup_blkio_device(b->path, &dev);
371 if (r < 0)
372 continue;
373
374 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
375
376 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
377 r = cg_set_attribute("blkio", path, a, buf);
1aeab12b 378 if (r < 0)
714e2e1d
LP
379 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
380 "Failed to set %s on %s: %m", a, path);
d686d8a9 381 }
8e274523
LP
382 }
383
efdb0237 384 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
6a94f2e9 385 if (c->memory_limit != (uint64_t) -1) {
e58cec11
LP
386 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
387
6a94f2e9 388 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
efdb0237
LP
389
390 if (cg_unified() <= 0)
391 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
392 else
393 r = cg_set_attribute("memory", path, "memory.max", buf);
394
395 } else {
396 if (cg_unified() <= 0)
397 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
398 else
399 r = cg_set_attribute("memory", path, "memory.max", "max");
400 }
8e274523 401
1aeab12b 402 if (r < 0)
714e2e1d 403 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
efdb0237 404 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
4ad49000 405 }
8e274523 406
efdb0237 407 if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
4ad49000 408 CGroupDeviceAllow *a;
8e274523 409
714e2e1d
LP
410 /* Changing the devices list of a populated cgroup
411 * might result in EINVAL, hence ignore EINVAL
412 * here. */
413
4ad49000
LP
414 if (c->device_allow || c->device_policy != CGROUP_AUTO)
415 r = cg_set_attribute("devices", path, "devices.deny", "a");
416 else
417 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 418 if (r < 0)
714e2e1d
LP
419 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
420 "Failed to reset devices.list on %s: %m", path);
fb385181 421
4ad49000
LP
422 if (c->device_policy == CGROUP_CLOSED ||
423 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
424 static const char auto_devices[] =
7d711efb
LP
425 "/dev/null\0" "rwm\0"
426 "/dev/zero\0" "rwm\0"
427 "/dev/full\0" "rwm\0"
428 "/dev/random\0" "rwm\0"
429 "/dev/urandom\0" "rwm\0"
430 "/dev/tty\0" "rwm\0"
431 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
4ad49000
LP
432
433 const char *x, *y;
434
435 NULSTR_FOREACH_PAIR(x, y, auto_devices)
436 whitelist_device(path, x, y);
7d711efb
LP
437
438 whitelist_major(path, "pts", 'c', "rw");
439 whitelist_major(path, "kdbus", 'c', "rw");
440 whitelist_major(path, "kdbus/*", 'c', "rw");
4ad49000
LP
441 }
442
443 LIST_FOREACH(device_allow, a, c->device_allow) {
444 char acc[4];
445 unsigned k = 0;
446
447 if (a->r)
448 acc[k++] = 'r';
449 if (a->w)
450 acc[k++] = 'w';
451 if (a->m)
452 acc[k++] = 'm';
fb385181 453
4ad49000
LP
454 if (k == 0)
455 continue;
fb385181 456
4ad49000 457 acc[k++] = 0;
90060676
LP
458
459 if (startswith(a->path, "/dev/"))
460 whitelist_device(path, a->path, acc);
461 else if (startswith(a->path, "block-"))
462 whitelist_major(path, a->path + 6, 'b', acc);
463 else if (startswith(a->path, "char-"))
464 whitelist_major(path, a->path + 5, 'c', acc);
465 else
466 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
467 }
468 }
fb385181
LP
469}
470
efdb0237
LP
471CGroupMask cgroup_context_get_mask(CGroupContext *c) {
472 CGroupMask mask = 0;
8e274523 473
4ad49000 474 /* Figure out which controllers we need */
8e274523 475
b2f8b02e 476 if (c->cpu_accounting ||
db785129
LP
477 c->cpu_shares != (unsigned long) -1 ||
478 c->startup_cpu_shares != (unsigned long) -1 ||
3a43da28 479 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 480 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 481
4ad49000 482 if (c->blockio_accounting ||
db785129
LP
483 c->blockio_weight != (unsigned long) -1 ||
484 c->startup_blockio_weight != (unsigned long) -1 ||
4ad49000 485 c->blockio_device_weights ||
db785129 486 c->blockio_device_bandwidths)
efdb0237 487 mask |= CGROUP_MASK_BLKIO;
ecedd90f 488
4ad49000 489 if (c->memory_accounting ||
ddca82ac 490 c->memory_limit != (uint64_t) -1)
efdb0237 491 mask |= CGROUP_MASK_MEMORY;
8e274523 492
a931ad47
LP
493 if (c->device_allow ||
494 c->device_policy != CGROUP_AUTO)
efdb0237 495 mask |= CGROUP_MASK_DEVICE;
4ad49000
LP
496
497 return mask;
8e274523
LP
498}
499
efdb0237 500CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 501 CGroupContext *c;
8e274523 502
efdb0237
LP
503 /* Returns the mask of controllers the unit needs for itself */
504
4ad49000
LP
505 c = unit_get_cgroup_context(u);
506 if (!c)
507 return 0;
8e274523 508
a931ad47
LP
509 /* If delegation is turned on, then turn on all cgroups,
510 * unless the process we fork into it is known to drop
511 * privileges anyway, and shouldn't get access to the
512 * controllers anyway. */
513
514 if (c->delegate) {
515 ExecContext *e;
516
517 e = unit_get_exec_context(u);
518 if (!e || exec_context_maintains_privileges(e))
efdb0237 519 return _CGROUP_MASK_ALL;
a931ad47
LP
520 }
521
db785129 522 return cgroup_context_get_mask(c);
8e274523
LP
523}
524
efdb0237 525CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 526 assert(u);
bc432dc7 527
efdb0237
LP
528 /* Returns the mask of controllers all of the unit's children
529 * require, merged */
530
bc432dc7
LP
531 if (u->cgroup_members_mask_valid)
532 return u->cgroup_members_mask;
533
534 u->cgroup_members_mask = 0;
535
536 if (u->type == UNIT_SLICE) {
537 Unit *member;
538 Iterator i;
539
540 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
541
542 if (member == u)
543 continue;
544
d4fdc205 545 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
546 continue;
547
548 u->cgroup_members_mask |=
efdb0237 549 unit_get_own_mask(member) |
bc432dc7
LP
550 unit_get_members_mask(member);
551 }
552 }
553
554 u->cgroup_members_mask_valid = true;
6414b7c9 555 return u->cgroup_members_mask;
246aa6dd
LP
556}
557
efdb0237 558CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 559 assert(u);
246aa6dd 560
efdb0237
LP
561 /* Returns the mask of controllers all of the unit's siblings
562 * require, i.e. the members mask of the unit's parent slice
563 * if there is one. */
564
bc432dc7 565 if (UNIT_ISSET(u->slice))
637f421e 566 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 567
efdb0237 568 return unit_get_own_mask(u) | unit_get_members_mask(u);
246aa6dd
LP
569}
570
efdb0237
LP
571CGroupMask unit_get_subtree_mask(Unit *u) {
572
573 /* Returns the mask of this subtree, meaning of the group
574 * itself and its children. */
575
576 return unit_get_own_mask(u) | unit_get_members_mask(u);
577}
578
579CGroupMask unit_get_target_mask(Unit *u) {
580 CGroupMask mask;
581
582 /* This returns the cgroup mask of all controllers to enable
583 * for a specific cgroup, i.e. everything it needs itself,
584 * plus all that its children need, plus all that its siblings
585 * need. This is primarily useful on the legacy cgroup
586 * hierarchy, where we need to duplicate each cgroup in each
587 * hierarchy that shall be enabled for it. */
6414b7c9 588
efdb0237
LP
589 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
590 mask &= u->manager->cgroup_supported;
591
592 return mask;
593}
594
595CGroupMask unit_get_enable_mask(Unit *u) {
596 CGroupMask mask;
597
598 /* This returns the cgroup mask of all controllers to enable
599 * for the children of a specific cgroup. This is primarily
600 * useful for the unified cgroup hierarchy, where each cgroup
601 * controls which controllers are enabled for its children. */
602
603 mask = unit_get_members_mask(u);
6414b7c9
DS
604 mask &= u->manager->cgroup_supported;
605
606 return mask;
607}
608
609/* Recurse from a unit up through its containing slices, propagating
610 * mask bits upward. A unit is also member of itself. */
bc432dc7 611void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 612 CGroupMask m;
bc432dc7
LP
613 bool more;
614
615 assert(u);
616
617 /* Calculate subtree mask */
efdb0237 618 m = unit_get_subtree_mask(u);
bc432dc7
LP
619
620 /* See if anything changed from the previous invocation. If
621 * not, we're done. */
622 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
623 return;
624
625 more =
626 u->cgroup_subtree_mask_valid &&
627 ((m & ~u->cgroup_subtree_mask) != 0) &&
628 ((~m & u->cgroup_subtree_mask) == 0);
629
630 u->cgroup_subtree_mask = m;
631 u->cgroup_subtree_mask_valid = true;
632
6414b7c9
DS
633 if (UNIT_ISSET(u->slice)) {
634 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
635
636 if (more)
637 /* There's more set now than before. We
638 * propagate the new mask to the parent's mask
639 * (not caring if it actually was valid or
640 * not). */
641
642 s->cgroup_members_mask |= m;
643
644 else
645 /* There's less set now than before (or we
646 * don't know), we need to recalculate
647 * everything, so let's invalidate the
648 * parent's members mask */
649
650 s->cgroup_members_mask_valid = false;
651
652 /* And now make sure that this change also hits our
653 * grandparents */
654 unit_update_cgroup_members_masks(s);
6414b7c9
DS
655 }
656}
657
efdb0237 658static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
659 Unit *u = userdata;
660
661 assert(mask != 0);
662 assert(u);
663
664 while (u) {
665 if (u->cgroup_path &&
666 u->cgroup_realized &&
667 (u->cgroup_realized_mask & mask) == mask)
668 return u->cgroup_path;
669
670 u = UNIT_DEREF(u->slice);
671 }
672
673 return NULL;
674}
675
efdb0237
LP
676char *unit_default_cgroup_path(Unit *u) {
677 _cleanup_free_ char *escaped = NULL, *slice = NULL;
678 int r;
679
680 assert(u);
681
682 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
683 return strdup(u->manager->cgroup_root);
684
685 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
686 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
687 if (r < 0)
688 return NULL;
689 }
690
691 escaped = cg_escape(u->id);
692 if (!escaped)
693 return NULL;
694
695 if (slice)
696 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
697 else
698 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
699}
700
701int unit_set_cgroup_path(Unit *u, const char *path) {
702 _cleanup_free_ char *p = NULL;
703 int r;
704
705 assert(u);
706
707 if (path) {
708 p = strdup(path);
709 if (!p)
710 return -ENOMEM;
711 } else
712 p = NULL;
713
714 if (streq_ptr(u->cgroup_path, p))
715 return 0;
716
717 if (p) {
718 r = hashmap_put(u->manager->cgroup_unit, p, u);
719 if (r < 0)
720 return r;
721 }
722
723 unit_release_cgroup(u);
724
725 u->cgroup_path = p;
726 p = NULL;
727
728 return 1;
729}
730
731int unit_watch_cgroup(Unit *u) {
732 _cleanup_free_ char *populated = NULL;
733 int r;
734
735 assert(u);
736
737 if (!u->cgroup_path)
738 return 0;
739
740 if (u->cgroup_inotify_wd >= 0)
741 return 0;
742
743 /* Only applies to the unified hierarchy */
744 r = cg_unified();
745 if (r < 0)
746 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
747 if (r == 0)
748 return 0;
749
750 /* Don't watch the root slice, it's pointless. */
751 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
752 return 0;
753
754 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
755 if (r < 0)
756 return log_oom();
757
758 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
759 if (r < 0)
760 return log_oom();
761
762 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
763 if (u->cgroup_inotify_wd < 0) {
764
765 if (errno == ENOENT) /* If the directory is already
766 * gone we don't need to track
767 * it, so this is not an error */
768 return 0;
769
770 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
771 }
772
773 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
774 if (r < 0)
775 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
776
777 return 0;
778}
779
780static int unit_create_cgroup(
781 Unit *u,
782 CGroupMask target_mask,
783 CGroupMask enable_mask) {
784
0cd385d3 785 CGroupContext *c;
bc432dc7 786 int r;
64747e2d 787
4ad49000 788 assert(u);
64747e2d 789
0cd385d3
LP
790 c = unit_get_cgroup_context(u);
791 if (!c)
792 return 0;
793
7b3fd631
LP
794 if (!u->cgroup_path) {
795 _cleanup_free_ char *path = NULL;
64747e2d 796
7b3fd631
LP
797 path = unit_default_cgroup_path(u);
798 if (!path)
799 return log_oom();
800
efdb0237
LP
801 r = unit_set_cgroup_path(u, path);
802 if (r == -EEXIST)
803 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
804 if (r < 0)
805 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
806 }
807
03b90d4b 808 /* First, create our own group */
efdb0237 809 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 810 if (r < 0)
efdb0237
LP
811 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
812
813 /* Start watching it */
814 (void) unit_watch_cgroup(u);
815
816 /* Enable all controllers we need */
817 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
818 if (r < 0)
819 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
820
821 /* Keep track that this is now realized */
4ad49000 822 u->cgroup_realized = true;
efdb0237 823 u->cgroup_realized_mask = target_mask;
4ad49000 824
0cd385d3
LP
825 if (u->type != UNIT_SLICE && !c->delegate) {
826
827 /* Then, possibly move things over, but not if
828 * subgroups may contain processes, which is the case
829 * for slice and delegation units. */
830 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
831 if (r < 0)
efdb0237 832 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 833 }
03b90d4b 834
64747e2d
LP
835 return 0;
836}
837
7b3fd631
LP
838int unit_attach_pids_to_cgroup(Unit *u) {
839 int r;
840 assert(u);
841
842 r = unit_realize_cgroup(u);
843 if (r < 0)
844 return r;
845
846 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
847 if (r < 0)
848 return r;
849
850 return 0;
851}
852
efdb0237 853static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
bc432dc7
LP
854 assert(u);
855
efdb0237 856 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
6414b7c9
DS
857}
858
859/* Check if necessary controllers and attributes for a unit are in place.
860 *
861 * If so, do nothing.
862 * If not, create paths, move processes over, and set attributes.
863 *
864 * Returns 0 on success and < 0 on failure. */
db785129 865static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 866 CGroupMask target_mask, enable_mask;
6414b7c9 867 int r;
64747e2d 868
4ad49000 869 assert(u);
64747e2d 870
4ad49000 871 if (u->in_cgroup_queue) {
71fda00f 872 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
873 u->in_cgroup_queue = false;
874 }
64747e2d 875
efdb0237
LP
876 target_mask = unit_get_target_mask(u);
877 if (unit_has_mask_realized(u, target_mask))
0a1eb06d 878 return 0;
64747e2d 879
4ad49000 880 /* First, realize parents */
6414b7c9 881 if (UNIT_ISSET(u->slice)) {
db785129 882 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
883 if (r < 0)
884 return r;
885 }
4ad49000
LP
886
887 /* And then do the real work */
efdb0237
LP
888 enable_mask = unit_get_enable_mask(u);
889 r = unit_create_cgroup(u, target_mask, enable_mask);
6414b7c9
DS
890 if (r < 0)
891 return r;
892
893 /* Finally, apply the necessary attributes. */
efdb0237 894 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
6414b7c9
DS
895
896 return 0;
64747e2d
LP
897}
898
4ad49000 899static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 900
4ad49000
LP
901 if (u->in_cgroup_queue)
902 return;
8e274523 903
71fda00f 904 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
905 u->in_cgroup_queue = true;
906}
8c6db833 907
4ad49000 908unsigned manager_dispatch_cgroup_queue(Manager *m) {
db785129 909 ManagerState state;
4ad49000 910 unsigned n = 0;
db785129 911 Unit *i;
6414b7c9 912 int r;
ecedd90f 913
db785129
LP
914 state = manager_state(m);
915
4ad49000
LP
916 while ((i = m->cgroup_queue)) {
917 assert(i->in_cgroup_queue);
ecedd90f 918
db785129 919 r = unit_realize_cgroup_now(i, state);
6414b7c9 920 if (r < 0)
efdb0237 921 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 922
4ad49000
LP
923 n++;
924 }
ecedd90f 925
4ad49000 926 return n;
8e274523
LP
927}
928
4ad49000
LP
929static void unit_queue_siblings(Unit *u) {
930 Unit *slice;
ca949c9d 931
4ad49000
LP
932 /* This adds the siblings of the specified unit and the
933 * siblings of all parent units to the cgroup queue. (But
934 * neither the specified unit itself nor the parents.) */
935
936 while ((slice = UNIT_DEREF(u->slice))) {
937 Iterator i;
938 Unit *m;
8f53a7b8 939
4ad49000
LP
940 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
941 if (m == u)
942 continue;
8e274523 943
6414b7c9
DS
944 /* Skip units that have a dependency on the slice
945 * but aren't actually in it. */
4ad49000 946 if (UNIT_DEREF(m->slice) != slice)
50159e6a 947 continue;
8e274523 948
6414b7c9
DS
949 /* No point in doing cgroup application for units
950 * without active processes. */
951 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
952 continue;
953
954 /* If the unit doesn't need any new controllers
955 * and has current ones realized, it doesn't need
956 * any changes. */
957 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
958 continue;
959
4ad49000 960 unit_add_to_cgroup_queue(m);
50159e6a
LP
961 }
962
4ad49000 963 u = slice;
8e274523 964 }
4ad49000
LP
965}
966
0a1eb06d 967int unit_realize_cgroup(Unit *u) {
4ad49000
LP
968 assert(u);
969
35b7ff80 970 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 971 return 0;
8e274523 972
4ad49000
LP
973 /* So, here's the deal: when realizing the cgroups for this
974 * unit, we need to first create all parents, but there's more
975 * actually: for the weight-based controllers we also need to
976 * make sure that all our siblings (i.e. units that are in the
73e231ab 977 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
978 * would become very uneven as each of their processes would
979 * get as much resources as all our group together. This call
980 * will synchronously create the parent cgroups, but will
981 * defer work on the siblings to the next event loop
982 * iteration. */
ca949c9d 983
4ad49000
LP
984 /* Add all sibling slices to the cgroup queue. */
985 unit_queue_siblings(u);
986
6414b7c9 987 /* And realize this one now (and apply the values) */
db785129 988 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
989}
990
efdb0237
LP
991void unit_release_cgroup(Unit *u) {
992 assert(u);
993
994 /* Forgets all cgroup details for this cgroup */
995
996 if (u->cgroup_path) {
997 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
998 u->cgroup_path = mfree(u->cgroup_path);
999 }
1000
1001 if (u->cgroup_inotify_wd >= 0) {
1002 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1003 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1004
1005 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1006 u->cgroup_inotify_wd = -1;
1007 }
1008}
1009
1010void unit_prune_cgroup(Unit *u) {
8e274523 1011 int r;
efdb0237 1012 bool is_root_slice;
8e274523 1013
4ad49000 1014 assert(u);
8e274523 1015
efdb0237
LP
1016 /* Removes the cgroup, if empty and possible, and stops watching it. */
1017
4ad49000
LP
1018 if (!u->cgroup_path)
1019 return;
8e274523 1020
efdb0237
LP
1021 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1022
1023 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1024 if (r < 0) {
efdb0237 1025 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1026 return;
1027 }
8e274523 1028
efdb0237
LP
1029 if (is_root_slice)
1030 return;
1031
1032 unit_release_cgroup(u);
0a1eb06d 1033
4ad49000 1034 u->cgroup_realized = false;
bc432dc7 1035 u->cgroup_realized_mask = 0;
8e274523
LP
1036}
1037
efdb0237 1038int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1039 _cleanup_fclose_ FILE *f = NULL;
1040 pid_t pid = 0, npid, mypid;
efdb0237 1041 int r;
4ad49000
LP
1042
1043 assert(u);
efdb0237 1044 assert(ret);
4ad49000
LP
1045
1046 if (!u->cgroup_path)
efdb0237 1047 return -ENXIO;
4ad49000 1048
efdb0237
LP
1049 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1050 if (r < 0)
1051 return r;
4ad49000
LP
1052
1053 mypid = getpid();
1054 while (cg_read_pid(f, &npid) > 0) {
1055 pid_t ppid;
1056
1057 if (npid == pid)
1058 continue;
8e274523 1059
4ad49000
LP
1060 /* Ignore processes that aren't our kids */
1061 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1062 continue;
8e274523 1063
efdb0237 1064 if (pid != 0)
4ad49000
LP
1065 /* Dang, there's more than one daemonized PID
1066 in this group, so we don't know what process
1067 is the main process. */
efdb0237
LP
1068
1069 return -ENODATA;
8e274523 1070
4ad49000 1071 pid = npid;
8e274523
LP
1072 }
1073
efdb0237
LP
1074 *ret = pid;
1075 return 0;
1076}
1077
1078static int unit_watch_pids_in_path(Unit *u, const char *path) {
1079 _cleanup_closedir_ DIR *d = NULL;
1080 _cleanup_fclose_ FILE *f = NULL;
1081 int ret = 0, r;
1082
1083 assert(u);
1084 assert(path);
1085
1086 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1087 if (r < 0)
1088 ret = r;
1089 else {
1090 pid_t pid;
1091
1092 while ((r = cg_read_pid(f, &pid)) > 0) {
1093 r = unit_watch_pid(u, pid);
1094 if (r < 0 && ret >= 0)
1095 ret = r;
1096 }
1097
1098 if (r < 0 && ret >= 0)
1099 ret = r;
1100 }
1101
1102 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1103 if (r < 0) {
1104 if (ret >= 0)
1105 ret = r;
1106 } else {
1107 char *fn;
1108
1109 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1110 _cleanup_free_ char *p = NULL;
1111
1112 p = strjoin(path, "/", fn, NULL);
1113 free(fn);
1114
1115 if (!p)
1116 return -ENOMEM;
1117
1118 r = unit_watch_pids_in_path(u, p);
1119 if (r < 0 && ret >= 0)
1120 ret = r;
1121 }
1122
1123 if (r < 0 && ret >= 0)
1124 ret = r;
1125 }
1126
1127 return ret;
1128}
1129
1130int unit_watch_all_pids(Unit *u) {
1131 assert(u);
1132
1133 /* Adds all PIDs from our cgroup to the set of PIDs we
1134 * watch. This is a fallback logic for cases where we do not
1135 * get reliable cgroup empty notifications: we try to use
1136 * SIGCHLD as replacement. */
1137
1138 if (!u->cgroup_path)
1139 return -ENOENT;
1140
1141 if (cg_unified() > 0) /* On unified we can use proper notifications */
1142 return 0;
1143
1144 return unit_watch_pids_in_path(u, u->cgroup_path);
1145}
1146
1147int unit_notify_cgroup_empty(Unit *u) {
1148 int r;
1149
1150 assert(u);
1151
1152 if (!u->cgroup_path)
1153 return 0;
1154
1155 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1156 if (r <= 0)
1157 return r;
1158
1159 unit_add_to_gc_queue(u);
1160
1161 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1162 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1163
1164 return 0;
1165}
1166
1167static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1168 Manager *m = userdata;
1169
1170 assert(s);
1171 assert(fd >= 0);
1172 assert(m);
1173
1174 for (;;) {
1175 union inotify_event_buffer buffer;
1176 struct inotify_event *e;
1177 ssize_t l;
1178
1179 l = read(fd, &buffer, sizeof(buffer));
1180 if (l < 0) {
1181 if (errno == EINTR || errno == EAGAIN)
1182 return 0;
1183
1184 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1185 }
1186
1187 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1188 Unit *u;
1189
1190 if (e->wd < 0)
1191 /* Queue overflow has no watch descriptor */
1192 continue;
1193
1194 if (e->mask & IN_IGNORED)
1195 /* The watch was just removed */
1196 continue;
1197
1198 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1199 if (!u) /* Not that inotify might deliver
1200 * events for a watch even after it
1201 * was removed, because it was queued
1202 * before the removal. Let's ignore
1203 * this here safely. */
1204 continue;
1205
1206 (void) unit_notify_cgroup_empty(u);
1207 }
1208 }
8e274523
LP
1209}
1210
8e274523 1211int manager_setup_cgroup(Manager *m) {
9444b1f2 1212 _cleanup_free_ char *path = NULL;
efdb0237
LP
1213 CGroupController c;
1214 int r, unified;
1215 char *e;
8e274523
LP
1216
1217 assert(m);
1218
35d2e7ec 1219 /* 1. Determine hierarchy */
efdb0237 1220 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1221 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1222 if (r < 0)
1223 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1224
efdb0237
LP
1225 /* Chop off the init scope, if we are already located in it */
1226 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1227
efdb0237
LP
1228 /* LEGACY: Also chop off the system slice if we are in
1229 * it. This is to support live upgrades from older systemd
1230 * versions where PID 1 was moved there. Also see
1231 * cg_get_root_path(). */
1232 if (!e && m->running_as == MANAGER_SYSTEM) {
9444b1f2 1233 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1234 if (!e)
efdb0237 1235 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1236 }
efdb0237
LP
1237 if (e)
1238 *e = 0;
7ccfb64a 1239
9444b1f2
LP
1240 /* And make sure to store away the root value without trailing
1241 * slash, even for the root dir, so that we can easily prepend
1242 * it everywhere. */
efdb0237
LP
1243 while ((e = endswith(m->cgroup_root, "/")))
1244 *e = 0;
8e274523 1245
35d2e7ec 1246 /* 2. Show data */
9444b1f2 1247 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1248 if (r < 0)
1249 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1250
efdb0237
LP
1251 unified = cg_unified();
1252 if (unified < 0)
1253 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1254 if (unified > 0)
1255 log_debug("Unified cgroup hierarchy is located at %s.", path);
1256 else
1257 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1258
0d8c31ff 1259 if (!m->test_run) {
efdb0237 1260 const char *scope_path;
c6c18be3 1261
0d8c31ff 1262 /* 3. Install agent */
efdb0237
LP
1263 if (unified) {
1264
1265 /* In the unified hierarchy we can can get
1266 * cgroup empty notifications via inotify. */
1267
1268 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1269 safe_close(m->cgroup_inotify_fd);
1270
1271 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1272 if (m->cgroup_inotify_fd < 0)
1273 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1274
1275 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1276 if (r < 0)
1277 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1278
1279 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1280 if (r < 0)
1281 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1282
1283 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1284
1285 } else if (m->running_as == MANAGER_SYSTEM) {
1286
1287 /* On the legacy hierarchy we only get
1288 * notifications via cgroup agents. (Which
1289 * isn't really reliable, since it does not
1290 * generate events when control groups with
1291 * children run empty. */
1292
0d8c31ff
ZJS
1293 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1294 if (r < 0)
da927ba9 1295 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
0d8c31ff
ZJS
1296 else if (r > 0)
1297 log_debug("Installed release agent.");
efdb0237 1298 else if (r == 0)
0d8c31ff
ZJS
1299 log_debug("Release agent already installed.");
1300 }
8e274523 1301
efdb0237
LP
1302 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1303 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1304 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
23bbb0de 1305 if (r < 0)
efdb0237
LP
1306 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1307
1308 /* also, move all other userspace processes remaining
1309 * in the root cgroup into that scope. */
1310 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1311 if (r < 0)
1312 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 1313
0d8c31ff
ZJS
1314 /* 5. And pin it, so that it cannot be unmounted */
1315 safe_close(m->pin_cgroupfs_fd);
0d8c31ff 1316 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1317 if (m->pin_cgroupfs_fd < 0)
1318 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 1319
cc98b302 1320 /* 6. Always enable hierarchical support if it exists... */
efdb0237
LP
1321 if (!unified)
1322 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3
LP
1323 }
1324
0d8c31ff 1325 /* 7. Figure out which controllers are supported */
efdb0237
LP
1326 r = cg_mask_supported(&m->cgroup_supported);
1327 if (r < 0)
1328 return log_error_errno(r, "Failed to determine supported controllers: %m");
1329
1330 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1331 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
9156e799 1332
a32360f1 1333 return 0;
8e274523
LP
1334}
1335
c6c18be3 1336void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
1337 assert(m);
1338
9444b1f2
LP
1339 /* We can't really delete the group, since we are in it. But
1340 * let's trim it. */
1341 if (delete && m->cgroup_root)
efdb0237
LP
1342 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1343
1344 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1345
1346 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1347 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 1348
03e334a1 1349 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 1350
efdb0237 1351 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
1352}
1353
4ad49000 1354Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 1355 char *p;
4ad49000 1356 Unit *u;
acb14d31
LP
1357
1358 assert(m);
1359 assert(cgroup);
acb14d31 1360
4ad49000
LP
1361 u = hashmap_get(m->cgroup_unit, cgroup);
1362 if (u)
1363 return u;
acb14d31 1364
8e70580b 1365 p = strdupa(cgroup);
acb14d31
LP
1366 for (;;) {
1367 char *e;
1368
1369 e = strrchr(p, '/');
efdb0237
LP
1370 if (!e || e == p)
1371 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
1372
1373 *e = 0;
1374
4ad49000
LP
1375 u = hashmap_get(m->cgroup_unit, p);
1376 if (u)
1377 return u;
acb14d31
LP
1378 }
1379}
1380
4ad49000
LP
1381Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1382 _cleanup_free_ char *cgroup = NULL;
5fe8876b 1383 Unit *u;
acb14d31 1384 int r;
8e274523 1385
8c47c732
LP
1386 assert(m);
1387
efdb0237 1388 if (pid <= 0)
8c47c732
LP
1389 return NULL;
1390
efdb0237
LP
1391 if (pid == 1)
1392 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1393
5fe8876b
LP
1394 u = hashmap_get(m->watch_pids1, LONG_TO_PTR(pid));
1395 if (u)
1396 return u;
1397
1398 u = hashmap_get(m->watch_pids2, LONG_TO_PTR(pid));
1399 if (u)
1400 return u;
1401
4ad49000
LP
1402 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1403 if (r < 0)
6dde1f33
LP
1404 return NULL;
1405
4ad49000 1406 return manager_get_unit_by_cgroup(m, cgroup);
6dde1f33 1407}
4fbf50b3 1408
4ad49000
LP
1409int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1410 Unit *u;
4fbf50b3 1411
4ad49000
LP
1412 assert(m);
1413 assert(cgroup);
4fbf50b3 1414
4ad49000 1415 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
1416 if (!u)
1417 return 0;
b56c28c3 1418
efdb0237 1419 return unit_notify_cgroup_empty(u);
5ad096b3
LP
1420}
1421
1422int unit_get_memory_current(Unit *u, uint64_t *ret) {
1423 _cleanup_free_ char *v = NULL;
1424 int r;
1425
1426 assert(u);
1427 assert(ret);
1428
1429 if (!u->cgroup_path)
1430 return -ENODATA;
1431
efdb0237 1432 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
1433 return -ENODATA;
1434
efdb0237
LP
1435 if (cg_unified() <= 0)
1436 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1437 else
1438 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
5ad096b3
LP
1439 if (r == -ENOENT)
1440 return -ENODATA;
1441 if (r < 0)
1442 return r;
1443
1444 return safe_atou64(v, ret);
1445}
1446
1447static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1448 _cleanup_free_ char *v = NULL;
1449 uint64_t ns;
1450 int r;
1451
1452 assert(u);
1453 assert(ret);
1454
1455 if (!u->cgroup_path)
1456 return -ENODATA;
1457
efdb0237 1458 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
5ad096b3
LP
1459 return -ENODATA;
1460
1461 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1462 if (r == -ENOENT)
1463 return -ENODATA;
1464 if (r < 0)
1465 return r;
1466
1467 r = safe_atou64(v, &ns);
1468 if (r < 0)
1469 return r;
1470
1471 *ret = ns;
1472 return 0;
1473}
1474
1475int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1476 nsec_t ns;
1477 int r;
1478
1479 r = unit_get_cpu_usage_raw(u, &ns);
1480 if (r < 0)
1481 return r;
1482
1483 if (ns > u->cpuacct_usage_base)
1484 ns -= u->cpuacct_usage_base;
1485 else
1486 ns = 0;
1487
1488 *ret = ns;
1489 return 0;
1490}
1491
1492int unit_reset_cpu_usage(Unit *u) {
1493 nsec_t ns;
1494 int r;
1495
1496 assert(u);
1497
1498 r = unit_get_cpu_usage_raw(u, &ns);
1499 if (r < 0) {
1500 u->cpuacct_usage_base = 0;
1501 return r;
b56c28c3 1502 }
2633eb83 1503
5ad096b3 1504 u->cpuacct_usage_base = ns;
4ad49000 1505 return 0;
4fbf50b3
LP
1506}
1507
e9db43d5
LP
1508bool unit_cgroup_delegate(Unit *u) {
1509 CGroupContext *c;
1510
1511 assert(u);
1512
1513 c = unit_get_cgroup_context(u);
1514 if (!c)
1515 return false;
1516
1517 return c->delegate;
1518}
1519
4ad49000
LP
1520static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1521 [CGROUP_AUTO] = "auto",
1522 [CGROUP_CLOSED] = "closed",
1523 [CGROUP_STRICT] = "strict",
1524};
4fbf50b3 1525
4ad49000 1526DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);