]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
Merge pull request #1190 from poettering/rework-virt
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
4ad49000 6 Copyright 2013 Lennart Poettering
8e274523
LP
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
c6c18be3 22#include <fcntl.h>
e41969e3 23#include <fnmatch.h>
8c6db833 24
0b452006 25#include "process-util.h"
9eb977db 26#include "path-util.h"
9444b1f2 27#include "special.h"
4ad49000
LP
28#include "cgroup-util.h"
29#include "cgroup.h"
8e274523 30
9a054909
LP
31#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
32
4ad49000
LP
33void cgroup_context_init(CGroupContext *c) {
34 assert(c);
35
36 /* Initialize everything to the kernel defaults, assuming the
37 * structure is preinitialized to 0 */
38
db785129
LP
39 c->cpu_shares = (unsigned long) -1;
40 c->startup_cpu_shares = (unsigned long) -1;
ddca82ac 41 c->memory_limit = (uint64_t) -1;
db785129
LP
42 c->blockio_weight = (unsigned long) -1;
43 c->startup_blockio_weight = (unsigned long) -1;
b2f8b02e 44
3a43da28 45 c->cpu_quota_per_sec_usec = USEC_INFINITY;
4ad49000 46}
8e274523 47
4ad49000
LP
48void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
49 assert(c);
50 assert(a);
51
71fda00f 52 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
53 free(a->path);
54 free(a);
55}
56
57void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
58 assert(c);
59 assert(w);
60
71fda00f 61 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
62 free(w->path);
63 free(w);
64}
65
66void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
67 assert(c);
8e274523 68 assert(b);
8e274523 69
71fda00f 70 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
71 free(b->path);
72 free(b);
73}
74
75void cgroup_context_done(CGroupContext *c) {
76 assert(c);
77
78 while (c->blockio_device_weights)
79 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
80
81 while (c->blockio_device_bandwidths)
82 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
83
84 while (c->device_allow)
85 cgroup_context_free_device_allow(c, c->device_allow);
86}
87
88void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
89 CGroupBlockIODeviceBandwidth *b;
90 CGroupBlockIODeviceWeight *w;
91 CGroupDeviceAllow *a;
9a054909 92 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
93
94 assert(c);
95 assert(f);
96
97 prefix = strempty(prefix);
98
99 fprintf(f,
100 "%sCPUAccounting=%s\n"
101 "%sBlockIOAccounting=%s\n"
102 "%sMemoryAccounting=%s\n"
103 "%sCPUShares=%lu\n"
95ae05c0 104 "%sStartupCPUShares=%lu\n"
b2f8b02e 105 "%sCPUQuotaPerSecSec=%s\n"
112a7f46 106 "%sBlockIOWeight=%lu\n"
95ae05c0 107 "%sStartupBlockIOWeight=%lu\n"
4ad49000 108 "%sMemoryLimit=%" PRIu64 "\n"
a931ad47
LP
109 "%sDevicePolicy=%s\n"
110 "%sDelegate=%s\n",
4ad49000
LP
111 prefix, yes_no(c->cpu_accounting),
112 prefix, yes_no(c->blockio_accounting),
113 prefix, yes_no(c->memory_accounting),
114 prefix, c->cpu_shares,
95ae05c0 115 prefix, c->startup_cpu_shares,
b1d6dcf5 116 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
4ad49000 117 prefix, c->blockio_weight,
95ae05c0 118 prefix, c->startup_blockio_weight,
4ad49000 119 prefix, c->memory_limit,
a931ad47
LP
120 prefix, cgroup_device_policy_to_string(c->device_policy),
121 prefix, yes_no(c->delegate));
4ad49000
LP
122
123 LIST_FOREACH(device_allow, a, c->device_allow)
124 fprintf(f,
125 "%sDeviceAllow=%s %s%s%s\n",
126 prefix,
127 a->path,
128 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
129
130 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
131 fprintf(f,
8e7076ca 132 "%sBlockIODeviceWeight=%s %lu",
4ad49000
LP
133 prefix,
134 w->path,
135 w->weight);
136
137 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
138 char buf[FORMAT_BYTES_MAX];
139
140 fprintf(f,
141 "%s%s=%s %s\n",
142 prefix,
143 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
144 b->path,
145 format_bytes(buf, sizeof(buf), b->bandwidth));
146 }
147}
148
149static int lookup_blkio_device(const char *p, dev_t *dev) {
150 struct stat st;
151 int r;
152
153 assert(p);
154 assert(dev);
155
156 r = stat(p, &st);
4a62c710
MS
157 if (r < 0)
158 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 159
4ad49000
LP
160 if (S_ISBLK(st.st_mode))
161 *dev = st.st_rdev;
162 else if (major(st.st_dev) != 0) {
163 /* If this is not a device node then find the block
164 * device this file is stored on */
165 *dev = st.st_dev;
166
167 /* If this is a partition, try to get the originating
168 * block device */
169 block_get_whole_disk(*dev, dev);
170 } else {
171 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
172 return -ENODEV;
173 }
8e274523 174
8e274523 175 return 0;
8e274523
LP
176}
177
4ad49000
LP
178static int whitelist_device(const char *path, const char *node, const char *acc) {
179 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
180 struct stat st;
8c6db833 181 int r;
8e274523 182
4ad49000
LP
183 assert(path);
184 assert(acc);
8e274523 185
4ad49000
LP
186 if (stat(node, &st) < 0) {
187 log_warning("Couldn't stat device %s", node);
188 return -errno;
189 }
190
191 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
192 log_warning("%s is not a device.", node);
193 return -ENODEV;
194 }
195
196 sprintf(buf,
197 "%c %u:%u %s",
198 S_ISCHR(st.st_mode) ? 'c' : 'b',
199 major(st.st_rdev), minor(st.st_rdev),
200 acc);
201
202 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 203 if (r < 0)
714e2e1d
LP
204 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
205 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
206
207 return r;
8e274523
LP
208}
209
90060676
LP
210static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
211 _cleanup_fclose_ FILE *f = NULL;
212 char line[LINE_MAX];
213 bool good = false;
214 int r;
215
216 assert(path);
217 assert(acc);
218 assert(type == 'b' || type == 'c');
219
220 f = fopen("/proc/devices", "re");
4a62c710
MS
221 if (!f)
222 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
223
224 FOREACH_LINE(line, f, goto fail) {
225 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
226 unsigned maj;
227
228 truncate_nl(line);
229
230 if (type == 'c' && streq(line, "Character devices:")) {
231 good = true;
232 continue;
233 }
234
235 if (type == 'b' && streq(line, "Block devices:")) {
236 good = true;
237 continue;
238 }
239
240 if (isempty(line)) {
241 good = false;
242 continue;
243 }
244
245 if (!good)
246 continue;
247
248 p = strstrip(line);
249
250 w = strpbrk(p, WHITESPACE);
251 if (!w)
252 continue;
253 *w = 0;
254
255 r = safe_atou(p, &maj);
256 if (r < 0)
257 continue;
258 if (maj <= 0)
259 continue;
260
261 w++;
262 w += strspn(w, WHITESPACE);
e41969e3
LP
263
264 if (fnmatch(name, w, 0) != 0)
90060676
LP
265 continue;
266
267 sprintf(buf,
268 "%c %u:* %s",
269 type,
270 maj,
271 acc);
272
273 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 274 if (r < 0)
714e2e1d
LP
275 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
276 "Failed to set devices.allow on %s: %m", path);
90060676
LP
277 }
278
279 return 0;
280
281fail:
56f64d95 282 log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
283 return -errno;
284}
285
efdb0237 286void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
01efdf13 287 bool is_root;
4ad49000
LP
288 int r;
289
290 assert(c);
291 assert(path);
8e274523 292
4ad49000
LP
293 if (mask == 0)
294 return;
8e274523 295
71c26873 296 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
297 * hence silently ignore */
298 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
299 if (is_root)
300 /* Make sure we don't try to display messages with an empty path. */
301 path = "/";
01efdf13 302
714e2e1d
LP
303 /* We generally ignore errors caused by read-only mounted
304 * cgroup trees (assuming we are running in a container then),
305 * and missing cgroups, i.e. EROFS and ENOENT. */
306
efdb0237 307 if ((mask & CGROUP_MASK_CPU) && !is_root) {
b2f8b02e 308 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
8e274523 309
db785129 310 sprintf(buf, "%lu\n",
d81afec1 311 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
db785129 312 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
4ad49000 313 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
1aeab12b 314 if (r < 0)
714e2e1d
LP
315 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
316 "Failed to set cpu.shares on %s: %m", path);
b2f8b02e 317
9a054909 318 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
b2f8b02e 319 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
1aeab12b 320 if (r < 0)
714e2e1d
LP
321 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
322 "Failed to set cpu.cfs_period_us on %s: %m", path);
b2f8b02e 323
3a43da28 324 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
9a054909 325 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
b2f8b02e
LP
326 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
327 } else
328 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
1aeab12b 329 if (r < 0)
714e2e1d
LP
330 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
331 "Failed to set cpu.cfs_quota_us on %s: %m", path);
4ad49000
LP
332 }
333
efdb0237 334 if (mask & CGROUP_MASK_BLKIO) {
4ad49000
LP
335 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
336 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
337 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
338 CGroupBlockIODeviceWeight *w;
339 CGroupBlockIODeviceBandwidth *b;
340
01efdf13 341 if (!is_root) {
d81afec1 342 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
db785129 343 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
01efdf13 344 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 345 if (r < 0)
714e2e1d
LP
346 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
347 "Failed to set blkio.weight on %s: %m", path);
4ad49000 348
01efdf13
LP
349 /* FIXME: no way to reset this list */
350 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
351 dev_t dev;
4ad49000 352
01efdf13
LP
353 r = lookup_blkio_device(w->path, &dev);
354 if (r < 0)
355 continue;
8e274523 356
01efdf13
LP
357 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
358 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
1aeab12b 359 if (r < 0)
714e2e1d
LP
360 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
361 "Failed to set blkio.weight_device on %s: %m", path);
01efdf13 362 }
4ad49000
LP
363 }
364
365 /* FIXME: no way to reset this list */
366 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
367 const char *a;
368 dev_t dev;
369
370 r = lookup_blkio_device(b->path, &dev);
371 if (r < 0)
372 continue;
373
374 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
375
376 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
377 r = cg_set_attribute("blkio", path, a, buf);
1aeab12b 378 if (r < 0)
714e2e1d
LP
379 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
380 "Failed to set %s on %s: %m", a, path);
d686d8a9 381 }
8e274523
LP
382 }
383
efdb0237 384 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
6a94f2e9 385 if (c->memory_limit != (uint64_t) -1) {
e58cec11
LP
386 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
387
6a94f2e9 388 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
efdb0237
LP
389
390 if (cg_unified() <= 0)
391 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
392 else
393 r = cg_set_attribute("memory", path, "memory.max", buf);
394
395 } else {
396 if (cg_unified() <= 0)
397 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
398 else
399 r = cg_set_attribute("memory", path, "memory.max", "max");
400 }
8e274523 401
1aeab12b 402 if (r < 0)
714e2e1d 403 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
efdb0237 404 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
4ad49000 405 }
8e274523 406
efdb0237 407 if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
4ad49000 408 CGroupDeviceAllow *a;
8e274523 409
714e2e1d
LP
410 /* Changing the devices list of a populated cgroup
411 * might result in EINVAL, hence ignore EINVAL
412 * here. */
413
4ad49000
LP
414 if (c->device_allow || c->device_policy != CGROUP_AUTO)
415 r = cg_set_attribute("devices", path, "devices.deny", "a");
416 else
417 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 418 if (r < 0)
714e2e1d
LP
419 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
420 "Failed to reset devices.list on %s: %m", path);
fb385181 421
4ad49000
LP
422 if (c->device_policy == CGROUP_CLOSED ||
423 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
424 static const char auto_devices[] =
7d711efb
LP
425 "/dev/null\0" "rwm\0"
426 "/dev/zero\0" "rwm\0"
427 "/dev/full\0" "rwm\0"
428 "/dev/random\0" "rwm\0"
429 "/dev/urandom\0" "rwm\0"
430 "/dev/tty\0" "rwm\0"
431 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
4ad49000
LP
432
433 const char *x, *y;
434
435 NULSTR_FOREACH_PAIR(x, y, auto_devices)
436 whitelist_device(path, x, y);
7d711efb
LP
437
438 whitelist_major(path, "pts", 'c', "rw");
439 whitelist_major(path, "kdbus", 'c', "rw");
440 whitelist_major(path, "kdbus/*", 'c', "rw");
4ad49000
LP
441 }
442
443 LIST_FOREACH(device_allow, a, c->device_allow) {
444 char acc[4];
445 unsigned k = 0;
446
447 if (a->r)
448 acc[k++] = 'r';
449 if (a->w)
450 acc[k++] = 'w';
451 if (a->m)
452 acc[k++] = 'm';
fb385181 453
4ad49000
LP
454 if (k == 0)
455 continue;
fb385181 456
4ad49000 457 acc[k++] = 0;
90060676
LP
458
459 if (startswith(a->path, "/dev/"))
460 whitelist_device(path, a->path, acc);
461 else if (startswith(a->path, "block-"))
462 whitelist_major(path, a->path + 6, 'b', acc);
463 else if (startswith(a->path, "char-"))
464 whitelist_major(path, a->path + 5, 'c', acc);
465 else
466 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
467 }
468 }
fb385181
LP
469}
470
efdb0237
LP
471CGroupMask cgroup_context_get_mask(CGroupContext *c) {
472 CGroupMask mask = 0;
8e274523 473
4ad49000 474 /* Figure out which controllers we need */
8e274523 475
b2f8b02e 476 if (c->cpu_accounting ||
db785129
LP
477 c->cpu_shares != (unsigned long) -1 ||
478 c->startup_cpu_shares != (unsigned long) -1 ||
3a43da28 479 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 480 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 481
4ad49000 482 if (c->blockio_accounting ||
db785129
LP
483 c->blockio_weight != (unsigned long) -1 ||
484 c->startup_blockio_weight != (unsigned long) -1 ||
4ad49000 485 c->blockio_device_weights ||
db785129 486 c->blockio_device_bandwidths)
efdb0237 487 mask |= CGROUP_MASK_BLKIO;
ecedd90f 488
4ad49000 489 if (c->memory_accounting ||
ddca82ac 490 c->memory_limit != (uint64_t) -1)
efdb0237 491 mask |= CGROUP_MASK_MEMORY;
8e274523 492
a931ad47
LP
493 if (c->device_allow ||
494 c->device_policy != CGROUP_AUTO)
efdb0237 495 mask |= CGROUP_MASK_DEVICE;
4ad49000
LP
496
497 return mask;
8e274523
LP
498}
499
efdb0237 500CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 501 CGroupContext *c;
8e274523 502
efdb0237
LP
503 /* Returns the mask of controllers the unit needs for itself */
504
4ad49000
LP
505 c = unit_get_cgroup_context(u);
506 if (!c)
507 return 0;
8e274523 508
a931ad47 509 /* If delegation is turned on, then turn on all cgroups,
19af675e
LP
510 * unless we are on the legacy hierarchy and the process we
511 * fork into it is known to drop privileges, and hence
512 * shouldn't get access to the controllers.
513 *
514 * Note that on the unified hierarchy it is safe to delegate
515 * controllers to unprivileged services. */
a931ad47
LP
516
517 if (c->delegate) {
518 ExecContext *e;
519
520 e = unit_get_exec_context(u);
19af675e
LP
521 if (!e ||
522 exec_context_maintains_privileges(e) ||
523 cg_unified() > 0)
efdb0237 524 return _CGROUP_MASK_ALL;
a931ad47
LP
525 }
526
db785129 527 return cgroup_context_get_mask(c);
8e274523
LP
528}
529
efdb0237 530CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 531 assert(u);
bc432dc7 532
efdb0237
LP
533 /* Returns the mask of controllers all of the unit's children
534 * require, merged */
535
bc432dc7
LP
536 if (u->cgroup_members_mask_valid)
537 return u->cgroup_members_mask;
538
539 u->cgroup_members_mask = 0;
540
541 if (u->type == UNIT_SLICE) {
542 Unit *member;
543 Iterator i;
544
545 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
546
547 if (member == u)
548 continue;
549
d4fdc205 550 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
551 continue;
552
553 u->cgroup_members_mask |=
efdb0237 554 unit_get_own_mask(member) |
bc432dc7
LP
555 unit_get_members_mask(member);
556 }
557 }
558
559 u->cgroup_members_mask_valid = true;
6414b7c9 560 return u->cgroup_members_mask;
246aa6dd
LP
561}
562
efdb0237 563CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 564 assert(u);
246aa6dd 565
efdb0237
LP
566 /* Returns the mask of controllers all of the unit's siblings
567 * require, i.e. the members mask of the unit's parent slice
568 * if there is one. */
569
bc432dc7 570 if (UNIT_ISSET(u->slice))
637f421e 571 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 572
efdb0237 573 return unit_get_own_mask(u) | unit_get_members_mask(u);
246aa6dd
LP
574}
575
efdb0237
LP
576CGroupMask unit_get_subtree_mask(Unit *u) {
577
578 /* Returns the mask of this subtree, meaning of the group
579 * itself and its children. */
580
581 return unit_get_own_mask(u) | unit_get_members_mask(u);
582}
583
584CGroupMask unit_get_target_mask(Unit *u) {
585 CGroupMask mask;
586
587 /* This returns the cgroup mask of all controllers to enable
588 * for a specific cgroup, i.e. everything it needs itself,
589 * plus all that its children need, plus all that its siblings
590 * need. This is primarily useful on the legacy cgroup
591 * hierarchy, where we need to duplicate each cgroup in each
592 * hierarchy that shall be enabled for it. */
6414b7c9 593
efdb0237
LP
594 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
595 mask &= u->manager->cgroup_supported;
596
597 return mask;
598}
599
600CGroupMask unit_get_enable_mask(Unit *u) {
601 CGroupMask mask;
602
603 /* This returns the cgroup mask of all controllers to enable
604 * for the children of a specific cgroup. This is primarily
605 * useful for the unified cgroup hierarchy, where each cgroup
606 * controls which controllers are enabled for its children. */
607
608 mask = unit_get_members_mask(u);
6414b7c9
DS
609 mask &= u->manager->cgroup_supported;
610
611 return mask;
612}
613
614/* Recurse from a unit up through its containing slices, propagating
615 * mask bits upward. A unit is also member of itself. */
bc432dc7 616void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 617 CGroupMask m;
bc432dc7
LP
618 bool more;
619
620 assert(u);
621
622 /* Calculate subtree mask */
efdb0237 623 m = unit_get_subtree_mask(u);
bc432dc7
LP
624
625 /* See if anything changed from the previous invocation. If
626 * not, we're done. */
627 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
628 return;
629
630 more =
631 u->cgroup_subtree_mask_valid &&
632 ((m & ~u->cgroup_subtree_mask) != 0) &&
633 ((~m & u->cgroup_subtree_mask) == 0);
634
635 u->cgroup_subtree_mask = m;
636 u->cgroup_subtree_mask_valid = true;
637
6414b7c9
DS
638 if (UNIT_ISSET(u->slice)) {
639 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
640
641 if (more)
642 /* There's more set now than before. We
643 * propagate the new mask to the parent's mask
644 * (not caring if it actually was valid or
645 * not). */
646
647 s->cgroup_members_mask |= m;
648
649 else
650 /* There's less set now than before (or we
651 * don't know), we need to recalculate
652 * everything, so let's invalidate the
653 * parent's members mask */
654
655 s->cgroup_members_mask_valid = false;
656
657 /* And now make sure that this change also hits our
658 * grandparents */
659 unit_update_cgroup_members_masks(s);
6414b7c9
DS
660 }
661}
662
efdb0237 663static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
664 Unit *u = userdata;
665
666 assert(mask != 0);
667 assert(u);
668
669 while (u) {
670 if (u->cgroup_path &&
671 u->cgroup_realized &&
672 (u->cgroup_realized_mask & mask) == mask)
673 return u->cgroup_path;
674
675 u = UNIT_DEREF(u->slice);
676 }
677
678 return NULL;
679}
680
efdb0237
LP
681char *unit_default_cgroup_path(Unit *u) {
682 _cleanup_free_ char *escaped = NULL, *slice = NULL;
683 int r;
684
685 assert(u);
686
687 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
688 return strdup(u->manager->cgroup_root);
689
690 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
691 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
692 if (r < 0)
693 return NULL;
694 }
695
696 escaped = cg_escape(u->id);
697 if (!escaped)
698 return NULL;
699
700 if (slice)
701 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
702 else
703 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
704}
705
706int unit_set_cgroup_path(Unit *u, const char *path) {
707 _cleanup_free_ char *p = NULL;
708 int r;
709
710 assert(u);
711
712 if (path) {
713 p = strdup(path);
714 if (!p)
715 return -ENOMEM;
716 } else
717 p = NULL;
718
719 if (streq_ptr(u->cgroup_path, p))
720 return 0;
721
722 if (p) {
723 r = hashmap_put(u->manager->cgroup_unit, p, u);
724 if (r < 0)
725 return r;
726 }
727
728 unit_release_cgroup(u);
729
730 u->cgroup_path = p;
731 p = NULL;
732
733 return 1;
734}
735
736int unit_watch_cgroup(Unit *u) {
737 _cleanup_free_ char *populated = NULL;
738 int r;
739
740 assert(u);
741
742 if (!u->cgroup_path)
743 return 0;
744
745 if (u->cgroup_inotify_wd >= 0)
746 return 0;
747
748 /* Only applies to the unified hierarchy */
749 r = cg_unified();
750 if (r < 0)
751 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
752 if (r == 0)
753 return 0;
754
755 /* Don't watch the root slice, it's pointless. */
756 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
757 return 0;
758
759 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
760 if (r < 0)
761 return log_oom();
762
763 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
764 if (r < 0)
765 return log_oom();
766
767 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
768 if (u->cgroup_inotify_wd < 0) {
769
770 if (errno == ENOENT) /* If the directory is already
771 * gone we don't need to track
772 * it, so this is not an error */
773 return 0;
774
775 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
776 }
777
778 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
779 if (r < 0)
780 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
781
782 return 0;
783}
784
785static int unit_create_cgroup(
786 Unit *u,
787 CGroupMask target_mask,
788 CGroupMask enable_mask) {
789
0cd385d3 790 CGroupContext *c;
bc432dc7 791 int r;
64747e2d 792
4ad49000 793 assert(u);
64747e2d 794
0cd385d3
LP
795 c = unit_get_cgroup_context(u);
796 if (!c)
797 return 0;
798
7b3fd631
LP
799 if (!u->cgroup_path) {
800 _cleanup_free_ char *path = NULL;
64747e2d 801
7b3fd631
LP
802 path = unit_default_cgroup_path(u);
803 if (!path)
804 return log_oom();
805
efdb0237
LP
806 r = unit_set_cgroup_path(u, path);
807 if (r == -EEXIST)
808 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
809 if (r < 0)
810 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
811 }
812
03b90d4b 813 /* First, create our own group */
efdb0237 814 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 815 if (r < 0)
efdb0237
LP
816 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
817
818 /* Start watching it */
819 (void) unit_watch_cgroup(u);
820
821 /* Enable all controllers we need */
822 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
823 if (r < 0)
824 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
825
826 /* Keep track that this is now realized */
4ad49000 827 u->cgroup_realized = true;
efdb0237 828 u->cgroup_realized_mask = target_mask;
4ad49000 829
0cd385d3
LP
830 if (u->type != UNIT_SLICE && !c->delegate) {
831
832 /* Then, possibly move things over, but not if
833 * subgroups may contain processes, which is the case
834 * for slice and delegation units. */
835 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
836 if (r < 0)
efdb0237 837 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 838 }
03b90d4b 839
64747e2d
LP
840 return 0;
841}
842
7b3fd631
LP
843int unit_attach_pids_to_cgroup(Unit *u) {
844 int r;
845 assert(u);
846
847 r = unit_realize_cgroup(u);
848 if (r < 0)
849 return r;
850
851 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
852 if (r < 0)
853 return r;
854
855 return 0;
856}
857
efdb0237 858static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
bc432dc7
LP
859 assert(u);
860
efdb0237 861 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
6414b7c9
DS
862}
863
864/* Check if necessary controllers and attributes for a unit are in place.
865 *
866 * If so, do nothing.
867 * If not, create paths, move processes over, and set attributes.
868 *
869 * Returns 0 on success and < 0 on failure. */
db785129 870static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 871 CGroupMask target_mask, enable_mask;
6414b7c9 872 int r;
64747e2d 873
4ad49000 874 assert(u);
64747e2d 875
4ad49000 876 if (u->in_cgroup_queue) {
71fda00f 877 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
878 u->in_cgroup_queue = false;
879 }
64747e2d 880
efdb0237
LP
881 target_mask = unit_get_target_mask(u);
882 if (unit_has_mask_realized(u, target_mask))
0a1eb06d 883 return 0;
64747e2d 884
4ad49000 885 /* First, realize parents */
6414b7c9 886 if (UNIT_ISSET(u->slice)) {
db785129 887 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
888 if (r < 0)
889 return r;
890 }
4ad49000
LP
891
892 /* And then do the real work */
efdb0237
LP
893 enable_mask = unit_get_enable_mask(u);
894 r = unit_create_cgroup(u, target_mask, enable_mask);
6414b7c9
DS
895 if (r < 0)
896 return r;
897
898 /* Finally, apply the necessary attributes. */
efdb0237 899 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
6414b7c9
DS
900
901 return 0;
64747e2d
LP
902}
903
4ad49000 904static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 905
4ad49000
LP
906 if (u->in_cgroup_queue)
907 return;
8e274523 908
71fda00f 909 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
910 u->in_cgroup_queue = true;
911}
8c6db833 912
4ad49000 913unsigned manager_dispatch_cgroup_queue(Manager *m) {
db785129 914 ManagerState state;
4ad49000 915 unsigned n = 0;
db785129 916 Unit *i;
6414b7c9 917 int r;
ecedd90f 918
db785129
LP
919 state = manager_state(m);
920
4ad49000
LP
921 while ((i = m->cgroup_queue)) {
922 assert(i->in_cgroup_queue);
ecedd90f 923
db785129 924 r = unit_realize_cgroup_now(i, state);
6414b7c9 925 if (r < 0)
efdb0237 926 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 927
4ad49000
LP
928 n++;
929 }
ecedd90f 930
4ad49000 931 return n;
8e274523
LP
932}
933
4ad49000
LP
934static void unit_queue_siblings(Unit *u) {
935 Unit *slice;
ca949c9d 936
4ad49000
LP
937 /* This adds the siblings of the specified unit and the
938 * siblings of all parent units to the cgroup queue. (But
939 * neither the specified unit itself nor the parents.) */
940
941 while ((slice = UNIT_DEREF(u->slice))) {
942 Iterator i;
943 Unit *m;
8f53a7b8 944
4ad49000
LP
945 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
946 if (m == u)
947 continue;
8e274523 948
6414b7c9
DS
949 /* Skip units that have a dependency on the slice
950 * but aren't actually in it. */
4ad49000 951 if (UNIT_DEREF(m->slice) != slice)
50159e6a 952 continue;
8e274523 953
6414b7c9
DS
954 /* No point in doing cgroup application for units
955 * without active processes. */
956 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
957 continue;
958
959 /* If the unit doesn't need any new controllers
960 * and has current ones realized, it doesn't need
961 * any changes. */
962 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
963 continue;
964
4ad49000 965 unit_add_to_cgroup_queue(m);
50159e6a
LP
966 }
967
4ad49000 968 u = slice;
8e274523 969 }
4ad49000
LP
970}
971
0a1eb06d 972int unit_realize_cgroup(Unit *u) {
4ad49000
LP
973 assert(u);
974
35b7ff80 975 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 976 return 0;
8e274523 977
4ad49000
LP
978 /* So, here's the deal: when realizing the cgroups for this
979 * unit, we need to first create all parents, but there's more
980 * actually: for the weight-based controllers we also need to
981 * make sure that all our siblings (i.e. units that are in the
73e231ab 982 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
983 * would become very uneven as each of their processes would
984 * get as much resources as all our group together. This call
985 * will synchronously create the parent cgroups, but will
986 * defer work on the siblings to the next event loop
987 * iteration. */
ca949c9d 988
4ad49000
LP
989 /* Add all sibling slices to the cgroup queue. */
990 unit_queue_siblings(u);
991
6414b7c9 992 /* And realize this one now (and apply the values) */
db785129 993 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
994}
995
efdb0237
LP
996void unit_release_cgroup(Unit *u) {
997 assert(u);
998
999 /* Forgets all cgroup details for this cgroup */
1000
1001 if (u->cgroup_path) {
1002 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1003 u->cgroup_path = mfree(u->cgroup_path);
1004 }
1005
1006 if (u->cgroup_inotify_wd >= 0) {
1007 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1008 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1009
1010 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1011 u->cgroup_inotify_wd = -1;
1012 }
1013}
1014
1015void unit_prune_cgroup(Unit *u) {
8e274523 1016 int r;
efdb0237 1017 bool is_root_slice;
8e274523 1018
4ad49000 1019 assert(u);
8e274523 1020
efdb0237
LP
1021 /* Removes the cgroup, if empty and possible, and stops watching it. */
1022
4ad49000
LP
1023 if (!u->cgroup_path)
1024 return;
8e274523 1025
efdb0237
LP
1026 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1027
1028 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1029 if (r < 0) {
efdb0237 1030 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1031 return;
1032 }
8e274523 1033
efdb0237
LP
1034 if (is_root_slice)
1035 return;
1036
1037 unit_release_cgroup(u);
0a1eb06d 1038
4ad49000 1039 u->cgroup_realized = false;
bc432dc7 1040 u->cgroup_realized_mask = 0;
8e274523
LP
1041}
1042
efdb0237 1043int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1044 _cleanup_fclose_ FILE *f = NULL;
1045 pid_t pid = 0, npid, mypid;
efdb0237 1046 int r;
4ad49000
LP
1047
1048 assert(u);
efdb0237 1049 assert(ret);
4ad49000
LP
1050
1051 if (!u->cgroup_path)
efdb0237 1052 return -ENXIO;
4ad49000 1053
efdb0237
LP
1054 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1055 if (r < 0)
1056 return r;
4ad49000
LP
1057
1058 mypid = getpid();
1059 while (cg_read_pid(f, &npid) > 0) {
1060 pid_t ppid;
1061
1062 if (npid == pid)
1063 continue;
8e274523 1064
4ad49000
LP
1065 /* Ignore processes that aren't our kids */
1066 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1067 continue;
8e274523 1068
efdb0237 1069 if (pid != 0)
4ad49000
LP
1070 /* Dang, there's more than one daemonized PID
1071 in this group, so we don't know what process
1072 is the main process. */
efdb0237
LP
1073
1074 return -ENODATA;
8e274523 1075
4ad49000 1076 pid = npid;
8e274523
LP
1077 }
1078
efdb0237
LP
1079 *ret = pid;
1080 return 0;
1081}
1082
1083static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1084 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1085 _cleanup_fclose_ FILE *f = NULL;
1086 int ret = 0, r;
1087
1088 assert(u);
1089 assert(path);
1090
1091 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1092 if (r < 0)
1093 ret = r;
1094 else {
1095 pid_t pid;
1096
1097 while ((r = cg_read_pid(f, &pid)) > 0) {
1098 r = unit_watch_pid(u, pid);
1099 if (r < 0 && ret >= 0)
1100 ret = r;
1101 }
1102
1103 if (r < 0 && ret >= 0)
1104 ret = r;
1105 }
1106
1107 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1108 if (r < 0) {
1109 if (ret >= 0)
1110 ret = r;
1111 } else {
1112 char *fn;
1113
1114 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1115 _cleanup_free_ char *p = NULL;
1116
1117 p = strjoin(path, "/", fn, NULL);
1118 free(fn);
1119
1120 if (!p)
1121 return -ENOMEM;
1122
1123 r = unit_watch_pids_in_path(u, p);
1124 if (r < 0 && ret >= 0)
1125 ret = r;
1126 }
1127
1128 if (r < 0 && ret >= 0)
1129 ret = r;
1130 }
1131
1132 return ret;
1133}
1134
1135int unit_watch_all_pids(Unit *u) {
1136 assert(u);
1137
1138 /* Adds all PIDs from our cgroup to the set of PIDs we
1139 * watch. This is a fallback logic for cases where we do not
1140 * get reliable cgroup empty notifications: we try to use
1141 * SIGCHLD as replacement. */
1142
1143 if (!u->cgroup_path)
1144 return -ENOENT;
1145
1146 if (cg_unified() > 0) /* On unified we can use proper notifications */
1147 return 0;
1148
1149 return unit_watch_pids_in_path(u, u->cgroup_path);
1150}
1151
1152int unit_notify_cgroup_empty(Unit *u) {
1153 int r;
1154
1155 assert(u);
1156
1157 if (!u->cgroup_path)
1158 return 0;
1159
1160 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1161 if (r <= 0)
1162 return r;
1163
1164 unit_add_to_gc_queue(u);
1165
1166 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1167 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1168
1169 return 0;
1170}
1171
1172static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1173 Manager *m = userdata;
1174
1175 assert(s);
1176 assert(fd >= 0);
1177 assert(m);
1178
1179 for (;;) {
1180 union inotify_event_buffer buffer;
1181 struct inotify_event *e;
1182 ssize_t l;
1183
1184 l = read(fd, &buffer, sizeof(buffer));
1185 if (l < 0) {
1186 if (errno == EINTR || errno == EAGAIN)
1187 return 0;
1188
1189 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1190 }
1191
1192 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1193 Unit *u;
1194
1195 if (e->wd < 0)
1196 /* Queue overflow has no watch descriptor */
1197 continue;
1198
1199 if (e->mask & IN_IGNORED)
1200 /* The watch was just removed */
1201 continue;
1202
1203 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1204 if (!u) /* Not that inotify might deliver
1205 * events for a watch even after it
1206 * was removed, because it was queued
1207 * before the removal. Let's ignore
1208 * this here safely. */
1209 continue;
1210
1211 (void) unit_notify_cgroup_empty(u);
1212 }
1213 }
8e274523
LP
1214}
1215
8e274523 1216int manager_setup_cgroup(Manager *m) {
9444b1f2 1217 _cleanup_free_ char *path = NULL;
efdb0237
LP
1218 CGroupController c;
1219 int r, unified;
1220 char *e;
8e274523
LP
1221
1222 assert(m);
1223
35d2e7ec 1224 /* 1. Determine hierarchy */
efdb0237 1225 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1226 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1227 if (r < 0)
1228 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1229
efdb0237
LP
1230 /* Chop off the init scope, if we are already located in it */
1231 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1232
efdb0237
LP
1233 /* LEGACY: Also chop off the system slice if we are in
1234 * it. This is to support live upgrades from older systemd
1235 * versions where PID 1 was moved there. Also see
1236 * cg_get_root_path(). */
1237 if (!e && m->running_as == MANAGER_SYSTEM) {
9444b1f2 1238 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1239 if (!e)
efdb0237 1240 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1241 }
efdb0237
LP
1242 if (e)
1243 *e = 0;
7ccfb64a 1244
9444b1f2
LP
1245 /* And make sure to store away the root value without trailing
1246 * slash, even for the root dir, so that we can easily prepend
1247 * it everywhere. */
efdb0237
LP
1248 while ((e = endswith(m->cgroup_root, "/")))
1249 *e = 0;
8e274523 1250
35d2e7ec 1251 /* 2. Show data */
9444b1f2 1252 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1253 if (r < 0)
1254 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1255
efdb0237
LP
1256 unified = cg_unified();
1257 if (unified < 0)
1258 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1259 if (unified > 0)
1260 log_debug("Unified cgroup hierarchy is located at %s.", path);
1261 else
1262 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1263
0d8c31ff 1264 if (!m->test_run) {
efdb0237 1265 const char *scope_path;
c6c18be3 1266
0d8c31ff 1267 /* 3. Install agent */
efdb0237
LP
1268 if (unified) {
1269
1270 /* In the unified hierarchy we can can get
1271 * cgroup empty notifications via inotify. */
1272
1273 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1274 safe_close(m->cgroup_inotify_fd);
1275
1276 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1277 if (m->cgroup_inotify_fd < 0)
1278 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1279
1280 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1281 if (r < 0)
1282 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1283
1284 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1285 if (r < 0)
1286 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1287
1288 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1289
1290 } else if (m->running_as == MANAGER_SYSTEM) {
1291
1292 /* On the legacy hierarchy we only get
1293 * notifications via cgroup agents. (Which
1294 * isn't really reliable, since it does not
1295 * generate events when control groups with
1296 * children run empty. */
1297
0d8c31ff
ZJS
1298 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1299 if (r < 0)
da927ba9 1300 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
0d8c31ff
ZJS
1301 else if (r > 0)
1302 log_debug("Installed release agent.");
efdb0237 1303 else if (r == 0)
0d8c31ff
ZJS
1304 log_debug("Release agent already installed.");
1305 }
8e274523 1306
efdb0237
LP
1307 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1308 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1309 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
23bbb0de 1310 if (r < 0)
efdb0237
LP
1311 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1312
1313 /* also, move all other userspace processes remaining
1314 * in the root cgroup into that scope. */
1315 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1316 if (r < 0)
1317 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 1318
0d8c31ff
ZJS
1319 /* 5. And pin it, so that it cannot be unmounted */
1320 safe_close(m->pin_cgroupfs_fd);
0d8c31ff 1321 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1322 if (m->pin_cgroupfs_fd < 0)
1323 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 1324
cc98b302 1325 /* 6. Always enable hierarchical support if it exists... */
efdb0237
LP
1326 if (!unified)
1327 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3
LP
1328 }
1329
0d8c31ff 1330 /* 7. Figure out which controllers are supported */
efdb0237
LP
1331 r = cg_mask_supported(&m->cgroup_supported);
1332 if (r < 0)
1333 return log_error_errno(r, "Failed to determine supported controllers: %m");
1334
1335 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1336 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
9156e799 1337
a32360f1 1338 return 0;
8e274523
LP
1339}
1340
c6c18be3 1341void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
1342 assert(m);
1343
9444b1f2
LP
1344 /* We can't really delete the group, since we are in it. But
1345 * let's trim it. */
1346 if (delete && m->cgroup_root)
efdb0237
LP
1347 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1348
1349 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1350
1351 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1352 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 1353
03e334a1 1354 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 1355
efdb0237 1356 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
1357}
1358
4ad49000 1359Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 1360 char *p;
4ad49000 1361 Unit *u;
acb14d31
LP
1362
1363 assert(m);
1364 assert(cgroup);
acb14d31 1365
4ad49000
LP
1366 u = hashmap_get(m->cgroup_unit, cgroup);
1367 if (u)
1368 return u;
acb14d31 1369
8e70580b 1370 p = strdupa(cgroup);
acb14d31
LP
1371 for (;;) {
1372 char *e;
1373
1374 e = strrchr(p, '/');
efdb0237
LP
1375 if (!e || e == p)
1376 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
1377
1378 *e = 0;
1379
4ad49000
LP
1380 u = hashmap_get(m->cgroup_unit, p);
1381 if (u)
1382 return u;
acb14d31
LP
1383 }
1384}
1385
b3ac818b 1386Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 1387 _cleanup_free_ char *cgroup = NULL;
acb14d31 1388 int r;
8e274523 1389
8c47c732
LP
1390 assert(m);
1391
b3ac818b
LP
1392 if (pid <= 0)
1393 return NULL;
1394
1395 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1396 if (r < 0)
1397 return NULL;
1398
1399 return manager_get_unit_by_cgroup(m, cgroup);
1400}
1401
1402Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1403 Unit *u;
1404
1405 assert(m);
1406
efdb0237 1407 if (pid <= 0)
8c47c732
LP
1408 return NULL;
1409
efdb0237
LP
1410 if (pid == 1)
1411 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1412
fea72cc0 1413 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
1414 if (u)
1415 return u;
1416
fea72cc0 1417 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
1418 if (u)
1419 return u;
1420
b3ac818b 1421 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 1422}
4fbf50b3 1423
4ad49000
LP
1424int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1425 Unit *u;
4fbf50b3 1426
4ad49000
LP
1427 assert(m);
1428 assert(cgroup);
4fbf50b3 1429
4ad49000 1430 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
1431 if (!u)
1432 return 0;
b56c28c3 1433
efdb0237 1434 return unit_notify_cgroup_empty(u);
5ad096b3
LP
1435}
1436
1437int unit_get_memory_current(Unit *u, uint64_t *ret) {
1438 _cleanup_free_ char *v = NULL;
1439 int r;
1440
1441 assert(u);
1442 assert(ret);
1443
1444 if (!u->cgroup_path)
1445 return -ENODATA;
1446
efdb0237 1447 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
1448 return -ENODATA;
1449
efdb0237
LP
1450 if (cg_unified() <= 0)
1451 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1452 else
1453 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
5ad096b3
LP
1454 if (r == -ENOENT)
1455 return -ENODATA;
1456 if (r < 0)
1457 return r;
1458
1459 return safe_atou64(v, ret);
1460}
1461
1462static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1463 _cleanup_free_ char *v = NULL;
1464 uint64_t ns;
1465 int r;
1466
1467 assert(u);
1468 assert(ret);
1469
1470 if (!u->cgroup_path)
1471 return -ENODATA;
1472
efdb0237 1473 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
5ad096b3
LP
1474 return -ENODATA;
1475
1476 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1477 if (r == -ENOENT)
1478 return -ENODATA;
1479 if (r < 0)
1480 return r;
1481
1482 r = safe_atou64(v, &ns);
1483 if (r < 0)
1484 return r;
1485
1486 *ret = ns;
1487 return 0;
1488}
1489
1490int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1491 nsec_t ns;
1492 int r;
1493
1494 r = unit_get_cpu_usage_raw(u, &ns);
1495 if (r < 0)
1496 return r;
1497
1498 if (ns > u->cpuacct_usage_base)
1499 ns -= u->cpuacct_usage_base;
1500 else
1501 ns = 0;
1502
1503 *ret = ns;
1504 return 0;
1505}
1506
1507int unit_reset_cpu_usage(Unit *u) {
1508 nsec_t ns;
1509 int r;
1510
1511 assert(u);
1512
1513 r = unit_get_cpu_usage_raw(u, &ns);
1514 if (r < 0) {
1515 u->cpuacct_usage_base = 0;
1516 return r;
b56c28c3 1517 }
2633eb83 1518
5ad096b3 1519 u->cpuacct_usage_base = ns;
4ad49000 1520 return 0;
4fbf50b3
LP
1521}
1522
e9db43d5
LP
1523bool unit_cgroup_delegate(Unit *u) {
1524 CGroupContext *c;
1525
1526 assert(u);
1527
1528 c = unit_get_cgroup_context(u);
1529 if (!c)
1530 return false;
1531
1532 return c->delegate;
1533}
1534
4ad49000
LP
1535static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1536 [CGROUP_AUTO] = "auto",
1537 [CGROUP_CLOSED] = "closed",
1538 [CGROUP_STRICT] = "strict",
1539};
4fbf50b3 1540
4ad49000 1541DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);