]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
shared: add formats-util.h
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
4ad49000 6 Copyright 2013 Lennart Poettering
8e274523
LP
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
c6c18be3 22#include <fcntl.h>
e41969e3 23#include <fnmatch.h>
8c6db833 24
9eb977db 25#include "path-util.h"
9444b1f2 26#include "special.h"
4ad49000
LP
27#include "cgroup-util.h"
28#include "cgroup.h"
8e274523 29
9a054909
LP
30#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
31
4ad49000
LP
32void cgroup_context_init(CGroupContext *c) {
33 assert(c);
34
35 /* Initialize everything to the kernel defaults, assuming the
36 * structure is preinitialized to 0 */
37
db785129
LP
38 c->cpu_shares = (unsigned long) -1;
39 c->startup_cpu_shares = (unsigned long) -1;
ddca82ac 40 c->memory_limit = (uint64_t) -1;
db785129
LP
41 c->blockio_weight = (unsigned long) -1;
42 c->startup_blockio_weight = (unsigned long) -1;
b2f8b02e 43
3a43da28 44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
4ad49000 45}
8e274523 46
4ad49000
LP
47void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
48 assert(c);
49 assert(a);
50
71fda00f 51 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
52 free(a->path);
53 free(a);
54}
55
56void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
57 assert(c);
58 assert(w);
59
71fda00f 60 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
61 free(w->path);
62 free(w);
63}
64
65void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
66 assert(c);
8e274523 67 assert(b);
8e274523 68
71fda00f 69 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
70 free(b->path);
71 free(b);
72}
73
74void cgroup_context_done(CGroupContext *c) {
75 assert(c);
76
77 while (c->blockio_device_weights)
78 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
79
80 while (c->blockio_device_bandwidths)
81 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
82
83 while (c->device_allow)
84 cgroup_context_free_device_allow(c, c->device_allow);
85}
86
87void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
88 CGroupBlockIODeviceBandwidth *b;
89 CGroupBlockIODeviceWeight *w;
90 CGroupDeviceAllow *a;
9a054909 91 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
92
93 assert(c);
94 assert(f);
95
96 prefix = strempty(prefix);
97
98 fprintf(f,
99 "%sCPUAccounting=%s\n"
100 "%sBlockIOAccounting=%s\n"
101 "%sMemoryAccounting=%s\n"
102 "%sCPUShares=%lu\n"
95ae05c0 103 "%sStartupCPUShares=%lu\n"
b2f8b02e 104 "%sCPUQuotaPerSecSec=%s\n"
112a7f46 105 "%sBlockIOWeight=%lu\n"
95ae05c0 106 "%sStartupBlockIOWeight=%lu\n"
4ad49000 107 "%sMemoryLimit=%" PRIu64 "\n"
a931ad47
LP
108 "%sDevicePolicy=%s\n"
109 "%sDelegate=%s\n",
4ad49000
LP
110 prefix, yes_no(c->cpu_accounting),
111 prefix, yes_no(c->blockio_accounting),
112 prefix, yes_no(c->memory_accounting),
113 prefix, c->cpu_shares,
95ae05c0 114 prefix, c->startup_cpu_shares,
b1d6dcf5 115 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
4ad49000 116 prefix, c->blockio_weight,
95ae05c0 117 prefix, c->startup_blockio_weight,
4ad49000 118 prefix, c->memory_limit,
a931ad47
LP
119 prefix, cgroup_device_policy_to_string(c->device_policy),
120 prefix, yes_no(c->delegate));
4ad49000
LP
121
122 LIST_FOREACH(device_allow, a, c->device_allow)
123 fprintf(f,
124 "%sDeviceAllow=%s %s%s%s\n",
125 prefix,
126 a->path,
127 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
128
129 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
130 fprintf(f,
8e7076ca 131 "%sBlockIODeviceWeight=%s %lu",
4ad49000
LP
132 prefix,
133 w->path,
134 w->weight);
135
136 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
137 char buf[FORMAT_BYTES_MAX];
138
139 fprintf(f,
140 "%s%s=%s %s\n",
141 prefix,
142 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
143 b->path,
144 format_bytes(buf, sizeof(buf), b->bandwidth));
145 }
146}
147
148static int lookup_blkio_device(const char *p, dev_t *dev) {
149 struct stat st;
150 int r;
151
152 assert(p);
153 assert(dev);
154
155 r = stat(p, &st);
4a62c710
MS
156 if (r < 0)
157 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 158
4ad49000
LP
159 if (S_ISBLK(st.st_mode))
160 *dev = st.st_rdev;
161 else if (major(st.st_dev) != 0) {
162 /* If this is not a device node then find the block
163 * device this file is stored on */
164 *dev = st.st_dev;
165
166 /* If this is a partition, try to get the originating
167 * block device */
168 block_get_whole_disk(*dev, dev);
169 } else {
170 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
171 return -ENODEV;
172 }
8e274523 173
8e274523 174 return 0;
8e274523
LP
175}
176
4ad49000
LP
177static int whitelist_device(const char *path, const char *node, const char *acc) {
178 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
179 struct stat st;
8c6db833 180 int r;
8e274523 181
4ad49000
LP
182 assert(path);
183 assert(acc);
8e274523 184
4ad49000
LP
185 if (stat(node, &st) < 0) {
186 log_warning("Couldn't stat device %s", node);
187 return -errno;
188 }
189
190 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
191 log_warning("%s is not a device.", node);
192 return -ENODEV;
193 }
194
195 sprintf(buf,
196 "%c %u:%u %s",
197 S_ISCHR(st.st_mode) ? 'c' : 'b',
198 major(st.st_rdev), minor(st.st_rdev),
199 acc);
200
201 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 202 if (r < 0)
714e2e1d
LP
203 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
204 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
205
206 return r;
8e274523
LP
207}
208
90060676
LP
209static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
210 _cleanup_fclose_ FILE *f = NULL;
211 char line[LINE_MAX];
212 bool good = false;
213 int r;
214
215 assert(path);
216 assert(acc);
217 assert(type == 'b' || type == 'c');
218
219 f = fopen("/proc/devices", "re");
4a62c710
MS
220 if (!f)
221 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
222
223 FOREACH_LINE(line, f, goto fail) {
224 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
225 unsigned maj;
226
227 truncate_nl(line);
228
229 if (type == 'c' && streq(line, "Character devices:")) {
230 good = true;
231 continue;
232 }
233
234 if (type == 'b' && streq(line, "Block devices:")) {
235 good = true;
236 continue;
237 }
238
239 if (isempty(line)) {
240 good = false;
241 continue;
242 }
243
244 if (!good)
245 continue;
246
247 p = strstrip(line);
248
249 w = strpbrk(p, WHITESPACE);
250 if (!w)
251 continue;
252 *w = 0;
253
254 r = safe_atou(p, &maj);
255 if (r < 0)
256 continue;
257 if (maj <= 0)
258 continue;
259
260 w++;
261 w += strspn(w, WHITESPACE);
e41969e3
LP
262
263 if (fnmatch(name, w, 0) != 0)
90060676
LP
264 continue;
265
266 sprintf(buf,
267 "%c %u:* %s",
268 type,
269 maj,
270 acc);
271
272 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 273 if (r < 0)
714e2e1d
LP
274 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
275 "Failed to set devices.allow on %s: %m", path);
90060676
LP
276 }
277
278 return 0;
279
280fail:
56f64d95 281 log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
282 return -errno;
283}
284
db785129 285void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
01efdf13 286 bool is_root;
4ad49000
LP
287 int r;
288
289 assert(c);
290 assert(path);
8e274523 291
4ad49000
LP
292 if (mask == 0)
293 return;
8e274523 294
71c26873 295 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
296 * hence silently ignore */
297 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
298 if (is_root)
299 /* Make sure we don't try to display messages with an empty path. */
300 path = "/";
01efdf13 301
714e2e1d
LP
302 /* We generally ignore errors caused by read-only mounted
303 * cgroup trees (assuming we are running in a container then),
304 * and missing cgroups, i.e. EROFS and ENOENT. */
305
01efdf13 306 if ((mask & CGROUP_CPU) && !is_root) {
b2f8b02e 307 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
8e274523 308
db785129 309 sprintf(buf, "%lu\n",
d81afec1 310 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
db785129 311 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
4ad49000 312 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
1aeab12b 313 if (r < 0)
714e2e1d
LP
314 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
315 "Failed to set cpu.shares on %s: %m", path);
b2f8b02e 316
9a054909 317 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
b2f8b02e 318 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
1aeab12b 319 if (r < 0)
714e2e1d
LP
320 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
321 "Failed to set cpu.cfs_period_us on %s: %m", path);
b2f8b02e 322
3a43da28 323 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
9a054909 324 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
b2f8b02e
LP
325 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
326 } else
327 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
1aeab12b 328 if (r < 0)
714e2e1d
LP
329 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
330 "Failed to set cpu.cfs_quota_us on %s: %m", path);
4ad49000
LP
331 }
332
333 if (mask & CGROUP_BLKIO) {
334 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
335 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
336 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
337 CGroupBlockIODeviceWeight *w;
338 CGroupBlockIODeviceBandwidth *b;
339
01efdf13 340 if (!is_root) {
d81afec1 341 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
db785129 342 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
01efdf13 343 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 344 if (r < 0)
714e2e1d
LP
345 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
346 "Failed to set blkio.weight on %s: %m", path);
4ad49000 347
01efdf13
LP
348 /* FIXME: no way to reset this list */
349 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
350 dev_t dev;
4ad49000 351
01efdf13
LP
352 r = lookup_blkio_device(w->path, &dev);
353 if (r < 0)
354 continue;
8e274523 355
01efdf13
LP
356 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
357 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
1aeab12b 358 if (r < 0)
714e2e1d
LP
359 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
360 "Failed to set blkio.weight_device on %s: %m", path);
01efdf13 361 }
4ad49000
LP
362 }
363
364 /* FIXME: no way to reset this list */
365 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
366 const char *a;
367 dev_t dev;
368
369 r = lookup_blkio_device(b->path, &dev);
370 if (r < 0)
371 continue;
372
373 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
374
375 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
376 r = cg_set_attribute("blkio", path, a, buf);
1aeab12b 377 if (r < 0)
714e2e1d
LP
378 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
379 "Failed to set %s on %s: %m", a, path);
d686d8a9 380 }
8e274523
LP
381 }
382
a3bd89ea 383 if ((mask & CGROUP_MEMORY) && !is_root) {
6a94f2e9 384 if (c->memory_limit != (uint64_t) -1) {
e58cec11
LP
385 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
386
6a94f2e9
G
387 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
388 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
389 } else
390 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
8e274523 391
1aeab12b 392 if (r < 0)
714e2e1d
LP
393 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
394 "Failed to set memory.limit_in_bytes on %s: %m", path);
4ad49000 395 }
8e274523 396
01efdf13 397 if ((mask & CGROUP_DEVICE) && !is_root) {
4ad49000 398 CGroupDeviceAllow *a;
8e274523 399
714e2e1d
LP
400 /* Changing the devices list of a populated cgroup
401 * might result in EINVAL, hence ignore EINVAL
402 * here. */
403
4ad49000
LP
404 if (c->device_allow || c->device_policy != CGROUP_AUTO)
405 r = cg_set_attribute("devices", path, "devices.deny", "a");
406 else
407 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 408 if (r < 0)
714e2e1d
LP
409 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
410 "Failed to reset devices.list on %s: %m", path);
fb385181 411
4ad49000
LP
412 if (c->device_policy == CGROUP_CLOSED ||
413 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
414 static const char auto_devices[] =
7d711efb
LP
415 "/dev/null\0" "rwm\0"
416 "/dev/zero\0" "rwm\0"
417 "/dev/full\0" "rwm\0"
418 "/dev/random\0" "rwm\0"
419 "/dev/urandom\0" "rwm\0"
420 "/dev/tty\0" "rwm\0"
421 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
4ad49000
LP
422
423 const char *x, *y;
424
425 NULSTR_FOREACH_PAIR(x, y, auto_devices)
426 whitelist_device(path, x, y);
7d711efb
LP
427
428 whitelist_major(path, "pts", 'c', "rw");
429 whitelist_major(path, "kdbus", 'c', "rw");
430 whitelist_major(path, "kdbus/*", 'c', "rw");
4ad49000
LP
431 }
432
433 LIST_FOREACH(device_allow, a, c->device_allow) {
434 char acc[4];
435 unsigned k = 0;
436
437 if (a->r)
438 acc[k++] = 'r';
439 if (a->w)
440 acc[k++] = 'w';
441 if (a->m)
442 acc[k++] = 'm';
fb385181 443
4ad49000
LP
444 if (k == 0)
445 continue;
fb385181 446
4ad49000 447 acc[k++] = 0;
90060676
LP
448
449 if (startswith(a->path, "/dev/"))
450 whitelist_device(path, a->path, acc);
451 else if (startswith(a->path, "block-"))
452 whitelist_major(path, a->path + 6, 'b', acc);
453 else if (startswith(a->path, "char-"))
454 whitelist_major(path, a->path + 5, 'c', acc);
455 else
456 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
457 }
458 }
fb385181
LP
459}
460
db785129 461CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
4ad49000 462 CGroupControllerMask mask = 0;
8e274523 463
4ad49000 464 /* Figure out which controllers we need */
8e274523 465
b2f8b02e 466 if (c->cpu_accounting ||
db785129
LP
467 c->cpu_shares != (unsigned long) -1 ||
468 c->startup_cpu_shares != (unsigned long) -1 ||
3a43da28 469 c->cpu_quota_per_sec_usec != USEC_INFINITY)
4ad49000 470 mask |= CGROUP_CPUACCT | CGROUP_CPU;
ecedd90f 471
4ad49000 472 if (c->blockio_accounting ||
db785129
LP
473 c->blockio_weight != (unsigned long) -1 ||
474 c->startup_blockio_weight != (unsigned long) -1 ||
4ad49000 475 c->blockio_device_weights ||
db785129 476 c->blockio_device_bandwidths)
4ad49000 477 mask |= CGROUP_BLKIO;
ecedd90f 478
4ad49000 479 if (c->memory_accounting ||
ddca82ac 480 c->memory_limit != (uint64_t) -1)
4ad49000 481 mask |= CGROUP_MEMORY;
8e274523 482
a931ad47
LP
483 if (c->device_allow ||
484 c->device_policy != CGROUP_AUTO)
4ad49000
LP
485 mask |= CGROUP_DEVICE;
486
487 return mask;
8e274523
LP
488}
489
bc432dc7 490CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
4ad49000 491 CGroupContext *c;
8e274523 492
4ad49000
LP
493 c = unit_get_cgroup_context(u);
494 if (!c)
495 return 0;
8e274523 496
a931ad47
LP
497 /* If delegation is turned on, then turn on all cgroups,
498 * unless the process we fork into it is known to drop
499 * privileges anyway, and shouldn't get access to the
500 * controllers anyway. */
501
502 if (c->delegate) {
503 ExecContext *e;
504
505 e = unit_get_exec_context(u);
506 if (!e || exec_context_maintains_privileges(e))
507 return _CGROUP_CONTROLLER_MASK_ALL;
508 }
509
db785129 510 return cgroup_context_get_mask(c);
8e274523
LP
511}
512
bc432dc7 513CGroupControllerMask unit_get_members_mask(Unit *u) {
4ad49000 514 assert(u);
bc432dc7
LP
515
516 if (u->cgroup_members_mask_valid)
517 return u->cgroup_members_mask;
518
519 u->cgroup_members_mask = 0;
520
521 if (u->type == UNIT_SLICE) {
522 Unit *member;
523 Iterator i;
524
525 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
526
527 if (member == u)
528 continue;
529
d4fdc205 530 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
531 continue;
532
533 u->cgroup_members_mask |=
534 unit_get_cgroup_mask(member) |
535 unit_get_members_mask(member);
536 }
537 }
538
539 u->cgroup_members_mask_valid = true;
6414b7c9 540 return u->cgroup_members_mask;
246aa6dd
LP
541}
542
bc432dc7 543CGroupControllerMask unit_get_siblings_mask(Unit *u) {
4ad49000 544 assert(u);
246aa6dd 545
bc432dc7 546 if (UNIT_ISSET(u->slice))
637f421e 547 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 548
637f421e 549 return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
246aa6dd
LP
550}
551
bc432dc7 552CGroupControllerMask unit_get_target_mask(Unit *u) {
6414b7c9
DS
553 CGroupControllerMask mask;
554
555 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
556 mask &= u->manager->cgroup_supported;
557
558 return mask;
559}
560
561/* Recurse from a unit up through its containing slices, propagating
562 * mask bits upward. A unit is also member of itself. */
bc432dc7
LP
563void unit_update_cgroup_members_masks(Unit *u) {
564 CGroupControllerMask m;
565 bool more;
566
567 assert(u);
568
569 /* Calculate subtree mask */
570 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
571
572 /* See if anything changed from the previous invocation. If
573 * not, we're done. */
574 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
575 return;
576
577 more =
578 u->cgroup_subtree_mask_valid &&
579 ((m & ~u->cgroup_subtree_mask) != 0) &&
580 ((~m & u->cgroup_subtree_mask) == 0);
581
582 u->cgroup_subtree_mask = m;
583 u->cgroup_subtree_mask_valid = true;
584
6414b7c9
DS
585 if (UNIT_ISSET(u->slice)) {
586 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
587
588 if (more)
589 /* There's more set now than before. We
590 * propagate the new mask to the parent's mask
591 * (not caring if it actually was valid or
592 * not). */
593
594 s->cgroup_members_mask |= m;
595
596 else
597 /* There's less set now than before (or we
598 * don't know), we need to recalculate
599 * everything, so let's invalidate the
600 * parent's members mask */
601
602 s->cgroup_members_mask_valid = false;
603
604 /* And now make sure that this change also hits our
605 * grandparents */
606 unit_update_cgroup_members_masks(s);
6414b7c9
DS
607 }
608}
609
03b90d4b
LP
610static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
611 Unit *u = userdata;
612
613 assert(mask != 0);
614 assert(u);
615
616 while (u) {
617 if (u->cgroup_path &&
618 u->cgroup_realized &&
619 (u->cgroup_realized_mask & mask) == mask)
620 return u->cgroup_path;
621
622 u = UNIT_DEREF(u->slice);
623 }
624
625 return NULL;
626}
627
4ad49000 628static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
0cd385d3 629 CGroupContext *c;
bc432dc7 630 int r;
64747e2d 631
4ad49000 632 assert(u);
64747e2d 633
0cd385d3
LP
634 c = unit_get_cgroup_context(u);
635 if (!c)
636 return 0;
637
7b3fd631
LP
638 if (!u->cgroup_path) {
639 _cleanup_free_ char *path = NULL;
64747e2d 640
7b3fd631
LP
641 path = unit_default_cgroup_path(u);
642 if (!path)
643 return log_oom();
644
645 r = hashmap_put(u->manager->cgroup_unit, path, u);
646 if (r < 0) {
647 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
648 return r;
649 }
650 if (r > 0) {
651 u->cgroup_path = path;
652 path = NULL;
653 }
b58b8e11
HH
654 }
655
03b90d4b
LP
656 /* First, create our own group */
657 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
23bbb0de
MS
658 if (r < 0)
659 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
03b90d4b
LP
660
661 /* Keep track that this is now realized */
4ad49000 662 u->cgroup_realized = true;
bc432dc7 663 u->cgroup_realized_mask = mask;
4ad49000 664
0cd385d3
LP
665 if (u->type != UNIT_SLICE && !c->delegate) {
666
667 /* Then, possibly move things over, but not if
668 * subgroups may contain processes, which is the case
669 * for slice and delegation units. */
670 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
671 if (r < 0)
672 log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
673 }
03b90d4b 674
64747e2d
LP
675 return 0;
676}
677
7b3fd631
LP
678int unit_attach_pids_to_cgroup(Unit *u) {
679 int r;
680 assert(u);
681
682 r = unit_realize_cgroup(u);
683 if (r < 0)
684 return r;
685
686 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
687 if (r < 0)
688 return r;
689
690 return 0;
691}
692
6414b7c9 693static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
bc432dc7
LP
694 assert(u);
695
696 return u->cgroup_realized && u->cgroup_realized_mask == mask;
6414b7c9
DS
697}
698
699/* Check if necessary controllers and attributes for a unit are in place.
700 *
701 * If so, do nothing.
702 * If not, create paths, move processes over, and set attributes.
703 *
704 * Returns 0 on success and < 0 on failure. */
db785129 705static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
4ad49000 706 CGroupControllerMask mask;
6414b7c9 707 int r;
64747e2d 708
4ad49000 709 assert(u);
64747e2d 710
4ad49000 711 if (u->in_cgroup_queue) {
71fda00f 712 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
713 u->in_cgroup_queue = false;
714 }
64747e2d 715
6414b7c9 716 mask = unit_get_target_mask(u);
64747e2d 717
6414b7c9 718 if (unit_has_mask_realized(u, mask))
0a1eb06d 719 return 0;
64747e2d 720
4ad49000 721 /* First, realize parents */
6414b7c9 722 if (UNIT_ISSET(u->slice)) {
db785129 723 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
724 if (r < 0)
725 return r;
726 }
4ad49000
LP
727
728 /* And then do the real work */
6414b7c9
DS
729 r = unit_create_cgroups(u, mask);
730 if (r < 0)
731 return r;
732
733 /* Finally, apply the necessary attributes. */
db785129 734 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
6414b7c9
DS
735
736 return 0;
64747e2d
LP
737}
738
4ad49000 739static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 740
4ad49000
LP
741 if (u->in_cgroup_queue)
742 return;
8e274523 743
71fda00f 744 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
745 u->in_cgroup_queue = true;
746}
8c6db833 747
4ad49000 748unsigned manager_dispatch_cgroup_queue(Manager *m) {
db785129 749 ManagerState state;
4ad49000 750 unsigned n = 0;
db785129 751 Unit *i;
6414b7c9 752 int r;
ecedd90f 753
db785129
LP
754 state = manager_state(m);
755
4ad49000
LP
756 while ((i = m->cgroup_queue)) {
757 assert(i->in_cgroup_queue);
ecedd90f 758
db785129 759 r = unit_realize_cgroup_now(i, state);
6414b7c9 760 if (r < 0)
da927ba9 761 log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
0a1eb06d 762
4ad49000
LP
763 n++;
764 }
ecedd90f 765
4ad49000 766 return n;
8e274523
LP
767}
768
4ad49000
LP
769static void unit_queue_siblings(Unit *u) {
770 Unit *slice;
ca949c9d 771
4ad49000
LP
772 /* This adds the siblings of the specified unit and the
773 * siblings of all parent units to the cgroup queue. (But
774 * neither the specified unit itself nor the parents.) */
775
776 while ((slice = UNIT_DEREF(u->slice))) {
777 Iterator i;
778 Unit *m;
8f53a7b8 779
4ad49000
LP
780 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
781 if (m == u)
782 continue;
8e274523 783
6414b7c9
DS
784 /* Skip units that have a dependency on the slice
785 * but aren't actually in it. */
4ad49000 786 if (UNIT_DEREF(m->slice) != slice)
50159e6a 787 continue;
8e274523 788
6414b7c9
DS
789 /* No point in doing cgroup application for units
790 * without active processes. */
791 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
792 continue;
793
794 /* If the unit doesn't need any new controllers
795 * and has current ones realized, it doesn't need
796 * any changes. */
797 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
798 continue;
799
4ad49000 800 unit_add_to_cgroup_queue(m);
50159e6a
LP
801 }
802
4ad49000 803 u = slice;
8e274523 804 }
4ad49000
LP
805}
806
0a1eb06d 807int unit_realize_cgroup(Unit *u) {
4ad49000
LP
808 CGroupContext *c;
809
810 assert(u);
811
812 c = unit_get_cgroup_context(u);
813 if (!c)
0a1eb06d 814 return 0;
8e274523 815
4ad49000
LP
816 /* So, here's the deal: when realizing the cgroups for this
817 * unit, we need to first create all parents, but there's more
818 * actually: for the weight-based controllers we also need to
819 * make sure that all our siblings (i.e. units that are in the
73e231ab 820 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
821 * would become very uneven as each of their processes would
822 * get as much resources as all our group together. This call
823 * will synchronously create the parent cgroups, but will
824 * defer work on the siblings to the next event loop
825 * iteration. */
ca949c9d 826
4ad49000
LP
827 /* Add all sibling slices to the cgroup queue. */
828 unit_queue_siblings(u);
829
6414b7c9 830 /* And realize this one now (and apply the values) */
db785129 831 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
832}
833
b1491eba 834void unit_destroy_cgroup_if_empty(Unit *u) {
8e274523
LP
835 int r;
836
4ad49000 837 assert(u);
8e274523 838
4ad49000
LP
839 if (!u->cgroup_path)
840 return;
8e274523 841
13b84ec7 842 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
dab5bf85 843 if (r < 0) {
da927ba9 844 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
dab5bf85
RL
845 return;
846 }
8e274523 847
0a1eb06d
LP
848 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
849
4ad49000
LP
850 free(u->cgroup_path);
851 u->cgroup_path = NULL;
852 u->cgroup_realized = false;
bc432dc7 853 u->cgroup_realized_mask = 0;
8e274523
LP
854}
855
4ad49000
LP
856pid_t unit_search_main_pid(Unit *u) {
857 _cleanup_fclose_ FILE *f = NULL;
858 pid_t pid = 0, npid, mypid;
859
860 assert(u);
861
862 if (!u->cgroup_path)
863 return 0;
864
865 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
866 return 0;
867
868 mypid = getpid();
869 while (cg_read_pid(f, &npid) > 0) {
870 pid_t ppid;
871
872 if (npid == pid)
873 continue;
8e274523 874
4ad49000
LP
875 /* Ignore processes that aren't our kids */
876 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
877 continue;
8e274523 878
4ad49000
LP
879 if (pid != 0) {
880 /* Dang, there's more than one daemonized PID
881 in this group, so we don't know what process
882 is the main process. */
883 pid = 0;
884 break;
885 }
8e274523 886
4ad49000 887 pid = npid;
8e274523
LP
888 }
889
4ad49000 890 return pid;
8e274523
LP
891}
892
8e274523 893int manager_setup_cgroup(Manager *m) {
9444b1f2 894 _cleanup_free_ char *path = NULL;
8e274523 895 int r;
8e274523
LP
896
897 assert(m);
898
35d2e7ec 899 /* 1. Determine hierarchy */
9444b1f2
LP
900 free(m->cgroup_root);
901 m->cgroup_root = NULL;
902
903 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
904 if (r < 0)
905 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 906
15c60e99
LP
907 /* LEGACY: Already in /system.slice? If so, let's cut this
908 * off. This is to support live upgrades from older systemd
909 * versions where PID 1 was moved there. */
9444b1f2 910 if (m->running_as == SYSTEMD_SYSTEM) {
0d8c31ff
ZJS
911 char *e;
912
9444b1f2 913 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99
LP
914 if (!e)
915 e = endswith(m->cgroup_root, "/system");
9444b1f2
LP
916 if (e)
917 *e = 0;
0baf24dd 918 }
7ccfb64a 919
9444b1f2
LP
920 /* And make sure to store away the root value without trailing
921 * slash, even for the root dir, so that we can easily prepend
922 * it everywhere. */
923 if (streq(m->cgroup_root, "/"))
924 m->cgroup_root[0] = 0;
8e274523 925
35d2e7ec 926 /* 2. Show data */
9444b1f2 927 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
928 if (r < 0)
929 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 930
c6c18be3 931 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
0d8c31ff 932 if (!m->test_run) {
c6c18be3 933
0d8c31ff
ZJS
934 /* 3. Install agent */
935 if (m->running_as == SYSTEMD_SYSTEM) {
936 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
937 if (r < 0)
da927ba9 938 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
0d8c31ff
ZJS
939 else if (r > 0)
940 log_debug("Installed release agent.");
941 else
942 log_debug("Release agent already installed.");
943 }
8e274523 944
0d8c31ff
ZJS
945 /* 4. Make sure we are in the root cgroup */
946 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
23bbb0de
MS
947 if (r < 0)
948 return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
c6c18be3 949
0d8c31ff
ZJS
950 /* 5. And pin it, so that it cannot be unmounted */
951 safe_close(m->pin_cgroupfs_fd);
c6c18be3 952
0d8c31ff 953 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
954 if (m->pin_cgroupfs_fd < 0)
955 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 956
cc98b302 957 /* 6. Always enable hierarchical support if it exists... */
0d8c31ff 958 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3
LP
959 }
960
0d8c31ff 961 /* 7. Figure out which controllers are supported */
4ad49000 962 m->cgroup_supported = cg_mask_supported();
9156e799 963
a32360f1 964 return 0;
8e274523
LP
965}
966
c6c18be3 967void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
968 assert(m);
969
9444b1f2
LP
970 /* We can't really delete the group, since we are in it. But
971 * let's trim it. */
972 if (delete && m->cgroup_root)
973 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
8e274523 974
03e334a1 975 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 976
9444b1f2
LP
977 free(m->cgroup_root);
978 m->cgroup_root = NULL;
8e274523
LP
979}
980
4ad49000 981Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 982 char *p;
4ad49000 983 Unit *u;
acb14d31
LP
984
985 assert(m);
986 assert(cgroup);
acb14d31 987
4ad49000
LP
988 u = hashmap_get(m->cgroup_unit, cgroup);
989 if (u)
990 return u;
acb14d31 991
8e70580b 992 p = strdupa(cgroup);
acb14d31
LP
993 for (;;) {
994 char *e;
995
996 e = strrchr(p, '/');
4ad49000
LP
997 if (e == p || !e)
998 return NULL;
acb14d31
LP
999
1000 *e = 0;
1001
4ad49000
LP
1002 u = hashmap_get(m->cgroup_unit, p);
1003 if (u)
1004 return u;
acb14d31
LP
1005 }
1006}
1007
4ad49000
LP
1008Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1009 _cleanup_free_ char *cgroup = NULL;
acb14d31 1010 int r;
8e274523 1011
8c47c732
LP
1012 assert(m);
1013
1014 if (pid <= 1)
1015 return NULL;
1016
4ad49000
LP
1017 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1018 if (r < 0)
6dde1f33
LP
1019 return NULL;
1020
4ad49000 1021 return manager_get_unit_by_cgroup(m, cgroup);
6dde1f33 1022}
4fbf50b3 1023
4ad49000
LP
1024int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1025 Unit *u;
1026 int r;
4fbf50b3 1027
4ad49000
LP
1028 assert(m);
1029 assert(cgroup);
4fbf50b3 1030
4ad49000 1031 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
1032 if (!u)
1033 return 0;
b56c28c3 1034
5ad096b3
LP
1035 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1036 if (r <= 0)
1037 return r;
1038
1039 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1040 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1041
1042 unit_add_to_gc_queue(u);
1043 return 0;
1044}
1045
1046int unit_get_memory_current(Unit *u, uint64_t *ret) {
1047 _cleanup_free_ char *v = NULL;
1048 int r;
1049
1050 assert(u);
1051 assert(ret);
1052
1053 if (!u->cgroup_path)
1054 return -ENODATA;
1055
1056 if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0)
1057 return -ENODATA;
1058
1059 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1060 if (r == -ENOENT)
1061 return -ENODATA;
1062 if (r < 0)
1063 return r;
1064
1065 return safe_atou64(v, ret);
1066}
1067
1068static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1069 _cleanup_free_ char *v = NULL;
1070 uint64_t ns;
1071 int r;
1072
1073 assert(u);
1074 assert(ret);
1075
1076 if (!u->cgroup_path)
1077 return -ENODATA;
1078
1079 if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0)
1080 return -ENODATA;
1081
1082 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1083 if (r == -ENOENT)
1084 return -ENODATA;
1085 if (r < 0)
1086 return r;
1087
1088 r = safe_atou64(v, &ns);
1089 if (r < 0)
1090 return r;
1091
1092 *ret = ns;
1093 return 0;
1094}
1095
1096int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1097 nsec_t ns;
1098 int r;
1099
1100 r = unit_get_cpu_usage_raw(u, &ns);
1101 if (r < 0)
1102 return r;
1103
1104 if (ns > u->cpuacct_usage_base)
1105 ns -= u->cpuacct_usage_base;
1106 else
1107 ns = 0;
1108
1109 *ret = ns;
1110 return 0;
1111}
1112
1113int unit_reset_cpu_usage(Unit *u) {
1114 nsec_t ns;
1115 int r;
1116
1117 assert(u);
1118
1119 r = unit_get_cpu_usage_raw(u, &ns);
1120 if (r < 0) {
1121 u->cpuacct_usage_base = 0;
1122 return r;
b56c28c3 1123 }
2633eb83 1124
5ad096b3 1125 u->cpuacct_usage_base = ns;
4ad49000 1126 return 0;
4fbf50b3
LP
1127}
1128
4ad49000
LP
1129static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1130 [CGROUP_AUTO] = "auto",
1131 [CGROUP_CLOSED] = "closed",
1132 [CGROUP_STRICT] = "strict",
1133};
4fbf50b3 1134
4ad49000 1135DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);