]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
2a2cf02774d8be7b446eab86954447b148729ee6
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "cgroup-util.h"
26 #include "cgroup.h"
27 #include "fd-util.h"
28 #include "path-util.h"
29 #include "process-util.h"
30 #include "special.h"
31 #include "string-util.h"
32
33 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
34
35 void cgroup_context_init(CGroupContext *c) {
36 assert(c);
37
38 /* Initialize everything to the kernel defaults, assuming the
39 * structure is preinitialized to 0 */
40
41 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
42 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
43 c->cpu_quota_per_sec_usec = USEC_INFINITY;
44
45 c->memory_limit = (uint64_t) -1;
46
47 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
48 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
49
50 c->tasks_max = (uint64_t) -1;
51
52 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
53 }
54
55 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
56 assert(c);
57 assert(a);
58
59 LIST_REMOVE(device_allow, c->device_allow, a);
60 free(a->path);
61 free(a);
62 }
63
64 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
65 assert(c);
66 assert(w);
67
68 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
69 free(w->path);
70 free(w);
71 }
72
73 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
74 assert(c);
75 assert(b);
76
77 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
78 free(b->path);
79 free(b);
80 }
81
82 void cgroup_context_done(CGroupContext *c) {
83 assert(c);
84
85 while (c->blockio_device_weights)
86 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
87
88 while (c->blockio_device_bandwidths)
89 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
90
91 while (c->device_allow)
92 cgroup_context_free_device_allow(c, c->device_allow);
93 }
94
95 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
96 CGroupBlockIODeviceBandwidth *b;
97 CGroupBlockIODeviceWeight *w;
98 CGroupDeviceAllow *a;
99 char u[FORMAT_TIMESPAN_MAX];
100
101 assert(c);
102 assert(f);
103
104 prefix = strempty(prefix);
105
106 fprintf(f,
107 "%sCPUAccounting=%s\n"
108 "%sBlockIOAccounting=%s\n"
109 "%sMemoryAccounting=%s\n"
110 "%sTasksAccounting=%s\n"
111 "%sCPUShares=%" PRIu64 "\n"
112 "%sStartupCPUShares=%" PRIu64 "\n"
113 "%sCPUQuotaPerSecSec=%s\n"
114 "%sBlockIOWeight=%" PRIu64 "\n"
115 "%sStartupBlockIOWeight=%" PRIu64 "\n"
116 "%sMemoryLimit=%" PRIu64 "\n"
117 "%sTasksMax=%" PRIu64 "\n"
118 "%sDevicePolicy=%s\n"
119 "%sDelegate=%s\n",
120 prefix, yes_no(c->cpu_accounting),
121 prefix, yes_no(c->blockio_accounting),
122 prefix, yes_no(c->memory_accounting),
123 prefix, yes_no(c->tasks_accounting),
124 prefix, c->cpu_shares,
125 prefix, c->startup_cpu_shares,
126 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
127 prefix, c->blockio_weight,
128 prefix, c->startup_blockio_weight,
129 prefix, c->memory_limit,
130 prefix, c->tasks_max,
131 prefix, cgroup_device_policy_to_string(c->device_policy),
132 prefix, yes_no(c->delegate));
133
134 LIST_FOREACH(device_allow, a, c->device_allow)
135 fprintf(f,
136 "%sDeviceAllow=%s %s%s%s\n",
137 prefix,
138 a->path,
139 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
140
141 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
142 fprintf(f,
143 "%sBlockIODeviceWeight=%s %" PRIu64,
144 prefix,
145 w->path,
146 w->weight);
147
148 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
149 char buf[FORMAT_BYTES_MAX];
150
151 fprintf(f,
152 "%s%s=%s %s\n",
153 prefix,
154 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
155 b->path,
156 format_bytes(buf, sizeof(buf), b->bandwidth));
157 }
158 }
159
160 static int lookup_blkio_device(const char *p, dev_t *dev) {
161 struct stat st;
162 int r;
163
164 assert(p);
165 assert(dev);
166
167 r = stat(p, &st);
168 if (r < 0)
169 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
170
171 if (S_ISBLK(st.st_mode))
172 *dev = st.st_rdev;
173 else if (major(st.st_dev) != 0) {
174 /* If this is not a device node then find the block
175 * device this file is stored on */
176 *dev = st.st_dev;
177
178 /* If this is a partition, try to get the originating
179 * block device */
180 block_get_whole_disk(*dev, dev);
181 } else {
182 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
183 return -ENODEV;
184 }
185
186 return 0;
187 }
188
189 static int whitelist_device(const char *path, const char *node, const char *acc) {
190 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
191 struct stat st;
192 int r;
193
194 assert(path);
195 assert(acc);
196
197 if (stat(node, &st) < 0) {
198 log_warning("Couldn't stat device %s", node);
199 return -errno;
200 }
201
202 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
203 log_warning("%s is not a device.", node);
204 return -ENODEV;
205 }
206
207 sprintf(buf,
208 "%c %u:%u %s",
209 S_ISCHR(st.st_mode) ? 'c' : 'b',
210 major(st.st_rdev), minor(st.st_rdev),
211 acc);
212
213 r = cg_set_attribute("devices", path, "devices.allow", buf);
214 if (r < 0)
215 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
216 "Failed to set devices.allow on %s: %m", path);
217
218 return r;
219 }
220
221 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
222 _cleanup_fclose_ FILE *f = NULL;
223 char line[LINE_MAX];
224 bool good = false;
225 int r;
226
227 assert(path);
228 assert(acc);
229 assert(type == 'b' || type == 'c');
230
231 f = fopen("/proc/devices", "re");
232 if (!f)
233 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
234
235 FOREACH_LINE(line, f, goto fail) {
236 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
237 unsigned maj;
238
239 truncate_nl(line);
240
241 if (type == 'c' && streq(line, "Character devices:")) {
242 good = true;
243 continue;
244 }
245
246 if (type == 'b' && streq(line, "Block devices:")) {
247 good = true;
248 continue;
249 }
250
251 if (isempty(line)) {
252 good = false;
253 continue;
254 }
255
256 if (!good)
257 continue;
258
259 p = strstrip(line);
260
261 w = strpbrk(p, WHITESPACE);
262 if (!w)
263 continue;
264 *w = 0;
265
266 r = safe_atou(p, &maj);
267 if (r < 0)
268 continue;
269 if (maj <= 0)
270 continue;
271
272 w++;
273 w += strspn(w, WHITESPACE);
274
275 if (fnmatch(name, w, 0) != 0)
276 continue;
277
278 sprintf(buf,
279 "%c %u:* %s",
280 type,
281 maj,
282 acc);
283
284 r = cg_set_attribute("devices", path, "devices.allow", buf);
285 if (r < 0)
286 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
287 "Failed to set devices.allow on %s: %m", path);
288 }
289
290 return 0;
291
292 fail:
293 log_warning_errno(errno, "Failed to read /proc/devices: %m");
294 return -errno;
295 }
296
297 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
298 bool is_root;
299 int r;
300
301 assert(c);
302 assert(path);
303
304 if (mask == 0)
305 return;
306
307 /* Some cgroup attributes are not supported on the root cgroup,
308 * hence silently ignore */
309 is_root = isempty(path) || path_equal(path, "/");
310 if (is_root)
311 /* Make sure we don't try to display messages with an empty path. */
312 path = "/";
313
314 /* We generally ignore errors caused by read-only mounted
315 * cgroup trees (assuming we are running in a container then),
316 * and missing cgroups, i.e. EROFS and ENOENT. */
317
318 if ((mask & CGROUP_MASK_CPU) && !is_root) {
319 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
320
321 sprintf(buf, "%" PRIu64 "\n",
322 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
323 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
324 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
325 if (r < 0)
326 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
327 "Failed to set cpu.shares on %s: %m", path);
328
329 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
330 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
331 if (r < 0)
332 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
333 "Failed to set cpu.cfs_period_us on %s: %m", path);
334
335 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
336 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
337 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
338 } else
339 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
340 if (r < 0)
341 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
342 "Failed to set cpu.cfs_quota_us on %s: %m", path);
343 }
344
345 if (mask & CGROUP_MASK_BLKIO) {
346 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
347 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
348 CGroupBlockIODeviceWeight *w;
349 CGroupBlockIODeviceBandwidth *b;
350
351 if (!is_root) {
352 sprintf(buf, "%" PRIu64 "\n",
353 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
354 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
355 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
356 if (r < 0)
357 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
358 "Failed to set blkio.weight on %s: %m", path);
359
360 /* FIXME: no way to reset this list */
361 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
362 dev_t dev;
363
364 r = lookup_blkio_device(w->path, &dev);
365 if (r < 0)
366 continue;
367
368 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
369 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
370 if (r < 0)
371 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
372 "Failed to set blkio.weight_device on %s: %m", path);
373 }
374 }
375
376 /* FIXME: no way to reset this list */
377 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
378 const char *a;
379 dev_t dev;
380
381 r = lookup_blkio_device(b->path, &dev);
382 if (r < 0)
383 continue;
384
385 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
386
387 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
388 r = cg_set_attribute("blkio", path, a, buf);
389 if (r < 0)
390 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
391 "Failed to set %s on %s: %m", a, path);
392 }
393 }
394
395 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
396 if (c->memory_limit != (uint64_t) -1) {
397 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
398
399 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
400
401 if (cg_unified() <= 0)
402 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
403 else
404 r = cg_set_attribute("memory", path, "memory.max", buf);
405
406 } else {
407 if (cg_unified() <= 0)
408 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
409 else
410 r = cg_set_attribute("memory", path, "memory.max", "max");
411 }
412
413 if (r < 0)
414 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
415 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
416 }
417
418 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
419 CGroupDeviceAllow *a;
420
421 /* Changing the devices list of a populated cgroup
422 * might result in EINVAL, hence ignore EINVAL
423 * here. */
424
425 if (c->device_allow || c->device_policy != CGROUP_AUTO)
426 r = cg_set_attribute("devices", path, "devices.deny", "a");
427 else
428 r = cg_set_attribute("devices", path, "devices.allow", "a");
429 if (r < 0)
430 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
431 "Failed to reset devices.list on %s: %m", path);
432
433 if (c->device_policy == CGROUP_CLOSED ||
434 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
435 static const char auto_devices[] =
436 "/dev/null\0" "rwm\0"
437 "/dev/zero\0" "rwm\0"
438 "/dev/full\0" "rwm\0"
439 "/dev/random\0" "rwm\0"
440 "/dev/urandom\0" "rwm\0"
441 "/dev/tty\0" "rwm\0"
442 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
443
444 const char *x, *y;
445
446 NULSTR_FOREACH_PAIR(x, y, auto_devices)
447 whitelist_device(path, x, y);
448
449 whitelist_major(path, "pts", 'c', "rw");
450 whitelist_major(path, "kdbus", 'c', "rw");
451 whitelist_major(path, "kdbus/*", 'c', "rw");
452 }
453
454 LIST_FOREACH(device_allow, a, c->device_allow) {
455 char acc[4];
456 unsigned k = 0;
457
458 if (a->r)
459 acc[k++] = 'r';
460 if (a->w)
461 acc[k++] = 'w';
462 if (a->m)
463 acc[k++] = 'm';
464
465 if (k == 0)
466 continue;
467
468 acc[k++] = 0;
469
470 if (startswith(a->path, "/dev/"))
471 whitelist_device(path, a->path, acc);
472 else if (startswith(a->path, "block-"))
473 whitelist_major(path, a->path + 6, 'b', acc);
474 else if (startswith(a->path, "char-"))
475 whitelist_major(path, a->path + 5, 'c', acc);
476 else
477 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
478 }
479 }
480
481 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
482
483 if (c->tasks_max != (uint64_t) -1) {
484 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
485
486 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
487 r = cg_set_attribute("pids", path, "pids.max", buf);
488 } else
489 r = cg_set_attribute("pids", path, "pids.max", "max");
490
491 if (r < 0)
492 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
493 "Failed to set pids.max on %s: %m", path);
494 }
495
496 if (mask & CGROUP_MASK_NET_CLS) {
497 char buf[DECIMAL_STR_MAX(uint32_t)];
498
499 sprintf(buf, "%" PRIu32, netclass);
500
501 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
502 if (r < 0)
503 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
504 "Failed to set net_cls.classid on %s: %m", path);
505 }
506 }
507
508 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
509 CGroupMask mask = 0;
510
511 /* Figure out which controllers we need */
512
513 if (c->cpu_accounting ||
514 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
515 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
516 c->cpu_quota_per_sec_usec != USEC_INFINITY)
517 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
518
519 if (c->blockio_accounting ||
520 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
521 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
522 c->blockio_device_weights ||
523 c->blockio_device_bandwidths)
524 mask |= CGROUP_MASK_BLKIO;
525
526 if (c->memory_accounting ||
527 c->memory_limit != (uint64_t) -1)
528 mask |= CGROUP_MASK_MEMORY;
529
530 if (c->device_allow ||
531 c->device_policy != CGROUP_AUTO)
532 mask |= CGROUP_MASK_DEVICES;
533
534 if (c->tasks_accounting ||
535 c->tasks_max != (uint64_t) -1)
536 mask |= CGROUP_MASK_PIDS;
537
538 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
539 mask |= CGROUP_MASK_NET_CLS;
540
541 return mask;
542 }
543
544 CGroupMask unit_get_own_mask(Unit *u) {
545 CGroupContext *c;
546
547 /* Returns the mask of controllers the unit needs for itself */
548
549 c = unit_get_cgroup_context(u);
550 if (!c)
551 return 0;
552
553 /* If delegation is turned on, then turn on all cgroups,
554 * unless we are on the legacy hierarchy and the process we
555 * fork into it is known to drop privileges, and hence
556 * shouldn't get access to the controllers.
557 *
558 * Note that on the unified hierarchy it is safe to delegate
559 * controllers to unprivileged services. */
560
561 if (c->delegate) {
562 ExecContext *e;
563
564 e = unit_get_exec_context(u);
565 if (!e ||
566 exec_context_maintains_privileges(e) ||
567 cg_unified() > 0)
568 return _CGROUP_MASK_ALL;
569 }
570
571 return cgroup_context_get_mask(c);
572 }
573
574 CGroupMask unit_get_members_mask(Unit *u) {
575 assert(u);
576
577 /* Returns the mask of controllers all of the unit's children
578 * require, merged */
579
580 if (u->cgroup_members_mask_valid)
581 return u->cgroup_members_mask;
582
583 u->cgroup_members_mask = 0;
584
585 if (u->type == UNIT_SLICE) {
586 Unit *member;
587 Iterator i;
588
589 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
590
591 if (member == u)
592 continue;
593
594 if (UNIT_DEREF(member->slice) != u)
595 continue;
596
597 u->cgroup_members_mask |=
598 unit_get_own_mask(member) |
599 unit_get_members_mask(member);
600 }
601 }
602
603 u->cgroup_members_mask_valid = true;
604 return u->cgroup_members_mask;
605 }
606
607 CGroupMask unit_get_siblings_mask(Unit *u) {
608 assert(u);
609
610 /* Returns the mask of controllers all of the unit's siblings
611 * require, i.e. the members mask of the unit's parent slice
612 * if there is one. */
613
614 if (UNIT_ISSET(u->slice))
615 return unit_get_members_mask(UNIT_DEREF(u->slice));
616
617 return unit_get_own_mask(u) | unit_get_members_mask(u);
618 }
619
620 CGroupMask unit_get_subtree_mask(Unit *u) {
621
622 /* Returns the mask of this subtree, meaning of the group
623 * itself and its children. */
624
625 return unit_get_own_mask(u) | unit_get_members_mask(u);
626 }
627
628 CGroupMask unit_get_target_mask(Unit *u) {
629 CGroupMask mask;
630
631 /* This returns the cgroup mask of all controllers to enable
632 * for a specific cgroup, i.e. everything it needs itself,
633 * plus all that its children need, plus all that its siblings
634 * need. This is primarily useful on the legacy cgroup
635 * hierarchy, where we need to duplicate each cgroup in each
636 * hierarchy that shall be enabled for it. */
637
638 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
639 mask &= u->manager->cgroup_supported;
640
641 return mask;
642 }
643
644 CGroupMask unit_get_enable_mask(Unit *u) {
645 CGroupMask mask;
646
647 /* This returns the cgroup mask of all controllers to enable
648 * for the children of a specific cgroup. This is primarily
649 * useful for the unified cgroup hierarchy, where each cgroup
650 * controls which controllers are enabled for its children. */
651
652 mask = unit_get_members_mask(u);
653 mask &= u->manager->cgroup_supported;
654
655 return mask;
656 }
657
658 /* Recurse from a unit up through its containing slices, propagating
659 * mask bits upward. A unit is also member of itself. */
660 void unit_update_cgroup_members_masks(Unit *u) {
661 CGroupMask m;
662 bool more;
663
664 assert(u);
665
666 /* Calculate subtree mask */
667 m = unit_get_subtree_mask(u);
668
669 /* See if anything changed from the previous invocation. If
670 * not, we're done. */
671 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
672 return;
673
674 more =
675 u->cgroup_subtree_mask_valid &&
676 ((m & ~u->cgroup_subtree_mask) != 0) &&
677 ((~m & u->cgroup_subtree_mask) == 0);
678
679 u->cgroup_subtree_mask = m;
680 u->cgroup_subtree_mask_valid = true;
681
682 if (UNIT_ISSET(u->slice)) {
683 Unit *s = UNIT_DEREF(u->slice);
684
685 if (more)
686 /* There's more set now than before. We
687 * propagate the new mask to the parent's mask
688 * (not caring if it actually was valid or
689 * not). */
690
691 s->cgroup_members_mask |= m;
692
693 else
694 /* There's less set now than before (or we
695 * don't know), we need to recalculate
696 * everything, so let's invalidate the
697 * parent's members mask */
698
699 s->cgroup_members_mask_valid = false;
700
701 /* And now make sure that this change also hits our
702 * grandparents */
703 unit_update_cgroup_members_masks(s);
704 }
705 }
706
707 static const char *migrate_callback(CGroupMask mask, void *userdata) {
708 Unit *u = userdata;
709
710 assert(mask != 0);
711 assert(u);
712
713 while (u) {
714 if (u->cgroup_path &&
715 u->cgroup_realized &&
716 (u->cgroup_realized_mask & mask) == mask)
717 return u->cgroup_path;
718
719 u = UNIT_DEREF(u->slice);
720 }
721
722 return NULL;
723 }
724
725 char *unit_default_cgroup_path(Unit *u) {
726 _cleanup_free_ char *escaped = NULL, *slice = NULL;
727 int r;
728
729 assert(u);
730
731 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
732 return strdup(u->manager->cgroup_root);
733
734 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
735 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
736 if (r < 0)
737 return NULL;
738 }
739
740 escaped = cg_escape(u->id);
741 if (!escaped)
742 return NULL;
743
744 if (slice)
745 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
746 else
747 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
748 }
749
750 int unit_set_cgroup_path(Unit *u, const char *path) {
751 _cleanup_free_ char *p = NULL;
752 int r;
753
754 assert(u);
755
756 if (path) {
757 p = strdup(path);
758 if (!p)
759 return -ENOMEM;
760 } else
761 p = NULL;
762
763 if (streq_ptr(u->cgroup_path, p))
764 return 0;
765
766 if (p) {
767 r = hashmap_put(u->manager->cgroup_unit, p, u);
768 if (r < 0)
769 return r;
770 }
771
772 unit_release_cgroup(u);
773
774 u->cgroup_path = p;
775 p = NULL;
776
777 return 1;
778 }
779
780 int unit_watch_cgroup(Unit *u) {
781 _cleanup_free_ char *populated = NULL;
782 int r;
783
784 assert(u);
785
786 if (!u->cgroup_path)
787 return 0;
788
789 if (u->cgroup_inotify_wd >= 0)
790 return 0;
791
792 /* Only applies to the unified hierarchy */
793 r = cg_unified();
794 if (r < 0)
795 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
796 if (r == 0)
797 return 0;
798
799 /* Don't watch the root slice, it's pointless. */
800 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
801 return 0;
802
803 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
804 if (r < 0)
805 return log_oom();
806
807 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
808 if (r < 0)
809 return log_oom();
810
811 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
812 if (u->cgroup_inotify_wd < 0) {
813
814 if (errno == ENOENT) /* If the directory is already
815 * gone we don't need to track
816 * it, so this is not an error */
817 return 0;
818
819 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
820 }
821
822 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
823 if (r < 0)
824 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
825
826 return 0;
827 }
828
829 static int unit_create_cgroup(
830 Unit *u,
831 CGroupMask target_mask,
832 CGroupMask enable_mask) {
833
834 CGroupContext *c;
835 int r;
836
837 assert(u);
838
839 c = unit_get_cgroup_context(u);
840 if (!c)
841 return 0;
842
843 if (!u->cgroup_path) {
844 _cleanup_free_ char *path = NULL;
845
846 path = unit_default_cgroup_path(u);
847 if (!path)
848 return log_oom();
849
850 r = unit_set_cgroup_path(u, path);
851 if (r == -EEXIST)
852 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
853 if (r < 0)
854 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
855 }
856
857 /* First, create our own group */
858 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
859 if (r < 0)
860 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
861
862 /* Start watching it */
863 (void) unit_watch_cgroup(u);
864
865 /* Enable all controllers we need */
866 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
867 if (r < 0)
868 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
869
870 /* Keep track that this is now realized */
871 u->cgroup_realized = true;
872 u->cgroup_realized_mask = target_mask;
873
874 if (u->type != UNIT_SLICE && !c->delegate) {
875
876 /* Then, possibly move things over, but not if
877 * subgroups may contain processes, which is the case
878 * for slice and delegation units. */
879 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
880 if (r < 0)
881 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
882 }
883
884 return 0;
885 }
886
887 int unit_attach_pids_to_cgroup(Unit *u) {
888 int r;
889 assert(u);
890
891 r = unit_realize_cgroup(u);
892 if (r < 0)
893 return r;
894
895 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
896 if (r < 0)
897 return r;
898
899 return 0;
900 }
901
902 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
903 assert(u);
904
905 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
906 }
907
908 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
909
910 uint32_t start, i;
911 Manager *m;
912
913 assert(u);
914
915 m = u->manager;
916
917 i = start = m->cgroup_netclass_registry_last;
918
919 do {
920 i++;
921
922 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
923 m->cgroup_netclass_registry_last = i;
924 *ret = i;
925 return 0;
926 }
927
928 if (i == UINT32_MAX)
929 i = CGROUP_NETCLASS_FIXED_MAX;
930
931 } while (i != start);
932
933 return -ENOBUFS;
934 }
935
936 int unit_add_to_netclass_cgroup(Unit *u) {
937
938 CGroupContext *cc;
939 Unit *first;
940 void *key;
941 int r;
942
943 assert(u);
944
945 cc = unit_get_cgroup_context(u);
946 if (!cc)
947 return 0;
948
949 switch (cc->netclass_type) {
950 case CGROUP_NETCLASS_TYPE_NONE:
951 return 0;
952
953 case CGROUP_NETCLASS_TYPE_FIXED:
954 u->cgroup_netclass_id = cc->netclass_id;
955 break;
956
957 case CGROUP_NETCLASS_TYPE_AUTO:
958 /* Allocate a new ID in case it was requested and not done yet */
959 if (u->cgroup_netclass_id == 0) {
960 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
961 if (r < 0)
962 return r;
963
964 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
965 }
966
967 break;
968 }
969
970 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
971 if (r < 0)
972 return r;
973
974 key = UINT32_TO_PTR(u->cgroup_netclass_id);
975 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
976
977 if (first) {
978 LIST_PREPEND(cgroup_netclass, first, u);
979 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
980 }
981
982 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
983 }
984
985 int unit_remove_from_netclass_cgroup(Unit *u) {
986
987 Unit *head;
988 void *key;
989
990 assert(u);
991
992 key = UINT32_TO_PTR(u->cgroup_netclass_id);
993
994 LIST_FIND_HEAD(cgroup_netclass, u, head);
995 LIST_REMOVE(cgroup_netclass, head, u);
996
997 if (head)
998 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
999
1000 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1001
1002 return 0;
1003 }
1004
1005 /* Check if necessary controllers and attributes for a unit are in place.
1006 *
1007 * If so, do nothing.
1008 * If not, create paths, move processes over, and set attributes.
1009 *
1010 * Returns 0 on success and < 0 on failure. */
1011 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1012 CGroupMask target_mask, enable_mask;
1013 int r;
1014
1015 assert(u);
1016
1017 if (u->in_cgroup_queue) {
1018 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1019 u->in_cgroup_queue = false;
1020 }
1021
1022 target_mask = unit_get_target_mask(u);
1023 if (unit_has_mask_realized(u, target_mask))
1024 return 0;
1025
1026 /* First, realize parents */
1027 if (UNIT_ISSET(u->slice)) {
1028 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1029 if (r < 0)
1030 return r;
1031 }
1032
1033 /* And then do the real work */
1034 enable_mask = unit_get_enable_mask(u);
1035 r = unit_create_cgroup(u, target_mask, enable_mask);
1036 if (r < 0)
1037 return r;
1038
1039 /* Finally, apply the necessary attributes. */
1040 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1041
1042 return 0;
1043 }
1044
1045 static void unit_add_to_cgroup_queue(Unit *u) {
1046
1047 if (u->in_cgroup_queue)
1048 return;
1049
1050 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1051 u->in_cgroup_queue = true;
1052 }
1053
1054 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1055 ManagerState state;
1056 unsigned n = 0;
1057 Unit *i;
1058 int r;
1059
1060 state = manager_state(m);
1061
1062 while ((i = m->cgroup_queue)) {
1063 assert(i->in_cgroup_queue);
1064
1065 r = unit_realize_cgroup_now(i, state);
1066 if (r < 0)
1067 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1068
1069 n++;
1070 }
1071
1072 return n;
1073 }
1074
1075 static void unit_queue_siblings(Unit *u) {
1076 Unit *slice;
1077
1078 /* This adds the siblings of the specified unit and the
1079 * siblings of all parent units to the cgroup queue. (But
1080 * neither the specified unit itself nor the parents.) */
1081
1082 while ((slice = UNIT_DEREF(u->slice))) {
1083 Iterator i;
1084 Unit *m;
1085
1086 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1087 if (m == u)
1088 continue;
1089
1090 /* Skip units that have a dependency on the slice
1091 * but aren't actually in it. */
1092 if (UNIT_DEREF(m->slice) != slice)
1093 continue;
1094
1095 /* No point in doing cgroup application for units
1096 * without active processes. */
1097 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1098 continue;
1099
1100 /* If the unit doesn't need any new controllers
1101 * and has current ones realized, it doesn't need
1102 * any changes. */
1103 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1104 continue;
1105
1106 unit_add_to_cgroup_queue(m);
1107 }
1108
1109 u = slice;
1110 }
1111 }
1112
1113 int unit_realize_cgroup(Unit *u) {
1114 assert(u);
1115
1116 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1117 return 0;
1118
1119 /* So, here's the deal: when realizing the cgroups for this
1120 * unit, we need to first create all parents, but there's more
1121 * actually: for the weight-based controllers we also need to
1122 * make sure that all our siblings (i.e. units that are in the
1123 * same slice as we are) have cgroups, too. Otherwise, things
1124 * would become very uneven as each of their processes would
1125 * get as much resources as all our group together. This call
1126 * will synchronously create the parent cgroups, but will
1127 * defer work on the siblings to the next event loop
1128 * iteration. */
1129
1130 /* Add all sibling slices to the cgroup queue. */
1131 unit_queue_siblings(u);
1132
1133 /* And realize this one now (and apply the values) */
1134 return unit_realize_cgroup_now(u, manager_state(u->manager));
1135 }
1136
1137 void unit_release_cgroup(Unit *u) {
1138 assert(u);
1139
1140 /* Forgets all cgroup details for this cgroup */
1141
1142 if (u->cgroup_path) {
1143 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1144 u->cgroup_path = mfree(u->cgroup_path);
1145 }
1146
1147 if (u->cgroup_inotify_wd >= 0) {
1148 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1149 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1150
1151 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1152 u->cgroup_inotify_wd = -1;
1153 }
1154 }
1155
1156 void unit_prune_cgroup(Unit *u) {
1157 int r;
1158 bool is_root_slice;
1159
1160 assert(u);
1161
1162 /* Removes the cgroup, if empty and possible, and stops watching it. */
1163
1164 if (!u->cgroup_path)
1165 return;
1166
1167 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1168
1169 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1170 if (r < 0) {
1171 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1172 return;
1173 }
1174
1175 if (is_root_slice)
1176 return;
1177
1178 unit_release_cgroup(u);
1179
1180 u->cgroup_realized = false;
1181 u->cgroup_realized_mask = 0;
1182 }
1183
1184 int unit_search_main_pid(Unit *u, pid_t *ret) {
1185 _cleanup_fclose_ FILE *f = NULL;
1186 pid_t pid = 0, npid, mypid;
1187 int r;
1188
1189 assert(u);
1190 assert(ret);
1191
1192 if (!u->cgroup_path)
1193 return -ENXIO;
1194
1195 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1196 if (r < 0)
1197 return r;
1198
1199 mypid = getpid();
1200 while (cg_read_pid(f, &npid) > 0) {
1201 pid_t ppid;
1202
1203 if (npid == pid)
1204 continue;
1205
1206 /* Ignore processes that aren't our kids */
1207 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1208 continue;
1209
1210 if (pid != 0)
1211 /* Dang, there's more than one daemonized PID
1212 in this group, so we don't know what process
1213 is the main process. */
1214
1215 return -ENODATA;
1216
1217 pid = npid;
1218 }
1219
1220 *ret = pid;
1221 return 0;
1222 }
1223
1224 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1225 _cleanup_closedir_ DIR *d = NULL;
1226 _cleanup_fclose_ FILE *f = NULL;
1227 int ret = 0, r;
1228
1229 assert(u);
1230 assert(path);
1231
1232 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1233 if (r < 0)
1234 ret = r;
1235 else {
1236 pid_t pid;
1237
1238 while ((r = cg_read_pid(f, &pid)) > 0) {
1239 r = unit_watch_pid(u, pid);
1240 if (r < 0 && ret >= 0)
1241 ret = r;
1242 }
1243
1244 if (r < 0 && ret >= 0)
1245 ret = r;
1246 }
1247
1248 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1249 if (r < 0) {
1250 if (ret >= 0)
1251 ret = r;
1252 } else {
1253 char *fn;
1254
1255 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1256 _cleanup_free_ char *p = NULL;
1257
1258 p = strjoin(path, "/", fn, NULL);
1259 free(fn);
1260
1261 if (!p)
1262 return -ENOMEM;
1263
1264 r = unit_watch_pids_in_path(u, p);
1265 if (r < 0 && ret >= 0)
1266 ret = r;
1267 }
1268
1269 if (r < 0 && ret >= 0)
1270 ret = r;
1271 }
1272
1273 return ret;
1274 }
1275
1276 int unit_watch_all_pids(Unit *u) {
1277 assert(u);
1278
1279 /* Adds all PIDs from our cgroup to the set of PIDs we
1280 * watch. This is a fallback logic for cases where we do not
1281 * get reliable cgroup empty notifications: we try to use
1282 * SIGCHLD as replacement. */
1283
1284 if (!u->cgroup_path)
1285 return -ENOENT;
1286
1287 if (cg_unified() > 0) /* On unified we can use proper notifications */
1288 return 0;
1289
1290 return unit_watch_pids_in_path(u, u->cgroup_path);
1291 }
1292
1293 int unit_notify_cgroup_empty(Unit *u) {
1294 int r;
1295
1296 assert(u);
1297
1298 if (!u->cgroup_path)
1299 return 0;
1300
1301 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1302 if (r <= 0)
1303 return r;
1304
1305 unit_add_to_gc_queue(u);
1306
1307 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1308 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1309
1310 return 0;
1311 }
1312
1313 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1314 Manager *m = userdata;
1315
1316 assert(s);
1317 assert(fd >= 0);
1318 assert(m);
1319
1320 for (;;) {
1321 union inotify_event_buffer buffer;
1322 struct inotify_event *e;
1323 ssize_t l;
1324
1325 l = read(fd, &buffer, sizeof(buffer));
1326 if (l < 0) {
1327 if (errno == EINTR || errno == EAGAIN)
1328 return 0;
1329
1330 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1331 }
1332
1333 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1334 Unit *u;
1335
1336 if (e->wd < 0)
1337 /* Queue overflow has no watch descriptor */
1338 continue;
1339
1340 if (e->mask & IN_IGNORED)
1341 /* The watch was just removed */
1342 continue;
1343
1344 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1345 if (!u) /* Not that inotify might deliver
1346 * events for a watch even after it
1347 * was removed, because it was queued
1348 * before the removal. Let's ignore
1349 * this here safely. */
1350 continue;
1351
1352 (void) unit_notify_cgroup_empty(u);
1353 }
1354 }
1355 }
1356
1357 int manager_setup_cgroup(Manager *m) {
1358 _cleanup_free_ char *path = NULL;
1359 CGroupController c;
1360 int r, unified;
1361 char *e;
1362
1363 assert(m);
1364
1365 /* 1. Determine hierarchy */
1366 m->cgroup_root = mfree(m->cgroup_root);
1367 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1368 if (r < 0)
1369 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1370
1371 /* Chop off the init scope, if we are already located in it */
1372 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1373
1374 /* LEGACY: Also chop off the system slice if we are in
1375 * it. This is to support live upgrades from older systemd
1376 * versions where PID 1 was moved there. Also see
1377 * cg_get_root_path(). */
1378 if (!e && m->running_as == MANAGER_SYSTEM) {
1379 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1380 if (!e)
1381 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1382 }
1383 if (e)
1384 *e = 0;
1385
1386 /* And make sure to store away the root value without trailing
1387 * slash, even for the root dir, so that we can easily prepend
1388 * it everywhere. */
1389 while ((e = endswith(m->cgroup_root, "/")))
1390 *e = 0;
1391
1392 /* 2. Show data */
1393 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1394 if (r < 0)
1395 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1396
1397 unified = cg_unified();
1398 if (unified < 0)
1399 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1400 if (unified > 0)
1401 log_debug("Unified cgroup hierarchy is located at %s.", path);
1402 else
1403 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1404
1405 if (!m->test_run) {
1406 const char *scope_path;
1407
1408 /* 3. Install agent */
1409 if (unified) {
1410
1411 /* In the unified hierarchy we can can get
1412 * cgroup empty notifications via inotify. */
1413
1414 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1415 safe_close(m->cgroup_inotify_fd);
1416
1417 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1418 if (m->cgroup_inotify_fd < 0)
1419 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1420
1421 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1422 if (r < 0)
1423 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1424
1425 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1426 if (r < 0)
1427 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1428
1429 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1430
1431 } else if (m->running_as == MANAGER_SYSTEM) {
1432
1433 /* On the legacy hierarchy we only get
1434 * notifications via cgroup agents. (Which
1435 * isn't really reliable, since it does not
1436 * generate events when control groups with
1437 * children run empty. */
1438
1439 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1440 if (r < 0)
1441 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1442 else if (r > 0)
1443 log_debug("Installed release agent.");
1444 else if (r == 0)
1445 log_debug("Release agent already installed.");
1446 }
1447
1448 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1449 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1450 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1451 if (r < 0)
1452 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1453
1454 /* also, move all other userspace processes remaining
1455 * in the root cgroup into that scope. */
1456 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1457 if (r < 0)
1458 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1459
1460 /* 5. And pin it, so that it cannot be unmounted */
1461 safe_close(m->pin_cgroupfs_fd);
1462 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1463 if (m->pin_cgroupfs_fd < 0)
1464 return log_error_errno(errno, "Failed to open pin file: %m");
1465
1466 /* 6. Always enable hierarchical support if it exists... */
1467 if (!unified)
1468 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1469 }
1470
1471 /* 7. Figure out which controllers are supported */
1472 r = cg_mask_supported(&m->cgroup_supported);
1473 if (r < 0)
1474 return log_error_errno(r, "Failed to determine supported controllers: %m");
1475
1476 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1477 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1478
1479 return 0;
1480 }
1481
1482 void manager_shutdown_cgroup(Manager *m, bool delete) {
1483 assert(m);
1484
1485 /* We can't really delete the group, since we are in it. But
1486 * let's trim it. */
1487 if (delete && m->cgroup_root)
1488 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1489
1490 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1491
1492 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1493 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1494
1495 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1496
1497 m->cgroup_root = mfree(m->cgroup_root);
1498 }
1499
1500 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1501 char *p;
1502 Unit *u;
1503
1504 assert(m);
1505 assert(cgroup);
1506
1507 u = hashmap_get(m->cgroup_unit, cgroup);
1508 if (u)
1509 return u;
1510
1511 p = strdupa(cgroup);
1512 for (;;) {
1513 char *e;
1514
1515 e = strrchr(p, '/');
1516 if (!e || e == p)
1517 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1518
1519 *e = 0;
1520
1521 u = hashmap_get(m->cgroup_unit, p);
1522 if (u)
1523 return u;
1524 }
1525 }
1526
1527 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1528 _cleanup_free_ char *cgroup = NULL;
1529 int r;
1530
1531 assert(m);
1532
1533 if (pid <= 0)
1534 return NULL;
1535
1536 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1537 if (r < 0)
1538 return NULL;
1539
1540 return manager_get_unit_by_cgroup(m, cgroup);
1541 }
1542
1543 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1544 Unit *u;
1545
1546 assert(m);
1547
1548 if (pid <= 0)
1549 return NULL;
1550
1551 if (pid == 1)
1552 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1553
1554 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1555 if (u)
1556 return u;
1557
1558 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1559 if (u)
1560 return u;
1561
1562 return manager_get_unit_by_pid_cgroup(m, pid);
1563 }
1564
1565 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1566 Unit *u;
1567
1568 assert(m);
1569 assert(cgroup);
1570
1571 u = manager_get_unit_by_cgroup(m, cgroup);
1572 if (!u)
1573 return 0;
1574
1575 return unit_notify_cgroup_empty(u);
1576 }
1577
1578 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1579 _cleanup_free_ char *v = NULL;
1580 int r;
1581
1582 assert(u);
1583 assert(ret);
1584
1585 if (!u->cgroup_path)
1586 return -ENODATA;
1587
1588 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1589 return -ENODATA;
1590
1591 if (cg_unified() <= 0)
1592 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1593 else
1594 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1595 if (r == -ENOENT)
1596 return -ENODATA;
1597 if (r < 0)
1598 return r;
1599
1600 return safe_atou64(v, ret);
1601 }
1602
1603 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1604 _cleanup_free_ char *v = NULL;
1605 int r;
1606
1607 assert(u);
1608 assert(ret);
1609
1610 if (!u->cgroup_path)
1611 return -ENODATA;
1612
1613 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1614 return -ENODATA;
1615
1616 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1617 if (r == -ENOENT)
1618 return -ENODATA;
1619 if (r < 0)
1620 return r;
1621
1622 return safe_atou64(v, ret);
1623 }
1624
1625 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1626 _cleanup_free_ char *v = NULL;
1627 uint64_t ns;
1628 int r;
1629
1630 assert(u);
1631 assert(ret);
1632
1633 if (!u->cgroup_path)
1634 return -ENODATA;
1635
1636 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1637 return -ENODATA;
1638
1639 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1640 if (r == -ENOENT)
1641 return -ENODATA;
1642 if (r < 0)
1643 return r;
1644
1645 r = safe_atou64(v, &ns);
1646 if (r < 0)
1647 return r;
1648
1649 *ret = ns;
1650 return 0;
1651 }
1652
1653 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1654 nsec_t ns;
1655 int r;
1656
1657 r = unit_get_cpu_usage_raw(u, &ns);
1658 if (r < 0)
1659 return r;
1660
1661 if (ns > u->cpuacct_usage_base)
1662 ns -= u->cpuacct_usage_base;
1663 else
1664 ns = 0;
1665
1666 *ret = ns;
1667 return 0;
1668 }
1669
1670 int unit_reset_cpu_usage(Unit *u) {
1671 nsec_t ns;
1672 int r;
1673
1674 assert(u);
1675
1676 r = unit_get_cpu_usage_raw(u, &ns);
1677 if (r < 0) {
1678 u->cpuacct_usage_base = 0;
1679 return r;
1680 }
1681
1682 u->cpuacct_usage_base = ns;
1683 return 0;
1684 }
1685
1686 bool unit_cgroup_delegate(Unit *u) {
1687 CGroupContext *c;
1688
1689 assert(u);
1690
1691 c = unit_get_cgroup_context(u);
1692 if (!c)
1693 return false;
1694
1695 return c->delegate;
1696 }
1697
1698 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1699 assert(u);
1700
1701 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1702 return;
1703
1704 if (m == 0)
1705 return;
1706
1707 if ((u->cgroup_realized_mask & m) == 0)
1708 return;
1709
1710 u->cgroup_realized_mask &= ~m;
1711 unit_add_to_cgroup_queue(u);
1712 }
1713
1714 void manager_invalidate_startup_units(Manager *m) {
1715 Iterator i;
1716 Unit *u;
1717
1718 assert(m);
1719
1720 SET_FOREACH(u, m->startup_units, i)
1721 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1722 }
1723
1724 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1725 [CGROUP_AUTO] = "auto",
1726 [CGROUP_CLOSED] = "closed",
1727 [CGROUP_STRICT] = "strict",
1728 };
1729
1730 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);