]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
util-lib: split string parsing related calls from util.[ch] into parse-util.[ch]
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "cgroup-util.h"
26 #include "cgroup.h"
27 #include "fd-util.h"
28 #include "parse-util.h"
29 #include "path-util.h"
30 #include "process-util.h"
31 #include "special.h"
32 #include "string-util.h"
33
34 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
35
36 void cgroup_context_init(CGroupContext *c) {
37 assert(c);
38
39 /* Initialize everything to the kernel defaults, assuming the
40 * structure is preinitialized to 0 */
41
42 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
43 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
45
46 c->memory_limit = (uint64_t) -1;
47
48 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
49 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
50
51 c->tasks_max = (uint64_t) -1;
52
53 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
54 }
55
56 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
57 assert(c);
58 assert(a);
59
60 LIST_REMOVE(device_allow, c->device_allow, a);
61 free(a->path);
62 free(a);
63 }
64
65 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
66 assert(c);
67 assert(w);
68
69 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
70 free(w->path);
71 free(w);
72 }
73
74 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
75 assert(c);
76 assert(b);
77
78 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
79 free(b->path);
80 free(b);
81 }
82
83 void cgroup_context_done(CGroupContext *c) {
84 assert(c);
85
86 while (c->blockio_device_weights)
87 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
88
89 while (c->blockio_device_bandwidths)
90 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
91
92 while (c->device_allow)
93 cgroup_context_free_device_allow(c, c->device_allow);
94 }
95
96 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
97 CGroupBlockIODeviceBandwidth *b;
98 CGroupBlockIODeviceWeight *w;
99 CGroupDeviceAllow *a;
100 char u[FORMAT_TIMESPAN_MAX];
101
102 assert(c);
103 assert(f);
104
105 prefix = strempty(prefix);
106
107 fprintf(f,
108 "%sCPUAccounting=%s\n"
109 "%sBlockIOAccounting=%s\n"
110 "%sMemoryAccounting=%s\n"
111 "%sTasksAccounting=%s\n"
112 "%sCPUShares=%" PRIu64 "\n"
113 "%sStartupCPUShares=%" PRIu64 "\n"
114 "%sCPUQuotaPerSecSec=%s\n"
115 "%sBlockIOWeight=%" PRIu64 "\n"
116 "%sStartupBlockIOWeight=%" PRIu64 "\n"
117 "%sMemoryLimit=%" PRIu64 "\n"
118 "%sTasksMax=%" PRIu64 "\n"
119 "%sDevicePolicy=%s\n"
120 "%sDelegate=%s\n",
121 prefix, yes_no(c->cpu_accounting),
122 prefix, yes_no(c->blockio_accounting),
123 prefix, yes_no(c->memory_accounting),
124 prefix, yes_no(c->tasks_accounting),
125 prefix, c->cpu_shares,
126 prefix, c->startup_cpu_shares,
127 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
128 prefix, c->blockio_weight,
129 prefix, c->startup_blockio_weight,
130 prefix, c->memory_limit,
131 prefix, c->tasks_max,
132 prefix, cgroup_device_policy_to_string(c->device_policy),
133 prefix, yes_no(c->delegate));
134
135 LIST_FOREACH(device_allow, a, c->device_allow)
136 fprintf(f,
137 "%sDeviceAllow=%s %s%s%s\n",
138 prefix,
139 a->path,
140 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
141
142 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
143 fprintf(f,
144 "%sBlockIODeviceWeight=%s %" PRIu64,
145 prefix,
146 w->path,
147 w->weight);
148
149 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
150 char buf[FORMAT_BYTES_MAX];
151
152 fprintf(f,
153 "%s%s=%s %s\n",
154 prefix,
155 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
156 b->path,
157 format_bytes(buf, sizeof(buf), b->bandwidth));
158 }
159 }
160
161 static int lookup_blkio_device(const char *p, dev_t *dev) {
162 struct stat st;
163 int r;
164
165 assert(p);
166 assert(dev);
167
168 r = stat(p, &st);
169 if (r < 0)
170 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
171
172 if (S_ISBLK(st.st_mode))
173 *dev = st.st_rdev;
174 else if (major(st.st_dev) != 0) {
175 /* If this is not a device node then find the block
176 * device this file is stored on */
177 *dev = st.st_dev;
178
179 /* If this is a partition, try to get the originating
180 * block device */
181 block_get_whole_disk(*dev, dev);
182 } else {
183 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
184 return -ENODEV;
185 }
186
187 return 0;
188 }
189
190 static int whitelist_device(const char *path, const char *node, const char *acc) {
191 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
192 struct stat st;
193 int r;
194
195 assert(path);
196 assert(acc);
197
198 if (stat(node, &st) < 0) {
199 log_warning("Couldn't stat device %s", node);
200 return -errno;
201 }
202
203 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
204 log_warning("%s is not a device.", node);
205 return -ENODEV;
206 }
207
208 sprintf(buf,
209 "%c %u:%u %s",
210 S_ISCHR(st.st_mode) ? 'c' : 'b',
211 major(st.st_rdev), minor(st.st_rdev),
212 acc);
213
214 r = cg_set_attribute("devices", path, "devices.allow", buf);
215 if (r < 0)
216 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
217 "Failed to set devices.allow on %s: %m", path);
218
219 return r;
220 }
221
222 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
223 _cleanup_fclose_ FILE *f = NULL;
224 char line[LINE_MAX];
225 bool good = false;
226 int r;
227
228 assert(path);
229 assert(acc);
230 assert(type == 'b' || type == 'c');
231
232 f = fopen("/proc/devices", "re");
233 if (!f)
234 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
235
236 FOREACH_LINE(line, f, goto fail) {
237 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
238 unsigned maj;
239
240 truncate_nl(line);
241
242 if (type == 'c' && streq(line, "Character devices:")) {
243 good = true;
244 continue;
245 }
246
247 if (type == 'b' && streq(line, "Block devices:")) {
248 good = true;
249 continue;
250 }
251
252 if (isempty(line)) {
253 good = false;
254 continue;
255 }
256
257 if (!good)
258 continue;
259
260 p = strstrip(line);
261
262 w = strpbrk(p, WHITESPACE);
263 if (!w)
264 continue;
265 *w = 0;
266
267 r = safe_atou(p, &maj);
268 if (r < 0)
269 continue;
270 if (maj <= 0)
271 continue;
272
273 w++;
274 w += strspn(w, WHITESPACE);
275
276 if (fnmatch(name, w, 0) != 0)
277 continue;
278
279 sprintf(buf,
280 "%c %u:* %s",
281 type,
282 maj,
283 acc);
284
285 r = cg_set_attribute("devices", path, "devices.allow", buf);
286 if (r < 0)
287 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
288 "Failed to set devices.allow on %s: %m", path);
289 }
290
291 return 0;
292
293 fail:
294 log_warning_errno(errno, "Failed to read /proc/devices: %m");
295 return -errno;
296 }
297
298 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
299 bool is_root;
300 int r;
301
302 assert(c);
303 assert(path);
304
305 if (mask == 0)
306 return;
307
308 /* Some cgroup attributes are not supported on the root cgroup,
309 * hence silently ignore */
310 is_root = isempty(path) || path_equal(path, "/");
311 if (is_root)
312 /* Make sure we don't try to display messages with an empty path. */
313 path = "/";
314
315 /* We generally ignore errors caused by read-only mounted
316 * cgroup trees (assuming we are running in a container then),
317 * and missing cgroups, i.e. EROFS and ENOENT. */
318
319 if ((mask & CGROUP_MASK_CPU) && !is_root) {
320 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
321
322 sprintf(buf, "%" PRIu64 "\n",
323 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
324 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
325 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
326 if (r < 0)
327 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
328 "Failed to set cpu.shares on %s: %m", path);
329
330 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
331 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
332 if (r < 0)
333 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
334 "Failed to set cpu.cfs_period_us on %s: %m", path);
335
336 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
337 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
338 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
339 } else
340 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
341 if (r < 0)
342 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
343 "Failed to set cpu.cfs_quota_us on %s: %m", path);
344 }
345
346 if (mask & CGROUP_MASK_BLKIO) {
347 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
348 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
349 CGroupBlockIODeviceWeight *w;
350 CGroupBlockIODeviceBandwidth *b;
351
352 if (!is_root) {
353 sprintf(buf, "%" PRIu64 "\n",
354 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
355 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
356 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
357 if (r < 0)
358 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
359 "Failed to set blkio.weight on %s: %m", path);
360
361 /* FIXME: no way to reset this list */
362 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
363 dev_t dev;
364
365 r = lookup_blkio_device(w->path, &dev);
366 if (r < 0)
367 continue;
368
369 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
370 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
371 if (r < 0)
372 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
373 "Failed to set blkio.weight_device on %s: %m", path);
374 }
375 }
376
377 /* FIXME: no way to reset this list */
378 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
379 const char *a;
380 dev_t dev;
381
382 r = lookup_blkio_device(b->path, &dev);
383 if (r < 0)
384 continue;
385
386 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
387
388 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
389 r = cg_set_attribute("blkio", path, a, buf);
390 if (r < 0)
391 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
392 "Failed to set %s on %s: %m", a, path);
393 }
394 }
395
396 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
397 if (c->memory_limit != (uint64_t) -1) {
398 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
399
400 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
401
402 if (cg_unified() <= 0)
403 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
404 else
405 r = cg_set_attribute("memory", path, "memory.max", buf);
406
407 } else {
408 if (cg_unified() <= 0)
409 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
410 else
411 r = cg_set_attribute("memory", path, "memory.max", "max");
412 }
413
414 if (r < 0)
415 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
416 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
417 }
418
419 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
420 CGroupDeviceAllow *a;
421
422 /* Changing the devices list of a populated cgroup
423 * might result in EINVAL, hence ignore EINVAL
424 * here. */
425
426 if (c->device_allow || c->device_policy != CGROUP_AUTO)
427 r = cg_set_attribute("devices", path, "devices.deny", "a");
428 else
429 r = cg_set_attribute("devices", path, "devices.allow", "a");
430 if (r < 0)
431 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
432 "Failed to reset devices.list on %s: %m", path);
433
434 if (c->device_policy == CGROUP_CLOSED ||
435 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
436 static const char auto_devices[] =
437 "/dev/null\0" "rwm\0"
438 "/dev/zero\0" "rwm\0"
439 "/dev/full\0" "rwm\0"
440 "/dev/random\0" "rwm\0"
441 "/dev/urandom\0" "rwm\0"
442 "/dev/tty\0" "rwm\0"
443 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
444
445 const char *x, *y;
446
447 NULSTR_FOREACH_PAIR(x, y, auto_devices)
448 whitelist_device(path, x, y);
449
450 whitelist_major(path, "pts", 'c', "rw");
451 whitelist_major(path, "kdbus", 'c', "rw");
452 whitelist_major(path, "kdbus/*", 'c', "rw");
453 }
454
455 LIST_FOREACH(device_allow, a, c->device_allow) {
456 char acc[4];
457 unsigned k = 0;
458
459 if (a->r)
460 acc[k++] = 'r';
461 if (a->w)
462 acc[k++] = 'w';
463 if (a->m)
464 acc[k++] = 'm';
465
466 if (k == 0)
467 continue;
468
469 acc[k++] = 0;
470
471 if (startswith(a->path, "/dev/"))
472 whitelist_device(path, a->path, acc);
473 else if (startswith(a->path, "block-"))
474 whitelist_major(path, a->path + 6, 'b', acc);
475 else if (startswith(a->path, "char-"))
476 whitelist_major(path, a->path + 5, 'c', acc);
477 else
478 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
479 }
480 }
481
482 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
483
484 if (c->tasks_max != (uint64_t) -1) {
485 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
486
487 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
488 r = cg_set_attribute("pids", path, "pids.max", buf);
489 } else
490 r = cg_set_attribute("pids", path, "pids.max", "max");
491
492 if (r < 0)
493 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
494 "Failed to set pids.max on %s: %m", path);
495 }
496
497 if (mask & CGROUP_MASK_NET_CLS) {
498 char buf[DECIMAL_STR_MAX(uint32_t)];
499
500 sprintf(buf, "%" PRIu32, netclass);
501
502 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
503 if (r < 0)
504 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
505 "Failed to set net_cls.classid on %s: %m", path);
506 }
507 }
508
509 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
510 CGroupMask mask = 0;
511
512 /* Figure out which controllers we need */
513
514 if (c->cpu_accounting ||
515 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
516 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
517 c->cpu_quota_per_sec_usec != USEC_INFINITY)
518 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
519
520 if (c->blockio_accounting ||
521 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
522 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
523 c->blockio_device_weights ||
524 c->blockio_device_bandwidths)
525 mask |= CGROUP_MASK_BLKIO;
526
527 if (c->memory_accounting ||
528 c->memory_limit != (uint64_t) -1)
529 mask |= CGROUP_MASK_MEMORY;
530
531 if (c->device_allow ||
532 c->device_policy != CGROUP_AUTO)
533 mask |= CGROUP_MASK_DEVICES;
534
535 if (c->tasks_accounting ||
536 c->tasks_max != (uint64_t) -1)
537 mask |= CGROUP_MASK_PIDS;
538
539 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
540 mask |= CGROUP_MASK_NET_CLS;
541
542 return mask;
543 }
544
545 CGroupMask unit_get_own_mask(Unit *u) {
546 CGroupContext *c;
547
548 /* Returns the mask of controllers the unit needs for itself */
549
550 c = unit_get_cgroup_context(u);
551 if (!c)
552 return 0;
553
554 /* If delegation is turned on, then turn on all cgroups,
555 * unless we are on the legacy hierarchy and the process we
556 * fork into it is known to drop privileges, and hence
557 * shouldn't get access to the controllers.
558 *
559 * Note that on the unified hierarchy it is safe to delegate
560 * controllers to unprivileged services. */
561
562 if (c->delegate) {
563 ExecContext *e;
564
565 e = unit_get_exec_context(u);
566 if (!e ||
567 exec_context_maintains_privileges(e) ||
568 cg_unified() > 0)
569 return _CGROUP_MASK_ALL;
570 }
571
572 return cgroup_context_get_mask(c);
573 }
574
575 CGroupMask unit_get_members_mask(Unit *u) {
576 assert(u);
577
578 /* Returns the mask of controllers all of the unit's children
579 * require, merged */
580
581 if (u->cgroup_members_mask_valid)
582 return u->cgroup_members_mask;
583
584 u->cgroup_members_mask = 0;
585
586 if (u->type == UNIT_SLICE) {
587 Unit *member;
588 Iterator i;
589
590 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
591
592 if (member == u)
593 continue;
594
595 if (UNIT_DEREF(member->slice) != u)
596 continue;
597
598 u->cgroup_members_mask |=
599 unit_get_own_mask(member) |
600 unit_get_members_mask(member);
601 }
602 }
603
604 u->cgroup_members_mask_valid = true;
605 return u->cgroup_members_mask;
606 }
607
608 CGroupMask unit_get_siblings_mask(Unit *u) {
609 assert(u);
610
611 /* Returns the mask of controllers all of the unit's siblings
612 * require, i.e. the members mask of the unit's parent slice
613 * if there is one. */
614
615 if (UNIT_ISSET(u->slice))
616 return unit_get_members_mask(UNIT_DEREF(u->slice));
617
618 return unit_get_own_mask(u) | unit_get_members_mask(u);
619 }
620
621 CGroupMask unit_get_subtree_mask(Unit *u) {
622
623 /* Returns the mask of this subtree, meaning of the group
624 * itself and its children. */
625
626 return unit_get_own_mask(u) | unit_get_members_mask(u);
627 }
628
629 CGroupMask unit_get_target_mask(Unit *u) {
630 CGroupMask mask;
631
632 /* This returns the cgroup mask of all controllers to enable
633 * for a specific cgroup, i.e. everything it needs itself,
634 * plus all that its children need, plus all that its siblings
635 * need. This is primarily useful on the legacy cgroup
636 * hierarchy, where we need to duplicate each cgroup in each
637 * hierarchy that shall be enabled for it. */
638
639 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
640 mask &= u->manager->cgroup_supported;
641
642 return mask;
643 }
644
645 CGroupMask unit_get_enable_mask(Unit *u) {
646 CGroupMask mask;
647
648 /* This returns the cgroup mask of all controllers to enable
649 * for the children of a specific cgroup. This is primarily
650 * useful for the unified cgroup hierarchy, where each cgroup
651 * controls which controllers are enabled for its children. */
652
653 mask = unit_get_members_mask(u);
654 mask &= u->manager->cgroup_supported;
655
656 return mask;
657 }
658
659 /* Recurse from a unit up through its containing slices, propagating
660 * mask bits upward. A unit is also member of itself. */
661 void unit_update_cgroup_members_masks(Unit *u) {
662 CGroupMask m;
663 bool more;
664
665 assert(u);
666
667 /* Calculate subtree mask */
668 m = unit_get_subtree_mask(u);
669
670 /* See if anything changed from the previous invocation. If
671 * not, we're done. */
672 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
673 return;
674
675 more =
676 u->cgroup_subtree_mask_valid &&
677 ((m & ~u->cgroup_subtree_mask) != 0) &&
678 ((~m & u->cgroup_subtree_mask) == 0);
679
680 u->cgroup_subtree_mask = m;
681 u->cgroup_subtree_mask_valid = true;
682
683 if (UNIT_ISSET(u->slice)) {
684 Unit *s = UNIT_DEREF(u->slice);
685
686 if (more)
687 /* There's more set now than before. We
688 * propagate the new mask to the parent's mask
689 * (not caring if it actually was valid or
690 * not). */
691
692 s->cgroup_members_mask |= m;
693
694 else
695 /* There's less set now than before (or we
696 * don't know), we need to recalculate
697 * everything, so let's invalidate the
698 * parent's members mask */
699
700 s->cgroup_members_mask_valid = false;
701
702 /* And now make sure that this change also hits our
703 * grandparents */
704 unit_update_cgroup_members_masks(s);
705 }
706 }
707
708 static const char *migrate_callback(CGroupMask mask, void *userdata) {
709 Unit *u = userdata;
710
711 assert(mask != 0);
712 assert(u);
713
714 while (u) {
715 if (u->cgroup_path &&
716 u->cgroup_realized &&
717 (u->cgroup_realized_mask & mask) == mask)
718 return u->cgroup_path;
719
720 u = UNIT_DEREF(u->slice);
721 }
722
723 return NULL;
724 }
725
726 char *unit_default_cgroup_path(Unit *u) {
727 _cleanup_free_ char *escaped = NULL, *slice = NULL;
728 int r;
729
730 assert(u);
731
732 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
733 return strdup(u->manager->cgroup_root);
734
735 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
736 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
737 if (r < 0)
738 return NULL;
739 }
740
741 escaped = cg_escape(u->id);
742 if (!escaped)
743 return NULL;
744
745 if (slice)
746 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
747 else
748 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
749 }
750
751 int unit_set_cgroup_path(Unit *u, const char *path) {
752 _cleanup_free_ char *p = NULL;
753 int r;
754
755 assert(u);
756
757 if (path) {
758 p = strdup(path);
759 if (!p)
760 return -ENOMEM;
761 } else
762 p = NULL;
763
764 if (streq_ptr(u->cgroup_path, p))
765 return 0;
766
767 if (p) {
768 r = hashmap_put(u->manager->cgroup_unit, p, u);
769 if (r < 0)
770 return r;
771 }
772
773 unit_release_cgroup(u);
774
775 u->cgroup_path = p;
776 p = NULL;
777
778 return 1;
779 }
780
781 int unit_watch_cgroup(Unit *u) {
782 _cleanup_free_ char *populated = NULL;
783 int r;
784
785 assert(u);
786
787 if (!u->cgroup_path)
788 return 0;
789
790 if (u->cgroup_inotify_wd >= 0)
791 return 0;
792
793 /* Only applies to the unified hierarchy */
794 r = cg_unified();
795 if (r < 0)
796 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
797 if (r == 0)
798 return 0;
799
800 /* Don't watch the root slice, it's pointless. */
801 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
802 return 0;
803
804 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
805 if (r < 0)
806 return log_oom();
807
808 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
809 if (r < 0)
810 return log_oom();
811
812 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
813 if (u->cgroup_inotify_wd < 0) {
814
815 if (errno == ENOENT) /* If the directory is already
816 * gone we don't need to track
817 * it, so this is not an error */
818 return 0;
819
820 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
821 }
822
823 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
824 if (r < 0)
825 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
826
827 return 0;
828 }
829
830 static int unit_create_cgroup(
831 Unit *u,
832 CGroupMask target_mask,
833 CGroupMask enable_mask) {
834
835 CGroupContext *c;
836 int r;
837
838 assert(u);
839
840 c = unit_get_cgroup_context(u);
841 if (!c)
842 return 0;
843
844 if (!u->cgroup_path) {
845 _cleanup_free_ char *path = NULL;
846
847 path = unit_default_cgroup_path(u);
848 if (!path)
849 return log_oom();
850
851 r = unit_set_cgroup_path(u, path);
852 if (r == -EEXIST)
853 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
854 if (r < 0)
855 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
856 }
857
858 /* First, create our own group */
859 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
860 if (r < 0)
861 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
862
863 /* Start watching it */
864 (void) unit_watch_cgroup(u);
865
866 /* Enable all controllers we need */
867 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
868 if (r < 0)
869 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
870
871 /* Keep track that this is now realized */
872 u->cgroup_realized = true;
873 u->cgroup_realized_mask = target_mask;
874
875 if (u->type != UNIT_SLICE && !c->delegate) {
876
877 /* Then, possibly move things over, but not if
878 * subgroups may contain processes, which is the case
879 * for slice and delegation units. */
880 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
881 if (r < 0)
882 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
883 }
884
885 return 0;
886 }
887
888 int unit_attach_pids_to_cgroup(Unit *u) {
889 int r;
890 assert(u);
891
892 r = unit_realize_cgroup(u);
893 if (r < 0)
894 return r;
895
896 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
897 if (r < 0)
898 return r;
899
900 return 0;
901 }
902
903 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
904 assert(u);
905
906 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
907 }
908
909 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
910
911 uint32_t start, i;
912 Manager *m;
913
914 assert(u);
915
916 m = u->manager;
917
918 i = start = m->cgroup_netclass_registry_last;
919
920 do {
921 i++;
922
923 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
924 m->cgroup_netclass_registry_last = i;
925 *ret = i;
926 return 0;
927 }
928
929 if (i == UINT32_MAX)
930 i = CGROUP_NETCLASS_FIXED_MAX;
931
932 } while (i != start);
933
934 return -ENOBUFS;
935 }
936
937 int unit_add_to_netclass_cgroup(Unit *u) {
938
939 CGroupContext *cc;
940 Unit *first;
941 void *key;
942 int r;
943
944 assert(u);
945
946 cc = unit_get_cgroup_context(u);
947 if (!cc)
948 return 0;
949
950 switch (cc->netclass_type) {
951 case CGROUP_NETCLASS_TYPE_NONE:
952 return 0;
953
954 case CGROUP_NETCLASS_TYPE_FIXED:
955 u->cgroup_netclass_id = cc->netclass_id;
956 break;
957
958 case CGROUP_NETCLASS_TYPE_AUTO:
959 /* Allocate a new ID in case it was requested and not done yet */
960 if (u->cgroup_netclass_id == 0) {
961 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
962 if (r < 0)
963 return r;
964
965 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
966 }
967
968 break;
969 }
970
971 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
972 if (r < 0)
973 return r;
974
975 key = UINT32_TO_PTR(u->cgroup_netclass_id);
976 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
977
978 if (first) {
979 LIST_PREPEND(cgroup_netclass, first, u);
980 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
981 }
982
983 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
984 }
985
986 int unit_remove_from_netclass_cgroup(Unit *u) {
987
988 Unit *head;
989 void *key;
990
991 assert(u);
992
993 key = UINT32_TO_PTR(u->cgroup_netclass_id);
994
995 LIST_FIND_HEAD(cgroup_netclass, u, head);
996 LIST_REMOVE(cgroup_netclass, head, u);
997
998 if (head)
999 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1000
1001 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1002
1003 return 0;
1004 }
1005
1006 /* Check if necessary controllers and attributes for a unit are in place.
1007 *
1008 * If so, do nothing.
1009 * If not, create paths, move processes over, and set attributes.
1010 *
1011 * Returns 0 on success and < 0 on failure. */
1012 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1013 CGroupMask target_mask, enable_mask;
1014 int r;
1015
1016 assert(u);
1017
1018 if (u->in_cgroup_queue) {
1019 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1020 u->in_cgroup_queue = false;
1021 }
1022
1023 target_mask = unit_get_target_mask(u);
1024 if (unit_has_mask_realized(u, target_mask))
1025 return 0;
1026
1027 /* First, realize parents */
1028 if (UNIT_ISSET(u->slice)) {
1029 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1030 if (r < 0)
1031 return r;
1032 }
1033
1034 /* And then do the real work */
1035 enable_mask = unit_get_enable_mask(u);
1036 r = unit_create_cgroup(u, target_mask, enable_mask);
1037 if (r < 0)
1038 return r;
1039
1040 /* Finally, apply the necessary attributes. */
1041 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1042
1043 return 0;
1044 }
1045
1046 static void unit_add_to_cgroup_queue(Unit *u) {
1047
1048 if (u->in_cgroup_queue)
1049 return;
1050
1051 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1052 u->in_cgroup_queue = true;
1053 }
1054
1055 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1056 ManagerState state;
1057 unsigned n = 0;
1058 Unit *i;
1059 int r;
1060
1061 state = manager_state(m);
1062
1063 while ((i = m->cgroup_queue)) {
1064 assert(i->in_cgroup_queue);
1065
1066 r = unit_realize_cgroup_now(i, state);
1067 if (r < 0)
1068 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1069
1070 n++;
1071 }
1072
1073 return n;
1074 }
1075
1076 static void unit_queue_siblings(Unit *u) {
1077 Unit *slice;
1078
1079 /* This adds the siblings of the specified unit and the
1080 * siblings of all parent units to the cgroup queue. (But
1081 * neither the specified unit itself nor the parents.) */
1082
1083 while ((slice = UNIT_DEREF(u->slice))) {
1084 Iterator i;
1085 Unit *m;
1086
1087 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1088 if (m == u)
1089 continue;
1090
1091 /* Skip units that have a dependency on the slice
1092 * but aren't actually in it. */
1093 if (UNIT_DEREF(m->slice) != slice)
1094 continue;
1095
1096 /* No point in doing cgroup application for units
1097 * without active processes. */
1098 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1099 continue;
1100
1101 /* If the unit doesn't need any new controllers
1102 * and has current ones realized, it doesn't need
1103 * any changes. */
1104 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1105 continue;
1106
1107 unit_add_to_cgroup_queue(m);
1108 }
1109
1110 u = slice;
1111 }
1112 }
1113
1114 int unit_realize_cgroup(Unit *u) {
1115 assert(u);
1116
1117 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1118 return 0;
1119
1120 /* So, here's the deal: when realizing the cgroups for this
1121 * unit, we need to first create all parents, but there's more
1122 * actually: for the weight-based controllers we also need to
1123 * make sure that all our siblings (i.e. units that are in the
1124 * same slice as we are) have cgroups, too. Otherwise, things
1125 * would become very uneven as each of their processes would
1126 * get as much resources as all our group together. This call
1127 * will synchronously create the parent cgroups, but will
1128 * defer work on the siblings to the next event loop
1129 * iteration. */
1130
1131 /* Add all sibling slices to the cgroup queue. */
1132 unit_queue_siblings(u);
1133
1134 /* And realize this one now (and apply the values) */
1135 return unit_realize_cgroup_now(u, manager_state(u->manager));
1136 }
1137
1138 void unit_release_cgroup(Unit *u) {
1139 assert(u);
1140
1141 /* Forgets all cgroup details for this cgroup */
1142
1143 if (u->cgroup_path) {
1144 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1145 u->cgroup_path = mfree(u->cgroup_path);
1146 }
1147
1148 if (u->cgroup_inotify_wd >= 0) {
1149 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1150 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1151
1152 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1153 u->cgroup_inotify_wd = -1;
1154 }
1155 }
1156
1157 void unit_prune_cgroup(Unit *u) {
1158 int r;
1159 bool is_root_slice;
1160
1161 assert(u);
1162
1163 /* Removes the cgroup, if empty and possible, and stops watching it. */
1164
1165 if (!u->cgroup_path)
1166 return;
1167
1168 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1169
1170 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1171 if (r < 0) {
1172 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1173 return;
1174 }
1175
1176 if (is_root_slice)
1177 return;
1178
1179 unit_release_cgroup(u);
1180
1181 u->cgroup_realized = false;
1182 u->cgroup_realized_mask = 0;
1183 }
1184
1185 int unit_search_main_pid(Unit *u, pid_t *ret) {
1186 _cleanup_fclose_ FILE *f = NULL;
1187 pid_t pid = 0, npid, mypid;
1188 int r;
1189
1190 assert(u);
1191 assert(ret);
1192
1193 if (!u->cgroup_path)
1194 return -ENXIO;
1195
1196 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1197 if (r < 0)
1198 return r;
1199
1200 mypid = getpid();
1201 while (cg_read_pid(f, &npid) > 0) {
1202 pid_t ppid;
1203
1204 if (npid == pid)
1205 continue;
1206
1207 /* Ignore processes that aren't our kids */
1208 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1209 continue;
1210
1211 if (pid != 0)
1212 /* Dang, there's more than one daemonized PID
1213 in this group, so we don't know what process
1214 is the main process. */
1215
1216 return -ENODATA;
1217
1218 pid = npid;
1219 }
1220
1221 *ret = pid;
1222 return 0;
1223 }
1224
1225 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1226 _cleanup_closedir_ DIR *d = NULL;
1227 _cleanup_fclose_ FILE *f = NULL;
1228 int ret = 0, r;
1229
1230 assert(u);
1231 assert(path);
1232
1233 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1234 if (r < 0)
1235 ret = r;
1236 else {
1237 pid_t pid;
1238
1239 while ((r = cg_read_pid(f, &pid)) > 0) {
1240 r = unit_watch_pid(u, pid);
1241 if (r < 0 && ret >= 0)
1242 ret = r;
1243 }
1244
1245 if (r < 0 && ret >= 0)
1246 ret = r;
1247 }
1248
1249 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1250 if (r < 0) {
1251 if (ret >= 0)
1252 ret = r;
1253 } else {
1254 char *fn;
1255
1256 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1257 _cleanup_free_ char *p = NULL;
1258
1259 p = strjoin(path, "/", fn, NULL);
1260 free(fn);
1261
1262 if (!p)
1263 return -ENOMEM;
1264
1265 r = unit_watch_pids_in_path(u, p);
1266 if (r < 0 && ret >= 0)
1267 ret = r;
1268 }
1269
1270 if (r < 0 && ret >= 0)
1271 ret = r;
1272 }
1273
1274 return ret;
1275 }
1276
1277 int unit_watch_all_pids(Unit *u) {
1278 assert(u);
1279
1280 /* Adds all PIDs from our cgroup to the set of PIDs we
1281 * watch. This is a fallback logic for cases where we do not
1282 * get reliable cgroup empty notifications: we try to use
1283 * SIGCHLD as replacement. */
1284
1285 if (!u->cgroup_path)
1286 return -ENOENT;
1287
1288 if (cg_unified() > 0) /* On unified we can use proper notifications */
1289 return 0;
1290
1291 return unit_watch_pids_in_path(u, u->cgroup_path);
1292 }
1293
1294 int unit_notify_cgroup_empty(Unit *u) {
1295 int r;
1296
1297 assert(u);
1298
1299 if (!u->cgroup_path)
1300 return 0;
1301
1302 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1303 if (r <= 0)
1304 return r;
1305
1306 unit_add_to_gc_queue(u);
1307
1308 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1309 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1310
1311 return 0;
1312 }
1313
1314 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1315 Manager *m = userdata;
1316
1317 assert(s);
1318 assert(fd >= 0);
1319 assert(m);
1320
1321 for (;;) {
1322 union inotify_event_buffer buffer;
1323 struct inotify_event *e;
1324 ssize_t l;
1325
1326 l = read(fd, &buffer, sizeof(buffer));
1327 if (l < 0) {
1328 if (errno == EINTR || errno == EAGAIN)
1329 return 0;
1330
1331 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1332 }
1333
1334 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1335 Unit *u;
1336
1337 if (e->wd < 0)
1338 /* Queue overflow has no watch descriptor */
1339 continue;
1340
1341 if (e->mask & IN_IGNORED)
1342 /* The watch was just removed */
1343 continue;
1344
1345 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1346 if (!u) /* Not that inotify might deliver
1347 * events for a watch even after it
1348 * was removed, because it was queued
1349 * before the removal. Let's ignore
1350 * this here safely. */
1351 continue;
1352
1353 (void) unit_notify_cgroup_empty(u);
1354 }
1355 }
1356 }
1357
1358 int manager_setup_cgroup(Manager *m) {
1359 _cleanup_free_ char *path = NULL;
1360 CGroupController c;
1361 int r, unified;
1362 char *e;
1363
1364 assert(m);
1365
1366 /* 1. Determine hierarchy */
1367 m->cgroup_root = mfree(m->cgroup_root);
1368 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1369 if (r < 0)
1370 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1371
1372 /* Chop off the init scope, if we are already located in it */
1373 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1374
1375 /* LEGACY: Also chop off the system slice if we are in
1376 * it. This is to support live upgrades from older systemd
1377 * versions where PID 1 was moved there. Also see
1378 * cg_get_root_path(). */
1379 if (!e && m->running_as == MANAGER_SYSTEM) {
1380 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1381 if (!e)
1382 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1383 }
1384 if (e)
1385 *e = 0;
1386
1387 /* And make sure to store away the root value without trailing
1388 * slash, even for the root dir, so that we can easily prepend
1389 * it everywhere. */
1390 while ((e = endswith(m->cgroup_root, "/")))
1391 *e = 0;
1392
1393 /* 2. Show data */
1394 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1395 if (r < 0)
1396 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1397
1398 unified = cg_unified();
1399 if (unified < 0)
1400 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1401 if (unified > 0)
1402 log_debug("Unified cgroup hierarchy is located at %s.", path);
1403 else
1404 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1405
1406 if (!m->test_run) {
1407 const char *scope_path;
1408
1409 /* 3. Install agent */
1410 if (unified) {
1411
1412 /* In the unified hierarchy we can can get
1413 * cgroup empty notifications via inotify. */
1414
1415 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1416 safe_close(m->cgroup_inotify_fd);
1417
1418 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1419 if (m->cgroup_inotify_fd < 0)
1420 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1421
1422 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1423 if (r < 0)
1424 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1425
1426 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1427 if (r < 0)
1428 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1429
1430 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1431
1432 } else if (m->running_as == MANAGER_SYSTEM) {
1433
1434 /* On the legacy hierarchy we only get
1435 * notifications via cgroup agents. (Which
1436 * isn't really reliable, since it does not
1437 * generate events when control groups with
1438 * children run empty. */
1439
1440 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1441 if (r < 0)
1442 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1443 else if (r > 0)
1444 log_debug("Installed release agent.");
1445 else if (r == 0)
1446 log_debug("Release agent already installed.");
1447 }
1448
1449 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1450 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1451 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1452 if (r < 0)
1453 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1454
1455 /* also, move all other userspace processes remaining
1456 * in the root cgroup into that scope. */
1457 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1458 if (r < 0)
1459 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1460
1461 /* 5. And pin it, so that it cannot be unmounted */
1462 safe_close(m->pin_cgroupfs_fd);
1463 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1464 if (m->pin_cgroupfs_fd < 0)
1465 return log_error_errno(errno, "Failed to open pin file: %m");
1466
1467 /* 6. Always enable hierarchical support if it exists... */
1468 if (!unified)
1469 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1470 }
1471
1472 /* 7. Figure out which controllers are supported */
1473 r = cg_mask_supported(&m->cgroup_supported);
1474 if (r < 0)
1475 return log_error_errno(r, "Failed to determine supported controllers: %m");
1476
1477 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1478 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1479
1480 return 0;
1481 }
1482
1483 void manager_shutdown_cgroup(Manager *m, bool delete) {
1484 assert(m);
1485
1486 /* We can't really delete the group, since we are in it. But
1487 * let's trim it. */
1488 if (delete && m->cgroup_root)
1489 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1490
1491 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1492
1493 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1494 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1495
1496 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1497
1498 m->cgroup_root = mfree(m->cgroup_root);
1499 }
1500
1501 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1502 char *p;
1503 Unit *u;
1504
1505 assert(m);
1506 assert(cgroup);
1507
1508 u = hashmap_get(m->cgroup_unit, cgroup);
1509 if (u)
1510 return u;
1511
1512 p = strdupa(cgroup);
1513 for (;;) {
1514 char *e;
1515
1516 e = strrchr(p, '/');
1517 if (!e || e == p)
1518 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1519
1520 *e = 0;
1521
1522 u = hashmap_get(m->cgroup_unit, p);
1523 if (u)
1524 return u;
1525 }
1526 }
1527
1528 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1529 _cleanup_free_ char *cgroup = NULL;
1530 int r;
1531
1532 assert(m);
1533
1534 if (pid <= 0)
1535 return NULL;
1536
1537 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1538 if (r < 0)
1539 return NULL;
1540
1541 return manager_get_unit_by_cgroup(m, cgroup);
1542 }
1543
1544 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1545 Unit *u;
1546
1547 assert(m);
1548
1549 if (pid <= 0)
1550 return NULL;
1551
1552 if (pid == 1)
1553 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1554
1555 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1556 if (u)
1557 return u;
1558
1559 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1560 if (u)
1561 return u;
1562
1563 return manager_get_unit_by_pid_cgroup(m, pid);
1564 }
1565
1566 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1567 Unit *u;
1568
1569 assert(m);
1570 assert(cgroup);
1571
1572 u = manager_get_unit_by_cgroup(m, cgroup);
1573 if (!u)
1574 return 0;
1575
1576 return unit_notify_cgroup_empty(u);
1577 }
1578
1579 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1580 _cleanup_free_ char *v = NULL;
1581 int r;
1582
1583 assert(u);
1584 assert(ret);
1585
1586 if (!u->cgroup_path)
1587 return -ENODATA;
1588
1589 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1590 return -ENODATA;
1591
1592 if (cg_unified() <= 0)
1593 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1594 else
1595 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1596 if (r == -ENOENT)
1597 return -ENODATA;
1598 if (r < 0)
1599 return r;
1600
1601 return safe_atou64(v, ret);
1602 }
1603
1604 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1605 _cleanup_free_ char *v = NULL;
1606 int r;
1607
1608 assert(u);
1609 assert(ret);
1610
1611 if (!u->cgroup_path)
1612 return -ENODATA;
1613
1614 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1615 return -ENODATA;
1616
1617 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1618 if (r == -ENOENT)
1619 return -ENODATA;
1620 if (r < 0)
1621 return r;
1622
1623 return safe_atou64(v, ret);
1624 }
1625
1626 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1627 _cleanup_free_ char *v = NULL;
1628 uint64_t ns;
1629 int r;
1630
1631 assert(u);
1632 assert(ret);
1633
1634 if (!u->cgroup_path)
1635 return -ENODATA;
1636
1637 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1638 return -ENODATA;
1639
1640 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1641 if (r == -ENOENT)
1642 return -ENODATA;
1643 if (r < 0)
1644 return r;
1645
1646 r = safe_atou64(v, &ns);
1647 if (r < 0)
1648 return r;
1649
1650 *ret = ns;
1651 return 0;
1652 }
1653
1654 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1655 nsec_t ns;
1656 int r;
1657
1658 r = unit_get_cpu_usage_raw(u, &ns);
1659 if (r < 0)
1660 return r;
1661
1662 if (ns > u->cpuacct_usage_base)
1663 ns -= u->cpuacct_usage_base;
1664 else
1665 ns = 0;
1666
1667 *ret = ns;
1668 return 0;
1669 }
1670
1671 int unit_reset_cpu_usage(Unit *u) {
1672 nsec_t ns;
1673 int r;
1674
1675 assert(u);
1676
1677 r = unit_get_cpu_usage_raw(u, &ns);
1678 if (r < 0) {
1679 u->cpuacct_usage_base = 0;
1680 return r;
1681 }
1682
1683 u->cpuacct_usage_base = ns;
1684 return 0;
1685 }
1686
1687 bool unit_cgroup_delegate(Unit *u) {
1688 CGroupContext *c;
1689
1690 assert(u);
1691
1692 c = unit_get_cgroup_context(u);
1693 if (!c)
1694 return false;
1695
1696 return c->delegate;
1697 }
1698
1699 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1700 assert(u);
1701
1702 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1703 return;
1704
1705 if (m == 0)
1706 return;
1707
1708 if ((u->cgroup_realized_mask & m) == 0)
1709 return;
1710
1711 u->cgroup_realized_mask &= ~m;
1712 unit_add_to_cgroup_queue(u);
1713 }
1714
1715 void manager_invalidate_startup_units(Manager *m) {
1716 Iterator i;
1717 Unit *u;
1718
1719 assert(m);
1720
1721 SET_FOREACH(u, m->startup_units, i)
1722 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1723 }
1724
1725 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1726 [CGROUP_AUTO] = "auto",
1727 [CGROUP_CLOSED] = "closed",
1728 [CGROUP_STRICT] = "strict",
1729 };
1730
1731 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);