]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
util-lib: split our string related calls from util.[ch] into its own file string...
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "cgroup-util.h"
26 #include "path-util.h"
27 #include "process-util.h"
28 #include "special.h"
29 #include "string-util.h"
30 #include "cgroup.h"
31
32 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
33
34 void cgroup_context_init(CGroupContext *c) {
35 assert(c);
36
37 /* Initialize everything to the kernel defaults, assuming the
38 * structure is preinitialized to 0 */
39
40 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
41 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
42 c->cpu_quota_per_sec_usec = USEC_INFINITY;
43
44 c->memory_limit = (uint64_t) -1;
45
46 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
47 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
48
49 c->tasks_max = (uint64_t) -1;
50
51 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
52 }
53
54 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
55 assert(c);
56 assert(a);
57
58 LIST_REMOVE(device_allow, c->device_allow, a);
59 free(a->path);
60 free(a);
61 }
62
63 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
64 assert(c);
65 assert(w);
66
67 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
68 free(w->path);
69 free(w);
70 }
71
72 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
73 assert(c);
74 assert(b);
75
76 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
77 free(b->path);
78 free(b);
79 }
80
81 void cgroup_context_done(CGroupContext *c) {
82 assert(c);
83
84 while (c->blockio_device_weights)
85 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
86
87 while (c->blockio_device_bandwidths)
88 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
89
90 while (c->device_allow)
91 cgroup_context_free_device_allow(c, c->device_allow);
92 }
93
94 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
95 CGroupBlockIODeviceBandwidth *b;
96 CGroupBlockIODeviceWeight *w;
97 CGroupDeviceAllow *a;
98 char u[FORMAT_TIMESPAN_MAX];
99
100 assert(c);
101 assert(f);
102
103 prefix = strempty(prefix);
104
105 fprintf(f,
106 "%sCPUAccounting=%s\n"
107 "%sBlockIOAccounting=%s\n"
108 "%sMemoryAccounting=%s\n"
109 "%sTasksAccounting=%s\n"
110 "%sCPUShares=%" PRIu64 "\n"
111 "%sStartupCPUShares=%" PRIu64 "\n"
112 "%sCPUQuotaPerSecSec=%s\n"
113 "%sBlockIOWeight=%" PRIu64 "\n"
114 "%sStartupBlockIOWeight=%" PRIu64 "\n"
115 "%sMemoryLimit=%" PRIu64 "\n"
116 "%sTasksMax=%" PRIu64 "\n"
117 "%sDevicePolicy=%s\n"
118 "%sDelegate=%s\n",
119 prefix, yes_no(c->cpu_accounting),
120 prefix, yes_no(c->blockio_accounting),
121 prefix, yes_no(c->memory_accounting),
122 prefix, yes_no(c->tasks_accounting),
123 prefix, c->cpu_shares,
124 prefix, c->startup_cpu_shares,
125 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
126 prefix, c->blockio_weight,
127 prefix, c->startup_blockio_weight,
128 prefix, c->memory_limit,
129 prefix, c->tasks_max,
130 prefix, cgroup_device_policy_to_string(c->device_policy),
131 prefix, yes_no(c->delegate));
132
133 LIST_FOREACH(device_allow, a, c->device_allow)
134 fprintf(f,
135 "%sDeviceAllow=%s %s%s%s\n",
136 prefix,
137 a->path,
138 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
139
140 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
141 fprintf(f,
142 "%sBlockIODeviceWeight=%s %" PRIu64,
143 prefix,
144 w->path,
145 w->weight);
146
147 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
148 char buf[FORMAT_BYTES_MAX];
149
150 fprintf(f,
151 "%s%s=%s %s\n",
152 prefix,
153 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
154 b->path,
155 format_bytes(buf, sizeof(buf), b->bandwidth));
156 }
157 }
158
159 static int lookup_blkio_device(const char *p, dev_t *dev) {
160 struct stat st;
161 int r;
162
163 assert(p);
164 assert(dev);
165
166 r = stat(p, &st);
167 if (r < 0)
168 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
169
170 if (S_ISBLK(st.st_mode))
171 *dev = st.st_rdev;
172 else if (major(st.st_dev) != 0) {
173 /* If this is not a device node then find the block
174 * device this file is stored on */
175 *dev = st.st_dev;
176
177 /* If this is a partition, try to get the originating
178 * block device */
179 block_get_whole_disk(*dev, dev);
180 } else {
181 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
182 return -ENODEV;
183 }
184
185 return 0;
186 }
187
188 static int whitelist_device(const char *path, const char *node, const char *acc) {
189 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
190 struct stat st;
191 int r;
192
193 assert(path);
194 assert(acc);
195
196 if (stat(node, &st) < 0) {
197 log_warning("Couldn't stat device %s", node);
198 return -errno;
199 }
200
201 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
202 log_warning("%s is not a device.", node);
203 return -ENODEV;
204 }
205
206 sprintf(buf,
207 "%c %u:%u %s",
208 S_ISCHR(st.st_mode) ? 'c' : 'b',
209 major(st.st_rdev), minor(st.st_rdev),
210 acc);
211
212 r = cg_set_attribute("devices", path, "devices.allow", buf);
213 if (r < 0)
214 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
215 "Failed to set devices.allow on %s: %m", path);
216
217 return r;
218 }
219
220 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
221 _cleanup_fclose_ FILE *f = NULL;
222 char line[LINE_MAX];
223 bool good = false;
224 int r;
225
226 assert(path);
227 assert(acc);
228 assert(type == 'b' || type == 'c');
229
230 f = fopen("/proc/devices", "re");
231 if (!f)
232 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
233
234 FOREACH_LINE(line, f, goto fail) {
235 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
236 unsigned maj;
237
238 truncate_nl(line);
239
240 if (type == 'c' && streq(line, "Character devices:")) {
241 good = true;
242 continue;
243 }
244
245 if (type == 'b' && streq(line, "Block devices:")) {
246 good = true;
247 continue;
248 }
249
250 if (isempty(line)) {
251 good = false;
252 continue;
253 }
254
255 if (!good)
256 continue;
257
258 p = strstrip(line);
259
260 w = strpbrk(p, WHITESPACE);
261 if (!w)
262 continue;
263 *w = 0;
264
265 r = safe_atou(p, &maj);
266 if (r < 0)
267 continue;
268 if (maj <= 0)
269 continue;
270
271 w++;
272 w += strspn(w, WHITESPACE);
273
274 if (fnmatch(name, w, 0) != 0)
275 continue;
276
277 sprintf(buf,
278 "%c %u:* %s",
279 type,
280 maj,
281 acc);
282
283 r = cg_set_attribute("devices", path, "devices.allow", buf);
284 if (r < 0)
285 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
286 "Failed to set devices.allow on %s: %m", path);
287 }
288
289 return 0;
290
291 fail:
292 log_warning_errno(errno, "Failed to read /proc/devices: %m");
293 return -errno;
294 }
295
296 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
297 bool is_root;
298 int r;
299
300 assert(c);
301 assert(path);
302
303 if (mask == 0)
304 return;
305
306 /* Some cgroup attributes are not supported on the root cgroup,
307 * hence silently ignore */
308 is_root = isempty(path) || path_equal(path, "/");
309 if (is_root)
310 /* Make sure we don't try to display messages with an empty path. */
311 path = "/";
312
313 /* We generally ignore errors caused by read-only mounted
314 * cgroup trees (assuming we are running in a container then),
315 * and missing cgroups, i.e. EROFS and ENOENT. */
316
317 if ((mask & CGROUP_MASK_CPU) && !is_root) {
318 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
319
320 sprintf(buf, "%" PRIu64 "\n",
321 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
322 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
323 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
324 if (r < 0)
325 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
326 "Failed to set cpu.shares on %s: %m", path);
327
328 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
329 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
330 if (r < 0)
331 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
332 "Failed to set cpu.cfs_period_us on %s: %m", path);
333
334 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
335 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
336 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
337 } else
338 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
339 if (r < 0)
340 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
341 "Failed to set cpu.cfs_quota_us on %s: %m", path);
342 }
343
344 if (mask & CGROUP_MASK_BLKIO) {
345 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
346 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
347 CGroupBlockIODeviceWeight *w;
348 CGroupBlockIODeviceBandwidth *b;
349
350 if (!is_root) {
351 sprintf(buf, "%" PRIu64 "\n",
352 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
353 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
354 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
355 if (r < 0)
356 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
357 "Failed to set blkio.weight on %s: %m", path);
358
359 /* FIXME: no way to reset this list */
360 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
361 dev_t dev;
362
363 r = lookup_blkio_device(w->path, &dev);
364 if (r < 0)
365 continue;
366
367 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
368 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
369 if (r < 0)
370 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
371 "Failed to set blkio.weight_device on %s: %m", path);
372 }
373 }
374
375 /* FIXME: no way to reset this list */
376 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
377 const char *a;
378 dev_t dev;
379
380 r = lookup_blkio_device(b->path, &dev);
381 if (r < 0)
382 continue;
383
384 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
385
386 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
387 r = cg_set_attribute("blkio", path, a, buf);
388 if (r < 0)
389 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
390 "Failed to set %s on %s: %m", a, path);
391 }
392 }
393
394 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
395 if (c->memory_limit != (uint64_t) -1) {
396 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
397
398 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
399
400 if (cg_unified() <= 0)
401 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
402 else
403 r = cg_set_attribute("memory", path, "memory.max", buf);
404
405 } else {
406 if (cg_unified() <= 0)
407 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
408 else
409 r = cg_set_attribute("memory", path, "memory.max", "max");
410 }
411
412 if (r < 0)
413 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
414 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
415 }
416
417 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
418 CGroupDeviceAllow *a;
419
420 /* Changing the devices list of a populated cgroup
421 * might result in EINVAL, hence ignore EINVAL
422 * here. */
423
424 if (c->device_allow || c->device_policy != CGROUP_AUTO)
425 r = cg_set_attribute("devices", path, "devices.deny", "a");
426 else
427 r = cg_set_attribute("devices", path, "devices.allow", "a");
428 if (r < 0)
429 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
430 "Failed to reset devices.list on %s: %m", path);
431
432 if (c->device_policy == CGROUP_CLOSED ||
433 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
434 static const char auto_devices[] =
435 "/dev/null\0" "rwm\0"
436 "/dev/zero\0" "rwm\0"
437 "/dev/full\0" "rwm\0"
438 "/dev/random\0" "rwm\0"
439 "/dev/urandom\0" "rwm\0"
440 "/dev/tty\0" "rwm\0"
441 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
442
443 const char *x, *y;
444
445 NULSTR_FOREACH_PAIR(x, y, auto_devices)
446 whitelist_device(path, x, y);
447
448 whitelist_major(path, "pts", 'c', "rw");
449 whitelist_major(path, "kdbus", 'c', "rw");
450 whitelist_major(path, "kdbus/*", 'c', "rw");
451 }
452
453 LIST_FOREACH(device_allow, a, c->device_allow) {
454 char acc[4];
455 unsigned k = 0;
456
457 if (a->r)
458 acc[k++] = 'r';
459 if (a->w)
460 acc[k++] = 'w';
461 if (a->m)
462 acc[k++] = 'm';
463
464 if (k == 0)
465 continue;
466
467 acc[k++] = 0;
468
469 if (startswith(a->path, "/dev/"))
470 whitelist_device(path, a->path, acc);
471 else if (startswith(a->path, "block-"))
472 whitelist_major(path, a->path + 6, 'b', acc);
473 else if (startswith(a->path, "char-"))
474 whitelist_major(path, a->path + 5, 'c', acc);
475 else
476 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
477 }
478 }
479
480 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
481
482 if (c->tasks_max != (uint64_t) -1) {
483 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
484
485 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
486 r = cg_set_attribute("pids", path, "pids.max", buf);
487 } else
488 r = cg_set_attribute("pids", path, "pids.max", "max");
489
490 if (r < 0)
491 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
492 "Failed to set pids.max on %s: %m", path);
493 }
494
495 if (mask & CGROUP_MASK_NET_CLS) {
496 char buf[DECIMAL_STR_MAX(uint32_t)];
497
498 sprintf(buf, "%" PRIu32, netclass);
499
500 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
501 if (r < 0)
502 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
503 "Failed to set net_cls.classid on %s: %m", path);
504 }
505 }
506
507 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
508 CGroupMask mask = 0;
509
510 /* Figure out which controllers we need */
511
512 if (c->cpu_accounting ||
513 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
514 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
515 c->cpu_quota_per_sec_usec != USEC_INFINITY)
516 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
517
518 if (c->blockio_accounting ||
519 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
520 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
521 c->blockio_device_weights ||
522 c->blockio_device_bandwidths)
523 mask |= CGROUP_MASK_BLKIO;
524
525 if (c->memory_accounting ||
526 c->memory_limit != (uint64_t) -1)
527 mask |= CGROUP_MASK_MEMORY;
528
529 if (c->device_allow ||
530 c->device_policy != CGROUP_AUTO)
531 mask |= CGROUP_MASK_DEVICES;
532
533 if (c->tasks_accounting ||
534 c->tasks_max != (uint64_t) -1)
535 mask |= CGROUP_MASK_PIDS;
536
537 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
538 mask |= CGROUP_MASK_NET_CLS;
539
540 return mask;
541 }
542
543 CGroupMask unit_get_own_mask(Unit *u) {
544 CGroupContext *c;
545
546 /* Returns the mask of controllers the unit needs for itself */
547
548 c = unit_get_cgroup_context(u);
549 if (!c)
550 return 0;
551
552 /* If delegation is turned on, then turn on all cgroups,
553 * unless we are on the legacy hierarchy and the process we
554 * fork into it is known to drop privileges, and hence
555 * shouldn't get access to the controllers.
556 *
557 * Note that on the unified hierarchy it is safe to delegate
558 * controllers to unprivileged services. */
559
560 if (c->delegate) {
561 ExecContext *e;
562
563 e = unit_get_exec_context(u);
564 if (!e ||
565 exec_context_maintains_privileges(e) ||
566 cg_unified() > 0)
567 return _CGROUP_MASK_ALL;
568 }
569
570 return cgroup_context_get_mask(c);
571 }
572
573 CGroupMask unit_get_members_mask(Unit *u) {
574 assert(u);
575
576 /* Returns the mask of controllers all of the unit's children
577 * require, merged */
578
579 if (u->cgroup_members_mask_valid)
580 return u->cgroup_members_mask;
581
582 u->cgroup_members_mask = 0;
583
584 if (u->type == UNIT_SLICE) {
585 Unit *member;
586 Iterator i;
587
588 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
589
590 if (member == u)
591 continue;
592
593 if (UNIT_DEREF(member->slice) != u)
594 continue;
595
596 u->cgroup_members_mask |=
597 unit_get_own_mask(member) |
598 unit_get_members_mask(member);
599 }
600 }
601
602 u->cgroup_members_mask_valid = true;
603 return u->cgroup_members_mask;
604 }
605
606 CGroupMask unit_get_siblings_mask(Unit *u) {
607 assert(u);
608
609 /* Returns the mask of controllers all of the unit's siblings
610 * require, i.e. the members mask of the unit's parent slice
611 * if there is one. */
612
613 if (UNIT_ISSET(u->slice))
614 return unit_get_members_mask(UNIT_DEREF(u->slice));
615
616 return unit_get_own_mask(u) | unit_get_members_mask(u);
617 }
618
619 CGroupMask unit_get_subtree_mask(Unit *u) {
620
621 /* Returns the mask of this subtree, meaning of the group
622 * itself and its children. */
623
624 return unit_get_own_mask(u) | unit_get_members_mask(u);
625 }
626
627 CGroupMask unit_get_target_mask(Unit *u) {
628 CGroupMask mask;
629
630 /* This returns the cgroup mask of all controllers to enable
631 * for a specific cgroup, i.e. everything it needs itself,
632 * plus all that its children need, plus all that its siblings
633 * need. This is primarily useful on the legacy cgroup
634 * hierarchy, where we need to duplicate each cgroup in each
635 * hierarchy that shall be enabled for it. */
636
637 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
638 mask &= u->manager->cgroup_supported;
639
640 return mask;
641 }
642
643 CGroupMask unit_get_enable_mask(Unit *u) {
644 CGroupMask mask;
645
646 /* This returns the cgroup mask of all controllers to enable
647 * for the children of a specific cgroup. This is primarily
648 * useful for the unified cgroup hierarchy, where each cgroup
649 * controls which controllers are enabled for its children. */
650
651 mask = unit_get_members_mask(u);
652 mask &= u->manager->cgroup_supported;
653
654 return mask;
655 }
656
657 /* Recurse from a unit up through its containing slices, propagating
658 * mask bits upward. A unit is also member of itself. */
659 void unit_update_cgroup_members_masks(Unit *u) {
660 CGroupMask m;
661 bool more;
662
663 assert(u);
664
665 /* Calculate subtree mask */
666 m = unit_get_subtree_mask(u);
667
668 /* See if anything changed from the previous invocation. If
669 * not, we're done. */
670 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
671 return;
672
673 more =
674 u->cgroup_subtree_mask_valid &&
675 ((m & ~u->cgroup_subtree_mask) != 0) &&
676 ((~m & u->cgroup_subtree_mask) == 0);
677
678 u->cgroup_subtree_mask = m;
679 u->cgroup_subtree_mask_valid = true;
680
681 if (UNIT_ISSET(u->slice)) {
682 Unit *s = UNIT_DEREF(u->slice);
683
684 if (more)
685 /* There's more set now than before. We
686 * propagate the new mask to the parent's mask
687 * (not caring if it actually was valid or
688 * not). */
689
690 s->cgroup_members_mask |= m;
691
692 else
693 /* There's less set now than before (or we
694 * don't know), we need to recalculate
695 * everything, so let's invalidate the
696 * parent's members mask */
697
698 s->cgroup_members_mask_valid = false;
699
700 /* And now make sure that this change also hits our
701 * grandparents */
702 unit_update_cgroup_members_masks(s);
703 }
704 }
705
706 static const char *migrate_callback(CGroupMask mask, void *userdata) {
707 Unit *u = userdata;
708
709 assert(mask != 0);
710 assert(u);
711
712 while (u) {
713 if (u->cgroup_path &&
714 u->cgroup_realized &&
715 (u->cgroup_realized_mask & mask) == mask)
716 return u->cgroup_path;
717
718 u = UNIT_DEREF(u->slice);
719 }
720
721 return NULL;
722 }
723
724 char *unit_default_cgroup_path(Unit *u) {
725 _cleanup_free_ char *escaped = NULL, *slice = NULL;
726 int r;
727
728 assert(u);
729
730 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
731 return strdup(u->manager->cgroup_root);
732
733 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
734 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
735 if (r < 0)
736 return NULL;
737 }
738
739 escaped = cg_escape(u->id);
740 if (!escaped)
741 return NULL;
742
743 if (slice)
744 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
745 else
746 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
747 }
748
749 int unit_set_cgroup_path(Unit *u, const char *path) {
750 _cleanup_free_ char *p = NULL;
751 int r;
752
753 assert(u);
754
755 if (path) {
756 p = strdup(path);
757 if (!p)
758 return -ENOMEM;
759 } else
760 p = NULL;
761
762 if (streq_ptr(u->cgroup_path, p))
763 return 0;
764
765 if (p) {
766 r = hashmap_put(u->manager->cgroup_unit, p, u);
767 if (r < 0)
768 return r;
769 }
770
771 unit_release_cgroup(u);
772
773 u->cgroup_path = p;
774 p = NULL;
775
776 return 1;
777 }
778
779 int unit_watch_cgroup(Unit *u) {
780 _cleanup_free_ char *populated = NULL;
781 int r;
782
783 assert(u);
784
785 if (!u->cgroup_path)
786 return 0;
787
788 if (u->cgroup_inotify_wd >= 0)
789 return 0;
790
791 /* Only applies to the unified hierarchy */
792 r = cg_unified();
793 if (r < 0)
794 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
795 if (r == 0)
796 return 0;
797
798 /* Don't watch the root slice, it's pointless. */
799 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
800 return 0;
801
802 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
803 if (r < 0)
804 return log_oom();
805
806 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
807 if (r < 0)
808 return log_oom();
809
810 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
811 if (u->cgroup_inotify_wd < 0) {
812
813 if (errno == ENOENT) /* If the directory is already
814 * gone we don't need to track
815 * it, so this is not an error */
816 return 0;
817
818 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
819 }
820
821 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
822 if (r < 0)
823 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
824
825 return 0;
826 }
827
828 static int unit_create_cgroup(
829 Unit *u,
830 CGroupMask target_mask,
831 CGroupMask enable_mask) {
832
833 CGroupContext *c;
834 int r;
835
836 assert(u);
837
838 c = unit_get_cgroup_context(u);
839 if (!c)
840 return 0;
841
842 if (!u->cgroup_path) {
843 _cleanup_free_ char *path = NULL;
844
845 path = unit_default_cgroup_path(u);
846 if (!path)
847 return log_oom();
848
849 r = unit_set_cgroup_path(u, path);
850 if (r == -EEXIST)
851 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
852 if (r < 0)
853 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
854 }
855
856 /* First, create our own group */
857 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
858 if (r < 0)
859 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
860
861 /* Start watching it */
862 (void) unit_watch_cgroup(u);
863
864 /* Enable all controllers we need */
865 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
866 if (r < 0)
867 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
868
869 /* Keep track that this is now realized */
870 u->cgroup_realized = true;
871 u->cgroup_realized_mask = target_mask;
872
873 if (u->type != UNIT_SLICE && !c->delegate) {
874
875 /* Then, possibly move things over, but not if
876 * subgroups may contain processes, which is the case
877 * for slice and delegation units. */
878 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
879 if (r < 0)
880 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
881 }
882
883 return 0;
884 }
885
886 int unit_attach_pids_to_cgroup(Unit *u) {
887 int r;
888 assert(u);
889
890 r = unit_realize_cgroup(u);
891 if (r < 0)
892 return r;
893
894 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
895 if (r < 0)
896 return r;
897
898 return 0;
899 }
900
901 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
902 assert(u);
903
904 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
905 }
906
907 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
908
909 uint32_t start, i;
910 Manager *m;
911
912 assert(u);
913
914 m = u->manager;
915
916 i = start = m->cgroup_netclass_registry_last;
917
918 do {
919 i++;
920
921 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
922 m->cgroup_netclass_registry_last = i;
923 *ret = i;
924 return 0;
925 }
926
927 if (i == UINT32_MAX)
928 i = CGROUP_NETCLASS_FIXED_MAX;
929
930 } while (i != start);
931
932 return -ENOBUFS;
933 }
934
935 int unit_add_to_netclass_cgroup(Unit *u) {
936
937 CGroupContext *cc;
938 Unit *first;
939 void *key;
940 int r;
941
942 assert(u);
943
944 cc = unit_get_cgroup_context(u);
945 if (!cc)
946 return 0;
947
948 switch (cc->netclass_type) {
949 case CGROUP_NETCLASS_TYPE_NONE:
950 return 0;
951
952 case CGROUP_NETCLASS_TYPE_FIXED:
953 u->cgroup_netclass_id = cc->netclass_id;
954 break;
955
956 case CGROUP_NETCLASS_TYPE_AUTO:
957 /* Allocate a new ID in case it was requested and not done yet */
958 if (u->cgroup_netclass_id == 0) {
959 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
960 if (r < 0)
961 return r;
962
963 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
964 }
965
966 break;
967 }
968
969 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
970 if (r < 0)
971 return r;
972
973 key = UINT32_TO_PTR(u->cgroup_netclass_id);
974 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
975
976 if (first) {
977 LIST_PREPEND(cgroup_netclass, first, u);
978 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
979 }
980
981 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
982 }
983
984 int unit_remove_from_netclass_cgroup(Unit *u) {
985
986 Unit *head;
987 void *key;
988
989 assert(u);
990
991 key = UINT32_TO_PTR(u->cgroup_netclass_id);
992
993 LIST_FIND_HEAD(cgroup_netclass, u, head);
994 LIST_REMOVE(cgroup_netclass, head, u);
995
996 if (head)
997 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
998
999 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1000
1001 return 0;
1002 }
1003
1004 /* Check if necessary controllers and attributes for a unit are in place.
1005 *
1006 * If so, do nothing.
1007 * If not, create paths, move processes over, and set attributes.
1008 *
1009 * Returns 0 on success and < 0 on failure. */
1010 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1011 CGroupMask target_mask, enable_mask;
1012 int r;
1013
1014 assert(u);
1015
1016 if (u->in_cgroup_queue) {
1017 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1018 u->in_cgroup_queue = false;
1019 }
1020
1021 target_mask = unit_get_target_mask(u);
1022 if (unit_has_mask_realized(u, target_mask))
1023 return 0;
1024
1025 /* First, realize parents */
1026 if (UNIT_ISSET(u->slice)) {
1027 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1028 if (r < 0)
1029 return r;
1030 }
1031
1032 /* And then do the real work */
1033 enable_mask = unit_get_enable_mask(u);
1034 r = unit_create_cgroup(u, target_mask, enable_mask);
1035 if (r < 0)
1036 return r;
1037
1038 /* Finally, apply the necessary attributes. */
1039 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1040
1041 return 0;
1042 }
1043
1044 static void unit_add_to_cgroup_queue(Unit *u) {
1045
1046 if (u->in_cgroup_queue)
1047 return;
1048
1049 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1050 u->in_cgroup_queue = true;
1051 }
1052
1053 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1054 ManagerState state;
1055 unsigned n = 0;
1056 Unit *i;
1057 int r;
1058
1059 state = manager_state(m);
1060
1061 while ((i = m->cgroup_queue)) {
1062 assert(i->in_cgroup_queue);
1063
1064 r = unit_realize_cgroup_now(i, state);
1065 if (r < 0)
1066 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1067
1068 n++;
1069 }
1070
1071 return n;
1072 }
1073
1074 static void unit_queue_siblings(Unit *u) {
1075 Unit *slice;
1076
1077 /* This adds the siblings of the specified unit and the
1078 * siblings of all parent units to the cgroup queue. (But
1079 * neither the specified unit itself nor the parents.) */
1080
1081 while ((slice = UNIT_DEREF(u->slice))) {
1082 Iterator i;
1083 Unit *m;
1084
1085 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1086 if (m == u)
1087 continue;
1088
1089 /* Skip units that have a dependency on the slice
1090 * but aren't actually in it. */
1091 if (UNIT_DEREF(m->slice) != slice)
1092 continue;
1093
1094 /* No point in doing cgroup application for units
1095 * without active processes. */
1096 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1097 continue;
1098
1099 /* If the unit doesn't need any new controllers
1100 * and has current ones realized, it doesn't need
1101 * any changes. */
1102 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1103 continue;
1104
1105 unit_add_to_cgroup_queue(m);
1106 }
1107
1108 u = slice;
1109 }
1110 }
1111
1112 int unit_realize_cgroup(Unit *u) {
1113 assert(u);
1114
1115 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1116 return 0;
1117
1118 /* So, here's the deal: when realizing the cgroups for this
1119 * unit, we need to first create all parents, but there's more
1120 * actually: for the weight-based controllers we also need to
1121 * make sure that all our siblings (i.e. units that are in the
1122 * same slice as we are) have cgroups, too. Otherwise, things
1123 * would become very uneven as each of their processes would
1124 * get as much resources as all our group together. This call
1125 * will synchronously create the parent cgroups, but will
1126 * defer work on the siblings to the next event loop
1127 * iteration. */
1128
1129 /* Add all sibling slices to the cgroup queue. */
1130 unit_queue_siblings(u);
1131
1132 /* And realize this one now (and apply the values) */
1133 return unit_realize_cgroup_now(u, manager_state(u->manager));
1134 }
1135
1136 void unit_release_cgroup(Unit *u) {
1137 assert(u);
1138
1139 /* Forgets all cgroup details for this cgroup */
1140
1141 if (u->cgroup_path) {
1142 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1143 u->cgroup_path = mfree(u->cgroup_path);
1144 }
1145
1146 if (u->cgroup_inotify_wd >= 0) {
1147 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1148 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1149
1150 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1151 u->cgroup_inotify_wd = -1;
1152 }
1153 }
1154
1155 void unit_prune_cgroup(Unit *u) {
1156 int r;
1157 bool is_root_slice;
1158
1159 assert(u);
1160
1161 /* Removes the cgroup, if empty and possible, and stops watching it. */
1162
1163 if (!u->cgroup_path)
1164 return;
1165
1166 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1167
1168 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1169 if (r < 0) {
1170 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1171 return;
1172 }
1173
1174 if (is_root_slice)
1175 return;
1176
1177 unit_release_cgroup(u);
1178
1179 u->cgroup_realized = false;
1180 u->cgroup_realized_mask = 0;
1181 }
1182
1183 int unit_search_main_pid(Unit *u, pid_t *ret) {
1184 _cleanup_fclose_ FILE *f = NULL;
1185 pid_t pid = 0, npid, mypid;
1186 int r;
1187
1188 assert(u);
1189 assert(ret);
1190
1191 if (!u->cgroup_path)
1192 return -ENXIO;
1193
1194 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1195 if (r < 0)
1196 return r;
1197
1198 mypid = getpid();
1199 while (cg_read_pid(f, &npid) > 0) {
1200 pid_t ppid;
1201
1202 if (npid == pid)
1203 continue;
1204
1205 /* Ignore processes that aren't our kids */
1206 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1207 continue;
1208
1209 if (pid != 0)
1210 /* Dang, there's more than one daemonized PID
1211 in this group, so we don't know what process
1212 is the main process. */
1213
1214 return -ENODATA;
1215
1216 pid = npid;
1217 }
1218
1219 *ret = pid;
1220 return 0;
1221 }
1222
1223 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1224 _cleanup_closedir_ DIR *d = NULL;
1225 _cleanup_fclose_ FILE *f = NULL;
1226 int ret = 0, r;
1227
1228 assert(u);
1229 assert(path);
1230
1231 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1232 if (r < 0)
1233 ret = r;
1234 else {
1235 pid_t pid;
1236
1237 while ((r = cg_read_pid(f, &pid)) > 0) {
1238 r = unit_watch_pid(u, pid);
1239 if (r < 0 && ret >= 0)
1240 ret = r;
1241 }
1242
1243 if (r < 0 && ret >= 0)
1244 ret = r;
1245 }
1246
1247 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1248 if (r < 0) {
1249 if (ret >= 0)
1250 ret = r;
1251 } else {
1252 char *fn;
1253
1254 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1255 _cleanup_free_ char *p = NULL;
1256
1257 p = strjoin(path, "/", fn, NULL);
1258 free(fn);
1259
1260 if (!p)
1261 return -ENOMEM;
1262
1263 r = unit_watch_pids_in_path(u, p);
1264 if (r < 0 && ret >= 0)
1265 ret = r;
1266 }
1267
1268 if (r < 0 && ret >= 0)
1269 ret = r;
1270 }
1271
1272 return ret;
1273 }
1274
1275 int unit_watch_all_pids(Unit *u) {
1276 assert(u);
1277
1278 /* Adds all PIDs from our cgroup to the set of PIDs we
1279 * watch. This is a fallback logic for cases where we do not
1280 * get reliable cgroup empty notifications: we try to use
1281 * SIGCHLD as replacement. */
1282
1283 if (!u->cgroup_path)
1284 return -ENOENT;
1285
1286 if (cg_unified() > 0) /* On unified we can use proper notifications */
1287 return 0;
1288
1289 return unit_watch_pids_in_path(u, u->cgroup_path);
1290 }
1291
1292 int unit_notify_cgroup_empty(Unit *u) {
1293 int r;
1294
1295 assert(u);
1296
1297 if (!u->cgroup_path)
1298 return 0;
1299
1300 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1301 if (r <= 0)
1302 return r;
1303
1304 unit_add_to_gc_queue(u);
1305
1306 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1307 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1308
1309 return 0;
1310 }
1311
1312 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1313 Manager *m = userdata;
1314
1315 assert(s);
1316 assert(fd >= 0);
1317 assert(m);
1318
1319 for (;;) {
1320 union inotify_event_buffer buffer;
1321 struct inotify_event *e;
1322 ssize_t l;
1323
1324 l = read(fd, &buffer, sizeof(buffer));
1325 if (l < 0) {
1326 if (errno == EINTR || errno == EAGAIN)
1327 return 0;
1328
1329 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1330 }
1331
1332 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1333 Unit *u;
1334
1335 if (e->wd < 0)
1336 /* Queue overflow has no watch descriptor */
1337 continue;
1338
1339 if (e->mask & IN_IGNORED)
1340 /* The watch was just removed */
1341 continue;
1342
1343 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1344 if (!u) /* Not that inotify might deliver
1345 * events for a watch even after it
1346 * was removed, because it was queued
1347 * before the removal. Let's ignore
1348 * this here safely. */
1349 continue;
1350
1351 (void) unit_notify_cgroup_empty(u);
1352 }
1353 }
1354 }
1355
1356 int manager_setup_cgroup(Manager *m) {
1357 _cleanup_free_ char *path = NULL;
1358 CGroupController c;
1359 int r, unified;
1360 char *e;
1361
1362 assert(m);
1363
1364 /* 1. Determine hierarchy */
1365 m->cgroup_root = mfree(m->cgroup_root);
1366 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1367 if (r < 0)
1368 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1369
1370 /* Chop off the init scope, if we are already located in it */
1371 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1372
1373 /* LEGACY: Also chop off the system slice if we are in
1374 * it. This is to support live upgrades from older systemd
1375 * versions where PID 1 was moved there. Also see
1376 * cg_get_root_path(). */
1377 if (!e && m->running_as == MANAGER_SYSTEM) {
1378 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1379 if (!e)
1380 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1381 }
1382 if (e)
1383 *e = 0;
1384
1385 /* And make sure to store away the root value without trailing
1386 * slash, even for the root dir, so that we can easily prepend
1387 * it everywhere. */
1388 while ((e = endswith(m->cgroup_root, "/")))
1389 *e = 0;
1390
1391 /* 2. Show data */
1392 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1393 if (r < 0)
1394 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1395
1396 unified = cg_unified();
1397 if (unified < 0)
1398 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1399 if (unified > 0)
1400 log_debug("Unified cgroup hierarchy is located at %s.", path);
1401 else
1402 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1403
1404 if (!m->test_run) {
1405 const char *scope_path;
1406
1407 /* 3. Install agent */
1408 if (unified) {
1409
1410 /* In the unified hierarchy we can can get
1411 * cgroup empty notifications via inotify. */
1412
1413 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1414 safe_close(m->cgroup_inotify_fd);
1415
1416 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1417 if (m->cgroup_inotify_fd < 0)
1418 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1419
1420 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1421 if (r < 0)
1422 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1423
1424 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1425 if (r < 0)
1426 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1427
1428 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1429
1430 } else if (m->running_as == MANAGER_SYSTEM) {
1431
1432 /* On the legacy hierarchy we only get
1433 * notifications via cgroup agents. (Which
1434 * isn't really reliable, since it does not
1435 * generate events when control groups with
1436 * children run empty. */
1437
1438 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1439 if (r < 0)
1440 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1441 else if (r > 0)
1442 log_debug("Installed release agent.");
1443 else if (r == 0)
1444 log_debug("Release agent already installed.");
1445 }
1446
1447 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1448 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1449 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1450 if (r < 0)
1451 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1452
1453 /* also, move all other userspace processes remaining
1454 * in the root cgroup into that scope. */
1455 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1456 if (r < 0)
1457 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1458
1459 /* 5. And pin it, so that it cannot be unmounted */
1460 safe_close(m->pin_cgroupfs_fd);
1461 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1462 if (m->pin_cgroupfs_fd < 0)
1463 return log_error_errno(errno, "Failed to open pin file: %m");
1464
1465 /* 6. Always enable hierarchical support if it exists... */
1466 if (!unified)
1467 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1468 }
1469
1470 /* 7. Figure out which controllers are supported */
1471 r = cg_mask_supported(&m->cgroup_supported);
1472 if (r < 0)
1473 return log_error_errno(r, "Failed to determine supported controllers: %m");
1474
1475 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1476 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1477
1478 return 0;
1479 }
1480
1481 void manager_shutdown_cgroup(Manager *m, bool delete) {
1482 assert(m);
1483
1484 /* We can't really delete the group, since we are in it. But
1485 * let's trim it. */
1486 if (delete && m->cgroup_root)
1487 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1488
1489 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1490
1491 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1492 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1493
1494 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1495
1496 m->cgroup_root = mfree(m->cgroup_root);
1497 }
1498
1499 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1500 char *p;
1501 Unit *u;
1502
1503 assert(m);
1504 assert(cgroup);
1505
1506 u = hashmap_get(m->cgroup_unit, cgroup);
1507 if (u)
1508 return u;
1509
1510 p = strdupa(cgroup);
1511 for (;;) {
1512 char *e;
1513
1514 e = strrchr(p, '/');
1515 if (!e || e == p)
1516 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1517
1518 *e = 0;
1519
1520 u = hashmap_get(m->cgroup_unit, p);
1521 if (u)
1522 return u;
1523 }
1524 }
1525
1526 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1527 _cleanup_free_ char *cgroup = NULL;
1528 int r;
1529
1530 assert(m);
1531
1532 if (pid <= 0)
1533 return NULL;
1534
1535 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1536 if (r < 0)
1537 return NULL;
1538
1539 return manager_get_unit_by_cgroup(m, cgroup);
1540 }
1541
1542 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1543 Unit *u;
1544
1545 assert(m);
1546
1547 if (pid <= 0)
1548 return NULL;
1549
1550 if (pid == 1)
1551 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1552
1553 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1554 if (u)
1555 return u;
1556
1557 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1558 if (u)
1559 return u;
1560
1561 return manager_get_unit_by_pid_cgroup(m, pid);
1562 }
1563
1564 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1565 Unit *u;
1566
1567 assert(m);
1568 assert(cgroup);
1569
1570 u = manager_get_unit_by_cgroup(m, cgroup);
1571 if (!u)
1572 return 0;
1573
1574 return unit_notify_cgroup_empty(u);
1575 }
1576
1577 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1578 _cleanup_free_ char *v = NULL;
1579 int r;
1580
1581 assert(u);
1582 assert(ret);
1583
1584 if (!u->cgroup_path)
1585 return -ENODATA;
1586
1587 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1588 return -ENODATA;
1589
1590 if (cg_unified() <= 0)
1591 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1592 else
1593 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1594 if (r == -ENOENT)
1595 return -ENODATA;
1596 if (r < 0)
1597 return r;
1598
1599 return safe_atou64(v, ret);
1600 }
1601
1602 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1603 _cleanup_free_ char *v = NULL;
1604 int r;
1605
1606 assert(u);
1607 assert(ret);
1608
1609 if (!u->cgroup_path)
1610 return -ENODATA;
1611
1612 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1613 return -ENODATA;
1614
1615 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1616 if (r == -ENOENT)
1617 return -ENODATA;
1618 if (r < 0)
1619 return r;
1620
1621 return safe_atou64(v, ret);
1622 }
1623
1624 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1625 _cleanup_free_ char *v = NULL;
1626 uint64_t ns;
1627 int r;
1628
1629 assert(u);
1630 assert(ret);
1631
1632 if (!u->cgroup_path)
1633 return -ENODATA;
1634
1635 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1636 return -ENODATA;
1637
1638 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1639 if (r == -ENOENT)
1640 return -ENODATA;
1641 if (r < 0)
1642 return r;
1643
1644 r = safe_atou64(v, &ns);
1645 if (r < 0)
1646 return r;
1647
1648 *ret = ns;
1649 return 0;
1650 }
1651
1652 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1653 nsec_t ns;
1654 int r;
1655
1656 r = unit_get_cpu_usage_raw(u, &ns);
1657 if (r < 0)
1658 return r;
1659
1660 if (ns > u->cpuacct_usage_base)
1661 ns -= u->cpuacct_usage_base;
1662 else
1663 ns = 0;
1664
1665 *ret = ns;
1666 return 0;
1667 }
1668
1669 int unit_reset_cpu_usage(Unit *u) {
1670 nsec_t ns;
1671 int r;
1672
1673 assert(u);
1674
1675 r = unit_get_cpu_usage_raw(u, &ns);
1676 if (r < 0) {
1677 u->cpuacct_usage_base = 0;
1678 return r;
1679 }
1680
1681 u->cpuacct_usage_base = ns;
1682 return 0;
1683 }
1684
1685 bool unit_cgroup_delegate(Unit *u) {
1686 CGroupContext *c;
1687
1688 assert(u);
1689
1690 c = unit_get_cgroup_context(u);
1691 if (!c)
1692 return false;
1693
1694 return c->delegate;
1695 }
1696
1697 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1698 assert(u);
1699
1700 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1701 return;
1702
1703 if (m == 0)
1704 return;
1705
1706 if ((u->cgroup_realized_mask & m) == 0)
1707 return;
1708
1709 u->cgroup_realized_mask &= ~m;
1710 unit_add_to_cgroup_queue(u);
1711 }
1712
1713 void manager_invalidate_startup_units(Manager *m) {
1714 Iterator i;
1715 Unit *u;
1716
1717 assert(m);
1718
1719 SET_FOREACH(u, m->startup_units, i)
1720 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1721 }
1722
1723 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1724 [CGROUP_AUTO] = "auto",
1725 [CGROUP_CLOSED] = "closed",
1726 [CGROUP_STRICT] = "strict",
1727 };
1728
1729 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);