]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/cgroup.c
0d37e69cd9efc062cd4274f8138e26146d23fc7e
[thirdparty/systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "cgroup-util.h"
26 #include "cgroup.h"
27 #include "fd-util.h"
28 #include "fileio.h"
29 #include "parse-util.h"
30 #include "path-util.h"
31 #include "process-util.h"
32 #include "special.h"
33 #include "string-table.h"
34 #include "string-util.h"
35
36 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
37
38 void cgroup_context_init(CGroupContext *c) {
39 assert(c);
40
41 /* Initialize everything to the kernel defaults, assuming the
42 * structure is preinitialized to 0 */
43
44 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
45 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
46 c->cpu_quota_per_sec_usec = USEC_INFINITY;
47
48 c->memory_limit = (uint64_t) -1;
49
50 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
51 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
52
53 c->tasks_max = (uint64_t) -1;
54
55 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
56 }
57
58 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
59 assert(c);
60 assert(a);
61
62 LIST_REMOVE(device_allow, c->device_allow, a);
63 free(a->path);
64 free(a);
65 }
66
67 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
68 assert(c);
69 assert(w);
70
71 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
72 free(w->path);
73 free(w);
74 }
75
76 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
77 assert(c);
78 assert(b);
79
80 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
81 free(b->path);
82 free(b);
83 }
84
85 void cgroup_context_done(CGroupContext *c) {
86 assert(c);
87
88 while (c->blockio_device_weights)
89 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
90
91 while (c->blockio_device_bandwidths)
92 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
93
94 while (c->device_allow)
95 cgroup_context_free_device_allow(c, c->device_allow);
96 }
97
98 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
99 CGroupBlockIODeviceBandwidth *b;
100 CGroupBlockIODeviceWeight *w;
101 CGroupDeviceAllow *a;
102 char u[FORMAT_TIMESPAN_MAX];
103
104 assert(c);
105 assert(f);
106
107 prefix = strempty(prefix);
108
109 fprintf(f,
110 "%sCPUAccounting=%s\n"
111 "%sBlockIOAccounting=%s\n"
112 "%sMemoryAccounting=%s\n"
113 "%sTasksAccounting=%s\n"
114 "%sCPUShares=%" PRIu64 "\n"
115 "%sStartupCPUShares=%" PRIu64 "\n"
116 "%sCPUQuotaPerSecSec=%s\n"
117 "%sBlockIOWeight=%" PRIu64 "\n"
118 "%sStartupBlockIOWeight=%" PRIu64 "\n"
119 "%sMemoryLimit=%" PRIu64 "\n"
120 "%sTasksMax=%" PRIu64 "\n"
121 "%sDevicePolicy=%s\n"
122 "%sDelegate=%s\n",
123 prefix, yes_no(c->cpu_accounting),
124 prefix, yes_no(c->blockio_accounting),
125 prefix, yes_no(c->memory_accounting),
126 prefix, yes_no(c->tasks_accounting),
127 prefix, c->cpu_shares,
128 prefix, c->startup_cpu_shares,
129 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
130 prefix, c->blockio_weight,
131 prefix, c->startup_blockio_weight,
132 prefix, c->memory_limit,
133 prefix, c->tasks_max,
134 prefix, cgroup_device_policy_to_string(c->device_policy),
135 prefix, yes_no(c->delegate));
136
137 LIST_FOREACH(device_allow, a, c->device_allow)
138 fprintf(f,
139 "%sDeviceAllow=%s %s%s%s\n",
140 prefix,
141 a->path,
142 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
143
144 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
145 fprintf(f,
146 "%sBlockIODeviceWeight=%s %" PRIu64,
147 prefix,
148 w->path,
149 w->weight);
150
151 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
152 char buf[FORMAT_BYTES_MAX];
153
154 fprintf(f,
155 "%s%s=%s %s\n",
156 prefix,
157 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
158 b->path,
159 format_bytes(buf, sizeof(buf), b->bandwidth));
160 }
161 }
162
163 static int lookup_blkio_device(const char *p, dev_t *dev) {
164 struct stat st;
165 int r;
166
167 assert(p);
168 assert(dev);
169
170 r = stat(p, &st);
171 if (r < 0)
172 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
173
174 if (S_ISBLK(st.st_mode))
175 *dev = st.st_rdev;
176 else if (major(st.st_dev) != 0) {
177 /* If this is not a device node then find the block
178 * device this file is stored on */
179 *dev = st.st_dev;
180
181 /* If this is a partition, try to get the originating
182 * block device */
183 block_get_whole_disk(*dev, dev);
184 } else {
185 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
186 return -ENODEV;
187 }
188
189 return 0;
190 }
191
192 static int whitelist_device(const char *path, const char *node, const char *acc) {
193 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
194 struct stat st;
195 int r;
196
197 assert(path);
198 assert(acc);
199
200 if (stat(node, &st) < 0) {
201 log_warning("Couldn't stat device %s", node);
202 return -errno;
203 }
204
205 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
206 log_warning("%s is not a device.", node);
207 return -ENODEV;
208 }
209
210 sprintf(buf,
211 "%c %u:%u %s",
212 S_ISCHR(st.st_mode) ? 'c' : 'b',
213 major(st.st_rdev), minor(st.st_rdev),
214 acc);
215
216 r = cg_set_attribute("devices", path, "devices.allow", buf);
217 if (r < 0)
218 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
219 "Failed to set devices.allow on %s: %m", path);
220
221 return r;
222 }
223
224 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
225 _cleanup_fclose_ FILE *f = NULL;
226 char line[LINE_MAX];
227 bool good = false;
228 int r;
229
230 assert(path);
231 assert(acc);
232 assert(type == 'b' || type == 'c');
233
234 f = fopen("/proc/devices", "re");
235 if (!f)
236 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
237
238 FOREACH_LINE(line, f, goto fail) {
239 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
240 unsigned maj;
241
242 truncate_nl(line);
243
244 if (type == 'c' && streq(line, "Character devices:")) {
245 good = true;
246 continue;
247 }
248
249 if (type == 'b' && streq(line, "Block devices:")) {
250 good = true;
251 continue;
252 }
253
254 if (isempty(line)) {
255 good = false;
256 continue;
257 }
258
259 if (!good)
260 continue;
261
262 p = strstrip(line);
263
264 w = strpbrk(p, WHITESPACE);
265 if (!w)
266 continue;
267 *w = 0;
268
269 r = safe_atou(p, &maj);
270 if (r < 0)
271 continue;
272 if (maj <= 0)
273 continue;
274
275 w++;
276 w += strspn(w, WHITESPACE);
277
278 if (fnmatch(name, w, 0) != 0)
279 continue;
280
281 sprintf(buf,
282 "%c %u:* %s",
283 type,
284 maj,
285 acc);
286
287 r = cg_set_attribute("devices", path, "devices.allow", buf);
288 if (r < 0)
289 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
290 "Failed to set devices.allow on %s: %m", path);
291 }
292
293 return 0;
294
295 fail:
296 log_warning_errno(errno, "Failed to read /proc/devices: %m");
297 return -errno;
298 }
299
300 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
301 bool is_root;
302 int r;
303
304 assert(c);
305 assert(path);
306
307 if (mask == 0)
308 return;
309
310 /* Some cgroup attributes are not supported on the root cgroup,
311 * hence silently ignore */
312 is_root = isempty(path) || path_equal(path, "/");
313 if (is_root)
314 /* Make sure we don't try to display messages with an empty path. */
315 path = "/";
316
317 /* We generally ignore errors caused by read-only mounted
318 * cgroup trees (assuming we are running in a container then),
319 * and missing cgroups, i.e. EROFS and ENOENT. */
320
321 if ((mask & CGROUP_MASK_CPU) && !is_root) {
322 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
323
324 sprintf(buf, "%" PRIu64 "\n",
325 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
326 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
327 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
328 if (r < 0)
329 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
330 "Failed to set cpu.shares on %s: %m", path);
331
332 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
333 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
334 if (r < 0)
335 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
336 "Failed to set cpu.cfs_period_us on %s: %m", path);
337
338 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
339 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
340 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
341 } else
342 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
343 if (r < 0)
344 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
345 "Failed to set cpu.cfs_quota_us on %s: %m", path);
346 }
347
348 if (mask & CGROUP_MASK_BLKIO) {
349 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
350 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
351 CGroupBlockIODeviceWeight *w;
352 CGroupBlockIODeviceBandwidth *b;
353
354 if (!is_root) {
355 sprintf(buf, "%" PRIu64 "\n",
356 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
357 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
358 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
359 if (r < 0)
360 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
361 "Failed to set blkio.weight on %s: %m", path);
362
363 /* FIXME: no way to reset this list */
364 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
365 dev_t dev;
366
367 r = lookup_blkio_device(w->path, &dev);
368 if (r < 0)
369 continue;
370
371 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
372 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
373 if (r < 0)
374 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
375 "Failed to set blkio.weight_device on %s: %m", path);
376 }
377 }
378
379 /* FIXME: no way to reset this list */
380 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
381 const char *a;
382 dev_t dev;
383
384 r = lookup_blkio_device(b->path, &dev);
385 if (r < 0)
386 continue;
387
388 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
389
390 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
391 r = cg_set_attribute("blkio", path, a, buf);
392 if (r < 0)
393 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
394 "Failed to set %s on %s: %m", a, path);
395 }
396 }
397
398 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
399 if (c->memory_limit != (uint64_t) -1) {
400 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
401
402 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
403
404 if (cg_unified() <= 0)
405 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
406 else
407 r = cg_set_attribute("memory", path, "memory.max", buf);
408
409 } else {
410 if (cg_unified() <= 0)
411 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
412 else
413 r = cg_set_attribute("memory", path, "memory.max", "max");
414 }
415
416 if (r < 0)
417 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
418 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
419 }
420
421 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
422 CGroupDeviceAllow *a;
423
424 /* Changing the devices list of a populated cgroup
425 * might result in EINVAL, hence ignore EINVAL
426 * here. */
427
428 if (c->device_allow || c->device_policy != CGROUP_AUTO)
429 r = cg_set_attribute("devices", path, "devices.deny", "a");
430 else
431 r = cg_set_attribute("devices", path, "devices.allow", "a");
432 if (r < 0)
433 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
434 "Failed to reset devices.list on %s: %m", path);
435
436 if (c->device_policy == CGROUP_CLOSED ||
437 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
438 static const char auto_devices[] =
439 "/dev/null\0" "rwm\0"
440 "/dev/zero\0" "rwm\0"
441 "/dev/full\0" "rwm\0"
442 "/dev/random\0" "rwm\0"
443 "/dev/urandom\0" "rwm\0"
444 "/dev/tty\0" "rwm\0"
445 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
446
447 const char *x, *y;
448
449 NULSTR_FOREACH_PAIR(x, y, auto_devices)
450 whitelist_device(path, x, y);
451
452 whitelist_major(path, "pts", 'c', "rw");
453 whitelist_major(path, "kdbus", 'c', "rw");
454 whitelist_major(path, "kdbus/*", 'c', "rw");
455 }
456
457 LIST_FOREACH(device_allow, a, c->device_allow) {
458 char acc[4];
459 unsigned k = 0;
460
461 if (a->r)
462 acc[k++] = 'r';
463 if (a->w)
464 acc[k++] = 'w';
465 if (a->m)
466 acc[k++] = 'm';
467
468 if (k == 0)
469 continue;
470
471 acc[k++] = 0;
472
473 if (startswith(a->path, "/dev/"))
474 whitelist_device(path, a->path, acc);
475 else if (startswith(a->path, "block-"))
476 whitelist_major(path, a->path + 6, 'b', acc);
477 else if (startswith(a->path, "char-"))
478 whitelist_major(path, a->path + 5, 'c', acc);
479 else
480 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
481 }
482 }
483
484 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
485
486 if (c->tasks_max != (uint64_t) -1) {
487 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
488
489 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
490 r = cg_set_attribute("pids", path, "pids.max", buf);
491 } else
492 r = cg_set_attribute("pids", path, "pids.max", "max");
493
494 if (r < 0)
495 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
496 "Failed to set pids.max on %s: %m", path);
497 }
498
499 if (mask & CGROUP_MASK_NET_CLS) {
500 char buf[DECIMAL_STR_MAX(uint32_t)];
501
502 sprintf(buf, "%" PRIu32, netclass);
503
504 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
505 if (r < 0)
506 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
507 "Failed to set net_cls.classid on %s: %m", path);
508 }
509 }
510
511 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
512 CGroupMask mask = 0;
513
514 /* Figure out which controllers we need */
515
516 if (c->cpu_accounting ||
517 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
518 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
519 c->cpu_quota_per_sec_usec != USEC_INFINITY)
520 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
521
522 if (c->blockio_accounting ||
523 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
524 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
525 c->blockio_device_weights ||
526 c->blockio_device_bandwidths)
527 mask |= CGROUP_MASK_BLKIO;
528
529 if (c->memory_accounting ||
530 c->memory_limit != (uint64_t) -1)
531 mask |= CGROUP_MASK_MEMORY;
532
533 if (c->device_allow ||
534 c->device_policy != CGROUP_AUTO)
535 mask |= CGROUP_MASK_DEVICES;
536
537 if (c->tasks_accounting ||
538 c->tasks_max != (uint64_t) -1)
539 mask |= CGROUP_MASK_PIDS;
540
541 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
542 mask |= CGROUP_MASK_NET_CLS;
543
544 return mask;
545 }
546
547 CGroupMask unit_get_own_mask(Unit *u) {
548 CGroupContext *c;
549
550 /* Returns the mask of controllers the unit needs for itself */
551
552 c = unit_get_cgroup_context(u);
553 if (!c)
554 return 0;
555
556 /* If delegation is turned on, then turn on all cgroups,
557 * unless we are on the legacy hierarchy and the process we
558 * fork into it is known to drop privileges, and hence
559 * shouldn't get access to the controllers.
560 *
561 * Note that on the unified hierarchy it is safe to delegate
562 * controllers to unprivileged services. */
563
564 if (c->delegate) {
565 ExecContext *e;
566
567 e = unit_get_exec_context(u);
568 if (!e ||
569 exec_context_maintains_privileges(e) ||
570 cg_unified() > 0)
571 return _CGROUP_MASK_ALL;
572 }
573
574 return cgroup_context_get_mask(c);
575 }
576
577 CGroupMask unit_get_members_mask(Unit *u) {
578 assert(u);
579
580 /* Returns the mask of controllers all of the unit's children
581 * require, merged */
582
583 if (u->cgroup_members_mask_valid)
584 return u->cgroup_members_mask;
585
586 u->cgroup_members_mask = 0;
587
588 if (u->type == UNIT_SLICE) {
589 Unit *member;
590 Iterator i;
591
592 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
593
594 if (member == u)
595 continue;
596
597 if (UNIT_DEREF(member->slice) != u)
598 continue;
599
600 u->cgroup_members_mask |=
601 unit_get_own_mask(member) |
602 unit_get_members_mask(member);
603 }
604 }
605
606 u->cgroup_members_mask_valid = true;
607 return u->cgroup_members_mask;
608 }
609
610 CGroupMask unit_get_siblings_mask(Unit *u) {
611 assert(u);
612
613 /* Returns the mask of controllers all of the unit's siblings
614 * require, i.e. the members mask of the unit's parent slice
615 * if there is one. */
616
617 if (UNIT_ISSET(u->slice))
618 return unit_get_members_mask(UNIT_DEREF(u->slice));
619
620 return unit_get_own_mask(u) | unit_get_members_mask(u);
621 }
622
623 CGroupMask unit_get_subtree_mask(Unit *u) {
624
625 /* Returns the mask of this subtree, meaning of the group
626 * itself and its children. */
627
628 return unit_get_own_mask(u) | unit_get_members_mask(u);
629 }
630
631 CGroupMask unit_get_target_mask(Unit *u) {
632 CGroupMask mask;
633
634 /* This returns the cgroup mask of all controllers to enable
635 * for a specific cgroup, i.e. everything it needs itself,
636 * plus all that its children need, plus all that its siblings
637 * need. This is primarily useful on the legacy cgroup
638 * hierarchy, where we need to duplicate each cgroup in each
639 * hierarchy that shall be enabled for it. */
640
641 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
642 mask &= u->manager->cgroup_supported;
643
644 return mask;
645 }
646
647 CGroupMask unit_get_enable_mask(Unit *u) {
648 CGroupMask mask;
649
650 /* This returns the cgroup mask of all controllers to enable
651 * for the children of a specific cgroup. This is primarily
652 * useful for the unified cgroup hierarchy, where each cgroup
653 * controls which controllers are enabled for its children. */
654
655 mask = unit_get_members_mask(u);
656 mask &= u->manager->cgroup_supported;
657
658 return mask;
659 }
660
661 /* Recurse from a unit up through its containing slices, propagating
662 * mask bits upward. A unit is also member of itself. */
663 void unit_update_cgroup_members_masks(Unit *u) {
664 CGroupMask m;
665 bool more;
666
667 assert(u);
668
669 /* Calculate subtree mask */
670 m = unit_get_subtree_mask(u);
671
672 /* See if anything changed from the previous invocation. If
673 * not, we're done. */
674 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
675 return;
676
677 more =
678 u->cgroup_subtree_mask_valid &&
679 ((m & ~u->cgroup_subtree_mask) != 0) &&
680 ((~m & u->cgroup_subtree_mask) == 0);
681
682 u->cgroup_subtree_mask = m;
683 u->cgroup_subtree_mask_valid = true;
684
685 if (UNIT_ISSET(u->slice)) {
686 Unit *s = UNIT_DEREF(u->slice);
687
688 if (more)
689 /* There's more set now than before. We
690 * propagate the new mask to the parent's mask
691 * (not caring if it actually was valid or
692 * not). */
693
694 s->cgroup_members_mask |= m;
695
696 else
697 /* There's less set now than before (or we
698 * don't know), we need to recalculate
699 * everything, so let's invalidate the
700 * parent's members mask */
701
702 s->cgroup_members_mask_valid = false;
703
704 /* And now make sure that this change also hits our
705 * grandparents */
706 unit_update_cgroup_members_masks(s);
707 }
708 }
709
710 static const char *migrate_callback(CGroupMask mask, void *userdata) {
711 Unit *u = userdata;
712
713 assert(mask != 0);
714 assert(u);
715
716 while (u) {
717 if (u->cgroup_path &&
718 u->cgroup_realized &&
719 (u->cgroup_realized_mask & mask) == mask)
720 return u->cgroup_path;
721
722 u = UNIT_DEREF(u->slice);
723 }
724
725 return NULL;
726 }
727
728 char *unit_default_cgroup_path(Unit *u) {
729 _cleanup_free_ char *escaped = NULL, *slice = NULL;
730 int r;
731
732 assert(u);
733
734 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
735 return strdup(u->manager->cgroup_root);
736
737 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
738 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
739 if (r < 0)
740 return NULL;
741 }
742
743 escaped = cg_escape(u->id);
744 if (!escaped)
745 return NULL;
746
747 if (slice)
748 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
749 else
750 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
751 }
752
753 int unit_set_cgroup_path(Unit *u, const char *path) {
754 _cleanup_free_ char *p = NULL;
755 int r;
756
757 assert(u);
758
759 if (path) {
760 p = strdup(path);
761 if (!p)
762 return -ENOMEM;
763 } else
764 p = NULL;
765
766 if (streq_ptr(u->cgroup_path, p))
767 return 0;
768
769 if (p) {
770 r = hashmap_put(u->manager->cgroup_unit, p, u);
771 if (r < 0)
772 return r;
773 }
774
775 unit_release_cgroup(u);
776
777 u->cgroup_path = p;
778 p = NULL;
779
780 return 1;
781 }
782
783 int unit_watch_cgroup(Unit *u) {
784 _cleanup_free_ char *populated = NULL;
785 int r;
786
787 assert(u);
788
789 if (!u->cgroup_path)
790 return 0;
791
792 if (u->cgroup_inotify_wd >= 0)
793 return 0;
794
795 /* Only applies to the unified hierarchy */
796 r = cg_unified();
797 if (r < 0)
798 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
799 if (r == 0)
800 return 0;
801
802 /* Don't watch the root slice, it's pointless. */
803 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
804 return 0;
805
806 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
807 if (r < 0)
808 return log_oom();
809
810 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
811 if (r < 0)
812 return log_oom();
813
814 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
815 if (u->cgroup_inotify_wd < 0) {
816
817 if (errno == ENOENT) /* If the directory is already
818 * gone we don't need to track
819 * it, so this is not an error */
820 return 0;
821
822 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
823 }
824
825 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
826 if (r < 0)
827 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
828
829 return 0;
830 }
831
832 static int unit_create_cgroup(
833 Unit *u,
834 CGroupMask target_mask,
835 CGroupMask enable_mask) {
836
837 CGroupContext *c;
838 int r;
839
840 assert(u);
841
842 c = unit_get_cgroup_context(u);
843 if (!c)
844 return 0;
845
846 if (!u->cgroup_path) {
847 _cleanup_free_ char *path = NULL;
848
849 path = unit_default_cgroup_path(u);
850 if (!path)
851 return log_oom();
852
853 r = unit_set_cgroup_path(u, path);
854 if (r == -EEXIST)
855 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
856 if (r < 0)
857 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
858 }
859
860 /* First, create our own group */
861 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
862 if (r < 0)
863 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
864
865 /* Start watching it */
866 (void) unit_watch_cgroup(u);
867
868 /* Enable all controllers we need */
869 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
870 if (r < 0)
871 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
872
873 /* Keep track that this is now realized */
874 u->cgroup_realized = true;
875 u->cgroup_realized_mask = target_mask;
876
877 if (u->type != UNIT_SLICE && !c->delegate) {
878
879 /* Then, possibly move things over, but not if
880 * subgroups may contain processes, which is the case
881 * for slice and delegation units. */
882 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
883 if (r < 0)
884 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
885 }
886
887 return 0;
888 }
889
890 int unit_attach_pids_to_cgroup(Unit *u) {
891 int r;
892 assert(u);
893
894 r = unit_realize_cgroup(u);
895 if (r < 0)
896 return r;
897
898 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
899 if (r < 0)
900 return r;
901
902 return 0;
903 }
904
905 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
906 assert(u);
907
908 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
909 }
910
911 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
912
913 uint32_t start, i;
914 Manager *m;
915
916 assert(u);
917
918 m = u->manager;
919
920 i = start = m->cgroup_netclass_registry_last;
921
922 do {
923 i++;
924
925 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
926 m->cgroup_netclass_registry_last = i;
927 *ret = i;
928 return 0;
929 }
930
931 if (i == UINT32_MAX)
932 i = CGROUP_NETCLASS_FIXED_MAX;
933
934 } while (i != start);
935
936 return -ENOBUFS;
937 }
938
939 int unit_add_to_netclass_cgroup(Unit *u) {
940
941 CGroupContext *cc;
942 Unit *first;
943 void *key;
944 int r;
945
946 assert(u);
947
948 cc = unit_get_cgroup_context(u);
949 if (!cc)
950 return 0;
951
952 switch (cc->netclass_type) {
953 case CGROUP_NETCLASS_TYPE_NONE:
954 return 0;
955
956 case CGROUP_NETCLASS_TYPE_FIXED:
957 u->cgroup_netclass_id = cc->netclass_id;
958 break;
959
960 case CGROUP_NETCLASS_TYPE_AUTO:
961 /* Allocate a new ID in case it was requested and not done yet */
962 if (u->cgroup_netclass_id == 0) {
963 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
964 if (r < 0)
965 return r;
966
967 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
968 }
969
970 break;
971 }
972
973 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
974 if (r < 0)
975 return r;
976
977 key = UINT32_TO_PTR(u->cgroup_netclass_id);
978 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
979
980 if (first) {
981 LIST_PREPEND(cgroup_netclass, first, u);
982 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
983 }
984
985 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
986 }
987
988 int unit_remove_from_netclass_cgroup(Unit *u) {
989
990 Unit *head;
991 void *key;
992
993 assert(u);
994
995 key = UINT32_TO_PTR(u->cgroup_netclass_id);
996
997 LIST_FIND_HEAD(cgroup_netclass, u, head);
998 LIST_REMOVE(cgroup_netclass, head, u);
999
1000 if (head)
1001 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1002
1003 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1004
1005 return 0;
1006 }
1007
1008 /* Check if necessary controllers and attributes for a unit are in place.
1009 *
1010 * If so, do nothing.
1011 * If not, create paths, move processes over, and set attributes.
1012 *
1013 * Returns 0 on success and < 0 on failure. */
1014 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1015 CGroupMask target_mask, enable_mask;
1016 int r;
1017
1018 assert(u);
1019
1020 if (u->in_cgroup_queue) {
1021 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1022 u->in_cgroup_queue = false;
1023 }
1024
1025 target_mask = unit_get_target_mask(u);
1026 if (unit_has_mask_realized(u, target_mask))
1027 return 0;
1028
1029 /* First, realize parents */
1030 if (UNIT_ISSET(u->slice)) {
1031 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1032 if (r < 0)
1033 return r;
1034 }
1035
1036 /* And then do the real work */
1037 enable_mask = unit_get_enable_mask(u);
1038 r = unit_create_cgroup(u, target_mask, enable_mask);
1039 if (r < 0)
1040 return r;
1041
1042 /* Finally, apply the necessary attributes. */
1043 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1044
1045 return 0;
1046 }
1047
1048 static void unit_add_to_cgroup_queue(Unit *u) {
1049
1050 if (u->in_cgroup_queue)
1051 return;
1052
1053 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1054 u->in_cgroup_queue = true;
1055 }
1056
1057 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1058 ManagerState state;
1059 unsigned n = 0;
1060 Unit *i;
1061 int r;
1062
1063 state = manager_state(m);
1064
1065 while ((i = m->cgroup_queue)) {
1066 assert(i->in_cgroup_queue);
1067
1068 r = unit_realize_cgroup_now(i, state);
1069 if (r < 0)
1070 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1071
1072 n++;
1073 }
1074
1075 return n;
1076 }
1077
1078 static void unit_queue_siblings(Unit *u) {
1079 Unit *slice;
1080
1081 /* This adds the siblings of the specified unit and the
1082 * siblings of all parent units to the cgroup queue. (But
1083 * neither the specified unit itself nor the parents.) */
1084
1085 while ((slice = UNIT_DEREF(u->slice))) {
1086 Iterator i;
1087 Unit *m;
1088
1089 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1090 if (m == u)
1091 continue;
1092
1093 /* Skip units that have a dependency on the slice
1094 * but aren't actually in it. */
1095 if (UNIT_DEREF(m->slice) != slice)
1096 continue;
1097
1098 /* No point in doing cgroup application for units
1099 * without active processes. */
1100 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1101 continue;
1102
1103 /* If the unit doesn't need any new controllers
1104 * and has current ones realized, it doesn't need
1105 * any changes. */
1106 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1107 continue;
1108
1109 unit_add_to_cgroup_queue(m);
1110 }
1111
1112 u = slice;
1113 }
1114 }
1115
1116 int unit_realize_cgroup(Unit *u) {
1117 assert(u);
1118
1119 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1120 return 0;
1121
1122 /* So, here's the deal: when realizing the cgroups for this
1123 * unit, we need to first create all parents, but there's more
1124 * actually: for the weight-based controllers we also need to
1125 * make sure that all our siblings (i.e. units that are in the
1126 * same slice as we are) have cgroups, too. Otherwise, things
1127 * would become very uneven as each of their processes would
1128 * get as much resources as all our group together. This call
1129 * will synchronously create the parent cgroups, but will
1130 * defer work on the siblings to the next event loop
1131 * iteration. */
1132
1133 /* Add all sibling slices to the cgroup queue. */
1134 unit_queue_siblings(u);
1135
1136 /* And realize this one now (and apply the values) */
1137 return unit_realize_cgroup_now(u, manager_state(u->manager));
1138 }
1139
1140 void unit_release_cgroup(Unit *u) {
1141 assert(u);
1142
1143 /* Forgets all cgroup details for this cgroup */
1144
1145 if (u->cgroup_path) {
1146 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1147 u->cgroup_path = mfree(u->cgroup_path);
1148 }
1149
1150 if (u->cgroup_inotify_wd >= 0) {
1151 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1152 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1153
1154 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1155 u->cgroup_inotify_wd = -1;
1156 }
1157 }
1158
1159 void unit_prune_cgroup(Unit *u) {
1160 int r;
1161 bool is_root_slice;
1162
1163 assert(u);
1164
1165 /* Removes the cgroup, if empty and possible, and stops watching it. */
1166
1167 if (!u->cgroup_path)
1168 return;
1169
1170 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1171
1172 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1173 if (r < 0) {
1174 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1175 return;
1176 }
1177
1178 if (is_root_slice)
1179 return;
1180
1181 unit_release_cgroup(u);
1182
1183 u->cgroup_realized = false;
1184 u->cgroup_realized_mask = 0;
1185 }
1186
1187 int unit_search_main_pid(Unit *u, pid_t *ret) {
1188 _cleanup_fclose_ FILE *f = NULL;
1189 pid_t pid = 0, npid, mypid;
1190 int r;
1191
1192 assert(u);
1193 assert(ret);
1194
1195 if (!u->cgroup_path)
1196 return -ENXIO;
1197
1198 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1199 if (r < 0)
1200 return r;
1201
1202 mypid = getpid();
1203 while (cg_read_pid(f, &npid) > 0) {
1204 pid_t ppid;
1205
1206 if (npid == pid)
1207 continue;
1208
1209 /* Ignore processes that aren't our kids */
1210 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1211 continue;
1212
1213 if (pid != 0)
1214 /* Dang, there's more than one daemonized PID
1215 in this group, so we don't know what process
1216 is the main process. */
1217
1218 return -ENODATA;
1219
1220 pid = npid;
1221 }
1222
1223 *ret = pid;
1224 return 0;
1225 }
1226
1227 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1228 _cleanup_closedir_ DIR *d = NULL;
1229 _cleanup_fclose_ FILE *f = NULL;
1230 int ret = 0, r;
1231
1232 assert(u);
1233 assert(path);
1234
1235 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1236 if (r < 0)
1237 ret = r;
1238 else {
1239 pid_t pid;
1240
1241 while ((r = cg_read_pid(f, &pid)) > 0) {
1242 r = unit_watch_pid(u, pid);
1243 if (r < 0 && ret >= 0)
1244 ret = r;
1245 }
1246
1247 if (r < 0 && ret >= 0)
1248 ret = r;
1249 }
1250
1251 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1252 if (r < 0) {
1253 if (ret >= 0)
1254 ret = r;
1255 } else {
1256 char *fn;
1257
1258 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1259 _cleanup_free_ char *p = NULL;
1260
1261 p = strjoin(path, "/", fn, NULL);
1262 free(fn);
1263
1264 if (!p)
1265 return -ENOMEM;
1266
1267 r = unit_watch_pids_in_path(u, p);
1268 if (r < 0 && ret >= 0)
1269 ret = r;
1270 }
1271
1272 if (r < 0 && ret >= 0)
1273 ret = r;
1274 }
1275
1276 return ret;
1277 }
1278
1279 int unit_watch_all_pids(Unit *u) {
1280 assert(u);
1281
1282 /* Adds all PIDs from our cgroup to the set of PIDs we
1283 * watch. This is a fallback logic for cases where we do not
1284 * get reliable cgroup empty notifications: we try to use
1285 * SIGCHLD as replacement. */
1286
1287 if (!u->cgroup_path)
1288 return -ENOENT;
1289
1290 if (cg_unified() > 0) /* On unified we can use proper notifications */
1291 return 0;
1292
1293 return unit_watch_pids_in_path(u, u->cgroup_path);
1294 }
1295
1296 int unit_notify_cgroup_empty(Unit *u) {
1297 int r;
1298
1299 assert(u);
1300
1301 if (!u->cgroup_path)
1302 return 0;
1303
1304 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1305 if (r <= 0)
1306 return r;
1307
1308 unit_add_to_gc_queue(u);
1309
1310 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1311 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1312
1313 return 0;
1314 }
1315
1316 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1317 Manager *m = userdata;
1318
1319 assert(s);
1320 assert(fd >= 0);
1321 assert(m);
1322
1323 for (;;) {
1324 union inotify_event_buffer buffer;
1325 struct inotify_event *e;
1326 ssize_t l;
1327
1328 l = read(fd, &buffer, sizeof(buffer));
1329 if (l < 0) {
1330 if (errno == EINTR || errno == EAGAIN)
1331 return 0;
1332
1333 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1334 }
1335
1336 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1337 Unit *u;
1338
1339 if (e->wd < 0)
1340 /* Queue overflow has no watch descriptor */
1341 continue;
1342
1343 if (e->mask & IN_IGNORED)
1344 /* The watch was just removed */
1345 continue;
1346
1347 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1348 if (!u) /* Not that inotify might deliver
1349 * events for a watch even after it
1350 * was removed, because it was queued
1351 * before the removal. Let's ignore
1352 * this here safely. */
1353 continue;
1354
1355 (void) unit_notify_cgroup_empty(u);
1356 }
1357 }
1358 }
1359
1360 int manager_setup_cgroup(Manager *m) {
1361 _cleanup_free_ char *path = NULL;
1362 CGroupController c;
1363 int r, unified;
1364 char *e;
1365
1366 assert(m);
1367
1368 /* 1. Determine hierarchy */
1369 m->cgroup_root = mfree(m->cgroup_root);
1370 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1371 if (r < 0)
1372 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1373
1374 /* Chop off the init scope, if we are already located in it */
1375 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1376
1377 /* LEGACY: Also chop off the system slice if we are in
1378 * it. This is to support live upgrades from older systemd
1379 * versions where PID 1 was moved there. Also see
1380 * cg_get_root_path(). */
1381 if (!e && m->running_as == MANAGER_SYSTEM) {
1382 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1383 if (!e)
1384 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1385 }
1386 if (e)
1387 *e = 0;
1388
1389 /* And make sure to store away the root value without trailing
1390 * slash, even for the root dir, so that we can easily prepend
1391 * it everywhere. */
1392 while ((e = endswith(m->cgroup_root, "/")))
1393 *e = 0;
1394
1395 /* 2. Show data */
1396 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1397 if (r < 0)
1398 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1399
1400 unified = cg_unified();
1401 if (unified < 0)
1402 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1403 if (unified > 0)
1404 log_debug("Unified cgroup hierarchy is located at %s.", path);
1405 else
1406 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1407
1408 if (!m->test_run) {
1409 const char *scope_path;
1410
1411 /* 3. Install agent */
1412 if (unified) {
1413
1414 /* In the unified hierarchy we can can get
1415 * cgroup empty notifications via inotify. */
1416
1417 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1418 safe_close(m->cgroup_inotify_fd);
1419
1420 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1421 if (m->cgroup_inotify_fd < 0)
1422 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1423
1424 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1425 if (r < 0)
1426 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1427
1428 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1429 if (r < 0)
1430 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1431
1432 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1433
1434 } else if (m->running_as == MANAGER_SYSTEM) {
1435
1436 /* On the legacy hierarchy we only get
1437 * notifications via cgroup agents. (Which
1438 * isn't really reliable, since it does not
1439 * generate events when control groups with
1440 * children run empty. */
1441
1442 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1443 if (r < 0)
1444 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1445 else if (r > 0)
1446 log_debug("Installed release agent.");
1447 else if (r == 0)
1448 log_debug("Release agent already installed.");
1449 }
1450
1451 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1452 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1453 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1454 if (r < 0)
1455 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1456
1457 /* also, move all other userspace processes remaining
1458 * in the root cgroup into that scope. */
1459 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1460 if (r < 0)
1461 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1462
1463 /* 5. And pin it, so that it cannot be unmounted */
1464 safe_close(m->pin_cgroupfs_fd);
1465 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1466 if (m->pin_cgroupfs_fd < 0)
1467 return log_error_errno(errno, "Failed to open pin file: %m");
1468
1469 /* 6. Always enable hierarchical support if it exists... */
1470 if (!unified)
1471 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1472 }
1473
1474 /* 7. Figure out which controllers are supported */
1475 r = cg_mask_supported(&m->cgroup_supported);
1476 if (r < 0)
1477 return log_error_errno(r, "Failed to determine supported controllers: %m");
1478
1479 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1480 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1481
1482 return 0;
1483 }
1484
1485 void manager_shutdown_cgroup(Manager *m, bool delete) {
1486 assert(m);
1487
1488 /* We can't really delete the group, since we are in it. But
1489 * let's trim it. */
1490 if (delete && m->cgroup_root)
1491 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1492
1493 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1494
1495 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1496 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1497
1498 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1499
1500 m->cgroup_root = mfree(m->cgroup_root);
1501 }
1502
1503 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1504 char *p;
1505 Unit *u;
1506
1507 assert(m);
1508 assert(cgroup);
1509
1510 u = hashmap_get(m->cgroup_unit, cgroup);
1511 if (u)
1512 return u;
1513
1514 p = strdupa(cgroup);
1515 for (;;) {
1516 char *e;
1517
1518 e = strrchr(p, '/');
1519 if (!e || e == p)
1520 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1521
1522 *e = 0;
1523
1524 u = hashmap_get(m->cgroup_unit, p);
1525 if (u)
1526 return u;
1527 }
1528 }
1529
1530 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1531 _cleanup_free_ char *cgroup = NULL;
1532 int r;
1533
1534 assert(m);
1535
1536 if (pid <= 0)
1537 return NULL;
1538
1539 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1540 if (r < 0)
1541 return NULL;
1542
1543 return manager_get_unit_by_cgroup(m, cgroup);
1544 }
1545
1546 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1547 Unit *u;
1548
1549 assert(m);
1550
1551 if (pid <= 0)
1552 return NULL;
1553
1554 if (pid == 1)
1555 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1556
1557 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1558 if (u)
1559 return u;
1560
1561 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1562 if (u)
1563 return u;
1564
1565 return manager_get_unit_by_pid_cgroup(m, pid);
1566 }
1567
1568 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1569 Unit *u;
1570
1571 assert(m);
1572 assert(cgroup);
1573
1574 u = manager_get_unit_by_cgroup(m, cgroup);
1575 if (!u)
1576 return 0;
1577
1578 return unit_notify_cgroup_empty(u);
1579 }
1580
1581 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1582 _cleanup_free_ char *v = NULL;
1583 int r;
1584
1585 assert(u);
1586 assert(ret);
1587
1588 if (!u->cgroup_path)
1589 return -ENODATA;
1590
1591 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1592 return -ENODATA;
1593
1594 if (cg_unified() <= 0)
1595 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1596 else
1597 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1598 if (r == -ENOENT)
1599 return -ENODATA;
1600 if (r < 0)
1601 return r;
1602
1603 return safe_atou64(v, ret);
1604 }
1605
1606 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1607 _cleanup_free_ char *v = NULL;
1608 int r;
1609
1610 assert(u);
1611 assert(ret);
1612
1613 if (!u->cgroup_path)
1614 return -ENODATA;
1615
1616 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1617 return -ENODATA;
1618
1619 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1620 if (r == -ENOENT)
1621 return -ENODATA;
1622 if (r < 0)
1623 return r;
1624
1625 return safe_atou64(v, ret);
1626 }
1627
1628 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1629 _cleanup_free_ char *v = NULL;
1630 uint64_t ns;
1631 int r;
1632
1633 assert(u);
1634 assert(ret);
1635
1636 if (!u->cgroup_path)
1637 return -ENODATA;
1638
1639 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1640 return -ENODATA;
1641
1642 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1643 if (r == -ENOENT)
1644 return -ENODATA;
1645 if (r < 0)
1646 return r;
1647
1648 r = safe_atou64(v, &ns);
1649 if (r < 0)
1650 return r;
1651
1652 *ret = ns;
1653 return 0;
1654 }
1655
1656 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1657 nsec_t ns;
1658 int r;
1659
1660 r = unit_get_cpu_usage_raw(u, &ns);
1661 if (r < 0)
1662 return r;
1663
1664 if (ns > u->cpuacct_usage_base)
1665 ns -= u->cpuacct_usage_base;
1666 else
1667 ns = 0;
1668
1669 *ret = ns;
1670 return 0;
1671 }
1672
1673 int unit_reset_cpu_usage(Unit *u) {
1674 nsec_t ns;
1675 int r;
1676
1677 assert(u);
1678
1679 r = unit_get_cpu_usage_raw(u, &ns);
1680 if (r < 0) {
1681 u->cpuacct_usage_base = 0;
1682 return r;
1683 }
1684
1685 u->cpuacct_usage_base = ns;
1686 return 0;
1687 }
1688
1689 bool unit_cgroup_delegate(Unit *u) {
1690 CGroupContext *c;
1691
1692 assert(u);
1693
1694 c = unit_get_cgroup_context(u);
1695 if (!c)
1696 return false;
1697
1698 return c->delegate;
1699 }
1700
1701 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1702 assert(u);
1703
1704 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1705 return;
1706
1707 if (m == 0)
1708 return;
1709
1710 if ((u->cgroup_realized_mask & m) == 0)
1711 return;
1712
1713 u->cgroup_realized_mask &= ~m;
1714 unit_add_to_cgroup_queue(u);
1715 }
1716
1717 void manager_invalidate_startup_units(Manager *m) {
1718 Iterator i;
1719 Unit *u;
1720
1721 assert(m);
1722
1723 SET_FOREACH(u, m->startup_units, i)
1724 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1725 }
1726
1727 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1728 [CGROUP_AUTO] = "auto",
1729 [CGROUP_CLOSED] = "closed",
1730 [CGROUP_STRICT] = "strict",
1731 };
1732
1733 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);