1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include "process-util.h"
26 #include "path-util.h"
28 #include "cgroup-util.h"
31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
33 void cgroup_context_init(CGroupContext
*c
) {
36 /* Initialize everything to the kernel defaults, assuming the
37 * structure is preinitialized to 0 */
39 c
->cpu_shares
= (unsigned long) -1;
40 c
->startup_cpu_shares
= (unsigned long) -1;
41 c
->memory_limit
= (uint64_t) -1;
42 c
->blockio_weight
= (unsigned long) -1;
43 c
->startup_blockio_weight
= (unsigned long) -1;
45 c
->cpu_quota_per_sec_usec
= USEC_INFINITY
;
48 void cgroup_context_free_device_allow(CGroupContext
*c
, CGroupDeviceAllow
*a
) {
52 LIST_REMOVE(device_allow
, c
->device_allow
, a
);
57 void cgroup_context_free_blockio_device_weight(CGroupContext
*c
, CGroupBlockIODeviceWeight
*w
) {
61 LIST_REMOVE(device_weights
, c
->blockio_device_weights
, w
);
66 void cgroup_context_free_blockio_device_bandwidth(CGroupContext
*c
, CGroupBlockIODeviceBandwidth
*b
) {
70 LIST_REMOVE(device_bandwidths
, c
->blockio_device_bandwidths
, b
);
75 void cgroup_context_done(CGroupContext
*c
) {
78 while (c
->blockio_device_weights
)
79 cgroup_context_free_blockio_device_weight(c
, c
->blockio_device_weights
);
81 while (c
->blockio_device_bandwidths
)
82 cgroup_context_free_blockio_device_bandwidth(c
, c
->blockio_device_bandwidths
);
84 while (c
->device_allow
)
85 cgroup_context_free_device_allow(c
, c
->device_allow
);
88 void cgroup_context_dump(CGroupContext
*c
, FILE* f
, const char *prefix
) {
89 CGroupBlockIODeviceBandwidth
*b
;
90 CGroupBlockIODeviceWeight
*w
;
92 char u
[FORMAT_TIMESPAN_MAX
];
97 prefix
= strempty(prefix
);
100 "%sCPUAccounting=%s\n"
101 "%sBlockIOAccounting=%s\n"
102 "%sMemoryAccounting=%s\n"
104 "%sStartupCPUShares=%lu\n"
105 "%sCPUQuotaPerSecSec=%s\n"
106 "%sBlockIOWeight=%lu\n"
107 "%sStartupBlockIOWeight=%lu\n"
108 "%sMemoryLimit=%" PRIu64
"\n"
109 "%sDevicePolicy=%s\n"
111 prefix
, yes_no(c
->cpu_accounting
),
112 prefix
, yes_no(c
->blockio_accounting
),
113 prefix
, yes_no(c
->memory_accounting
),
114 prefix
, c
->cpu_shares
,
115 prefix
, c
->startup_cpu_shares
,
116 prefix
, format_timespan(u
, sizeof(u
), c
->cpu_quota_per_sec_usec
, 1),
117 prefix
, c
->blockio_weight
,
118 prefix
, c
->startup_blockio_weight
,
119 prefix
, c
->memory_limit
,
120 prefix
, cgroup_device_policy_to_string(c
->device_policy
),
121 prefix
, yes_no(c
->delegate
));
123 LIST_FOREACH(device_allow
, a
, c
->device_allow
)
125 "%sDeviceAllow=%s %s%s%s\n",
128 a
->r
? "r" : "", a
->w
? "w" : "", a
->m
? "m" : "");
130 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
)
132 "%sBlockIODeviceWeight=%s %lu",
137 LIST_FOREACH(device_bandwidths
, b
, c
->blockio_device_bandwidths
) {
138 char buf
[FORMAT_BYTES_MAX
];
143 b
->read
? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
145 format_bytes(buf
, sizeof(buf
), b
->bandwidth
));
149 static int lookup_blkio_device(const char *p
, dev_t
*dev
) {
158 return log_warning_errno(errno
, "Couldn't stat device %s: %m", p
);
160 if (S_ISBLK(st
.st_mode
))
162 else if (major(st
.st_dev
) != 0) {
163 /* If this is not a device node then find the block
164 * device this file is stored on */
167 /* If this is a partition, try to get the originating
169 block_get_whole_disk(*dev
, dev
);
171 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p
);
178 static int whitelist_device(const char *path
, const char *node
, const char *acc
) {
179 char buf
[2+DECIMAL_STR_MAX(dev_t
)*2+2+4];
186 if (stat(node
, &st
) < 0) {
187 log_warning("Couldn't stat device %s", node
);
191 if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
192 log_warning("%s is not a device.", node
);
198 S_ISCHR(st
.st_mode
) ? 'c' : 'b',
199 major(st
.st_rdev
), minor(st
.st_rdev
),
202 r
= cg_set_attribute("devices", path
, "devices.allow", buf
);
204 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
) ? LOG_DEBUG
: LOG_WARNING
, r
,
205 "Failed to set devices.allow on %s: %m", path
);
210 static int whitelist_major(const char *path
, const char *name
, char type
, const char *acc
) {
211 _cleanup_fclose_
FILE *f
= NULL
;
218 assert(type
== 'b' || type
== 'c');
220 f
= fopen("/proc/devices", "re");
222 return log_warning_errno(errno
, "Cannot open /proc/devices to resolve %s (%c): %m", name
, type
);
224 FOREACH_LINE(line
, f
, goto fail
) {
225 char buf
[2+DECIMAL_STR_MAX(unsigned)+3+4], *p
, *w
;
230 if (type
== 'c' && streq(line
, "Character devices:")) {
235 if (type
== 'b' && streq(line
, "Block devices:")) {
250 w
= strpbrk(p
, WHITESPACE
);
255 r
= safe_atou(p
, &maj
);
262 w
+= strspn(w
, WHITESPACE
);
264 if (fnmatch(name
, w
, 0) != 0)
273 r
= cg_set_attribute("devices", path
, "devices.allow", buf
);
275 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
) ? LOG_DEBUG
: LOG_WARNING
, r
,
276 "Failed to set devices.allow on %s: %m", path
);
282 log_warning_errno(errno
, "Failed to read /proc/devices: %m");
286 void cgroup_context_apply(CGroupContext
*c
, CGroupMask mask
, const char *path
, ManagerState state
) {
296 /* Some cgroup attributes are not supported on the root cgroup,
297 * hence silently ignore */
298 is_root
= isempty(path
) || path_equal(path
, "/");
300 /* Make sure we don't try to display messages with an empty path. */
303 /* We generally ignore errors caused by read-only mounted
304 * cgroup trees (assuming we are running in a container then),
305 * and missing cgroups, i.e. EROFS and ENOENT. */
307 if ((mask
& CGROUP_MASK_CPU
) && !is_root
) {
308 char buf
[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t
)) + 1];
310 sprintf(buf
, "%lu\n",
311 IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) && c
->startup_cpu_shares
!= (unsigned long) -1 ? c
->startup_cpu_shares
:
312 c
->cpu_shares
!= (unsigned long) -1 ? c
->cpu_shares
: 1024);
313 r
= cg_set_attribute("cpu", path
, "cpu.shares", buf
);
315 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
316 "Failed to set cpu.shares on %s: %m", path
);
318 sprintf(buf
, USEC_FMT
"\n", CGROUP_CPU_QUOTA_PERIOD_USEC
);
319 r
= cg_set_attribute("cpu", path
, "cpu.cfs_period_us", buf
);
321 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
322 "Failed to set cpu.cfs_period_us on %s: %m", path
);
324 if (c
->cpu_quota_per_sec_usec
!= USEC_INFINITY
) {
325 sprintf(buf
, USEC_FMT
"\n", c
->cpu_quota_per_sec_usec
* CGROUP_CPU_QUOTA_PERIOD_USEC
/ USEC_PER_SEC
);
326 r
= cg_set_attribute("cpu", path
, "cpu.cfs_quota_us", buf
);
328 r
= cg_set_attribute("cpu", path
, "cpu.cfs_quota_us", "-1");
330 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
331 "Failed to set cpu.cfs_quota_us on %s: %m", path
);
334 if (mask
& CGROUP_MASK_BLKIO
) {
335 char buf
[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
336 DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
337 DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
338 CGroupBlockIODeviceWeight
*w
;
339 CGroupBlockIODeviceBandwidth
*b
;
342 sprintf(buf
, "%lu\n", IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) && c
->startup_blockio_weight
!= (unsigned long) -1 ? c
->startup_blockio_weight
:
343 c
->blockio_weight
!= (unsigned long) -1 ? c
->blockio_weight
: 1000);
344 r
= cg_set_attribute("blkio", path
, "blkio.weight", buf
);
346 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
347 "Failed to set blkio.weight on %s: %m", path
);
349 /* FIXME: no way to reset this list */
350 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
) {
353 r
= lookup_blkio_device(w
->path
, &dev
);
357 sprintf(buf
, "%u:%u %lu", major(dev
), minor(dev
), w
->weight
);
358 r
= cg_set_attribute("blkio", path
, "blkio.weight_device", buf
);
360 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
361 "Failed to set blkio.weight_device on %s: %m", path
);
365 /* FIXME: no way to reset this list */
366 LIST_FOREACH(device_bandwidths
, b
, c
->blockio_device_bandwidths
) {
370 r
= lookup_blkio_device(b
->path
, &dev
);
374 a
= b
->read
? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
376 sprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), b
->bandwidth
);
377 r
= cg_set_attribute("blkio", path
, a
, buf
);
379 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
380 "Failed to set %s on %s: %m", a
, path
);
384 if ((mask
& CGROUP_MASK_MEMORY
) && !is_root
) {
385 if (c
->memory_limit
!= (uint64_t) -1) {
386 char buf
[DECIMAL_STR_MAX(uint64_t) + 1];
388 sprintf(buf
, "%" PRIu64
"\n", c
->memory_limit
);
390 if (cg_unified() <= 0)
391 r
= cg_set_attribute("memory", path
, "memory.limit_in_bytes", buf
);
393 r
= cg_set_attribute("memory", path
, "memory.max", buf
);
396 if (cg_unified() <= 0)
397 r
= cg_set_attribute("memory", path
, "memory.limit_in_bytes", "-1");
399 r
= cg_set_attribute("memory", path
, "memory.max", "max");
403 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
404 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path
);
407 if ((mask
& CGROUP_MASK_DEVICE
) && !is_root
) {
408 CGroupDeviceAllow
*a
;
410 /* Changing the devices list of a populated cgroup
411 * might result in EINVAL, hence ignore EINVAL
414 if (c
->device_allow
|| c
->device_policy
!= CGROUP_AUTO
)
415 r
= cg_set_attribute("devices", path
, "devices.deny", "a");
417 r
= cg_set_attribute("devices", path
, "devices.allow", "a");
419 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
) ? LOG_DEBUG
: LOG_WARNING
, r
,
420 "Failed to reset devices.list on %s: %m", path
);
422 if (c
->device_policy
== CGROUP_CLOSED
||
423 (c
->device_policy
== CGROUP_AUTO
&& c
->device_allow
)) {
424 static const char auto_devices
[] =
425 "/dev/null\0" "rwm\0"
426 "/dev/zero\0" "rwm\0"
427 "/dev/full\0" "rwm\0"
428 "/dev/random\0" "rwm\0"
429 "/dev/urandom\0" "rwm\0"
431 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
435 NULSTR_FOREACH_PAIR(x
, y
, auto_devices
)
436 whitelist_device(path
, x
, y
);
438 whitelist_major(path
, "pts", 'c', "rw");
439 whitelist_major(path
, "kdbus", 'c', "rw");
440 whitelist_major(path
, "kdbus/*", 'c', "rw");
443 LIST_FOREACH(device_allow
, a
, c
->device_allow
) {
459 if (startswith(a
->path
, "/dev/"))
460 whitelist_device(path
, a
->path
, acc
);
461 else if (startswith(a
->path
, "block-"))
462 whitelist_major(path
, a
->path
+ 6, 'b', acc
);
463 else if (startswith(a
->path
, "char-"))
464 whitelist_major(path
, a
->path
+ 5, 'c', acc
);
466 log_debug("Ignoring device %s while writing cgroup attribute.", a
->path
);
471 CGroupMask
cgroup_context_get_mask(CGroupContext
*c
) {
474 /* Figure out which controllers we need */
476 if (c
->cpu_accounting
||
477 c
->cpu_shares
!= (unsigned long) -1 ||
478 c
->startup_cpu_shares
!= (unsigned long) -1 ||
479 c
->cpu_quota_per_sec_usec
!= USEC_INFINITY
)
480 mask
|= CGROUP_MASK_CPUACCT
| CGROUP_MASK_CPU
;
482 if (c
->blockio_accounting
||
483 c
->blockio_weight
!= (unsigned long) -1 ||
484 c
->startup_blockio_weight
!= (unsigned long) -1 ||
485 c
->blockio_device_weights
||
486 c
->blockio_device_bandwidths
)
487 mask
|= CGROUP_MASK_BLKIO
;
489 if (c
->memory_accounting
||
490 c
->memory_limit
!= (uint64_t) -1)
491 mask
|= CGROUP_MASK_MEMORY
;
493 if (c
->device_allow
||
494 c
->device_policy
!= CGROUP_AUTO
)
495 mask
|= CGROUP_MASK_DEVICE
;
500 CGroupMask
unit_get_own_mask(Unit
*u
) {
503 /* Returns the mask of controllers the unit needs for itself */
505 c
= unit_get_cgroup_context(u
);
509 /* If delegation is turned on, then turn on all cgroups,
510 * unless we are on the legacy hierarchy and the process we
511 * fork into it is known to drop privileges, and hence
512 * shouldn't get access to the controllers.
514 * Note that on the unified hierarchy it is safe to delegate
515 * controllers to unprivileged services. */
520 e
= unit_get_exec_context(u
);
522 exec_context_maintains_privileges(e
) ||
524 return _CGROUP_MASK_ALL
;
527 return cgroup_context_get_mask(c
);
530 CGroupMask
unit_get_members_mask(Unit
*u
) {
533 /* Returns the mask of controllers all of the unit's children
536 if (u
->cgroup_members_mask_valid
)
537 return u
->cgroup_members_mask
;
539 u
->cgroup_members_mask
= 0;
541 if (u
->type
== UNIT_SLICE
) {
545 SET_FOREACH(member
, u
->dependencies
[UNIT_BEFORE
], i
) {
550 if (UNIT_DEREF(member
->slice
) != u
)
553 u
->cgroup_members_mask
|=
554 unit_get_own_mask(member
) |
555 unit_get_members_mask(member
);
559 u
->cgroup_members_mask_valid
= true;
560 return u
->cgroup_members_mask
;
563 CGroupMask
unit_get_siblings_mask(Unit
*u
) {
566 /* Returns the mask of controllers all of the unit's siblings
567 * require, i.e. the members mask of the unit's parent slice
568 * if there is one. */
570 if (UNIT_ISSET(u
->slice
))
571 return unit_get_members_mask(UNIT_DEREF(u
->slice
));
573 return unit_get_own_mask(u
) | unit_get_members_mask(u
);
576 CGroupMask
unit_get_subtree_mask(Unit
*u
) {
578 /* Returns the mask of this subtree, meaning of the group
579 * itself and its children. */
581 return unit_get_own_mask(u
) | unit_get_members_mask(u
);
584 CGroupMask
unit_get_target_mask(Unit
*u
) {
587 /* This returns the cgroup mask of all controllers to enable
588 * for a specific cgroup, i.e. everything it needs itself,
589 * plus all that its children need, plus all that its siblings
590 * need. This is primarily useful on the legacy cgroup
591 * hierarchy, where we need to duplicate each cgroup in each
592 * hierarchy that shall be enabled for it. */
594 mask
= unit_get_own_mask(u
) | unit_get_members_mask(u
) | unit_get_siblings_mask(u
);
595 mask
&= u
->manager
->cgroup_supported
;
600 CGroupMask
unit_get_enable_mask(Unit
*u
) {
603 /* This returns the cgroup mask of all controllers to enable
604 * for the children of a specific cgroup. This is primarily
605 * useful for the unified cgroup hierarchy, where each cgroup
606 * controls which controllers are enabled for its children. */
608 mask
= unit_get_members_mask(u
);
609 mask
&= u
->manager
->cgroup_supported
;
614 /* Recurse from a unit up through its containing slices, propagating
615 * mask bits upward. A unit is also member of itself. */
616 void unit_update_cgroup_members_masks(Unit
*u
) {
622 /* Calculate subtree mask */
623 m
= unit_get_subtree_mask(u
);
625 /* See if anything changed from the previous invocation. If
626 * not, we're done. */
627 if (u
->cgroup_subtree_mask_valid
&& m
== u
->cgroup_subtree_mask
)
631 u
->cgroup_subtree_mask_valid
&&
632 ((m
& ~u
->cgroup_subtree_mask
) != 0) &&
633 ((~m
& u
->cgroup_subtree_mask
) == 0);
635 u
->cgroup_subtree_mask
= m
;
636 u
->cgroup_subtree_mask_valid
= true;
638 if (UNIT_ISSET(u
->slice
)) {
639 Unit
*s
= UNIT_DEREF(u
->slice
);
642 /* There's more set now than before. We
643 * propagate the new mask to the parent's mask
644 * (not caring if it actually was valid or
647 s
->cgroup_members_mask
|= m
;
650 /* There's less set now than before (or we
651 * don't know), we need to recalculate
652 * everything, so let's invalidate the
653 * parent's members mask */
655 s
->cgroup_members_mask_valid
= false;
657 /* And now make sure that this change also hits our
659 unit_update_cgroup_members_masks(s
);
663 static const char *migrate_callback(CGroupMask mask
, void *userdata
) {
670 if (u
->cgroup_path
&&
671 u
->cgroup_realized
&&
672 (u
->cgroup_realized_mask
& mask
) == mask
)
673 return u
->cgroup_path
;
675 u
= UNIT_DEREF(u
->slice
);
681 char *unit_default_cgroup_path(Unit
*u
) {
682 _cleanup_free_
char *escaped
= NULL
, *slice
= NULL
;
687 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
688 return strdup(u
->manager
->cgroup_root
);
690 if (UNIT_ISSET(u
->slice
) && !unit_has_name(UNIT_DEREF(u
->slice
), SPECIAL_ROOT_SLICE
)) {
691 r
= cg_slice_to_path(UNIT_DEREF(u
->slice
)->id
, &slice
);
696 escaped
= cg_escape(u
->id
);
701 return strjoin(u
->manager
->cgroup_root
, "/", slice
, "/", escaped
, NULL
);
703 return strjoin(u
->manager
->cgroup_root
, "/", escaped
, NULL
);
706 int unit_set_cgroup_path(Unit
*u
, const char *path
) {
707 _cleanup_free_
char *p
= NULL
;
719 if (streq_ptr(u
->cgroup_path
, p
))
723 r
= hashmap_put(u
->manager
->cgroup_unit
, p
, u
);
728 unit_release_cgroup(u
);
736 int unit_watch_cgroup(Unit
*u
) {
737 _cleanup_free_
char *populated
= NULL
;
745 if (u
->cgroup_inotify_wd
>= 0)
748 /* Only applies to the unified hierarchy */
751 return log_unit_error_errno(u
, r
, "Failed detect wether the unified hierarchy is used: %m");
755 /* Don't watch the root slice, it's pointless. */
756 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
759 r
= hashmap_ensure_allocated(&u
->manager
->cgroup_inotify_wd_unit
, &trivial_hash_ops
);
763 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, "cgroup.populated", &populated
);
767 u
->cgroup_inotify_wd
= inotify_add_watch(u
->manager
->cgroup_inotify_fd
, populated
, IN_MODIFY
);
768 if (u
->cgroup_inotify_wd
< 0) {
770 if (errno
== ENOENT
) /* If the directory is already
771 * gone we don't need to track
772 * it, so this is not an error */
775 return log_unit_error_errno(u
, errno
, "Failed to add inotify watch descriptor for control group %s: %m", u
->cgroup_path
);
778 r
= hashmap_put(u
->manager
->cgroup_inotify_wd_unit
, INT_TO_PTR(u
->cgroup_inotify_wd
), u
);
780 return log_unit_error_errno(u
, r
, "Failed to add inotify watch descriptor to hash map: %m");
785 static int unit_create_cgroup(
787 CGroupMask target_mask
,
788 CGroupMask enable_mask
) {
795 c
= unit_get_cgroup_context(u
);
799 if (!u
->cgroup_path
) {
800 _cleanup_free_
char *path
= NULL
;
802 path
= unit_default_cgroup_path(u
);
806 r
= unit_set_cgroup_path(u
, path
);
808 return log_unit_error_errno(u
, r
, "Control group %s exists already.", path
);
810 return log_unit_error_errno(u
, r
, "Failed to set unit's control group path to %s: %m", path
);
813 /* First, create our own group */
814 r
= cg_create_everywhere(u
->manager
->cgroup_supported
, target_mask
, u
->cgroup_path
);
816 return log_unit_error_errno(u
, r
, "Failed to create cgroup %s: %m", u
->cgroup_path
);
818 /* Start watching it */
819 (void) unit_watch_cgroup(u
);
821 /* Enable all controllers we need */
822 r
= cg_enable_everywhere(u
->manager
->cgroup_supported
, enable_mask
, u
->cgroup_path
);
824 log_unit_warning_errno(u
, r
, "Failed to enable controllers on cgroup %s, ignoring: %m", u
->cgroup_path
);
826 /* Keep track that this is now realized */
827 u
->cgroup_realized
= true;
828 u
->cgroup_realized_mask
= target_mask
;
830 if (u
->type
!= UNIT_SLICE
&& !c
->delegate
) {
832 /* Then, possibly move things over, but not if
833 * subgroups may contain processes, which is the case
834 * for slice and delegation units. */
835 r
= cg_migrate_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, u
->cgroup_path
, migrate_callback
, u
);
837 log_unit_warning_errno(u
, r
, "Failed to migrate cgroup from to %s, ignoring: %m", u
->cgroup_path
);
843 int unit_attach_pids_to_cgroup(Unit
*u
) {
847 r
= unit_realize_cgroup(u
);
851 r
= cg_attach_many_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, u
->pids
, migrate_callback
, u
);
858 static bool unit_has_mask_realized(Unit
*u
, CGroupMask target_mask
) {
861 return u
->cgroup_realized
&& u
->cgroup_realized_mask
== target_mask
;
864 /* Check if necessary controllers and attributes for a unit are in place.
867 * If not, create paths, move processes over, and set attributes.
869 * Returns 0 on success and < 0 on failure. */
870 static int unit_realize_cgroup_now(Unit
*u
, ManagerState state
) {
871 CGroupMask target_mask
, enable_mask
;
876 if (u
->in_cgroup_queue
) {
877 LIST_REMOVE(cgroup_queue
, u
->manager
->cgroup_queue
, u
);
878 u
->in_cgroup_queue
= false;
881 target_mask
= unit_get_target_mask(u
);
882 if (unit_has_mask_realized(u
, target_mask
))
885 /* First, realize parents */
886 if (UNIT_ISSET(u
->slice
)) {
887 r
= unit_realize_cgroup_now(UNIT_DEREF(u
->slice
), state
);
892 /* And then do the real work */
893 enable_mask
= unit_get_enable_mask(u
);
894 r
= unit_create_cgroup(u
, target_mask
, enable_mask
);
898 /* Finally, apply the necessary attributes. */
899 cgroup_context_apply(unit_get_cgroup_context(u
), target_mask
, u
->cgroup_path
, state
);
904 static void unit_add_to_cgroup_queue(Unit
*u
) {
906 if (u
->in_cgroup_queue
)
909 LIST_PREPEND(cgroup_queue
, u
->manager
->cgroup_queue
, u
);
910 u
->in_cgroup_queue
= true;
913 unsigned manager_dispatch_cgroup_queue(Manager
*m
) {
919 state
= manager_state(m
);
921 while ((i
= m
->cgroup_queue
)) {
922 assert(i
->in_cgroup_queue
);
924 r
= unit_realize_cgroup_now(i
, state
);
926 log_warning_errno(r
, "Failed to realize cgroups for queued unit %s, ignoring: %m", i
->id
);
934 static void unit_queue_siblings(Unit
*u
) {
937 /* This adds the siblings of the specified unit and the
938 * siblings of all parent units to the cgroup queue. (But
939 * neither the specified unit itself nor the parents.) */
941 while ((slice
= UNIT_DEREF(u
->slice
))) {
945 SET_FOREACH(m
, slice
->dependencies
[UNIT_BEFORE
], i
) {
949 /* Skip units that have a dependency on the slice
950 * but aren't actually in it. */
951 if (UNIT_DEREF(m
->slice
) != slice
)
954 /* No point in doing cgroup application for units
955 * without active processes. */
956 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m
)))
959 /* If the unit doesn't need any new controllers
960 * and has current ones realized, it doesn't need
962 if (unit_has_mask_realized(m
, unit_get_target_mask(m
)))
965 unit_add_to_cgroup_queue(m
);
972 int unit_realize_cgroup(Unit
*u
) {
975 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
978 /* So, here's the deal: when realizing the cgroups for this
979 * unit, we need to first create all parents, but there's more
980 * actually: for the weight-based controllers we also need to
981 * make sure that all our siblings (i.e. units that are in the
982 * same slice as we are) have cgroups, too. Otherwise, things
983 * would become very uneven as each of their processes would
984 * get as much resources as all our group together. This call
985 * will synchronously create the parent cgroups, but will
986 * defer work on the siblings to the next event loop
989 /* Add all sibling slices to the cgroup queue. */
990 unit_queue_siblings(u
);
992 /* And realize this one now (and apply the values) */
993 return unit_realize_cgroup_now(u
, manager_state(u
->manager
));
996 void unit_release_cgroup(Unit
*u
) {
999 /* Forgets all cgroup details for this cgroup */
1001 if (u
->cgroup_path
) {
1002 (void) hashmap_remove(u
->manager
->cgroup_unit
, u
->cgroup_path
);
1003 u
->cgroup_path
= mfree(u
->cgroup_path
);
1006 if (u
->cgroup_inotify_wd
>= 0) {
1007 if (inotify_rm_watch(u
->manager
->cgroup_inotify_fd
, u
->cgroup_inotify_wd
) < 0)
1008 log_unit_debug_errno(u
, errno
, "Failed to remove cgroup inotify watch %i for %s, ignoring", u
->cgroup_inotify_wd
, u
->id
);
1010 (void) hashmap_remove(u
->manager
->cgroup_inotify_wd_unit
, INT_TO_PTR(u
->cgroup_inotify_wd
));
1011 u
->cgroup_inotify_wd
= -1;
1015 void unit_prune_cgroup(Unit
*u
) {
1021 /* Removes the cgroup, if empty and possible, and stops watching it. */
1023 if (!u
->cgroup_path
)
1026 is_root_slice
= unit_has_name(u
, SPECIAL_ROOT_SLICE
);
1028 r
= cg_trim_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, !is_root_slice
);
1030 log_debug_errno(r
, "Failed to destroy cgroup %s, ignoring: %m", u
->cgroup_path
);
1037 unit_release_cgroup(u
);
1039 u
->cgroup_realized
= false;
1040 u
->cgroup_realized_mask
= 0;
1043 int unit_search_main_pid(Unit
*u
, pid_t
*ret
) {
1044 _cleanup_fclose_
FILE *f
= NULL
;
1045 pid_t pid
= 0, npid
, mypid
;
1051 if (!u
->cgroup_path
)
1054 r
= cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, &f
);
1059 while (cg_read_pid(f
, &npid
) > 0) {
1065 /* Ignore processes that aren't our kids */
1066 if (get_parent_of_pid(npid
, &ppid
) >= 0 && ppid
!= mypid
)
1070 /* Dang, there's more than one daemonized PID
1071 in this group, so we don't know what process
1072 is the main process. */
1083 static int unit_watch_pids_in_path(Unit
*u
, const char *path
) {
1084 _cleanup_closedir_
DIR *d
= NULL
;
1085 _cleanup_fclose_
FILE *f
= NULL
;
1091 r
= cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER
, path
, &f
);
1097 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
1098 r
= unit_watch_pid(u
, pid
);
1099 if (r
< 0 && ret
>= 0)
1103 if (r
< 0 && ret
>= 0)
1107 r
= cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER
, path
, &d
);
1114 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
1115 _cleanup_free_
char *p
= NULL
;
1117 p
= strjoin(path
, "/", fn
, NULL
);
1123 r
= unit_watch_pids_in_path(u
, p
);
1124 if (r
< 0 && ret
>= 0)
1128 if (r
< 0 && ret
>= 0)
1135 int unit_watch_all_pids(Unit
*u
) {
1138 /* Adds all PIDs from our cgroup to the set of PIDs we
1139 * watch. This is a fallback logic for cases where we do not
1140 * get reliable cgroup empty notifications: we try to use
1141 * SIGCHLD as replacement. */
1143 if (!u
->cgroup_path
)
1146 if (cg_unified() > 0) /* On unified we can use proper notifications */
1149 return unit_watch_pids_in_path(u
, u
->cgroup_path
);
1152 int unit_notify_cgroup_empty(Unit
*u
) {
1157 if (!u
->cgroup_path
)
1160 r
= cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
);
1164 unit_add_to_gc_queue(u
);
1166 if (UNIT_VTABLE(u
)->notify_cgroup_empty
)
1167 UNIT_VTABLE(u
)->notify_cgroup_empty(u
);
1172 static int on_cgroup_inotify_event(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
1173 Manager
*m
= userdata
;
1180 union inotify_event_buffer buffer
;
1181 struct inotify_event
*e
;
1184 l
= read(fd
, &buffer
, sizeof(buffer
));
1186 if (errno
== EINTR
|| errno
== EAGAIN
)
1189 return log_error_errno(errno
, "Failed to read control group inotify events: %m");
1192 FOREACH_INOTIFY_EVENT(e
, buffer
, l
) {
1196 /* Queue overflow has no watch descriptor */
1199 if (e
->mask
& IN_IGNORED
)
1200 /* The watch was just removed */
1203 u
= hashmap_get(m
->cgroup_inotify_wd_unit
, INT_TO_PTR(e
->wd
));
1204 if (!u
) /* Not that inotify might deliver
1205 * events for a watch even after it
1206 * was removed, because it was queued
1207 * before the removal. Let's ignore
1208 * this here safely. */
1211 (void) unit_notify_cgroup_empty(u
);
1216 int manager_setup_cgroup(Manager
*m
) {
1217 _cleanup_free_
char *path
= NULL
;
1224 /* 1. Determine hierarchy */
1225 m
->cgroup_root
= mfree(m
->cgroup_root
);
1226 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &m
->cgroup_root
);
1228 return log_error_errno(r
, "Cannot determine cgroup we are running in: %m");
1230 /* Chop off the init scope, if we are already located in it */
1231 e
= endswith(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
1233 /* LEGACY: Also chop off the system slice if we are in
1234 * it. This is to support live upgrades from older systemd
1235 * versions where PID 1 was moved there. Also see
1236 * cg_get_root_path(). */
1237 if (!e
&& m
->running_as
== MANAGER_SYSTEM
) {
1238 e
= endswith(m
->cgroup_root
, "/" SPECIAL_SYSTEM_SLICE
);
1240 e
= endswith(m
->cgroup_root
, "/system"); /* even more legacy */
1245 /* And make sure to store away the root value without trailing
1246 * slash, even for the root dir, so that we can easily prepend
1248 while ((e
= endswith(m
->cgroup_root
, "/")))
1252 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, NULL
, &path
);
1254 return log_error_errno(r
, "Cannot find cgroup mount point: %m");
1256 unified
= cg_unified();
1258 return log_error_errno(r
, "Couldn't determine if we are running in the unified hierarchy: %m");
1260 log_debug("Unified cgroup hierarchy is located at %s.", path
);
1262 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER
". File system hierarchy is at %s.", path
);
1265 const char *scope_path
;
1267 /* 3. Install agent */
1270 /* In the unified hierarchy we can can get
1271 * cgroup empty notifications via inotify. */
1273 m
->cgroup_inotify_event_source
= sd_event_source_unref(m
->cgroup_inotify_event_source
);
1274 safe_close(m
->cgroup_inotify_fd
);
1276 m
->cgroup_inotify_fd
= inotify_init1(IN_NONBLOCK
|IN_CLOEXEC
);
1277 if (m
->cgroup_inotify_fd
< 0)
1278 return log_error_errno(errno
, "Failed to create control group inotify object: %m");
1280 r
= sd_event_add_io(m
->event
, &m
->cgroup_inotify_event_source
, m
->cgroup_inotify_fd
, EPOLLIN
, on_cgroup_inotify_event
, m
);
1282 return log_error_errno(r
, "Failed to watch control group inotify object: %m");
1284 r
= sd_event_source_set_priority(m
->cgroup_inotify_event_source
, SD_EVENT_PRIORITY_IDLE
- 5);
1286 return log_error_errno(r
, "Failed to set priority of inotify event source: %m");
1288 (void) sd_event_source_set_description(m
->cgroup_inotify_event_source
, "cgroup-inotify");
1290 } else if (m
->running_as
== MANAGER_SYSTEM
) {
1292 /* On the legacy hierarchy we only get
1293 * notifications via cgroup agents. (Which
1294 * isn't really reliable, since it does not
1295 * generate events when control groups with
1296 * children run empty. */
1298 r
= cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER
, SYSTEMD_CGROUP_AGENT_PATH
);
1300 log_warning_errno(r
, "Failed to install release agent, ignoring: %m");
1302 log_debug("Installed release agent.");
1304 log_debug("Release agent already installed.");
1307 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1308 scope_path
= strjoina(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
1309 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, scope_path
, 0);
1311 return log_error_errno(r
, "Failed to create %s control group: %m", scope_path
);
1313 /* also, move all other userspace processes remaining
1314 * in the root cgroup into that scope. */
1315 r
= cg_migrate(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, SYSTEMD_CGROUP_CONTROLLER
, scope_path
, false);
1317 log_warning_errno(r
, "Couldn't move remaining userspace processes, ignoring: %m");
1319 /* 5. And pin it, so that it cannot be unmounted */
1320 safe_close(m
->pin_cgroupfs_fd
);
1321 m
->pin_cgroupfs_fd
= open(path
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
|O_NOCTTY
|O_NONBLOCK
);
1322 if (m
->pin_cgroupfs_fd
< 0)
1323 return log_error_errno(errno
, "Failed to open pin file: %m");
1325 /* 6. Always enable hierarchical support if it exists... */
1327 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1330 /* 7. Figure out which controllers are supported */
1331 r
= cg_mask_supported(&m
->cgroup_supported
);
1333 return log_error_errno(r
, "Failed to determine supported controllers: %m");
1335 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++)
1336 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c
), yes_no(m
->cgroup_supported
& c
));
1341 void manager_shutdown_cgroup(Manager
*m
, bool delete) {
1344 /* We can't really delete the group, since we are in it. But
1346 if (delete && m
->cgroup_root
)
1347 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, false);
1349 m
->cgroup_inotify_wd_unit
= hashmap_free(m
->cgroup_inotify_wd_unit
);
1351 m
->cgroup_inotify_event_source
= sd_event_source_unref(m
->cgroup_inotify_event_source
);
1352 m
->cgroup_inotify_fd
= safe_close(m
->cgroup_inotify_fd
);
1354 m
->pin_cgroupfs_fd
= safe_close(m
->pin_cgroupfs_fd
);
1356 m
->cgroup_root
= mfree(m
->cgroup_root
);
1359 Unit
* manager_get_unit_by_cgroup(Manager
*m
, const char *cgroup
) {
1366 u
= hashmap_get(m
->cgroup_unit
, cgroup
);
1370 p
= strdupa(cgroup
);
1374 e
= strrchr(p
, '/');
1376 return hashmap_get(m
->cgroup_unit
, SPECIAL_ROOT_SLICE
);
1380 u
= hashmap_get(m
->cgroup_unit
, p
);
1386 Unit
*manager_get_unit_by_pid_cgroup(Manager
*m
, pid_t pid
) {
1387 _cleanup_free_
char *cgroup
= NULL
;
1395 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
1399 return manager_get_unit_by_cgroup(m
, cgroup
);
1402 Unit
*manager_get_unit_by_pid(Manager
*m
, pid_t pid
) {
1411 return hashmap_get(m
->units
, SPECIAL_INIT_SCOPE
);
1413 u
= hashmap_get(m
->watch_pids1
, PID_TO_PTR(pid
));
1417 u
= hashmap_get(m
->watch_pids2
, PID_TO_PTR(pid
));
1421 return manager_get_unit_by_pid_cgroup(m
, pid
);
1424 int manager_notify_cgroup_empty(Manager
*m
, const char *cgroup
) {
1430 u
= manager_get_unit_by_cgroup(m
, cgroup
);
1434 return unit_notify_cgroup_empty(u
);
1437 int unit_get_memory_current(Unit
*u
, uint64_t *ret
) {
1438 _cleanup_free_
char *v
= NULL
;
1444 if (!u
->cgroup_path
)
1447 if ((u
->cgroup_realized_mask
& CGROUP_MASK_MEMORY
) == 0)
1450 if (cg_unified() <= 0)
1451 r
= cg_get_attribute("memory", u
->cgroup_path
, "memory.usage_in_bytes", &v
);
1453 r
= cg_get_attribute("memory", u
->cgroup_path
, "memory.current", &v
);
1459 return safe_atou64(v
, ret
);
1462 static int unit_get_cpu_usage_raw(Unit
*u
, nsec_t
*ret
) {
1463 _cleanup_free_
char *v
= NULL
;
1470 if (!u
->cgroup_path
)
1473 if ((u
->cgroup_realized_mask
& CGROUP_MASK_CPUACCT
) == 0)
1476 r
= cg_get_attribute("cpuacct", u
->cgroup_path
, "cpuacct.usage", &v
);
1482 r
= safe_atou64(v
, &ns
);
1490 int unit_get_cpu_usage(Unit
*u
, nsec_t
*ret
) {
1494 r
= unit_get_cpu_usage_raw(u
, &ns
);
1498 if (ns
> u
->cpuacct_usage_base
)
1499 ns
-= u
->cpuacct_usage_base
;
1507 int unit_reset_cpu_usage(Unit
*u
) {
1513 r
= unit_get_cpu_usage_raw(u
, &ns
);
1515 u
->cpuacct_usage_base
= 0;
1519 u
->cpuacct_usage_base
= ns
;
1523 bool unit_cgroup_delegate(Unit
*u
) {
1528 c
= unit_get_cgroup_context(u
);
1535 static const char* const cgroup_device_policy_table
[_CGROUP_DEVICE_POLICY_MAX
] = {
1536 [CGROUP_AUTO
] = "auto",
1537 [CGROUP_CLOSED
] = "closed",
1538 [CGROUP_STRICT
] = "strict",
1541 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy
, CGroupDevicePolicy
);