1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 #include "blockdev-util.h"
26 #include "bpf-firewall.h"
27 #include "bus-error.h"
28 #include "cgroup-util.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "process-util.h"
36 #include "procfs-util.h"
38 #include "stdio-util.h"
39 #include "string-table.h"
40 #include "string-util.h"
43 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
45 bool manager_owns_root_cgroup(Manager
*m
) {
48 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
49 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
50 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
51 * we run in any kind of container virtualization. */
53 if (detect_container() > 0)
56 return isempty(m
->cgroup_root
) || path_equal(m
->cgroup_root
, "/");
59 bool unit_has_root_cgroup(Unit
*u
) {
62 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
63 * the manager manages the root cgroup. */
65 if (!manager_owns_root_cgroup(u
->manager
))
68 return unit_has_name(u
, SPECIAL_ROOT_SLICE
);
71 static void cgroup_compat_warn(void) {
72 static bool cgroup_compat_warned
= false;
74 if (cgroup_compat_warned
)
77 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
78 "See cgroup-compat debug messages for details.");
80 cgroup_compat_warned
= true;
83 #define log_cgroup_compat(unit, fmt, ...) do { \
84 cgroup_compat_warn(); \
85 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
88 void cgroup_context_init(CGroupContext
*c
) {
91 /* Initialize everything to the kernel defaults, assuming the
92 * structure is preinitialized to 0 */
94 c
->cpu_weight
= CGROUP_WEIGHT_INVALID
;
95 c
->startup_cpu_weight
= CGROUP_WEIGHT_INVALID
;
96 c
->cpu_quota_per_sec_usec
= USEC_INFINITY
;
98 c
->cpu_shares
= CGROUP_CPU_SHARES_INVALID
;
99 c
->startup_cpu_shares
= CGROUP_CPU_SHARES_INVALID
;
101 c
->memory_high
= CGROUP_LIMIT_MAX
;
102 c
->memory_max
= CGROUP_LIMIT_MAX
;
103 c
->memory_swap_max
= CGROUP_LIMIT_MAX
;
105 c
->memory_limit
= CGROUP_LIMIT_MAX
;
107 c
->io_weight
= CGROUP_WEIGHT_INVALID
;
108 c
->startup_io_weight
= CGROUP_WEIGHT_INVALID
;
110 c
->blockio_weight
= CGROUP_BLKIO_WEIGHT_INVALID
;
111 c
->startup_blockio_weight
= CGROUP_BLKIO_WEIGHT_INVALID
;
113 c
->tasks_max
= (uint64_t) -1;
116 void cgroup_context_free_device_allow(CGroupContext
*c
, CGroupDeviceAllow
*a
) {
120 LIST_REMOVE(device_allow
, c
->device_allow
, a
);
125 void cgroup_context_free_io_device_weight(CGroupContext
*c
, CGroupIODeviceWeight
*w
) {
129 LIST_REMOVE(device_weights
, c
->io_device_weights
, w
);
134 void cgroup_context_free_io_device_limit(CGroupContext
*c
, CGroupIODeviceLimit
*l
) {
138 LIST_REMOVE(device_limits
, c
->io_device_limits
, l
);
143 void cgroup_context_free_blockio_device_weight(CGroupContext
*c
, CGroupBlockIODeviceWeight
*w
) {
147 LIST_REMOVE(device_weights
, c
->blockio_device_weights
, w
);
152 void cgroup_context_free_blockio_device_bandwidth(CGroupContext
*c
, CGroupBlockIODeviceBandwidth
*b
) {
156 LIST_REMOVE(device_bandwidths
, c
->blockio_device_bandwidths
, b
);
161 void cgroup_context_done(CGroupContext
*c
) {
164 while (c
->io_device_weights
)
165 cgroup_context_free_io_device_weight(c
, c
->io_device_weights
);
167 while (c
->io_device_limits
)
168 cgroup_context_free_io_device_limit(c
, c
->io_device_limits
);
170 while (c
->blockio_device_weights
)
171 cgroup_context_free_blockio_device_weight(c
, c
->blockio_device_weights
);
173 while (c
->blockio_device_bandwidths
)
174 cgroup_context_free_blockio_device_bandwidth(c
, c
->blockio_device_bandwidths
);
176 while (c
->device_allow
)
177 cgroup_context_free_device_allow(c
, c
->device_allow
);
179 c
->ip_address_allow
= ip_address_access_free_all(c
->ip_address_allow
);
180 c
->ip_address_deny
= ip_address_access_free_all(c
->ip_address_deny
);
183 void cgroup_context_dump(CGroupContext
*c
, FILE* f
, const char *prefix
) {
184 CGroupIODeviceLimit
*il
;
185 CGroupIODeviceWeight
*iw
;
186 CGroupBlockIODeviceBandwidth
*b
;
187 CGroupBlockIODeviceWeight
*w
;
188 CGroupDeviceAllow
*a
;
189 IPAddressAccessItem
*iaai
;
190 char u
[FORMAT_TIMESPAN_MAX
];
195 prefix
= strempty(prefix
);
198 "%sCPUAccounting=%s\n"
199 "%sIOAccounting=%s\n"
200 "%sBlockIOAccounting=%s\n"
201 "%sMemoryAccounting=%s\n"
202 "%sTasksAccounting=%s\n"
203 "%sIPAccounting=%s\n"
204 "%sCPUWeight=%" PRIu64
"\n"
205 "%sStartupCPUWeight=%" PRIu64
"\n"
206 "%sCPUShares=%" PRIu64
"\n"
207 "%sStartupCPUShares=%" PRIu64
"\n"
208 "%sCPUQuotaPerSecSec=%s\n"
209 "%sIOWeight=%" PRIu64
"\n"
210 "%sStartupIOWeight=%" PRIu64
"\n"
211 "%sBlockIOWeight=%" PRIu64
"\n"
212 "%sStartupBlockIOWeight=%" PRIu64
"\n"
213 "%sMemoryLow=%" PRIu64
"\n"
214 "%sMemoryHigh=%" PRIu64
"\n"
215 "%sMemoryMax=%" PRIu64
"\n"
216 "%sMemorySwapMax=%" PRIu64
"\n"
217 "%sMemoryLimit=%" PRIu64
"\n"
218 "%sTasksMax=%" PRIu64
"\n"
219 "%sDevicePolicy=%s\n"
221 prefix
, yes_no(c
->cpu_accounting
),
222 prefix
, yes_no(c
->io_accounting
),
223 prefix
, yes_no(c
->blockio_accounting
),
224 prefix
, yes_no(c
->memory_accounting
),
225 prefix
, yes_no(c
->tasks_accounting
),
226 prefix
, yes_no(c
->ip_accounting
),
227 prefix
, c
->cpu_weight
,
228 prefix
, c
->startup_cpu_weight
,
229 prefix
, c
->cpu_shares
,
230 prefix
, c
->startup_cpu_shares
,
231 prefix
, format_timespan(u
, sizeof(u
), c
->cpu_quota_per_sec_usec
, 1),
232 prefix
, c
->io_weight
,
233 prefix
, c
->startup_io_weight
,
234 prefix
, c
->blockio_weight
,
235 prefix
, c
->startup_blockio_weight
,
236 prefix
, c
->memory_low
,
237 prefix
, c
->memory_high
,
238 prefix
, c
->memory_max
,
239 prefix
, c
->memory_swap_max
,
240 prefix
, c
->memory_limit
,
241 prefix
, c
->tasks_max
,
242 prefix
, cgroup_device_policy_to_string(c
->device_policy
),
243 prefix
, yes_no(c
->delegate
));
246 _cleanup_free_
char *t
= NULL
;
248 (void) cg_mask_to_string(c
->delegate_controllers
, &t
);
250 fprintf(f
, "%sDelegateControllers=%s\n",
255 LIST_FOREACH(device_allow
, a
, c
->device_allow
)
257 "%sDeviceAllow=%s %s%s%s\n",
260 a
->r
? "r" : "", a
->w
? "w" : "", a
->m
? "m" : "");
262 LIST_FOREACH(device_weights
, iw
, c
->io_device_weights
)
264 "%sIODeviceWeight=%s %" PRIu64
,
269 LIST_FOREACH(device_limits
, il
, c
->io_device_limits
) {
270 char buf
[FORMAT_BYTES_MAX
];
271 CGroupIOLimitType type
;
273 for (type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++)
274 if (il
->limits
[type
] != cgroup_io_limit_defaults
[type
])
278 cgroup_io_limit_type_to_string(type
),
280 format_bytes(buf
, sizeof(buf
), il
->limits
[type
]));
283 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
)
285 "%sBlockIODeviceWeight=%s %" PRIu64
,
290 LIST_FOREACH(device_bandwidths
, b
, c
->blockio_device_bandwidths
) {
291 char buf
[FORMAT_BYTES_MAX
];
293 if (b
->rbps
!= CGROUP_LIMIT_MAX
)
295 "%sBlockIOReadBandwidth=%s %s\n",
298 format_bytes(buf
, sizeof(buf
), b
->rbps
));
299 if (b
->wbps
!= CGROUP_LIMIT_MAX
)
301 "%sBlockIOWriteBandwidth=%s %s\n",
304 format_bytes(buf
, sizeof(buf
), b
->wbps
));
307 LIST_FOREACH(items
, iaai
, c
->ip_address_allow
) {
308 _cleanup_free_
char *k
= NULL
;
310 (void) in_addr_to_string(iaai
->family
, &iaai
->address
, &k
);
311 fprintf(f
, "%sIPAddressAllow=%s/%u\n", prefix
, strnull(k
), iaai
->prefixlen
);
314 LIST_FOREACH(items
, iaai
, c
->ip_address_deny
) {
315 _cleanup_free_
char *k
= NULL
;
317 (void) in_addr_to_string(iaai
->family
, &iaai
->address
, &k
);
318 fprintf(f
, "%sIPAddressDeny=%s/%u\n", prefix
, strnull(k
), iaai
->prefixlen
);
322 static int lookup_block_device(const char *p
, dev_t
*dev
) {
331 return log_warning_errno(errno
, "Couldn't stat device %s: %m", p
);
333 if (S_ISBLK(st
.st_mode
))
335 else if (major(st
.st_dev
) != 0) {
336 /* If this is not a device node then find the block
337 * device this file is stored on */
340 /* If this is a partition, try to get the originating
342 (void) block_get_whole_disk(*dev
, dev
);
344 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p
);
351 static int whitelist_device(const char *path
, const char *node
, const char *acc
) {
352 char buf
[2+DECIMAL_STR_MAX(dev_t
)*2+2+4];
354 bool ignore_notfound
;
360 if (node
[0] == '-') {
361 /* Non-existent paths starting with "-" must be silently ignored */
363 ignore_notfound
= true;
365 ignore_notfound
= false;
367 if (stat(node
, &st
) < 0) {
368 if (errno
== ENOENT
&& ignore_notfound
)
371 return log_warning_errno(errno
, "Couldn't stat device %s: %m", node
);
374 if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
375 log_warning("%s is not a device.", node
);
381 S_ISCHR(st
.st_mode
) ? 'c' : 'b',
382 major(st
.st_rdev
), minor(st
.st_rdev
),
385 r
= cg_set_attribute("devices", path
, "devices.allow", buf
);
387 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
388 "Failed to set devices.allow on %s: %m", path
);
393 static int whitelist_major(const char *path
, const char *name
, char type
, const char *acc
) {
394 _cleanup_fclose_
FILE *f
= NULL
;
401 assert(IN_SET(type
, 'b', 'c'));
403 f
= fopen("/proc/devices", "re");
405 return log_warning_errno(errno
, "Cannot open /proc/devices to resolve %s (%c): %m", name
, type
);
407 FOREACH_LINE(line
, f
, goto fail
) {
408 char buf
[2+DECIMAL_STR_MAX(unsigned)+3+4], *p
, *w
;
413 if (type
== 'c' && streq(line
, "Character devices:")) {
418 if (type
== 'b' && streq(line
, "Block devices:")) {
433 w
= strpbrk(p
, WHITESPACE
);
438 r
= safe_atou(p
, &maj
);
445 w
+= strspn(w
, WHITESPACE
);
447 if (fnmatch(name
, w
, 0) != 0)
456 r
= cg_set_attribute("devices", path
, "devices.allow", buf
);
458 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
459 "Failed to set devices.allow on %s: %m", path
);
465 return log_warning_errno(errno
, "Failed to read /proc/devices: %m");
468 static bool cgroup_context_has_cpu_weight(CGroupContext
*c
) {
469 return c
->cpu_weight
!= CGROUP_WEIGHT_INVALID
||
470 c
->startup_cpu_weight
!= CGROUP_WEIGHT_INVALID
;
473 static bool cgroup_context_has_cpu_shares(CGroupContext
*c
) {
474 return c
->cpu_shares
!= CGROUP_CPU_SHARES_INVALID
||
475 c
->startup_cpu_shares
!= CGROUP_CPU_SHARES_INVALID
;
478 static uint64_t cgroup_context_cpu_weight(CGroupContext
*c
, ManagerState state
) {
479 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
480 c
->startup_cpu_weight
!= CGROUP_WEIGHT_INVALID
)
481 return c
->startup_cpu_weight
;
482 else if (c
->cpu_weight
!= CGROUP_WEIGHT_INVALID
)
483 return c
->cpu_weight
;
485 return CGROUP_WEIGHT_DEFAULT
;
488 static uint64_t cgroup_context_cpu_shares(CGroupContext
*c
, ManagerState state
) {
489 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
490 c
->startup_cpu_shares
!= CGROUP_CPU_SHARES_INVALID
)
491 return c
->startup_cpu_shares
;
492 else if (c
->cpu_shares
!= CGROUP_CPU_SHARES_INVALID
)
493 return c
->cpu_shares
;
495 return CGROUP_CPU_SHARES_DEFAULT
;
498 static void cgroup_apply_unified_cpu_config(Unit
*u
, uint64_t weight
, uint64_t quota
) {
499 char buf
[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t
) + 1) * 2)];
502 xsprintf(buf
, "%" PRIu64
"\n", weight
);
503 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.weight", buf
);
505 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
506 "Failed to set cpu.weight: %m");
508 if (quota
!= USEC_INFINITY
)
509 xsprintf(buf
, USEC_FMT
" " USEC_FMT
"\n",
510 quota
* CGROUP_CPU_QUOTA_PERIOD_USEC
/ USEC_PER_SEC
, CGROUP_CPU_QUOTA_PERIOD_USEC
);
512 xsprintf(buf
, "max " USEC_FMT
"\n", CGROUP_CPU_QUOTA_PERIOD_USEC
);
514 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.max", buf
);
517 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
518 "Failed to set cpu.max: %m");
521 static void cgroup_apply_legacy_cpu_config(Unit
*u
, uint64_t shares
, uint64_t quota
) {
522 char buf
[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t
)) + 1];
525 xsprintf(buf
, "%" PRIu64
"\n", shares
);
526 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.shares", buf
);
528 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
529 "Failed to set cpu.shares: %m");
531 xsprintf(buf
, USEC_FMT
"\n", CGROUP_CPU_QUOTA_PERIOD_USEC
);
532 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.cfs_period_us", buf
);
534 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
535 "Failed to set cpu.cfs_period_us: %m");
537 if (quota
!= USEC_INFINITY
) {
538 xsprintf(buf
, USEC_FMT
"\n", quota
* CGROUP_CPU_QUOTA_PERIOD_USEC
/ USEC_PER_SEC
);
539 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.cfs_quota_us", buf
);
541 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.cfs_quota_us", "-1");
543 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
544 "Failed to set cpu.cfs_quota_us: %m");
547 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares
) {
548 return CLAMP(shares
* CGROUP_WEIGHT_DEFAULT
/ CGROUP_CPU_SHARES_DEFAULT
,
549 CGROUP_WEIGHT_MIN
, CGROUP_WEIGHT_MAX
);
552 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight
) {
553 return CLAMP(weight
* CGROUP_CPU_SHARES_DEFAULT
/ CGROUP_WEIGHT_DEFAULT
,
554 CGROUP_CPU_SHARES_MIN
, CGROUP_CPU_SHARES_MAX
);
557 static bool cgroup_context_has_io_config(CGroupContext
*c
) {
558 return c
->io_accounting
||
559 c
->io_weight
!= CGROUP_WEIGHT_INVALID
||
560 c
->startup_io_weight
!= CGROUP_WEIGHT_INVALID
||
561 c
->io_device_weights
||
565 static bool cgroup_context_has_blockio_config(CGroupContext
*c
) {
566 return c
->blockio_accounting
||
567 c
->blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
||
568 c
->startup_blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
||
569 c
->blockio_device_weights
||
570 c
->blockio_device_bandwidths
;
573 static uint64_t cgroup_context_io_weight(CGroupContext
*c
, ManagerState state
) {
574 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
575 c
->startup_io_weight
!= CGROUP_WEIGHT_INVALID
)
576 return c
->startup_io_weight
;
577 else if (c
->io_weight
!= CGROUP_WEIGHT_INVALID
)
580 return CGROUP_WEIGHT_DEFAULT
;
583 static uint64_t cgroup_context_blkio_weight(CGroupContext
*c
, ManagerState state
) {
584 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
585 c
->startup_blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
)
586 return c
->startup_blockio_weight
;
587 else if (c
->blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
)
588 return c
->blockio_weight
;
590 return CGROUP_BLKIO_WEIGHT_DEFAULT
;
593 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight
) {
594 return CLAMP(blkio_weight
* CGROUP_WEIGHT_DEFAULT
/ CGROUP_BLKIO_WEIGHT_DEFAULT
,
595 CGROUP_WEIGHT_MIN
, CGROUP_WEIGHT_MAX
);
598 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight
) {
599 return CLAMP(io_weight
* CGROUP_BLKIO_WEIGHT_DEFAULT
/ CGROUP_WEIGHT_DEFAULT
,
600 CGROUP_BLKIO_WEIGHT_MIN
, CGROUP_BLKIO_WEIGHT_MAX
);
603 static void cgroup_apply_io_device_weight(Unit
*u
, const char *dev_path
, uint64_t io_weight
) {
604 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
608 r
= lookup_block_device(dev_path
, &dev
);
612 xsprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), io_weight
);
613 r
= cg_set_attribute("io", u
->cgroup_path
, "io.weight", buf
);
615 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
616 "Failed to set io.weight: %m");
619 static void cgroup_apply_blkio_device_weight(Unit
*u
, const char *dev_path
, uint64_t blkio_weight
) {
620 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
624 r
= lookup_block_device(dev_path
, &dev
);
628 xsprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), blkio_weight
);
629 r
= cg_set_attribute("blkio", u
->cgroup_path
, "blkio.weight_device", buf
);
631 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
632 "Failed to set blkio.weight_device: %m");
635 static unsigned cgroup_apply_io_device_limit(Unit
*u
, const char *dev_path
, uint64_t *limits
) {
636 char limit_bufs
[_CGROUP_IO_LIMIT_TYPE_MAX
][DECIMAL_STR_MAX(uint64_t)];
637 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
638 CGroupIOLimitType type
;
643 r
= lookup_block_device(dev_path
, &dev
);
647 for (type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++) {
648 if (limits
[type
] != cgroup_io_limit_defaults
[type
]) {
649 xsprintf(limit_bufs
[type
], "%" PRIu64
, limits
[type
]);
652 xsprintf(limit_bufs
[type
], "%s", limits
[type
] == CGROUP_LIMIT_MAX
? "max" : "0");
656 xsprintf(buf
, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev
), minor(dev
),
657 limit_bufs
[CGROUP_IO_RBPS_MAX
], limit_bufs
[CGROUP_IO_WBPS_MAX
],
658 limit_bufs
[CGROUP_IO_RIOPS_MAX
], limit_bufs
[CGROUP_IO_WIOPS_MAX
]);
659 r
= cg_set_attribute("io", u
->cgroup_path
, "io.max", buf
);
661 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
662 "Failed to set io.max: %m");
666 static unsigned cgroup_apply_blkio_device_limit(Unit
*u
, const char *dev_path
, uint64_t rbps
, uint64_t wbps
) {
667 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
672 r
= lookup_block_device(dev_path
, &dev
);
676 if (rbps
!= CGROUP_LIMIT_MAX
)
678 sprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), rbps
);
679 r
= cg_set_attribute("blkio", u
->cgroup_path
, "blkio.throttle.read_bps_device", buf
);
681 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
682 "Failed to set blkio.throttle.read_bps_device: %m");
684 if (wbps
!= CGROUP_LIMIT_MAX
)
686 sprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), wbps
);
687 r
= cg_set_attribute("blkio", u
->cgroup_path
, "blkio.throttle.write_bps_device", buf
);
689 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
690 "Failed to set blkio.throttle.write_bps_device: %m");
695 static bool cgroup_context_has_unified_memory_config(CGroupContext
*c
) {
696 return c
->memory_low
> 0 || c
->memory_high
!= CGROUP_LIMIT_MAX
|| c
->memory_max
!= CGROUP_LIMIT_MAX
|| c
->memory_swap_max
!= CGROUP_LIMIT_MAX
;
699 static void cgroup_apply_unified_memory_limit(Unit
*u
, const char *file
, uint64_t v
) {
700 char buf
[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
703 if (v
!= CGROUP_LIMIT_MAX
)
704 xsprintf(buf
, "%" PRIu64
"\n", v
);
706 r
= cg_set_attribute("memory", u
->cgroup_path
, file
, buf
);
708 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
709 "Failed to set %s: %m", file
);
712 static void cgroup_apply_firewall(Unit
*u
) {
715 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
717 if (bpf_firewall_compile(u
) < 0)
720 (void) bpf_firewall_install(u
);
723 static void cgroup_context_apply(
725 CGroupMask apply_mask
,
727 ManagerState state
) {
736 /* Nothing to do? Exit early! */
737 if (apply_mask
== 0 && !apply_bpf
)
740 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
741 is_root
= unit_has_root_cgroup(u
);
743 assert_se(c
= unit_get_cgroup_context(u
));
744 assert_se(path
= u
->cgroup_path
);
746 if (is_root
) /* Make sure we don't try to display messages with an empty path. */
749 /* We generally ignore errors caused by read-only mounted
750 * cgroup trees (assuming we are running in a container then),
751 * and missing cgroups, i.e. EROFS and ENOENT. */
753 if ((apply_mask
& CGROUP_MASK_CPU
) && !is_root
) {
754 bool has_weight
, has_shares
;
756 has_weight
= cgroup_context_has_cpu_weight(c
);
757 has_shares
= cgroup_context_has_cpu_shares(c
);
759 if (cg_all_unified() > 0) {
763 weight
= cgroup_context_cpu_weight(c
, state
);
764 else if (has_shares
) {
765 uint64_t shares
= cgroup_context_cpu_shares(c
, state
);
767 weight
= cgroup_cpu_shares_to_weight(shares
);
769 log_cgroup_compat(u
, "Applying [Startup]CpuShares %" PRIu64
" as [Startup]CpuWeight %" PRIu64
" on %s",
770 shares
, weight
, path
);
772 weight
= CGROUP_WEIGHT_DEFAULT
;
774 cgroup_apply_unified_cpu_config(u
, weight
, c
->cpu_quota_per_sec_usec
);
779 uint64_t weight
= cgroup_context_cpu_weight(c
, state
);
781 shares
= cgroup_cpu_weight_to_shares(weight
);
783 log_cgroup_compat(u
, "Applying [Startup]CpuWeight %" PRIu64
" as [Startup]CpuShares %" PRIu64
" on %s",
784 weight
, shares
, path
);
785 } else if (has_shares
)
786 shares
= cgroup_context_cpu_shares(c
, state
);
788 shares
= CGROUP_CPU_SHARES_DEFAULT
;
790 cgroup_apply_legacy_cpu_config(u
, shares
, c
->cpu_quota_per_sec_usec
);
794 if (apply_mask
& CGROUP_MASK_IO
) {
795 bool has_io
= cgroup_context_has_io_config(c
);
796 bool has_blockio
= cgroup_context_has_blockio_config(c
);
799 char buf
[8+DECIMAL_STR_MAX(uint64_t)+1];
803 weight
= cgroup_context_io_weight(c
, state
);
804 else if (has_blockio
) {
805 uint64_t blkio_weight
= cgroup_context_blkio_weight(c
, state
);
807 weight
= cgroup_weight_blkio_to_io(blkio_weight
);
809 log_cgroup_compat(u
, "Applying [Startup]BlockIOWeight %" PRIu64
" as [Startup]IOWeight %" PRIu64
,
810 blkio_weight
, weight
);
812 weight
= CGROUP_WEIGHT_DEFAULT
;
814 xsprintf(buf
, "default %" PRIu64
"\n", weight
);
815 r
= cg_set_attribute("io", path
, "io.weight", buf
);
817 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
818 "Failed to set io.weight: %m");
821 CGroupIODeviceWeight
*w
;
823 /* FIXME: no way to reset this list */
824 LIST_FOREACH(device_weights
, w
, c
->io_device_weights
)
825 cgroup_apply_io_device_weight(u
, w
->path
, w
->weight
);
826 } else if (has_blockio
) {
827 CGroupBlockIODeviceWeight
*w
;
829 /* FIXME: no way to reset this list */
830 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
) {
831 weight
= cgroup_weight_blkio_to_io(w
->weight
);
833 log_cgroup_compat(u
, "Applying BlockIODeviceWeight %" PRIu64
" as IODeviceWeight %" PRIu64
" for %s",
834 w
->weight
, weight
, w
->path
);
836 cgroup_apply_io_device_weight(u
, w
->path
, weight
);
841 /* Apply limits and free ones without config. */
843 CGroupIODeviceLimit
*l
, *next
;
845 LIST_FOREACH_SAFE(device_limits
, l
, next
, c
->io_device_limits
) {
846 if (!cgroup_apply_io_device_limit(u
, l
->path
, l
->limits
))
847 cgroup_context_free_io_device_limit(c
, l
);
849 } else if (has_blockio
) {
850 CGroupBlockIODeviceBandwidth
*b
, *next
;
852 LIST_FOREACH_SAFE(device_bandwidths
, b
, next
, c
->blockio_device_bandwidths
) {
853 uint64_t limits
[_CGROUP_IO_LIMIT_TYPE_MAX
];
854 CGroupIOLimitType type
;
856 for (type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++)
857 limits
[type
] = cgroup_io_limit_defaults
[type
];
859 limits
[CGROUP_IO_RBPS_MAX
] = b
->rbps
;
860 limits
[CGROUP_IO_WBPS_MAX
] = b
->wbps
;
862 log_cgroup_compat(u
, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64
" %" PRIu64
" as IO{Read|Write}BandwidthMax for %s",
863 b
->rbps
, b
->wbps
, b
->path
);
865 if (!cgroup_apply_io_device_limit(u
, b
->path
, limits
))
866 cgroup_context_free_blockio_device_bandwidth(c
, b
);
871 if (apply_mask
& CGROUP_MASK_BLKIO
) {
872 bool has_io
= cgroup_context_has_io_config(c
);
873 bool has_blockio
= cgroup_context_has_blockio_config(c
);
876 char buf
[DECIMAL_STR_MAX(uint64_t)+1];
880 uint64_t io_weight
= cgroup_context_io_weight(c
, state
);
882 weight
= cgroup_weight_io_to_blkio(cgroup_context_io_weight(c
, state
));
884 log_cgroup_compat(u
, "Applying [Startup]IOWeight %" PRIu64
" as [Startup]BlockIOWeight %" PRIu64
,
886 } else if (has_blockio
)
887 weight
= cgroup_context_blkio_weight(c
, state
);
889 weight
= CGROUP_BLKIO_WEIGHT_DEFAULT
;
891 xsprintf(buf
, "%" PRIu64
"\n", weight
);
892 r
= cg_set_attribute("blkio", path
, "blkio.weight", buf
);
894 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
895 "Failed to set blkio.weight: %m");
898 CGroupIODeviceWeight
*w
;
900 /* FIXME: no way to reset this list */
901 LIST_FOREACH(device_weights
, w
, c
->io_device_weights
) {
902 weight
= cgroup_weight_io_to_blkio(w
->weight
);
904 log_cgroup_compat(u
, "Applying IODeviceWeight %" PRIu64
" as BlockIODeviceWeight %" PRIu64
" for %s",
905 w
->weight
, weight
, w
->path
);
907 cgroup_apply_blkio_device_weight(u
, w
->path
, weight
);
909 } else if (has_blockio
) {
910 CGroupBlockIODeviceWeight
*w
;
912 /* FIXME: no way to reset this list */
913 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
)
914 cgroup_apply_blkio_device_weight(u
, w
->path
, w
->weight
);
918 /* Apply limits and free ones without config. */
920 CGroupIODeviceLimit
*l
, *next
;
922 LIST_FOREACH_SAFE(device_limits
, l
, next
, c
->io_device_limits
) {
923 log_cgroup_compat(u
, "Applying IO{Read|Write}Bandwidth %" PRIu64
" %" PRIu64
" as BlockIO{Read|Write}BandwidthMax for %s",
924 l
->limits
[CGROUP_IO_RBPS_MAX
], l
->limits
[CGROUP_IO_WBPS_MAX
], l
->path
);
926 if (!cgroup_apply_blkio_device_limit(u
, l
->path
, l
->limits
[CGROUP_IO_RBPS_MAX
], l
->limits
[CGROUP_IO_WBPS_MAX
]))
927 cgroup_context_free_io_device_limit(c
, l
);
929 } else if (has_blockio
) {
930 CGroupBlockIODeviceBandwidth
*b
, *next
;
932 LIST_FOREACH_SAFE(device_bandwidths
, b
, next
, c
->blockio_device_bandwidths
)
933 if (!cgroup_apply_blkio_device_limit(u
, b
->path
, b
->rbps
, b
->wbps
))
934 cgroup_context_free_blockio_device_bandwidth(c
, b
);
938 if ((apply_mask
& CGROUP_MASK_MEMORY
) && !is_root
) {
939 if (cg_all_unified() > 0) {
940 uint64_t max
, swap_max
= CGROUP_LIMIT_MAX
;
942 if (cgroup_context_has_unified_memory_config(c
)) {
944 swap_max
= c
->memory_swap_max
;
946 max
= c
->memory_limit
;
948 if (max
!= CGROUP_LIMIT_MAX
)
949 log_cgroup_compat(u
, "Applying MemoryLimit %" PRIu64
" as MemoryMax", max
);
952 cgroup_apply_unified_memory_limit(u
, "memory.low", c
->memory_low
);
953 cgroup_apply_unified_memory_limit(u
, "memory.high", c
->memory_high
);
954 cgroup_apply_unified_memory_limit(u
, "memory.max", max
);
955 cgroup_apply_unified_memory_limit(u
, "memory.swap.max", swap_max
);
957 char buf
[DECIMAL_STR_MAX(uint64_t) + 1];
960 if (cgroup_context_has_unified_memory_config(c
)) {
962 log_cgroup_compat(u
, "Applying MemoryMax %" PRIi64
" as MemoryLimit", val
);
964 val
= c
->memory_limit
;
966 if (val
== CGROUP_LIMIT_MAX
)
967 strncpy(buf
, "-1\n", sizeof(buf
));
969 xsprintf(buf
, "%" PRIu64
"\n", val
);
971 r
= cg_set_attribute("memory", path
, "memory.limit_in_bytes", buf
);
973 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
974 "Failed to set memory.limit_in_bytes: %m");
978 if ((apply_mask
& CGROUP_MASK_DEVICES
) && !is_root
) {
979 CGroupDeviceAllow
*a
;
981 /* Changing the devices list of a populated cgroup
982 * might result in EINVAL, hence ignore EINVAL
985 if (c
->device_allow
|| c
->device_policy
!= CGROUP_AUTO
)
986 r
= cg_set_attribute("devices", path
, "devices.deny", "a");
988 r
= cg_set_attribute("devices", path
, "devices.allow", "a");
990 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
991 "Failed to reset devices.list: %m");
993 if (c
->device_policy
== CGROUP_CLOSED
||
994 (c
->device_policy
== CGROUP_AUTO
&& c
->device_allow
)) {
995 static const char auto_devices
[] =
996 "/dev/null\0" "rwm\0"
997 "/dev/zero\0" "rwm\0"
998 "/dev/full\0" "rwm\0"
999 "/dev/random\0" "rwm\0"
1000 "/dev/urandom\0" "rwm\0"
1001 "/dev/tty\0" "rwm\0"
1002 "/dev/ptmx\0" "rwm\0"
1003 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1004 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1005 "-/run/systemd/inaccessible/blk\0" "rwm\0";
1009 NULSTR_FOREACH_PAIR(x
, y
, auto_devices
)
1010 whitelist_device(path
, x
, y
);
1012 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1013 whitelist_major(path
, "pts", 'c', "rw");
1016 LIST_FOREACH(device_allow
, a
, c
->device_allow
) {
1032 if (path_startswith(a
->path
, "/dev/"))
1033 whitelist_device(path
, a
->path
, acc
);
1034 else if ((val
= startswith(a
->path
, "block-")))
1035 whitelist_major(path
, val
, 'b', acc
);
1036 else if ((val
= startswith(a
->path
, "char-")))
1037 whitelist_major(path
, val
, 'c', acc
);
1039 log_unit_debug(u
, "Ignoring device %s while writing cgroup attribute.", a
->path
);
1043 if (apply_mask
& CGROUP_MASK_PIDS
) {
1046 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1047 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1048 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1049 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1050 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1051 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1052 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1053 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1054 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1055 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1056 * which is desirable so that there's an offical way to release control of the sysctl from
1057 * systemd: set the limit to unbounded and reload. */
1059 if (c
->tasks_max
!= CGROUP_LIMIT_MAX
) {
1060 u
->manager
->sysctl_pid_max_changed
= true;
1061 r
= procfs_tasks_set_limit(c
->tasks_max
);
1062 } else if (u
->manager
->sysctl_pid_max_changed
)
1063 r
= procfs_tasks_set_limit(TASKS_MAX
);
1068 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1069 "Failed to write to tasks limit sysctls: %m");
1072 if (c
->tasks_max
!= CGROUP_LIMIT_MAX
) {
1073 char buf
[DECIMAL_STR_MAX(uint64_t) + 2];
1075 sprintf(buf
, "%" PRIu64
"\n", c
->tasks_max
);
1076 r
= cg_set_attribute("pids", path
, "pids.max", buf
);
1078 r
= cg_set_attribute("pids", path
, "pids.max", "max");
1080 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1081 "Failed to set pids.max: %m");
1086 cgroup_apply_firewall(u
);
1089 CGroupMask
cgroup_context_get_mask(CGroupContext
*c
) {
1090 CGroupMask mask
= 0;
1092 /* Figure out which controllers we need */
1094 if (c
->cpu_accounting
||
1095 cgroup_context_has_cpu_weight(c
) ||
1096 cgroup_context_has_cpu_shares(c
) ||
1097 c
->cpu_quota_per_sec_usec
!= USEC_INFINITY
)
1098 mask
|= CGROUP_MASK_CPUACCT
| CGROUP_MASK_CPU
;
1100 if (cgroup_context_has_io_config(c
) || cgroup_context_has_blockio_config(c
))
1101 mask
|= CGROUP_MASK_IO
| CGROUP_MASK_BLKIO
;
1103 if (c
->memory_accounting
||
1104 c
->memory_limit
!= CGROUP_LIMIT_MAX
||
1105 cgroup_context_has_unified_memory_config(c
))
1106 mask
|= CGROUP_MASK_MEMORY
;
1108 if (c
->device_allow
||
1109 c
->device_policy
!= CGROUP_AUTO
)
1110 mask
|= CGROUP_MASK_DEVICES
;
1112 if (c
->tasks_accounting
||
1113 c
->tasks_max
!= CGROUP_LIMIT_MAX
)
1114 mask
|= CGROUP_MASK_PIDS
;
1119 CGroupMask
unit_get_own_mask(Unit
*u
) {
1122 /* Returns the mask of controllers the unit needs for itself */
1124 c
= unit_get_cgroup_context(u
);
1128 return cgroup_context_get_mask(c
) | unit_get_delegate_mask(u
);
1131 CGroupMask
unit_get_delegate_mask(Unit
*u
) {
1134 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1135 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1137 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1139 if (!unit_cgroup_delegate(u
))
1142 if (cg_all_unified() <= 0) {
1145 e
= unit_get_exec_context(u
);
1146 if (e
&& !exec_context_maintains_privileges(e
))
1150 assert_se(c
= unit_get_cgroup_context(u
));
1151 return c
->delegate_controllers
;
1154 CGroupMask
unit_get_members_mask(Unit
*u
) {
1157 /* Returns the mask of controllers all of the unit's children require, merged */
1159 if (u
->cgroup_members_mask_valid
)
1160 return u
->cgroup_members_mask
;
1162 u
->cgroup_members_mask
= 0;
1164 if (u
->type
== UNIT_SLICE
) {
1169 HASHMAP_FOREACH_KEY(v
, member
, u
->dependencies
[UNIT_BEFORE
], i
) {
1174 if (UNIT_DEREF(member
->slice
) != u
)
1177 u
->cgroup_members_mask
|= unit_get_subtree_mask(member
); /* note that this calls ourselves again, for the children */
1181 u
->cgroup_members_mask_valid
= true;
1182 return u
->cgroup_members_mask
;
1185 CGroupMask
unit_get_siblings_mask(Unit
*u
) {
1188 /* Returns the mask of controllers all of the unit's siblings
1189 * require, i.e. the members mask of the unit's parent slice
1190 * if there is one. */
1192 if (UNIT_ISSET(u
->slice
))
1193 return unit_get_members_mask(UNIT_DEREF(u
->slice
));
1195 return unit_get_subtree_mask(u
); /* we are the top-level slice */
1198 CGroupMask
unit_get_subtree_mask(Unit
*u
) {
1200 /* Returns the mask of this subtree, meaning of the group
1201 * itself and its children. */
1203 return unit_get_own_mask(u
) | unit_get_members_mask(u
);
1206 CGroupMask
unit_get_target_mask(Unit
*u
) {
1209 /* This returns the cgroup mask of all controllers to enable
1210 * for a specific cgroup, i.e. everything it needs itself,
1211 * plus all that its children need, plus all that its siblings
1212 * need. This is primarily useful on the legacy cgroup
1213 * hierarchy, where we need to duplicate each cgroup in each
1214 * hierarchy that shall be enabled for it. */
1216 mask
= unit_get_own_mask(u
) | unit_get_members_mask(u
) | unit_get_siblings_mask(u
);
1217 mask
&= u
->manager
->cgroup_supported
;
1222 CGroupMask
unit_get_enable_mask(Unit
*u
) {
1225 /* This returns the cgroup mask of all controllers to enable
1226 * for the children of a specific cgroup. This is primarily
1227 * useful for the unified cgroup hierarchy, where each cgroup
1228 * controls which controllers are enabled for its children. */
1230 mask
= unit_get_members_mask(u
);
1231 mask
&= u
->manager
->cgroup_supported
;
1236 bool unit_get_needs_bpf(Unit
*u
) {
1241 c
= unit_get_cgroup_context(u
);
1245 if (c
->ip_accounting
||
1246 c
->ip_address_allow
||
1250 /* If any parent slice has an IP access list defined, it applies too */
1251 for (p
= UNIT_DEREF(u
->slice
); p
; p
= UNIT_DEREF(p
->slice
)) {
1252 c
= unit_get_cgroup_context(p
);
1256 if (c
->ip_address_allow
||
1264 /* Recurse from a unit up through its containing slices, propagating
1265 * mask bits upward. A unit is also member of itself. */
1266 void unit_update_cgroup_members_masks(Unit
*u
) {
1272 /* Calculate subtree mask */
1273 m
= unit_get_subtree_mask(u
);
1275 /* See if anything changed from the previous invocation. If
1276 * not, we're done. */
1277 if (u
->cgroup_subtree_mask_valid
&& m
== u
->cgroup_subtree_mask
)
1281 u
->cgroup_subtree_mask_valid
&&
1282 ((m
& ~u
->cgroup_subtree_mask
) != 0) &&
1283 ((~m
& u
->cgroup_subtree_mask
) == 0);
1285 u
->cgroup_subtree_mask
= m
;
1286 u
->cgroup_subtree_mask_valid
= true;
1288 if (UNIT_ISSET(u
->slice
)) {
1289 Unit
*s
= UNIT_DEREF(u
->slice
);
1292 /* There's more set now than before. We
1293 * propagate the new mask to the parent's mask
1294 * (not caring if it actually was valid or
1297 s
->cgroup_members_mask
|= m
;
1300 /* There's less set now than before (or we
1301 * don't know), we need to recalculate
1302 * everything, so let's invalidate the
1303 * parent's members mask */
1305 s
->cgroup_members_mask_valid
= false;
1307 /* And now make sure that this change also hits our
1309 unit_update_cgroup_members_masks(s
);
1313 const char *unit_get_realized_cgroup_path(Unit
*u
, CGroupMask mask
) {
1315 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1319 if (u
->cgroup_path
&&
1320 u
->cgroup_realized
&&
1321 (u
->cgroup_realized_mask
& mask
) == mask
)
1322 return u
->cgroup_path
;
1324 u
= UNIT_DEREF(u
->slice
);
1330 static const char *migrate_callback(CGroupMask mask
, void *userdata
) {
1331 return unit_get_realized_cgroup_path(userdata
, mask
);
1334 char *unit_default_cgroup_path(Unit
*u
) {
1335 _cleanup_free_
char *escaped
= NULL
, *slice
= NULL
;
1340 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
1341 return strdup(u
->manager
->cgroup_root
);
1343 if (UNIT_ISSET(u
->slice
) && !unit_has_name(UNIT_DEREF(u
->slice
), SPECIAL_ROOT_SLICE
)) {
1344 r
= cg_slice_to_path(UNIT_DEREF(u
->slice
)->id
, &slice
);
1349 escaped
= cg_escape(u
->id
);
1354 return strjoin(u
->manager
->cgroup_root
, "/", slice
, "/",
1357 return strjoin(u
->manager
->cgroup_root
, "/", escaped
);
1360 int unit_set_cgroup_path(Unit
*u
, const char *path
) {
1361 _cleanup_free_
char *p
= NULL
;
1373 if (streq_ptr(u
->cgroup_path
, p
))
1377 r
= hashmap_put(u
->manager
->cgroup_unit
, p
, u
);
1382 unit_release_cgroup(u
);
1384 u
->cgroup_path
= TAKE_PTR(p
);
1389 int unit_watch_cgroup(Unit
*u
) {
1390 _cleanup_free_
char *events
= NULL
;
1395 if (!u
->cgroup_path
)
1398 if (u
->cgroup_inotify_wd
>= 0)
1401 /* Only applies to the unified hierarchy */
1402 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
1404 return log_error_errno(r
, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1408 /* Don't watch the root slice, it's pointless. */
1409 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
1412 r
= hashmap_ensure_allocated(&u
->manager
->cgroup_inotify_wd_unit
, &trivial_hash_ops
);
1416 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, "cgroup.events", &events
);
1420 u
->cgroup_inotify_wd
= inotify_add_watch(u
->manager
->cgroup_inotify_fd
, events
, IN_MODIFY
);
1421 if (u
->cgroup_inotify_wd
< 0) {
1423 if (errno
== ENOENT
) /* If the directory is already
1424 * gone we don't need to track
1425 * it, so this is not an error */
1428 return log_unit_error_errno(u
, errno
, "Failed to add inotify watch descriptor for control group %s: %m", u
->cgroup_path
);
1431 r
= hashmap_put(u
->manager
->cgroup_inotify_wd_unit
, INT_TO_PTR(u
->cgroup_inotify_wd
), u
);
1433 return log_unit_error_errno(u
, r
, "Failed to add inotify watch descriptor to hash map: %m");
1438 int unit_pick_cgroup_path(Unit
*u
) {
1439 _cleanup_free_
char *path
= NULL
;
1447 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
1450 path
= unit_default_cgroup_path(u
);
1454 r
= unit_set_cgroup_path(u
, path
);
1456 return log_unit_error_errno(u
, r
, "Control group %s exists already.", path
);
1458 return log_unit_error_errno(u
, r
, "Failed to set unit's control group path to %s: %m", path
);
1463 static int unit_create_cgroup(
1465 CGroupMask target_mask
,
1466 CGroupMask enable_mask
,
1474 c
= unit_get_cgroup_context(u
);
1478 /* Figure out our cgroup path */
1479 r
= unit_pick_cgroup_path(u
);
1483 /* First, create our own group */
1484 r
= cg_create_everywhere(u
->manager
->cgroup_supported
, target_mask
, u
->cgroup_path
);
1486 return log_unit_error_errno(u
, r
, "Failed to create cgroup %s: %m", u
->cgroup_path
);
1488 /* Start watching it */
1489 (void) unit_watch_cgroup(u
);
1491 /* Enable all controllers we need */
1492 r
= cg_enable_everywhere(u
->manager
->cgroup_supported
, enable_mask
, u
->cgroup_path
);
1494 log_unit_warning_errno(u
, r
, "Failed to enable controllers on cgroup %s, ignoring: %m", u
->cgroup_path
);
1496 /* Keep track that this is now realized */
1497 u
->cgroup_realized
= true;
1498 u
->cgroup_realized_mask
= target_mask
;
1499 u
->cgroup_enabled_mask
= enable_mask
;
1500 u
->cgroup_bpf_state
= needs_bpf
? UNIT_CGROUP_BPF_ON
: UNIT_CGROUP_BPF_OFF
;
1502 if (u
->type
!= UNIT_SLICE
&& !unit_cgroup_delegate(u
)) {
1504 /* Then, possibly move things over, but not if
1505 * subgroups may contain processes, which is the case
1506 * for slice and delegation units. */
1507 r
= cg_migrate_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, u
->cgroup_path
, migrate_callback
, u
);
1509 log_unit_warning_errno(u
, r
, "Failed to migrate cgroup from to %s, ignoring: %m", u
->cgroup_path
);
1515 static int unit_attach_pid_to_cgroup_via_bus(Unit
*u
, pid_t pid
, const char *suffix_path
) {
1516 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
1522 if (MANAGER_IS_SYSTEM(u
->manager
))
1525 if (!u
->manager
->system_bus
)
1528 if (!u
->cgroup_path
)
1531 /* Determine this unit's cgroup path relative to our cgroup root */
1532 pp
= path_startswith(u
->cgroup_path
, u
->manager
->cgroup_root
);
1536 pp
= strjoina("/", pp
, suffix_path
);
1537 path_kill_slashes(pp
);
1539 r
= sd_bus_call_method(u
->manager
->system_bus
,
1540 "org.freedesktop.systemd1",
1541 "/org/freedesktop/systemd1",
1542 "org.freedesktop.systemd1.Manager",
1543 "AttachProcessesToUnit",
1546 NULL
/* empty unit name means client's unit, i.e. us */, pp
, 1, (uint32_t) pid
);
1548 return log_unit_debug_errno(u
, r
, "Failed to attach unit process " PID_FMT
" via the bus: %s", pid
, bus_error_message(&error
, r
));
1553 int unit_attach_pids_to_cgroup(Unit
*u
, Set
*pids
, const char *suffix_path
) {
1554 CGroupMask delegated_mask
;
1562 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
1565 if (set_isempty(pids
))
1568 r
= unit_realize_cgroup(u
);
1572 if (isempty(suffix_path
))
1575 p
= strjoina(u
->cgroup_path
, "/", suffix_path
);
1577 delegated_mask
= unit_get_delegate_mask(u
);
1580 SET_FOREACH(pidp
, pids
, i
) {
1581 pid_t pid
= PTR_TO_PID(pidp
);
1584 /* First, attach the PID to the main cgroup hierarchy */
1585 q
= cg_attach(SYSTEMD_CGROUP_CONTROLLER
, p
, pid
);
1587 log_unit_debug_errno(u
, q
, "Couldn't move process " PID_FMT
" to requested cgroup '%s': %m", pid
, p
);
1589 if (MANAGER_IS_USER(u
->manager
) && IN_SET(q
, -EPERM
, -EACCES
)) {
1592 /* If we are in a user instance, and we can't move the process ourselves due to
1593 * permission problems, let's ask the system instance about it instead. Since it's more
1594 * privileged it might be able to move the process across the leaves of a subtree who's
1595 * top node is not owned by us. */
1597 z
= unit_attach_pid_to_cgroup_via_bus(u
, pid
, suffix_path
);
1599 log_unit_debug_errno(u
, z
, "Couldn't move process " PID_FMT
" to requested cgroup '%s' via the system bus either: %m", pid
, p
);
1601 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1605 r
= q
; /* Remember first error */
1610 q
= cg_all_unified();
1616 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1617 * innermost realized one */
1619 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++) {
1620 CGroupMask bit
= CGROUP_CONTROLLER_TO_MASK(c
);
1621 const char *realized
;
1623 if (!(u
->manager
->cgroup_supported
& bit
))
1626 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1627 if (delegated_mask
& u
->cgroup_realized_mask
& bit
) {
1628 q
= cg_attach(cgroup_controller_to_string(c
), p
, pid
);
1630 continue; /* Success! */
1632 log_unit_debug_errno(u
, q
, "Failed to attach PID " PID_FMT
" to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1633 pid
, p
, cgroup_controller_to_string(c
));
1636 /* So this controller is either not delegate or realized, or something else weird happened. In
1637 * that case let's attach the PID at least to the closest cgroup up the tree that is
1639 realized
= unit_get_realized_cgroup_path(u
, bit
);
1641 continue; /* Not even realized in the root slice? Then let's not bother */
1643 q
= cg_attach(cgroup_controller_to_string(c
), realized
, pid
);
1645 log_unit_debug_errno(u
, q
, "Failed to attach PID " PID_FMT
" to realized cgroup %s in controller %s, ignoring: %m",
1646 pid
, realized
, cgroup_controller_to_string(c
));
1653 static void cgroup_xattr_apply(Unit
*u
) {
1654 char ids
[SD_ID128_STRING_MAX
];
1659 if (!MANAGER_IS_SYSTEM(u
->manager
))
1662 if (sd_id128_is_null(u
->invocation_id
))
1665 r
= cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
,
1666 "trusted.invocation_id",
1667 sd_id128_to_string(u
->invocation_id
, ids
), 32,
1670 log_unit_debug_errno(u
, r
, "Failed to set invocation ID on control group %s, ignoring: %m", u
->cgroup_path
);
1673 static bool unit_has_mask_realized(
1675 CGroupMask target_mask
,
1676 CGroupMask enable_mask
,
1681 return u
->cgroup_realized
&&
1682 u
->cgroup_realized_mask
== target_mask
&&
1683 u
->cgroup_enabled_mask
== enable_mask
&&
1684 ((needs_bpf
&& u
->cgroup_bpf_state
== UNIT_CGROUP_BPF_ON
) ||
1685 (!needs_bpf
&& u
->cgroup_bpf_state
== UNIT_CGROUP_BPF_OFF
));
1688 static void unit_add_to_cgroup_realize_queue(Unit
*u
) {
1691 if (u
->in_cgroup_realize_queue
)
1694 LIST_PREPEND(cgroup_realize_queue
, u
->manager
->cgroup_realize_queue
, u
);
1695 u
->in_cgroup_realize_queue
= true;
1698 static void unit_remove_from_cgroup_realize_queue(Unit
*u
) {
1701 if (!u
->in_cgroup_realize_queue
)
1704 LIST_REMOVE(cgroup_realize_queue
, u
->manager
->cgroup_realize_queue
, u
);
1705 u
->in_cgroup_realize_queue
= false;
1709 /* Check if necessary controllers and attributes for a unit are in place.
1711 * If so, do nothing.
1712 * If not, create paths, move processes over, and set attributes.
1714 * Returns 0 on success and < 0 on failure. */
1715 static int unit_realize_cgroup_now(Unit
*u
, ManagerState state
) {
1716 CGroupMask target_mask
, enable_mask
;
1717 bool needs_bpf
, apply_bpf
;
1722 unit_remove_from_cgroup_realize_queue(u
);
1724 target_mask
= unit_get_target_mask(u
);
1725 enable_mask
= unit_get_enable_mask(u
);
1726 needs_bpf
= unit_get_needs_bpf(u
);
1728 if (unit_has_mask_realized(u
, target_mask
, enable_mask
, needs_bpf
))
1731 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1732 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1733 * this will trickle down properly to cgroupfs. */
1734 apply_bpf
= needs_bpf
|| u
->cgroup_bpf_state
!= UNIT_CGROUP_BPF_OFF
;
1736 /* First, realize parents */
1737 if (UNIT_ISSET(u
->slice
)) {
1738 r
= unit_realize_cgroup_now(UNIT_DEREF(u
->slice
), state
);
1743 /* And then do the real work */
1744 r
= unit_create_cgroup(u
, target_mask
, enable_mask
, needs_bpf
);
1748 /* Finally, apply the necessary attributes. */
1749 cgroup_context_apply(u
, target_mask
, apply_bpf
, state
);
1750 cgroup_xattr_apply(u
);
1755 unsigned manager_dispatch_cgroup_realize_queue(Manager
*m
) {
1763 state
= manager_state(m
);
1765 while ((i
= m
->cgroup_realize_queue
)) {
1766 assert(i
->in_cgroup_realize_queue
);
1768 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i
))) {
1769 /* Maybe things changed, and the unit is not actually active anymore? */
1770 unit_remove_from_cgroup_realize_queue(i
);
1774 r
= unit_realize_cgroup_now(i
, state
);
1776 log_warning_errno(r
, "Failed to realize cgroups for queued unit %s, ignoring: %m", i
->id
);
1784 static void unit_add_siblings_to_cgroup_realize_queue(Unit
*u
) {
1787 /* This adds the siblings of the specified unit and the
1788 * siblings of all parent units to the cgroup queue. (But
1789 * neither the specified unit itself nor the parents.) */
1791 while ((slice
= UNIT_DEREF(u
->slice
))) {
1796 HASHMAP_FOREACH_KEY(v
, m
, u
->dependencies
[UNIT_BEFORE
], i
) {
1800 /* Skip units that have a dependency on the slice
1801 * but aren't actually in it. */
1802 if (UNIT_DEREF(m
->slice
) != slice
)
1805 /* No point in doing cgroup application for units
1806 * without active processes. */
1807 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m
)))
1810 /* If the unit doesn't need any new controllers
1811 * and has current ones realized, it doesn't need
1813 if (unit_has_mask_realized(m
,
1814 unit_get_target_mask(m
),
1815 unit_get_enable_mask(m
),
1816 unit_get_needs_bpf(m
)))
1819 unit_add_to_cgroup_realize_queue(m
);
1826 int unit_realize_cgroup(Unit
*u
) {
1829 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
1832 /* So, here's the deal: when realizing the cgroups for this
1833 * unit, we need to first create all parents, but there's more
1834 * actually: for the weight-based controllers we also need to
1835 * make sure that all our siblings (i.e. units that are in the
1836 * same slice as we are) have cgroups, too. Otherwise, things
1837 * would become very uneven as each of their processes would
1838 * get as much resources as all our group together. This call
1839 * will synchronously create the parent cgroups, but will
1840 * defer work on the siblings to the next event loop
1843 /* Add all sibling slices to the cgroup queue. */
1844 unit_add_siblings_to_cgroup_realize_queue(u
);
1846 /* And realize this one now (and apply the values) */
1847 return unit_realize_cgroup_now(u
, manager_state(u
->manager
));
1850 void unit_release_cgroup(Unit
*u
) {
1853 /* Forgets all cgroup details for this cgroup */
1855 if (u
->cgroup_path
) {
1856 (void) hashmap_remove(u
->manager
->cgroup_unit
, u
->cgroup_path
);
1857 u
->cgroup_path
= mfree(u
->cgroup_path
);
1860 if (u
->cgroup_inotify_wd
>= 0) {
1861 if (inotify_rm_watch(u
->manager
->cgroup_inotify_fd
, u
->cgroup_inotify_wd
) < 0)
1862 log_unit_debug_errno(u
, errno
, "Failed to remove cgroup inotify watch %i for %s, ignoring", u
->cgroup_inotify_wd
, u
->id
);
1864 (void) hashmap_remove(u
->manager
->cgroup_inotify_wd_unit
, INT_TO_PTR(u
->cgroup_inotify_wd
));
1865 u
->cgroup_inotify_wd
= -1;
1869 void unit_prune_cgroup(Unit
*u
) {
1875 /* Removes the cgroup, if empty and possible, and stops watching it. */
1877 if (!u
->cgroup_path
)
1880 (void) unit_get_cpu_usage(u
, NULL
); /* Cache the last CPU usage value before we destroy the cgroup */
1882 is_root_slice
= unit_has_name(u
, SPECIAL_ROOT_SLICE
);
1884 r
= cg_trim_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, !is_root_slice
);
1886 log_unit_debug_errno(u
, r
, "Failed to destroy cgroup %s, ignoring: %m", u
->cgroup_path
);
1893 unit_release_cgroup(u
);
1895 u
->cgroup_realized
= false;
1896 u
->cgroup_realized_mask
= 0;
1897 u
->cgroup_enabled_mask
= 0;
1900 int unit_search_main_pid(Unit
*u
, pid_t
*ret
) {
1901 _cleanup_fclose_
FILE *f
= NULL
;
1902 pid_t pid
= 0, npid
, mypid
;
1908 if (!u
->cgroup_path
)
1911 r
= cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, &f
);
1915 mypid
= getpid_cached();
1916 while (cg_read_pid(f
, &npid
) > 0) {
1922 /* Ignore processes that aren't our kids */
1923 if (get_process_ppid(npid
, &ppid
) >= 0 && ppid
!= mypid
)
1927 /* Dang, there's more than one daemonized PID
1928 in this group, so we don't know what process
1929 is the main process. */
1940 static int unit_watch_pids_in_path(Unit
*u
, const char *path
) {
1941 _cleanup_closedir_
DIR *d
= NULL
;
1942 _cleanup_fclose_
FILE *f
= NULL
;
1948 r
= cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER
, path
, &f
);
1954 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
1955 r
= unit_watch_pid(u
, pid
);
1956 if (r
< 0 && ret
>= 0)
1960 if (r
< 0 && ret
>= 0)
1964 r
= cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER
, path
, &d
);
1971 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
1972 _cleanup_free_
char *p
= NULL
;
1974 p
= strjoin(path
, "/", fn
);
1980 r
= unit_watch_pids_in_path(u
, p
);
1981 if (r
< 0 && ret
>= 0)
1985 if (r
< 0 && ret
>= 0)
1992 int unit_synthesize_cgroup_empty_event(Unit
*u
) {
1997 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1998 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1999 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2001 if (!u
->cgroup_path
)
2004 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
2007 if (r
> 0) /* On unified we have reliable notifications, and don't need this */
2010 if (!set_isempty(u
->pids
))
2013 unit_add_to_cgroup_empty_queue(u
);
2017 int unit_watch_all_pids(Unit
*u
) {
2022 /* Adds all PIDs from our cgroup to the set of PIDs we
2023 * watch. This is a fallback logic for cases where we do not
2024 * get reliable cgroup empty notifications: we try to use
2025 * SIGCHLD as replacement. */
2027 if (!u
->cgroup_path
)
2030 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
2033 if (r
> 0) /* On unified we can use proper notifications */
2036 return unit_watch_pids_in_path(u
, u
->cgroup_path
);
2039 static int on_cgroup_empty_event(sd_event_source
*s
, void *userdata
) {
2040 Manager
*m
= userdata
;
2047 u
= m
->cgroup_empty_queue
;
2051 assert(u
->in_cgroup_empty_queue
);
2052 u
->in_cgroup_empty_queue
= false;
2053 LIST_REMOVE(cgroup_empty_queue
, m
->cgroup_empty_queue
, u
);
2055 if (m
->cgroup_empty_queue
) {
2056 /* More stuff queued, let's make sure we remain enabled */
2057 r
= sd_event_source_set_enabled(s
, SD_EVENT_ONESHOT
);
2059 log_debug_errno(r
, "Failed to reenable cgroup empty event source: %m");
2062 unit_add_to_gc_queue(u
);
2064 if (UNIT_VTABLE(u
)->notify_cgroup_empty
)
2065 UNIT_VTABLE(u
)->notify_cgroup_empty(u
);
2070 void unit_add_to_cgroup_empty_queue(Unit
*u
) {
2075 /* Note that there are four different ways how cgroup empty events reach us:
2077 * 1. On the unified hierarchy we get an inotify event on the cgroup
2079 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2081 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2083 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2084 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2086 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2087 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2088 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2089 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2090 * case for scope units). */
2092 if (u
->in_cgroup_empty_queue
)
2095 /* Let's verify that the cgroup is really empty */
2096 if (!u
->cgroup_path
)
2098 r
= cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
);
2100 log_unit_debug_errno(u
, r
, "Failed to determine whether cgroup %s is empty: %m", u
->cgroup_path
);
2106 LIST_PREPEND(cgroup_empty_queue
, u
->manager
->cgroup_empty_queue
, u
);
2107 u
->in_cgroup_empty_queue
= true;
2109 /* Trigger the defer event */
2110 r
= sd_event_source_set_enabled(u
->manager
->cgroup_empty_event_source
, SD_EVENT_ONESHOT
);
2112 log_debug_errno(r
, "Failed to enable cgroup empty event source: %m");
2115 static int on_cgroup_inotify_event(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
2116 Manager
*m
= userdata
;
2123 union inotify_event_buffer buffer
;
2124 struct inotify_event
*e
;
2127 l
= read(fd
, &buffer
, sizeof(buffer
));
2129 if (IN_SET(errno
, EINTR
, EAGAIN
))
2132 return log_error_errno(errno
, "Failed to read control group inotify events: %m");
2135 FOREACH_INOTIFY_EVENT(e
, buffer
, l
) {
2139 /* Queue overflow has no watch descriptor */
2142 if (e
->mask
& IN_IGNORED
)
2143 /* The watch was just removed */
2146 u
= hashmap_get(m
->cgroup_inotify_wd_unit
, INT_TO_PTR(e
->wd
));
2147 if (!u
) /* Not that inotify might deliver
2148 * events for a watch even after it
2149 * was removed, because it was queued
2150 * before the removal. Let's ignore
2151 * this here safely. */
2154 unit_add_to_cgroup_empty_queue(u
);
2159 int manager_setup_cgroup(Manager
*m
) {
2160 _cleanup_free_
char *path
= NULL
;
2161 const char *scope_path
;
2168 /* 1. Determine hierarchy */
2169 m
->cgroup_root
= mfree(m
->cgroup_root
);
2170 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &m
->cgroup_root
);
2172 return log_error_errno(r
, "Cannot determine cgroup we are running in: %m");
2174 /* Chop off the init scope, if we are already located in it */
2175 e
= endswith(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
2177 /* LEGACY: Also chop off the system slice if we are in
2178 * it. This is to support live upgrades from older systemd
2179 * versions where PID 1 was moved there. Also see
2180 * cg_get_root_path(). */
2181 if (!e
&& MANAGER_IS_SYSTEM(m
)) {
2182 e
= endswith(m
->cgroup_root
, "/" SPECIAL_SYSTEM_SLICE
);
2184 e
= endswith(m
->cgroup_root
, "/system"); /* even more legacy */
2189 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2190 * easily prepend it everywhere. */
2191 delete_trailing_chars(m
->cgroup_root
, "/");
2194 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, NULL
, &path
);
2196 return log_error_errno(r
, "Cannot find cgroup mount point: %m");
2198 r
= cg_unified_flush();
2200 return log_error_errno(r
, "Couldn't determine if we are running in the unified hierarchy: %m");
2202 all_unified
= cg_all_unified();
2203 if (all_unified
< 0)
2204 return log_error_errno(all_unified
, "Couldn't determine whether we are in all unified mode: %m");
2205 if (all_unified
> 0)
2206 log_debug("Unified cgroup hierarchy is located at %s.", path
);
2208 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
2210 return log_error_errno(r
, "Failed to determine whether systemd's own controller is in unified mode: %m");
2212 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path
);
2214 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY
". File system hierarchy is at %s.", path
);
2217 /* 3. Allocate cgroup empty defer event source */
2218 m
->cgroup_empty_event_source
= sd_event_source_unref(m
->cgroup_empty_event_source
);
2219 r
= sd_event_add_defer(m
->event
, &m
->cgroup_empty_event_source
, on_cgroup_empty_event
, m
);
2221 return log_error_errno(r
, "Failed to create cgroup empty event source: %m");
2223 r
= sd_event_source_set_priority(m
->cgroup_empty_event_source
, SD_EVENT_PRIORITY_NORMAL
-5);
2225 return log_error_errno(r
, "Failed to set priority of cgroup empty event source: %m");
2227 r
= sd_event_source_set_enabled(m
->cgroup_empty_event_source
, SD_EVENT_OFF
);
2229 return log_error_errno(r
, "Failed to disable cgroup empty event source: %m");
2231 (void) sd_event_source_set_description(m
->cgroup_empty_event_source
, "cgroup-empty");
2233 /* 4. Install notifier inotify object, or agent */
2234 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0) {
2236 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2238 m
->cgroup_inotify_event_source
= sd_event_source_unref(m
->cgroup_inotify_event_source
);
2239 safe_close(m
->cgroup_inotify_fd
);
2241 m
->cgroup_inotify_fd
= inotify_init1(IN_NONBLOCK
|IN_CLOEXEC
);
2242 if (m
->cgroup_inotify_fd
< 0)
2243 return log_error_errno(errno
, "Failed to create control group inotify object: %m");
2245 r
= sd_event_add_io(m
->event
, &m
->cgroup_inotify_event_source
, m
->cgroup_inotify_fd
, EPOLLIN
, on_cgroup_inotify_event
, m
);
2247 return log_error_errno(r
, "Failed to watch control group inotify object: %m");
2249 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2250 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2251 r
= sd_event_source_set_priority(m
->cgroup_inotify_event_source
, SD_EVENT_PRIORITY_NORMAL
-4);
2253 return log_error_errno(r
, "Failed to set priority of inotify event source: %m");
2255 (void) sd_event_source_set_description(m
->cgroup_inotify_event_source
, "cgroup-inotify");
2257 } else if (MANAGER_IS_SYSTEM(m
) && m
->test_run_flags
== 0) {
2259 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2260 * since it does not generate events when control groups with children run empty. */
2262 r
= cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER
, SYSTEMD_CGROUP_AGENT_PATH
);
2264 log_warning_errno(r
, "Failed to install release agent, ignoring: %m");
2266 log_debug("Installed release agent.");
2268 log_debug("Release agent already installed.");
2271 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2272 scope_path
= strjoina(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
2273 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, scope_path
, 0);
2275 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2276 r
= cg_migrate(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, SYSTEMD_CGROUP_CONTROLLER
, scope_path
, 0);
2278 log_warning_errno(r
, "Couldn't move remaining userspace processes, ignoring: %m");
2280 /* 6. And pin it, so that it cannot be unmounted */
2281 safe_close(m
->pin_cgroupfs_fd
);
2282 m
->pin_cgroupfs_fd
= open(path
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
|O_NOCTTY
|O_NONBLOCK
);
2283 if (m
->pin_cgroupfs_fd
< 0)
2284 return log_error_errno(errno
, "Failed to open pin file: %m");
2286 } else if (r
< 0 && !m
->test_run_flags
)
2287 return log_error_errno(r
, "Failed to create %s control group: %m", scope_path
);
2289 /* 7. Always enable hierarchical support if it exists... */
2290 if (!all_unified
&& m
->test_run_flags
== 0)
2291 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2293 /* 8. Figure out which controllers are supported, and log about it */
2294 r
= cg_mask_supported(&m
->cgroup_supported
);
2296 return log_error_errno(r
, "Failed to determine supported controllers: %m");
2297 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++)
2298 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c
), yes_no(m
->cgroup_supported
& CGROUP_CONTROLLER_TO_MASK(c
)));
2303 void manager_shutdown_cgroup(Manager
*m
, bool delete) {
2306 /* We can't really delete the group, since we are in it. But
2308 if (delete && m
->cgroup_root
)
2309 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, false);
2311 m
->cgroup_empty_event_source
= sd_event_source_unref(m
->cgroup_empty_event_source
);
2313 m
->cgroup_inotify_wd_unit
= hashmap_free(m
->cgroup_inotify_wd_unit
);
2315 m
->cgroup_inotify_event_source
= sd_event_source_unref(m
->cgroup_inotify_event_source
);
2316 m
->cgroup_inotify_fd
= safe_close(m
->cgroup_inotify_fd
);
2318 m
->pin_cgroupfs_fd
= safe_close(m
->pin_cgroupfs_fd
);
2320 m
->cgroup_root
= mfree(m
->cgroup_root
);
2323 Unit
* manager_get_unit_by_cgroup(Manager
*m
, const char *cgroup
) {
2330 u
= hashmap_get(m
->cgroup_unit
, cgroup
);
2334 p
= strdupa(cgroup
);
2338 e
= strrchr(p
, '/');
2340 return hashmap_get(m
->cgroup_unit
, SPECIAL_ROOT_SLICE
);
2344 u
= hashmap_get(m
->cgroup_unit
, p
);
2350 Unit
*manager_get_unit_by_pid_cgroup(Manager
*m
, pid_t pid
) {
2351 _cleanup_free_
char *cgroup
= NULL
;
2355 if (!pid_is_valid(pid
))
2358 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
) < 0)
2361 return manager_get_unit_by_cgroup(m
, cgroup
);
2364 Unit
*manager_get_unit_by_pid(Manager
*m
, pid_t pid
) {
2369 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2370 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2371 * relevant one as children of the process will be assigned to that one, too, before all else. */
2373 if (!pid_is_valid(pid
))
2376 if (pid
== getpid_cached())
2377 return hashmap_get(m
->units
, SPECIAL_INIT_SCOPE
);
2379 u
= manager_get_unit_by_pid_cgroup(m
, pid
);
2383 u
= hashmap_get(m
->watch_pids
, PID_TO_PTR(pid
));
2387 array
= hashmap_get(m
->watch_pids
, PID_TO_PTR(-pid
));
2394 int manager_notify_cgroup_empty(Manager
*m
, const char *cgroup
) {
2400 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2401 * or from the --system instance */
2403 log_debug("Got cgroup empty notification for: %s", cgroup
);
2405 u
= manager_get_unit_by_cgroup(m
, cgroup
);
2409 unit_add_to_cgroup_empty_queue(u
);
2413 int unit_get_memory_current(Unit
*u
, uint64_t *ret
) {
2414 _cleanup_free_
char *v
= NULL
;
2420 if (!UNIT_CGROUP_BOOL(u
, memory_accounting
))
2423 if (!u
->cgroup_path
)
2426 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2427 if (unit_has_root_cgroup(u
))
2428 return procfs_memory_get_current(ret
);
2430 if ((u
->cgroup_realized_mask
& CGROUP_MASK_MEMORY
) == 0)
2433 r
= cg_all_unified();
2437 r
= cg_get_attribute("memory", u
->cgroup_path
, "memory.current", &v
);
2439 r
= cg_get_attribute("memory", u
->cgroup_path
, "memory.usage_in_bytes", &v
);
2445 return safe_atou64(v
, ret
);
2448 int unit_get_tasks_current(Unit
*u
, uint64_t *ret
) {
2449 _cleanup_free_
char *v
= NULL
;
2455 if (!UNIT_CGROUP_BOOL(u
, tasks_accounting
))
2458 if (!u
->cgroup_path
)
2461 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2462 if (unit_has_root_cgroup(u
))
2463 return procfs_tasks_get_current(ret
);
2465 if ((u
->cgroup_realized_mask
& CGROUP_MASK_PIDS
) == 0)
2468 r
= cg_get_attribute("pids", u
->cgroup_path
, "pids.current", &v
);
2474 return safe_atou64(v
, ret
);
2477 static int unit_get_cpu_usage_raw(Unit
*u
, nsec_t
*ret
) {
2478 _cleanup_free_
char *v
= NULL
;
2485 if (!u
->cgroup_path
)
2488 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2489 if (unit_has_root_cgroup(u
))
2490 return procfs_cpu_get_usage(ret
);
2492 r
= cg_all_unified();
2496 _cleanup_free_
char *val
= NULL
;
2499 if ((u
->cgroup_realized_mask
& CGROUP_MASK_CPU
) == 0)
2502 r
= cg_get_keyed_attribute("cpu", u
->cgroup_path
, "cpu.stat", STRV_MAKE("usage_usec"), &val
);
2505 if (IN_SET(r
, -ENOENT
, -ENXIO
))
2508 r
= safe_atou64(val
, &us
);
2512 ns
= us
* NSEC_PER_USEC
;
2514 if ((u
->cgroup_realized_mask
& CGROUP_MASK_CPUACCT
) == 0)
2517 r
= cg_get_attribute("cpuacct", u
->cgroup_path
, "cpuacct.usage", &v
);
2523 r
= safe_atou64(v
, &ns
);
2532 int unit_get_cpu_usage(Unit
*u
, nsec_t
*ret
) {
2538 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2539 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2540 * call this function with a NULL return value. */
2542 if (!UNIT_CGROUP_BOOL(u
, cpu_accounting
))
2545 r
= unit_get_cpu_usage_raw(u
, &ns
);
2546 if (r
== -ENODATA
&& u
->cpu_usage_last
!= NSEC_INFINITY
) {
2547 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2551 *ret
= u
->cpu_usage_last
;
2557 if (ns
> u
->cpu_usage_base
)
2558 ns
-= u
->cpu_usage_base
;
2562 u
->cpu_usage_last
= ns
;
2569 int unit_get_ip_accounting(
2571 CGroupIPAccountingMetric metric
,
2578 assert(metric
>= 0);
2579 assert(metric
< _CGROUP_IP_ACCOUNTING_METRIC_MAX
);
2582 if (!UNIT_CGROUP_BOOL(u
, ip_accounting
))
2585 fd
= IN_SET(metric
, CGROUP_IP_INGRESS_BYTES
, CGROUP_IP_INGRESS_PACKETS
) ?
2586 u
->ip_accounting_ingress_map_fd
:
2587 u
->ip_accounting_egress_map_fd
;
2591 if (IN_SET(metric
, CGROUP_IP_INGRESS_BYTES
, CGROUP_IP_EGRESS_BYTES
))
2592 r
= bpf_firewall_read_accounting(fd
, &value
, NULL
);
2594 r
= bpf_firewall_read_accounting(fd
, NULL
, &value
);
2598 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2599 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2600 * ip_accounting_extra[] field, and add them in here transparently. */
2602 *ret
= value
+ u
->ip_accounting_extra
[metric
];
2607 int unit_reset_cpu_accounting(Unit
*u
) {
2613 u
->cpu_usage_last
= NSEC_INFINITY
;
2615 r
= unit_get_cpu_usage_raw(u
, &ns
);
2617 u
->cpu_usage_base
= 0;
2621 u
->cpu_usage_base
= ns
;
2625 int unit_reset_ip_accounting(Unit
*u
) {
2630 if (u
->ip_accounting_ingress_map_fd
>= 0)
2631 r
= bpf_firewall_reset_accounting(u
->ip_accounting_ingress_map_fd
);
2633 if (u
->ip_accounting_egress_map_fd
>= 0)
2634 q
= bpf_firewall_reset_accounting(u
->ip_accounting_egress_map_fd
);
2636 zero(u
->ip_accounting_extra
);
2638 return r
< 0 ? r
: q
;
2641 void unit_invalidate_cgroup(Unit
*u
, CGroupMask m
) {
2644 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2650 /* always invalidate compat pairs together */
2651 if (m
& (CGROUP_MASK_IO
| CGROUP_MASK_BLKIO
))
2652 m
|= CGROUP_MASK_IO
| CGROUP_MASK_BLKIO
;
2654 if (m
& (CGROUP_MASK_CPU
| CGROUP_MASK_CPUACCT
))
2655 m
|= CGROUP_MASK_CPU
| CGROUP_MASK_CPUACCT
;
2657 if ((u
->cgroup_realized_mask
& m
) == 0) /* NOP? */
2660 u
->cgroup_realized_mask
&= ~m
;
2661 unit_add_to_cgroup_realize_queue(u
);
2664 void unit_invalidate_cgroup_bpf(Unit
*u
) {
2667 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2670 if (u
->cgroup_bpf_state
== UNIT_CGROUP_BPF_INVALIDATED
) /* NOP? */
2673 u
->cgroup_bpf_state
= UNIT_CGROUP_BPF_INVALIDATED
;
2674 unit_add_to_cgroup_realize_queue(u
);
2676 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2677 * list of our children includes our own. */
2678 if (u
->type
== UNIT_SLICE
) {
2683 HASHMAP_FOREACH_KEY(v
, member
, u
->dependencies
[UNIT_BEFORE
], i
) {
2687 if (UNIT_DEREF(member
->slice
) != u
)
2690 unit_invalidate_cgroup_bpf(member
);
2695 bool unit_cgroup_delegate(Unit
*u
) {
2700 if (!UNIT_VTABLE(u
)->can_delegate
)
2703 c
= unit_get_cgroup_context(u
);
2710 void manager_invalidate_startup_units(Manager
*m
) {
2716 SET_FOREACH(u
, m
->startup_units
, i
)
2717 unit_invalidate_cgroup(u
, CGROUP_MASK_CPU
|CGROUP_MASK_IO
|CGROUP_MASK_BLKIO
);
2720 static const char* const cgroup_device_policy_table
[_CGROUP_DEVICE_POLICY_MAX
] = {
2721 [CGROUP_AUTO
] = "auto",
2722 [CGROUP_CLOSED
] = "closed",
2723 [CGROUP_STRICT
] = "strict",
2726 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy
, CGroupDevicePolicy
);