1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 #include "blockdev-util.h"
26 #include "bpf-firewall.h"
27 #include "cgroup-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
36 #include "stdio-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
42 static void cgroup_compat_warn(void) {
43 static bool cgroup_compat_warned
= false;
45 if (cgroup_compat_warned
)
48 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
49 cgroup_compat_warned
= true;
52 #define log_cgroup_compat(unit, fmt, ...) do { \
53 cgroup_compat_warn(); \
54 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
57 void cgroup_context_init(CGroupContext
*c
) {
60 /* Initialize everything to the kernel defaults, assuming the
61 * structure is preinitialized to 0 */
63 c
->cpu_weight
= CGROUP_WEIGHT_INVALID
;
64 c
->startup_cpu_weight
= CGROUP_WEIGHT_INVALID
;
65 c
->cpu_quota_per_sec_usec
= USEC_INFINITY
;
67 c
->cpu_shares
= CGROUP_CPU_SHARES_INVALID
;
68 c
->startup_cpu_shares
= CGROUP_CPU_SHARES_INVALID
;
70 c
->memory_high
= CGROUP_LIMIT_MAX
;
71 c
->memory_max
= CGROUP_LIMIT_MAX
;
72 c
->memory_swap_max
= CGROUP_LIMIT_MAX
;
74 c
->memory_limit
= CGROUP_LIMIT_MAX
;
76 c
->io_weight
= CGROUP_WEIGHT_INVALID
;
77 c
->startup_io_weight
= CGROUP_WEIGHT_INVALID
;
79 c
->blockio_weight
= CGROUP_BLKIO_WEIGHT_INVALID
;
80 c
->startup_blockio_weight
= CGROUP_BLKIO_WEIGHT_INVALID
;
82 c
->tasks_max
= (uint64_t) -1;
85 void cgroup_context_free_device_allow(CGroupContext
*c
, CGroupDeviceAllow
*a
) {
89 LIST_REMOVE(device_allow
, c
->device_allow
, a
);
94 void cgroup_context_free_io_device_weight(CGroupContext
*c
, CGroupIODeviceWeight
*w
) {
98 LIST_REMOVE(device_weights
, c
->io_device_weights
, w
);
103 void cgroup_context_free_io_device_limit(CGroupContext
*c
, CGroupIODeviceLimit
*l
) {
107 LIST_REMOVE(device_limits
, c
->io_device_limits
, l
);
112 void cgroup_context_free_blockio_device_weight(CGroupContext
*c
, CGroupBlockIODeviceWeight
*w
) {
116 LIST_REMOVE(device_weights
, c
->blockio_device_weights
, w
);
121 void cgroup_context_free_blockio_device_bandwidth(CGroupContext
*c
, CGroupBlockIODeviceBandwidth
*b
) {
125 LIST_REMOVE(device_bandwidths
, c
->blockio_device_bandwidths
, b
);
130 void cgroup_context_done(CGroupContext
*c
) {
133 while (c
->io_device_weights
)
134 cgroup_context_free_io_device_weight(c
, c
->io_device_weights
);
136 while (c
->io_device_limits
)
137 cgroup_context_free_io_device_limit(c
, c
->io_device_limits
);
139 while (c
->blockio_device_weights
)
140 cgroup_context_free_blockio_device_weight(c
, c
->blockio_device_weights
);
142 while (c
->blockio_device_bandwidths
)
143 cgroup_context_free_blockio_device_bandwidth(c
, c
->blockio_device_bandwidths
);
145 while (c
->device_allow
)
146 cgroup_context_free_device_allow(c
, c
->device_allow
);
148 c
->ip_address_allow
= ip_address_access_free_all(c
->ip_address_allow
);
149 c
->ip_address_deny
= ip_address_access_free_all(c
->ip_address_deny
);
152 void cgroup_context_dump(CGroupContext
*c
, FILE* f
, const char *prefix
) {
153 CGroupIODeviceLimit
*il
;
154 CGroupIODeviceWeight
*iw
;
155 CGroupBlockIODeviceBandwidth
*b
;
156 CGroupBlockIODeviceWeight
*w
;
157 CGroupDeviceAllow
*a
;
158 IPAddressAccessItem
*iaai
;
159 char u
[FORMAT_TIMESPAN_MAX
];
164 prefix
= strempty(prefix
);
167 "%sCPUAccounting=%s\n"
168 "%sIOAccounting=%s\n"
169 "%sBlockIOAccounting=%s\n"
170 "%sMemoryAccounting=%s\n"
171 "%sTasksAccounting=%s\n"
172 "%sIPAccounting=%s\n"
173 "%sCPUWeight=%" PRIu64
"\n"
174 "%sStartupCPUWeight=%" PRIu64
"\n"
175 "%sCPUShares=%" PRIu64
"\n"
176 "%sStartupCPUShares=%" PRIu64
"\n"
177 "%sCPUQuotaPerSecSec=%s\n"
178 "%sIOWeight=%" PRIu64
"\n"
179 "%sStartupIOWeight=%" PRIu64
"\n"
180 "%sBlockIOWeight=%" PRIu64
"\n"
181 "%sStartupBlockIOWeight=%" PRIu64
"\n"
182 "%sMemoryLow=%" PRIu64
"\n"
183 "%sMemoryHigh=%" PRIu64
"\n"
184 "%sMemoryMax=%" PRIu64
"\n"
185 "%sMemorySwapMax=%" PRIu64
"\n"
186 "%sMemoryLimit=%" PRIu64
"\n"
187 "%sTasksMax=%" PRIu64
"\n"
188 "%sDevicePolicy=%s\n"
190 prefix
, yes_no(c
->cpu_accounting
),
191 prefix
, yes_no(c
->io_accounting
),
192 prefix
, yes_no(c
->blockio_accounting
),
193 prefix
, yes_no(c
->memory_accounting
),
194 prefix
, yes_no(c
->tasks_accounting
),
195 prefix
, yes_no(c
->ip_accounting
),
196 prefix
, c
->cpu_weight
,
197 prefix
, c
->startup_cpu_weight
,
198 prefix
, c
->cpu_shares
,
199 prefix
, c
->startup_cpu_shares
,
200 prefix
, format_timespan(u
, sizeof(u
), c
->cpu_quota_per_sec_usec
, 1),
201 prefix
, c
->io_weight
,
202 prefix
, c
->startup_io_weight
,
203 prefix
, c
->blockio_weight
,
204 prefix
, c
->startup_blockio_weight
,
205 prefix
, c
->memory_low
,
206 prefix
, c
->memory_high
,
207 prefix
, c
->memory_max
,
208 prefix
, c
->memory_swap_max
,
209 prefix
, c
->memory_limit
,
210 prefix
, c
->tasks_max
,
211 prefix
, cgroup_device_policy_to_string(c
->device_policy
),
212 prefix
, yes_no(c
->delegate
));
215 _cleanup_free_
char *t
= NULL
;
217 (void) cg_mask_to_string(c
->delegate_controllers
, &t
);
219 fprintf(f
, "%sDelegateControllers=%s\n",
224 LIST_FOREACH(device_allow
, a
, c
->device_allow
)
226 "%sDeviceAllow=%s %s%s%s\n",
229 a
->r
? "r" : "", a
->w
? "w" : "", a
->m
? "m" : "");
231 LIST_FOREACH(device_weights
, iw
, c
->io_device_weights
)
233 "%sIODeviceWeight=%s %" PRIu64
,
238 LIST_FOREACH(device_limits
, il
, c
->io_device_limits
) {
239 char buf
[FORMAT_BYTES_MAX
];
240 CGroupIOLimitType type
;
242 for (type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++)
243 if (il
->limits
[type
] != cgroup_io_limit_defaults
[type
])
247 cgroup_io_limit_type_to_string(type
),
249 format_bytes(buf
, sizeof(buf
), il
->limits
[type
]));
252 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
)
254 "%sBlockIODeviceWeight=%s %" PRIu64
,
259 LIST_FOREACH(device_bandwidths
, b
, c
->blockio_device_bandwidths
) {
260 char buf
[FORMAT_BYTES_MAX
];
262 if (b
->rbps
!= CGROUP_LIMIT_MAX
)
264 "%sBlockIOReadBandwidth=%s %s\n",
267 format_bytes(buf
, sizeof(buf
), b
->rbps
));
268 if (b
->wbps
!= CGROUP_LIMIT_MAX
)
270 "%sBlockIOWriteBandwidth=%s %s\n",
273 format_bytes(buf
, sizeof(buf
), b
->wbps
));
276 LIST_FOREACH(items
, iaai
, c
->ip_address_allow
) {
277 _cleanup_free_
char *k
= NULL
;
279 (void) in_addr_to_string(iaai
->family
, &iaai
->address
, &k
);
280 fprintf(f
, "%sIPAddressAllow=%s/%u\n", prefix
, strnull(k
), iaai
->prefixlen
);
283 LIST_FOREACH(items
, iaai
, c
->ip_address_deny
) {
284 _cleanup_free_
char *k
= NULL
;
286 (void) in_addr_to_string(iaai
->family
, &iaai
->address
, &k
);
287 fprintf(f
, "%sIPAddressDeny=%s/%u\n", prefix
, strnull(k
), iaai
->prefixlen
);
291 static int lookup_block_device(const char *p
, dev_t
*dev
) {
300 return log_warning_errno(errno
, "Couldn't stat device %s: %m", p
);
302 if (S_ISBLK(st
.st_mode
))
304 else if (major(st
.st_dev
) != 0) {
305 /* If this is not a device node then find the block
306 * device this file is stored on */
309 /* If this is a partition, try to get the originating
311 (void) block_get_whole_disk(*dev
, dev
);
313 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p
);
320 static int whitelist_device(const char *path
, const char *node
, const char *acc
) {
321 char buf
[2+DECIMAL_STR_MAX(dev_t
)*2+2+4];
323 bool ignore_notfound
;
329 if (node
[0] == '-') {
330 /* Non-existent paths starting with "-" must be silently ignored */
332 ignore_notfound
= true;
334 ignore_notfound
= false;
336 if (stat(node
, &st
) < 0) {
337 if (errno
== ENOENT
&& ignore_notfound
)
340 return log_warning_errno(errno
, "Couldn't stat device %s: %m", node
);
343 if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
344 log_warning("%s is not a device.", node
);
350 S_ISCHR(st
.st_mode
) ? 'c' : 'b',
351 major(st
.st_rdev
), minor(st
.st_rdev
),
354 r
= cg_set_attribute("devices", path
, "devices.allow", buf
);
356 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
357 "Failed to set devices.allow on %s: %m", path
);
362 static int whitelist_major(const char *path
, const char *name
, char type
, const char *acc
) {
363 _cleanup_fclose_
FILE *f
= NULL
;
370 assert(IN_SET(type
, 'b', 'c'));
372 f
= fopen("/proc/devices", "re");
374 return log_warning_errno(errno
, "Cannot open /proc/devices to resolve %s (%c): %m", name
, type
);
376 FOREACH_LINE(line
, f
, goto fail
) {
377 char buf
[2+DECIMAL_STR_MAX(unsigned)+3+4], *p
, *w
;
382 if (type
== 'c' && streq(line
, "Character devices:")) {
387 if (type
== 'b' && streq(line
, "Block devices:")) {
402 w
= strpbrk(p
, WHITESPACE
);
407 r
= safe_atou(p
, &maj
);
414 w
+= strspn(w
, WHITESPACE
);
416 if (fnmatch(name
, w
, 0) != 0)
425 r
= cg_set_attribute("devices", path
, "devices.allow", buf
);
427 log_full_errno(IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
428 "Failed to set devices.allow on %s: %m", path
);
434 return log_warning_errno(errno
, "Failed to read /proc/devices: %m");
437 static bool cgroup_context_has_cpu_weight(CGroupContext
*c
) {
438 return c
->cpu_weight
!= CGROUP_WEIGHT_INVALID
||
439 c
->startup_cpu_weight
!= CGROUP_WEIGHT_INVALID
;
442 static bool cgroup_context_has_cpu_shares(CGroupContext
*c
) {
443 return c
->cpu_shares
!= CGROUP_CPU_SHARES_INVALID
||
444 c
->startup_cpu_shares
!= CGROUP_CPU_SHARES_INVALID
;
447 static uint64_t cgroup_context_cpu_weight(CGroupContext
*c
, ManagerState state
) {
448 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
449 c
->startup_cpu_weight
!= CGROUP_WEIGHT_INVALID
)
450 return c
->startup_cpu_weight
;
451 else if (c
->cpu_weight
!= CGROUP_WEIGHT_INVALID
)
452 return c
->cpu_weight
;
454 return CGROUP_WEIGHT_DEFAULT
;
457 static uint64_t cgroup_context_cpu_shares(CGroupContext
*c
, ManagerState state
) {
458 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
459 c
->startup_cpu_shares
!= CGROUP_CPU_SHARES_INVALID
)
460 return c
->startup_cpu_shares
;
461 else if (c
->cpu_shares
!= CGROUP_CPU_SHARES_INVALID
)
462 return c
->cpu_shares
;
464 return CGROUP_CPU_SHARES_DEFAULT
;
467 static void cgroup_apply_unified_cpu_config(Unit
*u
, uint64_t weight
, uint64_t quota
) {
468 char buf
[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t
) + 1) * 2)];
471 xsprintf(buf
, "%" PRIu64
"\n", weight
);
472 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.weight", buf
);
474 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
475 "Failed to set cpu.weight: %m");
477 if (quota
!= USEC_INFINITY
)
478 xsprintf(buf
, USEC_FMT
" " USEC_FMT
"\n",
479 quota
* CGROUP_CPU_QUOTA_PERIOD_USEC
/ USEC_PER_SEC
, CGROUP_CPU_QUOTA_PERIOD_USEC
);
481 xsprintf(buf
, "max " USEC_FMT
"\n", CGROUP_CPU_QUOTA_PERIOD_USEC
);
483 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.max", buf
);
486 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
487 "Failed to set cpu.max: %m");
490 static void cgroup_apply_legacy_cpu_config(Unit
*u
, uint64_t shares
, uint64_t quota
) {
491 char buf
[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t
)) + 1];
494 xsprintf(buf
, "%" PRIu64
"\n", shares
);
495 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.shares", buf
);
497 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
498 "Failed to set cpu.shares: %m");
500 xsprintf(buf
, USEC_FMT
"\n", CGROUP_CPU_QUOTA_PERIOD_USEC
);
501 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.cfs_period_us", buf
);
503 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
504 "Failed to set cpu.cfs_period_us: %m");
506 if (quota
!= USEC_INFINITY
) {
507 xsprintf(buf
, USEC_FMT
"\n", quota
* CGROUP_CPU_QUOTA_PERIOD_USEC
/ USEC_PER_SEC
);
508 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.cfs_quota_us", buf
);
510 r
= cg_set_attribute("cpu", u
->cgroup_path
, "cpu.cfs_quota_us", "-1");
512 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
513 "Failed to set cpu.cfs_quota_us: %m");
516 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares
) {
517 return CLAMP(shares
* CGROUP_WEIGHT_DEFAULT
/ CGROUP_CPU_SHARES_DEFAULT
,
518 CGROUP_WEIGHT_MIN
, CGROUP_WEIGHT_MAX
);
521 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight
) {
522 return CLAMP(weight
* CGROUP_CPU_SHARES_DEFAULT
/ CGROUP_WEIGHT_DEFAULT
,
523 CGROUP_CPU_SHARES_MIN
, CGROUP_CPU_SHARES_MAX
);
526 static bool cgroup_context_has_io_config(CGroupContext
*c
) {
527 return c
->io_accounting
||
528 c
->io_weight
!= CGROUP_WEIGHT_INVALID
||
529 c
->startup_io_weight
!= CGROUP_WEIGHT_INVALID
||
530 c
->io_device_weights
||
534 static bool cgroup_context_has_blockio_config(CGroupContext
*c
) {
535 return c
->blockio_accounting
||
536 c
->blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
||
537 c
->startup_blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
||
538 c
->blockio_device_weights
||
539 c
->blockio_device_bandwidths
;
542 static uint64_t cgroup_context_io_weight(CGroupContext
*c
, ManagerState state
) {
543 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
544 c
->startup_io_weight
!= CGROUP_WEIGHT_INVALID
)
545 return c
->startup_io_weight
;
546 else if (c
->io_weight
!= CGROUP_WEIGHT_INVALID
)
549 return CGROUP_WEIGHT_DEFAULT
;
552 static uint64_t cgroup_context_blkio_weight(CGroupContext
*c
, ManagerState state
) {
553 if (IN_SET(state
, MANAGER_STARTING
, MANAGER_INITIALIZING
) &&
554 c
->startup_blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
)
555 return c
->startup_blockio_weight
;
556 else if (c
->blockio_weight
!= CGROUP_BLKIO_WEIGHT_INVALID
)
557 return c
->blockio_weight
;
559 return CGROUP_BLKIO_WEIGHT_DEFAULT
;
562 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight
) {
563 return CLAMP(blkio_weight
* CGROUP_WEIGHT_DEFAULT
/ CGROUP_BLKIO_WEIGHT_DEFAULT
,
564 CGROUP_WEIGHT_MIN
, CGROUP_WEIGHT_MAX
);
567 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight
) {
568 return CLAMP(io_weight
* CGROUP_BLKIO_WEIGHT_DEFAULT
/ CGROUP_WEIGHT_DEFAULT
,
569 CGROUP_BLKIO_WEIGHT_MIN
, CGROUP_BLKIO_WEIGHT_MAX
);
572 static void cgroup_apply_io_device_weight(Unit
*u
, const char *dev_path
, uint64_t io_weight
) {
573 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
577 r
= lookup_block_device(dev_path
, &dev
);
581 xsprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), io_weight
);
582 r
= cg_set_attribute("io", u
->cgroup_path
, "io.weight", buf
);
584 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
585 "Failed to set io.weight: %m");
588 static void cgroup_apply_blkio_device_weight(Unit
*u
, const char *dev_path
, uint64_t blkio_weight
) {
589 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
593 r
= lookup_block_device(dev_path
, &dev
);
597 xsprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), blkio_weight
);
598 r
= cg_set_attribute("blkio", u
->cgroup_path
, "blkio.weight_device", buf
);
600 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
601 "Failed to set blkio.weight_device: %m");
604 static unsigned cgroup_apply_io_device_limit(Unit
*u
, const char *dev_path
, uint64_t *limits
) {
605 char limit_bufs
[_CGROUP_IO_LIMIT_TYPE_MAX
][DECIMAL_STR_MAX(uint64_t)];
606 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
607 CGroupIOLimitType type
;
612 r
= lookup_block_device(dev_path
, &dev
);
616 for (type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++) {
617 if (limits
[type
] != cgroup_io_limit_defaults
[type
]) {
618 xsprintf(limit_bufs
[type
], "%" PRIu64
, limits
[type
]);
621 xsprintf(limit_bufs
[type
], "%s", limits
[type
] == CGROUP_LIMIT_MAX
? "max" : "0");
625 xsprintf(buf
, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev
), minor(dev
),
626 limit_bufs
[CGROUP_IO_RBPS_MAX
], limit_bufs
[CGROUP_IO_WBPS_MAX
],
627 limit_bufs
[CGROUP_IO_RIOPS_MAX
], limit_bufs
[CGROUP_IO_WIOPS_MAX
]);
628 r
= cg_set_attribute("io", u
->cgroup_path
, "io.max", buf
);
630 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
631 "Failed to set io.max: %m");
635 static unsigned cgroup_apply_blkio_device_limit(Unit
*u
, const char *dev_path
, uint64_t rbps
, uint64_t wbps
) {
636 char buf
[DECIMAL_STR_MAX(dev_t
)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
641 r
= lookup_block_device(dev_path
, &dev
);
645 if (rbps
!= CGROUP_LIMIT_MAX
)
647 sprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), rbps
);
648 r
= cg_set_attribute("blkio", u
->cgroup_path
, "blkio.throttle.read_bps_device", buf
);
650 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
651 "Failed to set blkio.throttle.read_bps_device: %m");
653 if (wbps
!= CGROUP_LIMIT_MAX
)
655 sprintf(buf
, "%u:%u %" PRIu64
"\n", major(dev
), minor(dev
), wbps
);
656 r
= cg_set_attribute("blkio", u
->cgroup_path
, "blkio.throttle.write_bps_device", buf
);
658 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
659 "Failed to set blkio.throttle.write_bps_device: %m");
664 static bool cgroup_context_has_unified_memory_config(CGroupContext
*c
) {
665 return c
->memory_low
> 0 || c
->memory_high
!= CGROUP_LIMIT_MAX
|| c
->memory_max
!= CGROUP_LIMIT_MAX
|| c
->memory_swap_max
!= CGROUP_LIMIT_MAX
;
668 static void cgroup_apply_unified_memory_limit(Unit
*u
, const char *file
, uint64_t v
) {
669 char buf
[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
672 if (v
!= CGROUP_LIMIT_MAX
)
673 xsprintf(buf
, "%" PRIu64
"\n", v
);
675 r
= cg_set_attribute("memory", u
->cgroup_path
, file
, buf
);
677 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
678 "Failed to set %s: %m", file
);
681 static void cgroup_apply_firewall(Unit
*u
) {
686 if (u
->type
== UNIT_SLICE
) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
687 * not recursive we don't ever touch the bpf on them */
690 r
= bpf_firewall_compile(u
);
694 (void) bpf_firewall_install(u
);
698 static void cgroup_context_apply(
700 CGroupMask apply_mask
,
702 ManagerState state
) {
711 c
= unit_get_cgroup_context(u
);
712 path
= u
->cgroup_path
;
717 /* Nothing to do? Exit early! */
718 if (apply_mask
== 0 && !apply_bpf
)
721 /* Some cgroup attributes are not supported on the root cgroup,
722 * hence silently ignore */
723 is_root
= isempty(path
) || path_equal(path
, "/");
725 /* Make sure we don't try to display messages with an empty path. */
728 /* We generally ignore errors caused by read-only mounted
729 * cgroup trees (assuming we are running in a container then),
730 * and missing cgroups, i.e. EROFS and ENOENT. */
732 if ((apply_mask
& CGROUP_MASK_CPU
) && !is_root
) {
733 bool has_weight
, has_shares
;
735 has_weight
= cgroup_context_has_cpu_weight(c
);
736 has_shares
= cgroup_context_has_cpu_shares(c
);
738 if (cg_all_unified() > 0) {
742 weight
= cgroup_context_cpu_weight(c
, state
);
743 else if (has_shares
) {
744 uint64_t shares
= cgroup_context_cpu_shares(c
, state
);
746 weight
= cgroup_cpu_shares_to_weight(shares
);
748 log_cgroup_compat(u
, "Applying [Startup]CpuShares %" PRIu64
" as [Startup]CpuWeight %" PRIu64
" on %s",
749 shares
, weight
, path
);
751 weight
= CGROUP_WEIGHT_DEFAULT
;
753 cgroup_apply_unified_cpu_config(u
, weight
, c
->cpu_quota_per_sec_usec
);
758 uint64_t weight
= cgroup_context_cpu_weight(c
, state
);
760 shares
= cgroup_cpu_weight_to_shares(weight
);
762 log_cgroup_compat(u
, "Applying [Startup]CpuWeight %" PRIu64
" as [Startup]CpuShares %" PRIu64
" on %s",
763 weight
, shares
, path
);
764 } else if (has_shares
)
765 shares
= cgroup_context_cpu_shares(c
, state
);
767 shares
= CGROUP_CPU_SHARES_DEFAULT
;
769 cgroup_apply_legacy_cpu_config(u
, shares
, c
->cpu_quota_per_sec_usec
);
773 if (apply_mask
& CGROUP_MASK_IO
) {
774 bool has_io
= cgroup_context_has_io_config(c
);
775 bool has_blockio
= cgroup_context_has_blockio_config(c
);
778 char buf
[8+DECIMAL_STR_MAX(uint64_t)+1];
782 weight
= cgroup_context_io_weight(c
, state
);
783 else if (has_blockio
) {
784 uint64_t blkio_weight
= cgroup_context_blkio_weight(c
, state
);
786 weight
= cgroup_weight_blkio_to_io(blkio_weight
);
788 log_cgroup_compat(u
, "Applying [Startup]BlockIOWeight %" PRIu64
" as [Startup]IOWeight %" PRIu64
,
789 blkio_weight
, weight
);
791 weight
= CGROUP_WEIGHT_DEFAULT
;
793 xsprintf(buf
, "default %" PRIu64
"\n", weight
);
794 r
= cg_set_attribute("io", path
, "io.weight", buf
);
796 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
797 "Failed to set io.weight: %m");
800 CGroupIODeviceWeight
*w
;
802 /* FIXME: no way to reset this list */
803 LIST_FOREACH(device_weights
, w
, c
->io_device_weights
)
804 cgroup_apply_io_device_weight(u
, w
->path
, w
->weight
);
805 } else if (has_blockio
) {
806 CGroupBlockIODeviceWeight
*w
;
808 /* FIXME: no way to reset this list */
809 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
) {
810 weight
= cgroup_weight_blkio_to_io(w
->weight
);
812 log_cgroup_compat(u
, "Applying BlockIODeviceWeight %" PRIu64
" as IODeviceWeight %" PRIu64
" for %s",
813 w
->weight
, weight
, w
->path
);
815 cgroup_apply_io_device_weight(u
, w
->path
, weight
);
820 /* Apply limits and free ones without config. */
822 CGroupIODeviceLimit
*l
, *next
;
824 LIST_FOREACH_SAFE(device_limits
, l
, next
, c
->io_device_limits
) {
825 if (!cgroup_apply_io_device_limit(u
, l
->path
, l
->limits
))
826 cgroup_context_free_io_device_limit(c
, l
);
828 } else if (has_blockio
) {
829 CGroupBlockIODeviceBandwidth
*b
, *next
;
831 LIST_FOREACH_SAFE(device_bandwidths
, b
, next
, c
->blockio_device_bandwidths
) {
832 uint64_t limits
[_CGROUP_IO_LIMIT_TYPE_MAX
];
833 CGroupIOLimitType type
;
835 for (type
= 0; type
< _CGROUP_IO_LIMIT_TYPE_MAX
; type
++)
836 limits
[type
] = cgroup_io_limit_defaults
[type
];
838 limits
[CGROUP_IO_RBPS_MAX
] = b
->rbps
;
839 limits
[CGROUP_IO_WBPS_MAX
] = b
->wbps
;
841 log_cgroup_compat(u
, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64
" %" PRIu64
" as IO{Read|Write}BandwidthMax for %s",
842 b
->rbps
, b
->wbps
, b
->path
);
844 if (!cgroup_apply_io_device_limit(u
, b
->path
, limits
))
845 cgroup_context_free_blockio_device_bandwidth(c
, b
);
850 if (apply_mask
& CGROUP_MASK_BLKIO
) {
851 bool has_io
= cgroup_context_has_io_config(c
);
852 bool has_blockio
= cgroup_context_has_blockio_config(c
);
855 char buf
[DECIMAL_STR_MAX(uint64_t)+1];
859 uint64_t io_weight
= cgroup_context_io_weight(c
, state
);
861 weight
= cgroup_weight_io_to_blkio(cgroup_context_io_weight(c
, state
));
863 log_cgroup_compat(u
, "Applying [Startup]IOWeight %" PRIu64
" as [Startup]BlockIOWeight %" PRIu64
,
865 } else if (has_blockio
)
866 weight
= cgroup_context_blkio_weight(c
, state
);
868 weight
= CGROUP_BLKIO_WEIGHT_DEFAULT
;
870 xsprintf(buf
, "%" PRIu64
"\n", weight
);
871 r
= cg_set_attribute("blkio", path
, "blkio.weight", buf
);
873 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
874 "Failed to set blkio.weight: %m");
877 CGroupIODeviceWeight
*w
;
879 /* FIXME: no way to reset this list */
880 LIST_FOREACH(device_weights
, w
, c
->io_device_weights
) {
881 weight
= cgroup_weight_io_to_blkio(w
->weight
);
883 log_cgroup_compat(u
, "Applying IODeviceWeight %" PRIu64
" as BlockIODeviceWeight %" PRIu64
" for %s",
884 w
->weight
, weight
, w
->path
);
886 cgroup_apply_blkio_device_weight(u
, w
->path
, weight
);
888 } else if (has_blockio
) {
889 CGroupBlockIODeviceWeight
*w
;
891 /* FIXME: no way to reset this list */
892 LIST_FOREACH(device_weights
, w
, c
->blockio_device_weights
)
893 cgroup_apply_blkio_device_weight(u
, w
->path
, w
->weight
);
897 /* Apply limits and free ones without config. */
899 CGroupIODeviceLimit
*l
, *next
;
901 LIST_FOREACH_SAFE(device_limits
, l
, next
, c
->io_device_limits
) {
902 log_cgroup_compat(u
, "Applying IO{Read|Write}Bandwidth %" PRIu64
" %" PRIu64
" as BlockIO{Read|Write}BandwidthMax for %s",
903 l
->limits
[CGROUP_IO_RBPS_MAX
], l
->limits
[CGROUP_IO_WBPS_MAX
], l
->path
);
905 if (!cgroup_apply_blkio_device_limit(u
, l
->path
, l
->limits
[CGROUP_IO_RBPS_MAX
], l
->limits
[CGROUP_IO_WBPS_MAX
]))
906 cgroup_context_free_io_device_limit(c
, l
);
908 } else if (has_blockio
) {
909 CGroupBlockIODeviceBandwidth
*b
, *next
;
911 LIST_FOREACH_SAFE(device_bandwidths
, b
, next
, c
->blockio_device_bandwidths
)
912 if (!cgroup_apply_blkio_device_limit(u
, b
->path
, b
->rbps
, b
->wbps
))
913 cgroup_context_free_blockio_device_bandwidth(c
, b
);
917 if ((apply_mask
& CGROUP_MASK_MEMORY
) && !is_root
) {
918 if (cg_all_unified() > 0) {
919 uint64_t max
, swap_max
= CGROUP_LIMIT_MAX
;
921 if (cgroup_context_has_unified_memory_config(c
)) {
923 swap_max
= c
->memory_swap_max
;
925 max
= c
->memory_limit
;
927 if (max
!= CGROUP_LIMIT_MAX
)
928 log_cgroup_compat(u
, "Applying MemoryLimit %" PRIu64
" as MemoryMax", max
);
931 cgroup_apply_unified_memory_limit(u
, "memory.low", c
->memory_low
);
932 cgroup_apply_unified_memory_limit(u
, "memory.high", c
->memory_high
);
933 cgroup_apply_unified_memory_limit(u
, "memory.max", max
);
934 cgroup_apply_unified_memory_limit(u
, "memory.swap.max", swap_max
);
936 char buf
[DECIMAL_STR_MAX(uint64_t) + 1];
939 if (cgroup_context_has_unified_memory_config(c
)) {
941 log_cgroup_compat(u
, "Applying MemoryMax %" PRIi64
" as MemoryLimit", val
);
943 val
= c
->memory_limit
;
945 if (val
== CGROUP_LIMIT_MAX
)
946 strncpy(buf
, "-1\n", sizeof(buf
));
948 xsprintf(buf
, "%" PRIu64
"\n", val
);
950 r
= cg_set_attribute("memory", path
, "memory.limit_in_bytes", buf
);
952 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
953 "Failed to set memory.limit_in_bytes: %m");
957 if ((apply_mask
& CGROUP_MASK_DEVICES
) && !is_root
) {
958 CGroupDeviceAllow
*a
;
960 /* Changing the devices list of a populated cgroup
961 * might result in EINVAL, hence ignore EINVAL
964 if (c
->device_allow
|| c
->device_policy
!= CGROUP_AUTO
)
965 r
= cg_set_attribute("devices", path
, "devices.deny", "a");
967 r
= cg_set_attribute("devices", path
, "devices.allow", "a");
969 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EINVAL
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
970 "Failed to reset devices.list: %m");
972 if (c
->device_policy
== CGROUP_CLOSED
||
973 (c
->device_policy
== CGROUP_AUTO
&& c
->device_allow
)) {
974 static const char auto_devices
[] =
975 "/dev/null\0" "rwm\0"
976 "/dev/zero\0" "rwm\0"
977 "/dev/full\0" "rwm\0"
978 "/dev/random\0" "rwm\0"
979 "/dev/urandom\0" "rwm\0"
981 "/dev/ptmx\0" "rwm\0"
982 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
983 "-/run/systemd/inaccessible/chr\0" "rwm\0"
984 "-/run/systemd/inaccessible/blk\0" "rwm\0";
988 NULSTR_FOREACH_PAIR(x
, y
, auto_devices
)
989 whitelist_device(path
, x
, y
);
991 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
992 whitelist_major(path
, "pts", 'c', "rw");
995 LIST_FOREACH(device_allow
, a
, c
->device_allow
) {
1011 if (path_startswith(a
->path
, "/dev/"))
1012 whitelist_device(path
, a
->path
, acc
);
1013 else if ((val
= startswith(a
->path
, "block-")))
1014 whitelist_major(path
, val
, 'b', acc
);
1015 else if ((val
= startswith(a
->path
, "char-")))
1016 whitelist_major(path
, val
, 'c', acc
);
1018 log_unit_debug(u
, "Ignoring device %s while writing cgroup attribute.", a
->path
);
1022 if ((apply_mask
& CGROUP_MASK_PIDS
) && !is_root
) {
1024 if (c
->tasks_max
!= CGROUP_LIMIT_MAX
) {
1025 char buf
[DECIMAL_STR_MAX(uint64_t) + 2];
1027 sprintf(buf
, "%" PRIu64
"\n", c
->tasks_max
);
1028 r
= cg_set_attribute("pids", path
, "pids.max", buf
);
1030 r
= cg_set_attribute("pids", path
, "pids.max", "max");
1033 log_unit_full(u
, IN_SET(r
, -ENOENT
, -EROFS
, -EACCES
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1034 "Failed to set pids.max: %m");
1038 cgroup_apply_firewall(u
);
1041 CGroupMask
cgroup_context_get_mask(CGroupContext
*c
) {
1042 CGroupMask mask
= 0;
1044 /* Figure out which controllers we need */
1046 if (c
->cpu_accounting
||
1047 cgroup_context_has_cpu_weight(c
) ||
1048 cgroup_context_has_cpu_shares(c
) ||
1049 c
->cpu_quota_per_sec_usec
!= USEC_INFINITY
)
1050 mask
|= CGROUP_MASK_CPUACCT
| CGROUP_MASK_CPU
;
1052 if (cgroup_context_has_io_config(c
) || cgroup_context_has_blockio_config(c
))
1053 mask
|= CGROUP_MASK_IO
| CGROUP_MASK_BLKIO
;
1055 if (c
->memory_accounting
||
1056 c
->memory_limit
!= CGROUP_LIMIT_MAX
||
1057 cgroup_context_has_unified_memory_config(c
))
1058 mask
|= CGROUP_MASK_MEMORY
;
1060 if (c
->device_allow
||
1061 c
->device_policy
!= CGROUP_AUTO
)
1062 mask
|= CGROUP_MASK_DEVICES
;
1064 if (c
->tasks_accounting
||
1065 c
->tasks_max
!= (uint64_t) -1)
1066 mask
|= CGROUP_MASK_PIDS
;
1071 CGroupMask
unit_get_own_mask(Unit
*u
) {
1074 /* Returns the mask of controllers the unit needs for itself */
1076 c
= unit_get_cgroup_context(u
);
1080 return cgroup_context_get_mask(c
) | unit_get_delegate_mask(u
);
1083 CGroupMask
unit_get_delegate_mask(Unit
*u
) {
1086 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1087 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1089 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1091 if (u
->type
== UNIT_SLICE
)
1094 c
= unit_get_cgroup_context(u
);
1101 if (cg_all_unified() <= 0) {
1104 e
= unit_get_exec_context(u
);
1105 if (e
&& !exec_context_maintains_privileges(e
))
1109 return c
->delegate_controllers
;
1112 CGroupMask
unit_get_members_mask(Unit
*u
) {
1115 /* Returns the mask of controllers all of the unit's children require, merged */
1117 if (u
->cgroup_members_mask_valid
)
1118 return u
->cgroup_members_mask
;
1120 u
->cgroup_members_mask
= 0;
1122 if (u
->type
== UNIT_SLICE
) {
1127 HASHMAP_FOREACH_KEY(v
, member
, u
->dependencies
[UNIT_BEFORE
], i
) {
1132 if (UNIT_DEREF(member
->slice
) != u
)
1135 u
->cgroup_members_mask
|= unit_get_subtree_mask(member
); /* note that this calls ourselves again, for the children */
1139 u
->cgroup_members_mask_valid
= true;
1140 return u
->cgroup_members_mask
;
1143 CGroupMask
unit_get_siblings_mask(Unit
*u
) {
1146 /* Returns the mask of controllers all of the unit's siblings
1147 * require, i.e. the members mask of the unit's parent slice
1148 * if there is one. */
1150 if (UNIT_ISSET(u
->slice
))
1151 return unit_get_members_mask(UNIT_DEREF(u
->slice
));
1153 return unit_get_subtree_mask(u
); /* we are the top-level slice */
1156 CGroupMask
unit_get_subtree_mask(Unit
*u
) {
1158 /* Returns the mask of this subtree, meaning of the group
1159 * itself and its children. */
1161 return unit_get_own_mask(u
) | unit_get_members_mask(u
);
1164 CGroupMask
unit_get_target_mask(Unit
*u
) {
1167 /* This returns the cgroup mask of all controllers to enable
1168 * for a specific cgroup, i.e. everything it needs itself,
1169 * plus all that its children need, plus all that its siblings
1170 * need. This is primarily useful on the legacy cgroup
1171 * hierarchy, where we need to duplicate each cgroup in each
1172 * hierarchy that shall be enabled for it. */
1174 mask
= unit_get_own_mask(u
) | unit_get_members_mask(u
) | unit_get_siblings_mask(u
);
1175 mask
&= u
->manager
->cgroup_supported
;
1180 CGroupMask
unit_get_enable_mask(Unit
*u
) {
1183 /* This returns the cgroup mask of all controllers to enable
1184 * for the children of a specific cgroup. This is primarily
1185 * useful for the unified cgroup hierarchy, where each cgroup
1186 * controls which controllers are enabled for its children. */
1188 mask
= unit_get_members_mask(u
);
1189 mask
&= u
->manager
->cgroup_supported
;
1194 bool unit_get_needs_bpf(Unit
*u
) {
1199 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1201 if (u
->type
== UNIT_SLICE
)
1204 c
= unit_get_cgroup_context(u
);
1208 if (c
->ip_accounting
||
1209 c
->ip_address_allow
||
1213 /* If any parent slice has an IP access list defined, it applies too */
1214 for (p
= UNIT_DEREF(u
->slice
); p
; p
= UNIT_DEREF(p
->slice
)) {
1215 c
= unit_get_cgroup_context(p
);
1219 if (c
->ip_address_allow
||
1227 /* Recurse from a unit up through its containing slices, propagating
1228 * mask bits upward. A unit is also member of itself. */
1229 void unit_update_cgroup_members_masks(Unit
*u
) {
1235 /* Calculate subtree mask */
1236 m
= unit_get_subtree_mask(u
);
1238 /* See if anything changed from the previous invocation. If
1239 * not, we're done. */
1240 if (u
->cgroup_subtree_mask_valid
&& m
== u
->cgroup_subtree_mask
)
1244 u
->cgroup_subtree_mask_valid
&&
1245 ((m
& ~u
->cgroup_subtree_mask
) != 0) &&
1246 ((~m
& u
->cgroup_subtree_mask
) == 0);
1248 u
->cgroup_subtree_mask
= m
;
1249 u
->cgroup_subtree_mask_valid
= true;
1251 if (UNIT_ISSET(u
->slice
)) {
1252 Unit
*s
= UNIT_DEREF(u
->slice
);
1255 /* There's more set now than before. We
1256 * propagate the new mask to the parent's mask
1257 * (not caring if it actually was valid or
1260 s
->cgroup_members_mask
|= m
;
1263 /* There's less set now than before (or we
1264 * don't know), we need to recalculate
1265 * everything, so let's invalidate the
1266 * parent's members mask */
1268 s
->cgroup_members_mask_valid
= false;
1270 /* And now make sure that this change also hits our
1272 unit_update_cgroup_members_masks(s
);
1276 static const char *migrate_callback(CGroupMask mask
, void *userdata
) {
1283 if (u
->cgroup_path
&&
1284 u
->cgroup_realized
&&
1285 (u
->cgroup_realized_mask
& mask
) == mask
)
1286 return u
->cgroup_path
;
1288 u
= UNIT_DEREF(u
->slice
);
1294 char *unit_default_cgroup_path(Unit
*u
) {
1295 _cleanup_free_
char *escaped
= NULL
, *slice
= NULL
;
1300 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
1301 return strdup(u
->manager
->cgroup_root
);
1303 if (UNIT_ISSET(u
->slice
) && !unit_has_name(UNIT_DEREF(u
->slice
), SPECIAL_ROOT_SLICE
)) {
1304 r
= cg_slice_to_path(UNIT_DEREF(u
->slice
)->id
, &slice
);
1309 escaped
= cg_escape(u
->id
);
1314 return strjoin(u
->manager
->cgroup_root
, "/", slice
, "/",
1317 return strjoin(u
->manager
->cgroup_root
, "/", escaped
);
1320 int unit_set_cgroup_path(Unit
*u
, const char *path
) {
1321 _cleanup_free_
char *p
= NULL
;
1333 if (streq_ptr(u
->cgroup_path
, p
))
1337 r
= hashmap_put(u
->manager
->cgroup_unit
, p
, u
);
1342 unit_release_cgroup(u
);
1350 int unit_watch_cgroup(Unit
*u
) {
1351 _cleanup_free_
char *events
= NULL
;
1356 if (!u
->cgroup_path
)
1359 if (u
->cgroup_inotify_wd
>= 0)
1362 /* Only applies to the unified hierarchy */
1363 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
1365 return log_error_errno(r
, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1369 /* Don't watch the root slice, it's pointless. */
1370 if (unit_has_name(u
, SPECIAL_ROOT_SLICE
))
1373 r
= hashmap_ensure_allocated(&u
->manager
->cgroup_inotify_wd_unit
, &trivial_hash_ops
);
1377 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, "cgroup.events", &events
);
1381 u
->cgroup_inotify_wd
= inotify_add_watch(u
->manager
->cgroup_inotify_fd
, events
, IN_MODIFY
);
1382 if (u
->cgroup_inotify_wd
< 0) {
1384 if (errno
== ENOENT
) /* If the directory is already
1385 * gone we don't need to track
1386 * it, so this is not an error */
1389 return log_unit_error_errno(u
, errno
, "Failed to add inotify watch descriptor for control group %s: %m", u
->cgroup_path
);
1392 r
= hashmap_put(u
->manager
->cgroup_inotify_wd_unit
, INT_TO_PTR(u
->cgroup_inotify_wd
), u
);
1394 return log_unit_error_errno(u
, r
, "Failed to add inotify watch descriptor to hash map: %m");
1399 int unit_pick_cgroup_path(Unit
*u
) {
1400 _cleanup_free_
char *path
= NULL
;
1408 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
1411 path
= unit_default_cgroup_path(u
);
1415 r
= unit_set_cgroup_path(u
, path
);
1417 return log_unit_error_errno(u
, r
, "Control group %s exists already.", path
);
1419 return log_unit_error_errno(u
, r
, "Failed to set unit's control group path to %s: %m", path
);
1424 static int unit_create_cgroup(
1426 CGroupMask target_mask
,
1427 CGroupMask enable_mask
,
1435 c
= unit_get_cgroup_context(u
);
1439 /* Figure out our cgroup path */
1440 r
= unit_pick_cgroup_path(u
);
1444 /* First, create our own group */
1445 r
= cg_create_everywhere(u
->manager
->cgroup_supported
, target_mask
, u
->cgroup_path
);
1447 return log_unit_error_errno(u
, r
, "Failed to create cgroup %s: %m", u
->cgroup_path
);
1449 /* Start watching it */
1450 (void) unit_watch_cgroup(u
);
1452 /* Enable all controllers we need */
1453 r
= cg_enable_everywhere(u
->manager
->cgroup_supported
, enable_mask
, u
->cgroup_path
);
1455 log_unit_warning_errno(u
, r
, "Failed to enable controllers on cgroup %s, ignoring: %m", u
->cgroup_path
);
1457 /* Keep track that this is now realized */
1458 u
->cgroup_realized
= true;
1459 u
->cgroup_realized_mask
= target_mask
;
1460 u
->cgroup_enabled_mask
= enable_mask
;
1461 u
->cgroup_bpf_state
= needs_bpf
? UNIT_CGROUP_BPF_ON
: UNIT_CGROUP_BPF_OFF
;
1463 if (u
->type
!= UNIT_SLICE
&& !c
->delegate
) {
1465 /* Then, possibly move things over, but not if
1466 * subgroups may contain processes, which is the case
1467 * for slice and delegation units. */
1468 r
= cg_migrate_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, u
->cgroup_path
, migrate_callback
, u
);
1470 log_unit_warning_errno(u
, r
, "Failed to migrate cgroup from to %s, ignoring: %m", u
->cgroup_path
);
1476 int unit_attach_pids_to_cgroup(Unit
*u
) {
1480 r
= unit_realize_cgroup(u
);
1484 r
= cg_attach_many_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, u
->pids
, migrate_callback
, u
);
1491 static void cgroup_xattr_apply(Unit
*u
) {
1492 char ids
[SD_ID128_STRING_MAX
];
1497 if (!MANAGER_IS_SYSTEM(u
->manager
))
1500 if (sd_id128_is_null(u
->invocation_id
))
1503 r
= cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
,
1504 "trusted.invocation_id",
1505 sd_id128_to_string(u
->invocation_id
, ids
), 32,
1508 log_unit_debug_errno(u
, r
, "Failed to set invocation ID on control group %s, ignoring: %m", u
->cgroup_path
);
1511 static bool unit_has_mask_realized(
1513 CGroupMask target_mask
,
1514 CGroupMask enable_mask
,
1519 return u
->cgroup_realized
&&
1520 u
->cgroup_realized_mask
== target_mask
&&
1521 u
->cgroup_enabled_mask
== enable_mask
&&
1522 ((needs_bpf
&& u
->cgroup_bpf_state
== UNIT_CGROUP_BPF_ON
) ||
1523 (!needs_bpf
&& u
->cgroup_bpf_state
== UNIT_CGROUP_BPF_OFF
));
1526 static void unit_add_to_cgroup_realize_queue(Unit
*u
) {
1529 if (u
->in_cgroup_realize_queue
)
1532 LIST_PREPEND(cgroup_realize_queue
, u
->manager
->cgroup_realize_queue
, u
);
1533 u
->in_cgroup_realize_queue
= true;
1536 static void unit_remove_from_cgroup_realize_queue(Unit
*u
) {
1539 if (!u
->in_cgroup_realize_queue
)
1542 LIST_REMOVE(cgroup_realize_queue
, u
->manager
->cgroup_realize_queue
, u
);
1543 u
->in_cgroup_realize_queue
= false;
1547 /* Check if necessary controllers and attributes for a unit are in place.
1549 * If so, do nothing.
1550 * If not, create paths, move processes over, and set attributes.
1552 * Returns 0 on success and < 0 on failure. */
1553 static int unit_realize_cgroup_now(Unit
*u
, ManagerState state
) {
1554 CGroupMask target_mask
, enable_mask
;
1555 bool needs_bpf
, apply_bpf
;
1560 unit_remove_from_cgroup_realize_queue(u
);
1562 target_mask
= unit_get_target_mask(u
);
1563 enable_mask
= unit_get_enable_mask(u
);
1564 needs_bpf
= unit_get_needs_bpf(u
);
1566 if (unit_has_mask_realized(u
, target_mask
, enable_mask
, needs_bpf
))
1569 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1570 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1571 * this will trickle down properly to cgroupfs. */
1572 apply_bpf
= needs_bpf
|| u
->cgroup_bpf_state
!= UNIT_CGROUP_BPF_OFF
;
1574 /* First, realize parents */
1575 if (UNIT_ISSET(u
->slice
)) {
1576 r
= unit_realize_cgroup_now(UNIT_DEREF(u
->slice
), state
);
1581 /* And then do the real work */
1582 r
= unit_create_cgroup(u
, target_mask
, enable_mask
, needs_bpf
);
1586 /* Finally, apply the necessary attributes. */
1587 cgroup_context_apply(u
, target_mask
, apply_bpf
, state
);
1588 cgroup_xattr_apply(u
);
1593 unsigned manager_dispatch_cgroup_realize_queue(Manager
*m
) {
1601 state
= manager_state(m
);
1603 while ((i
= m
->cgroup_realize_queue
)) {
1604 assert(i
->in_cgroup_realize_queue
);
1606 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i
))) {
1607 /* Maybe things changed, and the unit is not actually active anymore? */
1608 unit_remove_from_cgroup_realize_queue(i
);
1612 r
= unit_realize_cgroup_now(i
, state
);
1614 log_warning_errno(r
, "Failed to realize cgroups for queued unit %s, ignoring: %m", i
->id
);
1622 static void unit_add_siblings_to_cgroup_realize_queue(Unit
*u
) {
1625 /* This adds the siblings of the specified unit and the
1626 * siblings of all parent units to the cgroup queue. (But
1627 * neither the specified unit itself nor the parents.) */
1629 while ((slice
= UNIT_DEREF(u
->slice
))) {
1634 HASHMAP_FOREACH_KEY(v
, m
, u
->dependencies
[UNIT_BEFORE
], i
) {
1638 /* Skip units that have a dependency on the slice
1639 * but aren't actually in it. */
1640 if (UNIT_DEREF(m
->slice
) != slice
)
1643 /* No point in doing cgroup application for units
1644 * without active processes. */
1645 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m
)))
1648 /* If the unit doesn't need any new controllers
1649 * and has current ones realized, it doesn't need
1651 if (unit_has_mask_realized(m
,
1652 unit_get_target_mask(m
),
1653 unit_get_enable_mask(m
),
1654 unit_get_needs_bpf(m
)))
1657 unit_add_to_cgroup_realize_queue(m
);
1664 int unit_realize_cgroup(Unit
*u
) {
1667 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
1670 /* So, here's the deal: when realizing the cgroups for this
1671 * unit, we need to first create all parents, but there's more
1672 * actually: for the weight-based controllers we also need to
1673 * make sure that all our siblings (i.e. units that are in the
1674 * same slice as we are) have cgroups, too. Otherwise, things
1675 * would become very uneven as each of their processes would
1676 * get as much resources as all our group together. This call
1677 * will synchronously create the parent cgroups, but will
1678 * defer work on the siblings to the next event loop
1681 /* Add all sibling slices to the cgroup queue. */
1682 unit_add_siblings_to_cgroup_realize_queue(u
);
1684 /* And realize this one now (and apply the values) */
1685 return unit_realize_cgroup_now(u
, manager_state(u
->manager
));
1688 void unit_release_cgroup(Unit
*u
) {
1691 /* Forgets all cgroup details for this cgroup */
1693 if (u
->cgroup_path
) {
1694 (void) hashmap_remove(u
->manager
->cgroup_unit
, u
->cgroup_path
);
1695 u
->cgroup_path
= mfree(u
->cgroup_path
);
1698 if (u
->cgroup_inotify_wd
>= 0) {
1699 if (inotify_rm_watch(u
->manager
->cgroup_inotify_fd
, u
->cgroup_inotify_wd
) < 0)
1700 log_unit_debug_errno(u
, errno
, "Failed to remove cgroup inotify watch %i for %s, ignoring", u
->cgroup_inotify_wd
, u
->id
);
1702 (void) hashmap_remove(u
->manager
->cgroup_inotify_wd_unit
, INT_TO_PTR(u
->cgroup_inotify_wd
));
1703 u
->cgroup_inotify_wd
= -1;
1707 void unit_prune_cgroup(Unit
*u
) {
1713 /* Removes the cgroup, if empty and possible, and stops watching it. */
1715 if (!u
->cgroup_path
)
1718 (void) unit_get_cpu_usage(u
, NULL
); /* Cache the last CPU usage value before we destroy the cgroup */
1720 is_root_slice
= unit_has_name(u
, SPECIAL_ROOT_SLICE
);
1722 r
= cg_trim_everywhere(u
->manager
->cgroup_supported
, u
->cgroup_path
, !is_root_slice
);
1724 log_unit_debug_errno(u
, r
, "Failed to destroy cgroup %s, ignoring: %m", u
->cgroup_path
);
1731 unit_release_cgroup(u
);
1733 u
->cgroup_realized
= false;
1734 u
->cgroup_realized_mask
= 0;
1735 u
->cgroup_enabled_mask
= 0;
1738 int unit_search_main_pid(Unit
*u
, pid_t
*ret
) {
1739 _cleanup_fclose_
FILE *f
= NULL
;
1740 pid_t pid
= 0, npid
, mypid
;
1746 if (!u
->cgroup_path
)
1749 r
= cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, &f
);
1753 mypid
= getpid_cached();
1754 while (cg_read_pid(f
, &npid
) > 0) {
1760 /* Ignore processes that aren't our kids */
1761 if (get_process_ppid(npid
, &ppid
) >= 0 && ppid
!= mypid
)
1765 /* Dang, there's more than one daemonized PID
1766 in this group, so we don't know what process
1767 is the main process. */
1778 static int unit_watch_pids_in_path(Unit
*u
, const char *path
) {
1779 _cleanup_closedir_
DIR *d
= NULL
;
1780 _cleanup_fclose_
FILE *f
= NULL
;
1786 r
= cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER
, path
, &f
);
1792 while ((r
= cg_read_pid(f
, &pid
)) > 0) {
1793 r
= unit_watch_pid(u
, pid
);
1794 if (r
< 0 && ret
>= 0)
1798 if (r
< 0 && ret
>= 0)
1802 r
= cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER
, path
, &d
);
1809 while ((r
= cg_read_subgroup(d
, &fn
)) > 0) {
1810 _cleanup_free_
char *p
= NULL
;
1812 p
= strjoin(path
, "/", fn
);
1818 r
= unit_watch_pids_in_path(u
, p
);
1819 if (r
< 0 && ret
>= 0)
1823 if (r
< 0 && ret
>= 0)
1830 int unit_watch_all_pids(Unit
*u
) {
1835 /* Adds all PIDs from our cgroup to the set of PIDs we
1836 * watch. This is a fallback logic for cases where we do not
1837 * get reliable cgroup empty notifications: we try to use
1838 * SIGCHLD as replacement. */
1840 if (!u
->cgroup_path
)
1843 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
1846 if (r
> 0) /* On unified we can use proper notifications */
1849 return unit_watch_pids_in_path(u
, u
->cgroup_path
);
1852 static int on_cgroup_empty_event(sd_event_source
*s
, void *userdata
) {
1853 Manager
*m
= userdata
;
1860 u
= m
->cgroup_empty_queue
;
1864 assert(u
->in_cgroup_empty_queue
);
1865 u
->in_cgroup_empty_queue
= false;
1866 LIST_REMOVE(cgroup_empty_queue
, m
->cgroup_empty_queue
, u
);
1868 if (m
->cgroup_empty_queue
) {
1869 /* More stuff queued, let's make sure we remain enabled */
1870 r
= sd_event_source_set_enabled(s
, SD_EVENT_ONESHOT
);
1872 log_debug_errno(r
, "Failed to reenable cgroup empty event source: %m");
1875 unit_add_to_gc_queue(u
);
1877 if (UNIT_VTABLE(u
)->notify_cgroup_empty
)
1878 UNIT_VTABLE(u
)->notify_cgroup_empty(u
);
1883 void unit_add_to_cgroup_empty_queue(Unit
*u
) {
1888 /* Note that there are four different ways how cgroup empty events reach us:
1890 * 1. On the unified hierarchy we get an inotify event on the cgroup
1892 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1894 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1896 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1897 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1899 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1900 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1901 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1902 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1903 * case for scope units). */
1905 if (u
->in_cgroup_empty_queue
)
1908 /* Let's verify that the cgroup is really empty */
1909 if (!u
->cgroup_path
)
1911 r
= cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
);
1913 log_unit_debug_errno(u
, r
, "Failed to determine whether cgroup %s is empty: %m", u
->cgroup_path
);
1919 LIST_PREPEND(cgroup_empty_queue
, u
->manager
->cgroup_empty_queue
, u
);
1920 u
->in_cgroup_empty_queue
= true;
1922 /* Trigger the defer event */
1923 r
= sd_event_source_set_enabled(u
->manager
->cgroup_empty_event_source
, SD_EVENT_ONESHOT
);
1925 log_debug_errno(r
, "Failed to enable cgroup empty event source: %m");
1928 static int on_cgroup_inotify_event(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
1929 Manager
*m
= userdata
;
1936 union inotify_event_buffer buffer
;
1937 struct inotify_event
*e
;
1940 l
= read(fd
, &buffer
, sizeof(buffer
));
1942 if (IN_SET(errno
, EINTR
, EAGAIN
))
1945 return log_error_errno(errno
, "Failed to read control group inotify events: %m");
1948 FOREACH_INOTIFY_EVENT(e
, buffer
, l
) {
1952 /* Queue overflow has no watch descriptor */
1955 if (e
->mask
& IN_IGNORED
)
1956 /* The watch was just removed */
1959 u
= hashmap_get(m
->cgroup_inotify_wd_unit
, INT_TO_PTR(e
->wd
));
1960 if (!u
) /* Not that inotify might deliver
1961 * events for a watch even after it
1962 * was removed, because it was queued
1963 * before the removal. Let's ignore
1964 * this here safely. */
1967 unit_add_to_cgroup_empty_queue(u
);
1972 int manager_setup_cgroup(Manager
*m
) {
1973 _cleanup_free_
char *path
= NULL
;
1974 const char *scope_path
;
1981 /* 1. Determine hierarchy */
1982 m
->cgroup_root
= mfree(m
->cgroup_root
);
1983 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &m
->cgroup_root
);
1985 return log_error_errno(r
, "Cannot determine cgroup we are running in: %m");
1987 /* Chop off the init scope, if we are already located in it */
1988 e
= endswith(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
1990 /* LEGACY: Also chop off the system slice if we are in
1991 * it. This is to support live upgrades from older systemd
1992 * versions where PID 1 was moved there. Also see
1993 * cg_get_root_path(). */
1994 if (!e
&& MANAGER_IS_SYSTEM(m
)) {
1995 e
= endswith(m
->cgroup_root
, "/" SPECIAL_SYSTEM_SLICE
);
1997 e
= endswith(m
->cgroup_root
, "/system"); /* even more legacy */
2002 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2003 * easily prepend it everywhere. */
2004 delete_trailing_chars(m
->cgroup_root
, "/");
2007 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, NULL
, &path
);
2009 return log_error_errno(r
, "Cannot find cgroup mount point: %m");
2011 r
= cg_unified_flush();
2013 return log_error_errno(r
, "Couldn't determine if we are running in the unified hierarchy: %m");
2015 all_unified
= cg_all_unified();
2016 if (all_unified
< 0)
2017 return log_error_errno(all_unified
, "Couldn't determine whether we are in all unified mode: %m");
2018 if (all_unified
> 0)
2019 log_debug("Unified cgroup hierarchy is located at %s.", path
);
2021 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
2023 return log_error_errno(r
, "Failed to determine whether systemd's own controller is in unified mode: %m");
2025 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path
);
2027 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY
". File system hierarchy is at %s.", path
);
2030 /* 3. Allocate cgroup empty defer event source */
2031 m
->cgroup_empty_event_source
= sd_event_source_unref(m
->cgroup_empty_event_source
);
2032 r
= sd_event_add_defer(m
->event
, &m
->cgroup_empty_event_source
, on_cgroup_empty_event
, m
);
2034 return log_error_errno(r
, "Failed to create cgroup empty event source: %m");
2036 r
= sd_event_source_set_priority(m
->cgroup_empty_event_source
, SD_EVENT_PRIORITY_NORMAL
-5);
2038 return log_error_errno(r
, "Failed to set priority of cgroup empty event source: %m");
2040 r
= sd_event_source_set_enabled(m
->cgroup_empty_event_source
, SD_EVENT_OFF
);
2042 return log_error_errno(r
, "Failed to disable cgroup empty event source: %m");
2044 (void) sd_event_source_set_description(m
->cgroup_empty_event_source
, "cgroup-empty");
2046 /* 4. Install notifier inotify object, or agent */
2047 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0) {
2049 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2051 m
->cgroup_inotify_event_source
= sd_event_source_unref(m
->cgroup_inotify_event_source
);
2052 safe_close(m
->cgroup_inotify_fd
);
2054 m
->cgroup_inotify_fd
= inotify_init1(IN_NONBLOCK
|IN_CLOEXEC
);
2055 if (m
->cgroup_inotify_fd
< 0)
2056 return log_error_errno(errno
, "Failed to create control group inotify object: %m");
2058 r
= sd_event_add_io(m
->event
, &m
->cgroup_inotify_event_source
, m
->cgroup_inotify_fd
, EPOLLIN
, on_cgroup_inotify_event
, m
);
2060 return log_error_errno(r
, "Failed to watch control group inotify object: %m");
2062 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2063 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2064 r
= sd_event_source_set_priority(m
->cgroup_inotify_event_source
, SD_EVENT_PRIORITY_NORMAL
-4);
2066 return log_error_errno(r
, "Failed to set priority of inotify event source: %m");
2068 (void) sd_event_source_set_description(m
->cgroup_inotify_event_source
, "cgroup-inotify");
2070 } else if (MANAGER_IS_SYSTEM(m
) && m
->test_run_flags
== 0) {
2072 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2073 * since it does not generate events when control groups with children run empty. */
2075 r
= cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER
, SYSTEMD_CGROUP_AGENT_PATH
);
2077 log_warning_errno(r
, "Failed to install release agent, ignoring: %m");
2079 log_debug("Installed release agent.");
2081 log_debug("Release agent already installed.");
2084 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2085 scope_path
= strjoina(m
->cgroup_root
, "/" SPECIAL_INIT_SCOPE
);
2086 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, scope_path
, 0);
2088 return log_error_errno(r
, "Failed to create %s control group: %m", scope_path
);
2090 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2091 r
= cg_migrate(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, SYSTEMD_CGROUP_CONTROLLER
, scope_path
, 0);
2093 log_warning_errno(r
, "Couldn't move remaining userspace processes, ignoring: %m");
2095 /* 6. And pin it, so that it cannot be unmounted */
2096 safe_close(m
->pin_cgroupfs_fd
);
2097 m
->pin_cgroupfs_fd
= open(path
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
|O_NOCTTY
|O_NONBLOCK
);
2098 if (m
->pin_cgroupfs_fd
< 0)
2099 return log_error_errno(errno
, "Failed to open pin file: %m");
2101 /* 7. Always enable hierarchical support if it exists... */
2102 if (!all_unified
&& m
->test_run_flags
== 0)
2103 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2105 /* 8. Figure out which controllers are supported, and log about it */
2106 r
= cg_mask_supported(&m
->cgroup_supported
);
2108 return log_error_errno(r
, "Failed to determine supported controllers: %m");
2109 for (c
= 0; c
< _CGROUP_CONTROLLER_MAX
; c
++)
2110 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c
), yes_no(m
->cgroup_supported
& CGROUP_CONTROLLER_TO_MASK(c
)));
2115 void manager_shutdown_cgroup(Manager
*m
, bool delete) {
2118 /* We can't really delete the group, since we are in it. But
2120 if (delete && m
->cgroup_root
)
2121 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER
, m
->cgroup_root
, false);
2123 m
->cgroup_empty_event_source
= sd_event_source_unref(m
->cgroup_empty_event_source
);
2125 m
->cgroup_inotify_wd_unit
= hashmap_free(m
->cgroup_inotify_wd_unit
);
2127 m
->cgroup_inotify_event_source
= sd_event_source_unref(m
->cgroup_inotify_event_source
);
2128 m
->cgroup_inotify_fd
= safe_close(m
->cgroup_inotify_fd
);
2130 m
->pin_cgroupfs_fd
= safe_close(m
->pin_cgroupfs_fd
);
2132 m
->cgroup_root
= mfree(m
->cgroup_root
);
2135 Unit
* manager_get_unit_by_cgroup(Manager
*m
, const char *cgroup
) {
2142 u
= hashmap_get(m
->cgroup_unit
, cgroup
);
2146 p
= strdupa(cgroup
);
2150 e
= strrchr(p
, '/');
2152 return hashmap_get(m
->cgroup_unit
, SPECIAL_ROOT_SLICE
);
2156 u
= hashmap_get(m
->cgroup_unit
, p
);
2162 Unit
*manager_get_unit_by_pid_cgroup(Manager
*m
, pid_t pid
) {
2163 _cleanup_free_
char *cgroup
= NULL
;
2171 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
2175 return manager_get_unit_by_cgroup(m
, cgroup
);
2178 Unit
*manager_get_unit_by_pid(Manager
*m
, pid_t pid
) {
2186 if (pid
== getpid_cached())
2187 return hashmap_get(m
->units
, SPECIAL_INIT_SCOPE
);
2189 u
= hashmap_get(m
->watch_pids1
, PID_TO_PTR(pid
));
2193 u
= hashmap_get(m
->watch_pids2
, PID_TO_PTR(pid
));
2197 return manager_get_unit_by_pid_cgroup(m
, pid
);
2200 int manager_notify_cgroup_empty(Manager
*m
, const char *cgroup
) {
2206 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2207 * or from the --system instance */
2209 log_debug("Got cgroup empty notification for: %s", cgroup
);
2211 u
= manager_get_unit_by_cgroup(m
, cgroup
);
2215 unit_add_to_cgroup_empty_queue(u
);
2219 int unit_get_memory_current(Unit
*u
, uint64_t *ret
) {
2220 _cleanup_free_
char *v
= NULL
;
2226 if (!UNIT_CGROUP_BOOL(u
, memory_accounting
))
2229 if (!u
->cgroup_path
)
2232 if ((u
->cgroup_realized_mask
& CGROUP_MASK_MEMORY
) == 0)
2235 r
= cg_all_unified();
2239 r
= cg_get_attribute("memory", u
->cgroup_path
, "memory.current", &v
);
2241 r
= cg_get_attribute("memory", u
->cgroup_path
, "memory.usage_in_bytes", &v
);
2247 return safe_atou64(v
, ret
);
2250 int unit_get_tasks_current(Unit
*u
, uint64_t *ret
) {
2251 _cleanup_free_
char *v
= NULL
;
2257 if (!UNIT_CGROUP_BOOL(u
, tasks_accounting
))
2260 if (!u
->cgroup_path
)
2263 if ((u
->cgroup_realized_mask
& CGROUP_MASK_PIDS
) == 0)
2266 r
= cg_get_attribute("pids", u
->cgroup_path
, "pids.current", &v
);
2272 return safe_atou64(v
, ret
);
2275 static int unit_get_cpu_usage_raw(Unit
*u
, nsec_t
*ret
) {
2276 _cleanup_free_
char *v
= NULL
;
2283 if (!u
->cgroup_path
)
2286 r
= cg_all_unified();
2290 const char *keys
[] = { "usage_usec", NULL
};
2291 _cleanup_free_
char *val
= NULL
;
2294 if ((u
->cgroup_realized_mask
& CGROUP_MASK_CPU
) == 0)
2297 r
= cg_get_keyed_attribute("cpu", u
->cgroup_path
, "cpu.stat", keys
, &val
);
2301 r
= safe_atou64(val
, &us
);
2305 ns
= us
* NSEC_PER_USEC
;
2307 if ((u
->cgroup_realized_mask
& CGROUP_MASK_CPUACCT
) == 0)
2310 r
= cg_get_attribute("cpuacct", u
->cgroup_path
, "cpuacct.usage", &v
);
2316 r
= safe_atou64(v
, &ns
);
2325 int unit_get_cpu_usage(Unit
*u
, nsec_t
*ret
) {
2331 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2332 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2333 * call this function with a NULL return value. */
2335 if (!UNIT_CGROUP_BOOL(u
, cpu_accounting
))
2338 r
= unit_get_cpu_usage_raw(u
, &ns
);
2339 if (r
== -ENODATA
&& u
->cpu_usage_last
!= NSEC_INFINITY
) {
2340 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2344 *ret
= u
->cpu_usage_last
;
2350 if (ns
> u
->cpu_usage_base
)
2351 ns
-= u
->cpu_usage_base
;
2355 u
->cpu_usage_last
= ns
;
2362 int unit_get_ip_accounting(
2364 CGroupIPAccountingMetric metric
,
2371 assert(metric
>= 0);
2372 assert(metric
< _CGROUP_IP_ACCOUNTING_METRIC_MAX
);
2375 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2376 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2377 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2379 if (u
->type
== UNIT_SLICE
)
2382 if (!UNIT_CGROUP_BOOL(u
, ip_accounting
))
2385 fd
= IN_SET(metric
, CGROUP_IP_INGRESS_BYTES
, CGROUP_IP_INGRESS_PACKETS
) ?
2386 u
->ip_accounting_ingress_map_fd
:
2387 u
->ip_accounting_egress_map_fd
;
2391 if (IN_SET(metric
, CGROUP_IP_INGRESS_BYTES
, CGROUP_IP_EGRESS_BYTES
))
2392 r
= bpf_firewall_read_accounting(fd
, &value
, NULL
);
2394 r
= bpf_firewall_read_accounting(fd
, NULL
, &value
);
2398 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2399 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2400 * ip_accounting_extra[] field, and add them in here transparently. */
2402 *ret
= value
+ u
->ip_accounting_extra
[metric
];
2407 int unit_reset_cpu_accounting(Unit
*u
) {
2413 u
->cpu_usage_last
= NSEC_INFINITY
;
2415 r
= unit_get_cpu_usage_raw(u
, &ns
);
2417 u
->cpu_usage_base
= 0;
2421 u
->cpu_usage_base
= ns
;
2425 int unit_reset_ip_accounting(Unit
*u
) {
2430 if (u
->ip_accounting_ingress_map_fd
>= 0)
2431 r
= bpf_firewall_reset_accounting(u
->ip_accounting_ingress_map_fd
);
2433 if (u
->ip_accounting_egress_map_fd
>= 0)
2434 q
= bpf_firewall_reset_accounting(u
->ip_accounting_egress_map_fd
);
2436 zero(u
->ip_accounting_extra
);
2438 return r
< 0 ? r
: q
;
2441 void unit_invalidate_cgroup(Unit
*u
, CGroupMask m
) {
2444 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2450 /* always invalidate compat pairs together */
2451 if (m
& (CGROUP_MASK_IO
| CGROUP_MASK_BLKIO
))
2452 m
|= CGROUP_MASK_IO
| CGROUP_MASK_BLKIO
;
2454 if (m
& (CGROUP_MASK_CPU
| CGROUP_MASK_CPUACCT
))
2455 m
|= CGROUP_MASK_CPU
| CGROUP_MASK_CPUACCT
;
2457 if ((u
->cgroup_realized_mask
& m
) == 0) /* NOP? */
2460 u
->cgroup_realized_mask
&= ~m
;
2461 unit_add_to_cgroup_realize_queue(u
);
2464 void unit_invalidate_cgroup_bpf(Unit
*u
) {
2467 if (!UNIT_HAS_CGROUP_CONTEXT(u
))
2470 if (u
->cgroup_bpf_state
== UNIT_CGROUP_BPF_INVALIDATED
) /* NOP? */
2473 u
->cgroup_bpf_state
= UNIT_CGROUP_BPF_INVALIDATED
;
2474 unit_add_to_cgroup_realize_queue(u
);
2476 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2477 * list of our children includes our own. */
2478 if (u
->type
== UNIT_SLICE
) {
2483 HASHMAP_FOREACH_KEY(v
, member
, u
->dependencies
[UNIT_BEFORE
], i
) {
2487 if (UNIT_DEREF(member
->slice
) != u
)
2490 unit_invalidate_cgroup_bpf(member
);
2495 void manager_invalidate_startup_units(Manager
*m
) {
2501 SET_FOREACH(u
, m
->startup_units
, i
)
2502 unit_invalidate_cgroup(u
, CGROUP_MASK_CPU
|CGROUP_MASK_IO
|CGROUP_MASK_BLKIO
);
2505 static const char* const cgroup_device_policy_table
[_CGROUP_DEVICE_POLICY_MAX
] = {
2506 [CGROUP_AUTO
] = "auto",
2507 [CGROUP_CLOSED
] = "closed",
2508 [CGROUP_STRICT
] = "strict",
2511 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy
, CGroupDevicePolicy
);