1 /* SPDX-License-Identifier: LGPL-2.1+ */
5 #include "alloc-util.h"
8 #include "format-util.h"
11 #include "mount-util.h"
12 #include "mountpoint-util.h"
13 #include "nspawn-cgroup.h"
14 #include "nspawn-mount.h"
15 #include "path-util.h"
17 #include "string-util.h"
19 #include "user-util.h"
22 static int chown_cgroup_path(const char *path
, uid_t uid_shift
) {
23 _cleanup_close_
int fd
= -1;
26 fd
= open(path
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
32 "cgroup.clone_children",
37 "cgroup.subtree_control",
41 if (fchownat(fd
, fn
, uid_shift
, uid_shift
, 0) < 0)
42 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
43 "Failed to chown \"%s/%s\", ignoring: %m", path
, fn
);
48 int chown_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
49 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
52 r
= cg_pid_get_path(NULL
, pid
, &path
);
54 return log_error_errno(r
, "Failed to get container cgroup path: %m");
56 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
58 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
60 r
= chown_cgroup_path(fs
, uid_shift
);
62 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", fs
);
64 if (unified_requested
== CGROUP_UNIFIED_SYSTEMD
|| (unified_requested
== CGROUP_UNIFIED_NONE
&& cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0)) {
65 _cleanup_free_
char *lfs
= NULL
;
66 /* Always propagate access rights from unified to legacy controller */
68 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, NULL
, &lfs
);
70 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
72 r
= chown_cgroup_path(lfs
, uid_shift
);
74 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", lfs
);
80 int sync_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
81 _cleanup_free_
char *cgroup
= NULL
;
82 char tree
[] = "/tmp/unifiedXXXXXX", pid_string
[DECIMAL_STR_MAX(pid
) + 1];
83 bool undo_mount
= false;
85 int r
, unified_controller
;
87 unified_controller
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
88 if (unified_controller
< 0)
89 return log_error_errno(unified_controller
, "Failed to determine whether the systemd hierarchy is unified: %m");
90 if ((unified_controller
> 0) == (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
))
93 /* When the host uses the legacy cgroup setup, but the
94 * container shall use the unified hierarchy, let's make sure
95 * we copy the path from the name=systemd hierarchy into the
96 * unified hierarchy. Similar for the reverse situation. */
98 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
100 return log_error_errno(r
, "Failed to get control group of " PID_FMT
": %m", pid
);
102 /* In order to access the unified hierarchy we need to mount it */
104 return log_error_errno(errno
, "Failed to generate temporary mount point for unified hierarchy: %m");
106 if (unified_controller
> 0)
107 r
= mount_verbose(LOG_ERR
, "cgroup", tree
, "cgroup",
108 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "none,name=systemd,xattr");
110 r
= mount_verbose(LOG_ERR
, "cgroup", tree
, "cgroup2",
111 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
117 /* If nspawn dies abruptly the cgroup hierarchy created below
118 * its unit isn't cleaned up. So, let's remove it
119 * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
120 fn
= strjoina(tree
, cgroup
);
121 (void) rm_rf(fn
, REMOVE_ROOT
|REMOVE_ONLY_DIRECTORIES
);
123 fn
= strjoina(tree
, cgroup
, "/cgroup.procs");
125 sprintf(pid_string
, PID_FMT
, pid
);
126 r
= write_string_file(fn
, pid_string
, WRITE_STRING_FILE_DISABLE_BUFFER
|WRITE_STRING_FILE_MKDIR_0755
);
128 log_error_errno(r
, "Failed to move process: %m");
132 fn
= strjoina(tree
, cgroup
);
133 r
= chown_cgroup_path(fn
, uid_shift
);
135 log_error_errno(r
, "Failed to chown() cgroup %s: %m", fn
);
138 (void) umount_verbose(tree
);
144 int create_subcgroup(pid_t pid
, bool keep_unit
, CGroupUnified unified_requested
) {
145 _cleanup_free_
char *cgroup
= NULL
;
146 CGroupMask supported
;
152 /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
153 * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
154 * move us and the container into two separate subcgroups.
156 * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
157 * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
158 * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
159 * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
160 * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
162 * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
163 * that's fine because there's only one hierarchy anyway and controllers are enabled directly on it. On the
164 * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
167 r
= cg_mask_supported(&supported
);
169 return log_error_errno(r
, "Failed to determine supported controllers: %m");
172 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cgroup
);
174 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
176 return log_error_errno(r
, "Failed to get our control group: %m");
178 payload
= strjoina(cgroup
, "/payload");
179 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, payload
, pid
);
181 return log_error_errno(r
, "Failed to create %s subcgroup: %m", payload
);
184 const char *supervisor
;
186 supervisor
= strjoina(cgroup
, "/supervisor");
187 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, supervisor
, 0);
189 return log_error_errno(r
, "Failed to create %s subcgroup: %m", supervisor
);
192 /* Try to enable as many controllers as possible for the new payload. */
193 (void) cg_enable_everywhere(supported
, supported
, cgroup
, NULL
);
197 /* Retrieve existing subsystems. This function is called in a new cgroup
200 static int get_process_controllers(Set
**ret
) {
201 _cleanup_set_free_free_ Set
*controllers
= NULL
;
202 _cleanup_fclose_
FILE *f
= NULL
;
207 controllers
= set_new(&string_hash_ops
);
211 f
= fopen("/proc/self/cgroup", "re");
213 return errno
== ENOENT
? -ESRCH
: -errno
;
216 _cleanup_free_
char *line
= NULL
;
219 r
= read_line(f
, LONG_LINE_MAX
, &line
);
225 l
= strchr(line
, ':');
236 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
239 r
= set_put_strdup(controllers
, l
);
244 *ret
= TAKE_PTR(controllers
);
249 static int mount_legacy_cgroup_hierarchy(
251 const char *controller
,
252 const char *hierarchy
,
255 const char *to
, *fstype
, *opts
;
258 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
260 r
= path_is_mount_point(to
, dest
, 0);
261 if (r
< 0 && r
!= -ENOENT
)
262 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
266 (void) mkdir_p(to
, 0755);
268 /* The superblock mount options of the mount point need to be
269 * identical to the hosts', and hence writable... */
270 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
273 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
275 opts
= "none,name=systemd,xattr";
281 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
285 /* ... hence let's only make the bind mount read-only, not the superblock. */
287 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
288 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
296 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
297 static int mount_legacy_cgns_supported(
299 CGroupUnified unified_requested
,
303 const char *selinux_apifs_context
) {
305 _cleanup_set_free_free_ Set
*controllers
= NULL
;
306 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
309 (void) mkdir_p(cgroup_root
, 0755);
311 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
312 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
314 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
316 _cleanup_free_
char *options
= NULL
;
318 /* When cgroup namespaces are enabled and user namespaces are
319 * used then the mount of the cgroupfs is done *inside* the new
320 * user namespace. We're root in the new user namespace and the
321 * kernel will happily translate our uid/gid to the correct
322 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
323 * pass uid 0 and not uid_shift to tmpfs_patch_options().
325 r
= tmpfs_patch_options("mode=755", 0, selinux_apifs_context
, &options
);
329 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
330 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
335 r
= cg_all_unified();
339 goto skip_controllers
;
341 r
= get_process_controllers(&controllers
);
343 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
346 _cleanup_free_
const char *controller
= NULL
;
348 controller
= set_steal_first(controllers
);
352 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
356 /* When multiple hierarchies are co-mounted, make their
357 * constituting individual hierarchies a symlink to the
362 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
364 r
= extract_first_word(&c
, &tok
, ",", 0);
366 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
370 if (streq(controller
, tok
))
373 target
= prefix_root("/sys/fs/cgroup/", tok
);
377 r
= symlink_idempotent(controller
, target
, false);
379 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
381 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
386 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
387 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
392 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
397 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
398 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
403 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
404 static int mount_legacy_cgns_unsupported(
406 CGroupUnified unified_requested
,
410 const char *selinux_apifs_context
) {
412 _cleanup_set_free_free_ Set
*controllers
= NULL
;
413 const char *cgroup_root
;
416 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
418 (void) mkdir_p(cgroup_root
, 0755);
420 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
421 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
423 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
425 _cleanup_free_
char *options
= NULL
;
427 r
= tmpfs_patch_options("mode=755", uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &options
);
431 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
432 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
437 r
= cg_all_unified();
441 goto skip_controllers
;
443 r
= cg_kernel_controllers(&controllers
);
445 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
448 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
450 controller
= set_steal_first(controllers
);
454 origin
= prefix_root("/sys/fs/cgroup/", controller
);
458 r
= readlink_malloc(origin
, &combined
);
460 /* Not a symbolic link, but directly a single cgroup hierarchy */
462 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
467 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
469 _cleanup_free_
char *target
= NULL
;
471 target
= prefix_root(dest
, origin
);
475 /* A symbolic link, a combination of controllers in one hierarchy */
477 if (!filename_is_valid(combined
)) {
478 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
482 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
486 r
= symlink_idempotent(combined
, target
, false);
488 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
490 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
495 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
496 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
501 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
505 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
506 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
509 static int mount_unified_cgroups(const char *dest
) {
515 p
= prefix_roota(dest
, "/sys/fs/cgroup");
517 (void) mkdir_p(p
, 0755);
519 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
521 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
523 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
524 if (access(p
, F_OK
) >= 0)
527 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
529 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
530 "%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
533 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
538 CGroupUnified unified_requested
,
542 const char *selinux_apifs_context
,
545 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
546 return mount_unified_cgroups(dest
);
548 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
550 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
553 static int mount_systemd_cgroup_writable_one(const char *root
, const char *own
) {
559 /* Make our own cgroup a (writable) bind mount */
560 r
= mount_verbose(LOG_ERR
, own
, own
, NULL
, MS_BIND
, NULL
);
564 /* And then remount the systemd cgroup root read-only */
565 return mount_verbose(LOG_ERR
, NULL
, root
, NULL
,
566 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
569 int mount_systemd_cgroup_writable(
571 CGroupUnified unified_requested
) {
573 _cleanup_free_
char *own_cgroup_path
= NULL
;
574 const char *root
, *own
;
579 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
581 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
583 /* If we are living in the top-level, then there's nothing to do... */
584 if (path_equal(own_cgroup_path
, "/"))
587 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
589 root
= prefix_roota(dest
, "/sys/fs/cgroup");
590 own
= strjoina(root
, own_cgroup_path
);
594 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
595 root
= prefix_roota(dest
, "/sys/fs/cgroup/unified");
596 own
= strjoina(root
, own_cgroup_path
);
598 r
= mount_systemd_cgroup_writable_one(root
, own
);
603 root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
604 own
= strjoina(root
, own_cgroup_path
);
607 return mount_systemd_cgroup_writable_one(root
, own
);