1 /* SPDX-License-Identifier: LGPL-2.1+ */
5 #include "alloc-util.h"
10 #include "mount-util.h"
11 #include "nspawn-cgroup.h"
12 #include "nspawn-mount.h"
13 #include "path-util.h"
15 #include "string-util.h"
17 #include "user-util.h"
20 static int chown_cgroup_path(const char *path
, uid_t uid_shift
) {
21 _cleanup_close_
int fd
= -1;
24 fd
= open(path
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
30 "cgroup.clone_children",
35 "cgroup.subtree_control",
39 if (fchownat(fd
, fn
, uid_shift
, uid_shift
, 0) < 0)
40 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
41 "Failed to chown \"%s/%s\", ignoring: %m", path
, fn
);
46 int chown_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
47 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
50 r
= cg_pid_get_path(NULL
, pid
, &path
);
52 return log_error_errno(r
, "Failed to get container cgroup path: %m");
54 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
56 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
58 r
= chown_cgroup_path(fs
, uid_shift
);
60 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", fs
);
62 if (unified_requested
== CGROUP_UNIFIED_SYSTEMD
|| (unified_requested
== CGROUP_UNIFIED_NONE
&& cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0)) {
63 _cleanup_free_
char *lfs
= NULL
;
64 /* Always propagate access rights from unified to legacy controller */
66 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, NULL
, &lfs
);
68 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
70 r
= chown_cgroup_path(lfs
, uid_shift
);
72 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", lfs
);
78 int sync_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
79 _cleanup_free_
char *cgroup
= NULL
;
80 char tree
[] = "/tmp/unifiedXXXXXX", pid_string
[DECIMAL_STR_MAX(pid
) + 1];
81 bool undo_mount
= false;
83 int r
, unified_controller
;
85 unified_controller
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
86 if (unified_controller
< 0)
87 return log_error_errno(unified_controller
, "Failed to determine whether the systemd hierarchy is unified: %m");
88 if ((unified_controller
> 0) == (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
))
91 /* When the host uses the legacy cgroup setup, but the
92 * container shall use the unified hierarchy, let's make sure
93 * we copy the path from the name=systemd hierarchy into the
94 * unified hierarchy. Similar for the reverse situation. */
96 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
98 return log_error_errno(r
, "Failed to get control group of " PID_FMT
": %m", pid
);
100 /* In order to access the unified hierarchy we need to mount it */
102 return log_error_errno(errno
, "Failed to generate temporary mount point for unified hierarchy: %m");
104 if (unified_controller
> 0)
105 r
= mount_verbose(LOG_ERR
, "cgroup", tree
, "cgroup",
106 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "none,name=systemd,xattr");
108 r
= mount_verbose(LOG_ERR
, "cgroup", tree
, "cgroup2",
109 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
115 /* If nspawn dies abruptly the cgroup hierarchy created below
116 * its unit isn't cleaned up. So, let's remove it
117 * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
118 fn
= strjoina(tree
, cgroup
);
119 (void) rm_rf(fn
, REMOVE_ROOT
|REMOVE_ONLY_DIRECTORIES
);
121 fn
= strjoina(tree
, cgroup
, "/cgroup.procs");
122 (void) mkdir_parents(fn
, 0755);
124 sprintf(pid_string
, PID_FMT
, pid
);
125 r
= write_string_file(fn
, pid_string
, WRITE_STRING_FILE_DISABLE_BUFFER
);
127 log_error_errno(r
, "Failed to move process: %m");
131 fn
= strjoina(tree
, cgroup
);
132 r
= chown_cgroup_path(fn
, uid_shift
);
134 log_error_errno(r
, "Failed to chown() cgroup %s: %m", fn
);
137 (void) umount_verbose(tree
);
143 int create_subcgroup(pid_t pid
, bool keep_unit
, CGroupUnified unified_requested
) {
144 _cleanup_free_
char *cgroup
= NULL
;
145 CGroupMask supported
;
151 /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
152 * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
153 * move us and the container into two separate subcgroups.
155 * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
156 * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
157 * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
158 * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
159 * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
161 * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
162 * that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
163 * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
166 r
= cg_mask_supported(&supported
);
168 return log_error_errno(r
, "Failed to determine supported controllers: %m");
171 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cgroup
);
173 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
175 return log_error_errno(r
, "Failed to get our control group: %m");
177 payload
= strjoina(cgroup
, "/payload");
178 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, payload
, pid
);
180 return log_error_errno(r
, "Failed to create %s subcgroup: %m", payload
);
183 const char *supervisor
;
185 supervisor
= strjoina(cgroup
, "/supervisor");
186 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, supervisor
, 0);
188 return log_error_errno(r
, "Failed to create %s subcgroup: %m", supervisor
);
191 /* Try to enable as many controllers as possible for the new payload. */
192 (void) cg_enable_everywhere(supported
, supported
, cgroup
);
196 /* Retrieve existing subsystems. This function is called in a new cgroup
199 static int get_process_controllers(Set
**ret
) {
200 _cleanup_set_free_free_ Set
*controllers
= NULL
;
201 _cleanup_fclose_
FILE *f
= NULL
;
206 controllers
= set_new(&string_hash_ops
);
210 f
= fopen("/proc/self/cgroup", "re");
212 return errno
== ENOENT
? -ESRCH
: -errno
;
215 _cleanup_free_
char *line
= NULL
;
218 r
= read_line(f
, LONG_LINE_MAX
, &line
);
224 l
= strchr(line
, ':');
235 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
238 r
= set_put_strdup(controllers
, l
);
243 *ret
= TAKE_PTR(controllers
);
248 static int mount_legacy_cgroup_hierarchy(
250 const char *controller
,
251 const char *hierarchy
,
254 const char *to
, *fstype
, *opts
;
257 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
259 r
= path_is_mount_point(to
, dest
, 0);
260 if (r
< 0 && r
!= -ENOENT
)
261 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
267 /* The superblock mount options of the mount point need to be
268 * identical to the hosts', and hence writable... */
269 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
272 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
274 opts
= "none,name=systemd,xattr";
280 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
284 /* ... hence let's only make the bind mount read-only, not the superblock. */
286 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
287 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
295 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
296 static int mount_legacy_cgns_supported(
298 CGroupUnified unified_requested
,
302 const char *selinux_apifs_context
) {
304 _cleanup_set_free_free_ Set
*controllers
= NULL
;
305 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
308 (void) mkdir_p(cgroup_root
, 0755);
310 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
311 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
313 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
315 _cleanup_free_
char *options
= NULL
;
317 /* When cgroup namespaces are enabled and user namespaces are
318 * used then the mount of the cgroupfs is done *inside* the new
319 * user namespace. We're root in the new user namespace and the
320 * kernel will happily translate our uid/gid to the correct
321 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
322 * pass uid 0 and not uid_shift to tmpfs_patch_options().
324 r
= tmpfs_patch_options("mode=755", 0, selinux_apifs_context
, &options
);
328 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
329 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
334 r
= cg_all_unified();
338 goto skip_controllers
;
340 r
= get_process_controllers(&controllers
);
342 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
345 _cleanup_free_
const char *controller
= NULL
;
347 controller
= set_steal_first(controllers
);
351 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
355 /* When multiple hierarchies are co-mounted, make their
356 * constituting individual hierarchies a symlink to the
361 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
363 r
= extract_first_word(&c
, &tok
, ",", 0);
365 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
369 if (streq(controller
, tok
))
372 target
= prefix_root("/sys/fs/cgroup/", tok
);
376 r
= symlink_idempotent(controller
, target
, false);
378 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
380 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
385 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
386 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
391 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
396 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
397 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
402 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
403 static int mount_legacy_cgns_unsupported(
405 CGroupUnified unified_requested
,
409 const char *selinux_apifs_context
) {
411 _cleanup_set_free_free_ Set
*controllers
= NULL
;
412 const char *cgroup_root
;
415 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
417 (void) mkdir_p(cgroup_root
, 0755);
419 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
420 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
422 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
424 _cleanup_free_
char *options
= NULL
;
426 r
= tmpfs_patch_options("mode=755", uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &options
);
430 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
431 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
436 r
= cg_all_unified();
440 goto skip_controllers
;
442 r
= cg_kernel_controllers(&controllers
);
444 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
447 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
449 controller
= set_steal_first(controllers
);
453 origin
= prefix_root("/sys/fs/cgroup/", controller
);
457 r
= readlink_malloc(origin
, &combined
);
459 /* Not a symbolic link, but directly a single cgroup hierarchy */
461 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
466 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
468 _cleanup_free_
char *target
= NULL
;
470 target
= prefix_root(dest
, origin
);
474 /* A symbolic link, a combination of controllers in one hierarchy */
476 if (!filename_is_valid(combined
)) {
477 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
481 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
485 r
= symlink_idempotent(combined
, target
, false);
487 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
489 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
494 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
495 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
500 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
504 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
505 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
508 static int mount_unified_cgroups(const char *dest
) {
514 p
= prefix_roota(dest
, "/sys/fs/cgroup");
516 (void) mkdir_p(p
, 0755);
518 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
520 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
522 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
523 if (access(p
, F_OK
) >= 0)
526 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
528 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
532 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
537 CGroupUnified unified_requested
,
541 const char *selinux_apifs_context
,
544 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
545 return mount_unified_cgroups(dest
);
547 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
549 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
552 static int mount_systemd_cgroup_writable_one(const char *root
, const char *own
) {
558 /* Make our own cgroup a (writable) bind mount */
559 r
= mount_verbose(LOG_ERR
, own
, own
, NULL
, MS_BIND
, NULL
);
563 /* And then remount the systemd cgroup root read-only */
564 return mount_verbose(LOG_ERR
, NULL
, root
, NULL
,
565 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
568 int mount_systemd_cgroup_writable(
570 CGroupUnified unified_requested
) {
572 _cleanup_free_
char *own_cgroup_path
= NULL
;
573 const char *root
, *own
;
578 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
580 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
582 /* If we are living in the top-level, then there's nothing to do... */
583 if (path_equal(own_cgroup_path
, "/"))
586 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
588 root
= prefix_roota(dest
, "/sys/fs/cgroup");
589 own
= strjoina(root
, own_cgroup_path
);
593 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
594 root
= prefix_roota(dest
, "/sys/fs/cgroup/unified");
595 own
= strjoina(root
, own_cgroup_path
);
597 r
= mount_systemd_cgroup_writable_one(root
, own
);
602 root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
603 own
= strjoina(root
, own_cgroup_path
);
606 return mount_systemd_cgroup_writable_one(root
, own
);