1 /* SPDX-License-Identifier: LGPL-2.1+ */
5 #include "alloc-util.h"
8 #include "format-util.h"
11 #include "mount-util.h"
12 #include "mountpoint-util.h"
13 #include "nspawn-cgroup.h"
14 #include "nspawn-mount.h"
15 #include "path-util.h"
17 #include "string-util.h"
19 #include "user-util.h"
22 static int chown_cgroup_path(const char *path
, uid_t uid_shift
) {
23 _cleanup_close_
int fd
= -1;
26 fd
= open(path
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
32 "cgroup.clone_children",
37 "cgroup.subtree_control",
41 if (fchownat(fd
, fn
, uid_shift
, uid_shift
, 0) < 0)
42 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
43 "Failed to chown \"%s/%s\", ignoring: %m", path
, fn
);
48 int chown_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
49 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
52 r
= cg_pid_get_path(NULL
, pid
, &path
);
54 return log_error_errno(r
, "Failed to get container cgroup path: %m");
56 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
58 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
60 r
= chown_cgroup_path(fs
, uid_shift
);
62 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", fs
);
64 if (unified_requested
== CGROUP_UNIFIED_SYSTEMD
|| (unified_requested
== CGROUP_UNIFIED_NONE
&& cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0)) {
65 _cleanup_free_
char *lfs
= NULL
;
66 /* Always propagate access rights from unified to legacy controller */
68 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, NULL
, &lfs
);
70 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
72 r
= chown_cgroup_path(lfs
, uid_shift
);
74 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", lfs
);
80 int sync_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
81 _cleanup_free_
char *cgroup
= NULL
;
82 char tree
[] = "/tmp/unifiedXXXXXX", pid_string
[DECIMAL_STR_MAX(pid
) + 1];
83 bool undo_mount
= false;
85 int r
, unified_controller
;
87 unified_controller
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
88 if (unified_controller
< 0)
89 return log_error_errno(unified_controller
, "Failed to determine whether the systemd hierarchy is unified: %m");
90 if ((unified_controller
> 0) == (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
))
93 /* When the host uses the legacy cgroup setup, but the
94 * container shall use the unified hierarchy, let's make sure
95 * we copy the path from the name=systemd hierarchy into the
96 * unified hierarchy. Similar for the reverse situation. */
98 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
100 return log_error_errno(r
, "Failed to get control group of " PID_FMT
": %m", pid
);
102 /* In order to access the unified hierarchy we need to mount it */
104 return log_error_errno(errno
, "Failed to generate temporary mount point for unified hierarchy: %m");
106 if (unified_controller
> 0)
107 r
= mount_verbose(LOG_ERR
, "cgroup", tree
, "cgroup",
108 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "none,name=systemd,xattr");
110 r
= mount_verbose(LOG_ERR
, "cgroup", tree
, "cgroup2",
111 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
117 /* If nspawn dies abruptly the cgroup hierarchy created below
118 * its unit isn't cleaned up. So, let's remove it
119 * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
120 fn
= strjoina(tree
, cgroup
);
121 (void) rm_rf(fn
, REMOVE_ROOT
|REMOVE_ONLY_DIRECTORIES
);
123 fn
= strjoina(tree
, cgroup
, "/cgroup.procs");
124 (void) mkdir_parents(fn
, 0755);
126 sprintf(pid_string
, PID_FMT
, pid
);
127 r
= write_string_file(fn
, pid_string
, WRITE_STRING_FILE_DISABLE_BUFFER
);
129 log_error_errno(r
, "Failed to move process: %m");
133 fn
= strjoina(tree
, cgroup
);
134 r
= chown_cgroup_path(fn
, uid_shift
);
136 log_error_errno(r
, "Failed to chown() cgroup %s: %m", fn
);
139 (void) umount_verbose(tree
);
145 int create_subcgroup(pid_t pid
, bool keep_unit
, CGroupUnified unified_requested
) {
146 _cleanup_free_
char *cgroup
= NULL
;
147 CGroupMask supported
;
153 /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
154 * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
155 * move us and the container into two separate subcgroups.
157 * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
158 * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
159 * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
160 * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
161 * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
163 * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
164 * that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
165 * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
168 r
= cg_mask_supported(&supported
);
170 return log_error_errno(r
, "Failed to determine supported controllers: %m");
173 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cgroup
);
175 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
177 return log_error_errno(r
, "Failed to get our control group: %m");
179 payload
= strjoina(cgroup
, "/payload");
180 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, payload
, pid
);
182 return log_error_errno(r
, "Failed to create %s subcgroup: %m", payload
);
185 const char *supervisor
;
187 supervisor
= strjoina(cgroup
, "/supervisor");
188 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, supervisor
, 0);
190 return log_error_errno(r
, "Failed to create %s subcgroup: %m", supervisor
);
193 /* Try to enable as many controllers as possible for the new payload. */
194 (void) cg_enable_everywhere(supported
, supported
, cgroup
, NULL
);
198 /* Retrieve existing subsystems. This function is called in a new cgroup
201 static int get_process_controllers(Set
**ret
) {
202 _cleanup_set_free_free_ Set
*controllers
= NULL
;
203 _cleanup_fclose_
FILE *f
= NULL
;
208 controllers
= set_new(&string_hash_ops
);
212 f
= fopen("/proc/self/cgroup", "re");
214 return errno
== ENOENT
? -ESRCH
: -errno
;
217 _cleanup_free_
char *line
= NULL
;
220 r
= read_line(f
, LONG_LINE_MAX
, &line
);
226 l
= strchr(line
, ':');
237 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
240 r
= set_put_strdup(controllers
, l
);
245 *ret
= TAKE_PTR(controllers
);
250 static int mount_legacy_cgroup_hierarchy(
252 const char *controller
,
253 const char *hierarchy
,
256 const char *to
, *fstype
, *opts
;
259 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
261 r
= path_is_mount_point(to
, dest
, 0);
262 if (r
< 0 && r
!= -ENOENT
)
263 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
267 (void) mkdir_p(to
, 0755);
269 /* The superblock mount options of the mount point need to be
270 * identical to the hosts', and hence writable... */
271 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
274 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
276 opts
= "none,name=systemd,xattr";
282 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
286 /* ... hence let's only make the bind mount read-only, not the superblock. */
288 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
289 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
297 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
298 static int mount_legacy_cgns_supported(
300 CGroupUnified unified_requested
,
304 const char *selinux_apifs_context
) {
306 _cleanup_set_free_free_ Set
*controllers
= NULL
;
307 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
310 (void) mkdir_p(cgroup_root
, 0755);
312 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
313 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
315 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
317 _cleanup_free_
char *options
= NULL
;
319 /* When cgroup namespaces are enabled and user namespaces are
320 * used then the mount of the cgroupfs is done *inside* the new
321 * user namespace. We're root in the new user namespace and the
322 * kernel will happily translate our uid/gid to the correct
323 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
324 * pass uid 0 and not uid_shift to tmpfs_patch_options().
326 r
= tmpfs_patch_options("mode=755", 0, selinux_apifs_context
, &options
);
330 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
331 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
336 r
= cg_all_unified();
340 goto skip_controllers
;
342 r
= get_process_controllers(&controllers
);
344 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
347 _cleanup_free_
const char *controller
= NULL
;
349 controller
= set_steal_first(controllers
);
353 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
357 /* When multiple hierarchies are co-mounted, make their
358 * constituting individual hierarchies a symlink to the
363 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
365 r
= extract_first_word(&c
, &tok
, ",", 0);
367 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
371 if (streq(controller
, tok
))
374 target
= prefix_root("/sys/fs/cgroup/", tok
);
378 r
= symlink_idempotent(controller
, target
, false);
380 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
382 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
387 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
388 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
393 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
398 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
399 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
404 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
405 static int mount_legacy_cgns_unsupported(
407 CGroupUnified unified_requested
,
411 const char *selinux_apifs_context
) {
413 _cleanup_set_free_free_ Set
*controllers
= NULL
;
414 const char *cgroup_root
;
417 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
419 (void) mkdir_p(cgroup_root
, 0755);
421 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
422 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
424 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
426 _cleanup_free_
char *options
= NULL
;
428 r
= tmpfs_patch_options("mode=755", uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &options
);
432 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
433 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
438 r
= cg_all_unified();
442 goto skip_controllers
;
444 r
= cg_kernel_controllers(&controllers
);
446 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
449 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
451 controller
= set_steal_first(controllers
);
455 origin
= prefix_root("/sys/fs/cgroup/", controller
);
459 r
= readlink_malloc(origin
, &combined
);
461 /* Not a symbolic link, but directly a single cgroup hierarchy */
463 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
468 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
470 _cleanup_free_
char *target
= NULL
;
472 target
= prefix_root(dest
, origin
);
476 /* A symbolic link, a combination of controllers in one hierarchy */
478 if (!filename_is_valid(combined
)) {
479 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
483 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
487 r
= symlink_idempotent(combined
, target
, false);
489 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
491 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
496 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
497 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
502 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
506 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
507 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
510 static int mount_unified_cgroups(const char *dest
) {
516 p
= prefix_roota(dest
, "/sys/fs/cgroup");
518 (void) mkdir_p(p
, 0755);
520 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
522 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
524 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
525 if (access(p
, F_OK
) >= 0)
528 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
530 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
531 "%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
534 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
539 CGroupUnified unified_requested
,
543 const char *selinux_apifs_context
,
546 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
547 return mount_unified_cgroups(dest
);
549 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
551 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
554 static int mount_systemd_cgroup_writable_one(const char *root
, const char *own
) {
560 /* Make our own cgroup a (writable) bind mount */
561 r
= mount_verbose(LOG_ERR
, own
, own
, NULL
, MS_BIND
, NULL
);
565 /* And then remount the systemd cgroup root read-only */
566 return mount_verbose(LOG_ERR
, NULL
, root
, NULL
,
567 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
570 int mount_systemd_cgroup_writable(
572 CGroupUnified unified_requested
) {
574 _cleanup_free_
char *own_cgroup_path
= NULL
;
575 const char *root
, *own
;
580 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
582 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
584 /* If we are living in the top-level, then there's nothing to do... */
585 if (path_equal(own_cgroup_path
, "/"))
588 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
590 root
= prefix_roota(dest
, "/sys/fs/cgroup");
591 own
= strjoina(root
, own_cgroup_path
);
595 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
596 root
= prefix_roota(dest
, "/sys/fs/cgroup/unified");
597 own
= strjoina(root
, own_cgroup_path
);
599 r
= mount_systemd_cgroup_writable_one(root
, own
);
604 root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
605 own
= strjoina(root
, own_cgroup_path
);
608 return mount_systemd_cgroup_writable_one(root
, own
);