2 This file is part of systemd.
4 Copyright 2015 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 #include <sys/mount.h>
21 #include <linux/magic.h>
23 #include "alloc-util.h"
30 #include "mount-util.h"
31 #include "nspawn-mount.h"
32 #include "parse-util.h"
33 #include "path-util.h"
36 #include "stat-util.h"
37 #include "string-util.h"
39 #include "user-util.h"
42 CustomMount
* custom_mount_add(CustomMount
**l
, unsigned *n
, CustomMountType t
) {
48 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
50 c
= realloc_multiply(*l
, (*n
+ 1), sizeof(CustomMount
));
58 *ret
= (CustomMount
) { .type
= t
};
63 void custom_mount_free_all(CustomMount
*l
, unsigned n
) {
66 for (i
= 0; i
< n
; i
++) {
67 CustomMount
*m
= l
+ i
;
74 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
84 int custom_mount_compare(const void *a
, const void *b
) {
85 const CustomMount
*x
= a
, *y
= b
;
88 r
= path_compare(x
->destination
, y
->destination
);
92 if (x
->type
< y
->type
)
94 if (x
->type
> y
->type
)
100 int bind_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
101 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
109 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
116 destination
= strdup(source
);
121 if (r
== 2 && !isempty(p
)) {
127 if (!path_is_absolute(source
))
130 if (!path_is_absolute(destination
))
133 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
138 m
->destination
= destination
;
139 m
->read_only
= read_only
;
142 source
= destination
= opts
= NULL
;
146 int tmpfs_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
) {
147 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
156 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
163 opts
= strdup("mode=0755");
169 if (!path_is_absolute(path
))
172 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
176 m
->destination
= path
;
183 static int tmpfs_patch_options(
186 uid_t uid_shift
, uid_t uid_range
,
188 const char *selinux_apifs_context
,
193 if ((userns
&& uid_shift
!= 0) || patch_ids
) {
194 assert(uid_shift
!= UID_INVALID
);
196 if (asprintf(&buf
, "%s%suid=" UID_FMT
",gid=" UID_FMT
,
197 options
?: "", options
? "," : "",
198 uid_shift
, uid_shift
) < 0)
205 if (selinux_apifs_context
) {
208 t
= strjoin(options
?: "", options
? "," : "",
209 "context=\"", selinux_apifs_context
, "\"");
218 if (!buf
&& options
) {
219 buf
= strdup(options
);
228 int mount_sysfs(const char *dest
, MountSettingsMask mount_settings
) {
229 const char *full
, *top
, *x
;
231 unsigned long extra_flags
= 0;
233 top
= prefix_roota(dest
, "/sys");
234 r
= path_check_fstype(top
, SYSFS_MAGIC
);
236 return log_error_errno(r
, "Failed to determine filesystem type of %s: %m", top
);
237 /* /sys might already be mounted as sysfs by the outer child in the
238 * !netns case. In this case, it's all good. Don't touch it because we
239 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
244 full
= prefix_roota(top
, "/full");
246 (void) mkdir(full
, 0755);
248 if (mount_settings
& MOUNT_APPLY_APIVFS_RO
)
249 extra_flags
|= MS_RDONLY
;
251 r
= mount_verbose(LOG_ERR
, "sysfs", full
, "sysfs",
252 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|extra_flags
, NULL
);
256 FOREACH_STRING(x
, "block", "bus", "class", "dev", "devices", "kernel") {
257 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
259 from
= prefix_root(full
, x
);
263 to
= prefix_root(top
, x
);
267 (void) mkdir(to
, 0755);
269 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
273 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
274 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
279 r
= umount_verbose(full
);
284 return log_error_errno(errno
, "Failed to remove %s: %m", full
);
286 x
= prefix_roota(top
, "/fs/kdbus");
287 (void) mkdir_p(x
, 0755);
289 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
290 * remount /sys read-only.
292 if (cg_ns_supported()) {
293 x
= prefix_roota(top
, "/fs/cgroup");
294 (void) mkdir_p(x
, 0755);
297 return mount_verbose(LOG_ERR
, NULL
, top
, NULL
,
298 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
301 static int mkdir_userns(const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
306 r
= mkdir(path
, mode
);
307 if (r
< 0 && errno
!= EEXIST
)
310 if ((mask
& MOUNT_USE_USERNS
) == 0)
313 if (mask
& MOUNT_IN_USERNS
)
316 r
= lchown(path
, uid_shift
, uid_shift
);
323 static int mkdir_userns_p(const char *prefix
, const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
329 if (prefix
&& !path_startswith(path
, prefix
))
332 /* create every parent directory in the path, except the last component */
333 p
= path
+ strspn(path
, "/");
335 char t
[strlen(path
) + 1];
337 e
= p
+ strcspn(p
, "/");
338 p
= e
+ strspn(e
, "/");
340 /* Is this the last component? If so, then we're done */
344 memcpy(t
, path
, e
- path
);
347 if (prefix
&& path_startswith(prefix
, t
))
350 r
= mkdir_userns(t
, mode
, mask
, uid_shift
);
355 return mkdir_userns(path
, mode
, mask
, uid_shift
);
358 int mount_all(const char *dest
,
359 MountSettingsMask mount_settings
,
360 uid_t uid_shift
, uid_t uid_range
,
361 const char *selinux_apifs_context
) {
363 typedef struct MountPoint
{
369 MountSettingsMask mount_settings
;
372 static const MountPoint mount_table
[] = {
373 /* inner child mounts */
374 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_IN_USERNS
},
375 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ...*/
376 { "/proc/sys/net", "/proc/sys/net", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
}, /* (except for this) */
377 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
378 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ...*/
379 { NULL
, "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
380 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, MOUNT_FATAL
|MOUNT_IN_USERNS
},
382 /* outer child mounts */
383 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_NETNS
},
384 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_RO
}, /* skipped if above was mounted */
385 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
}, /* skipped if above was mounted */
387 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, MOUNT_FATAL
},
388 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
389 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
391 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, 0 }, /* Bind mount first */
392 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, 0 }, /* Then, make it r/o */
398 bool use_userns
= (mount_settings
& MOUNT_USE_USERNS
);
399 bool netns
= (mount_settings
& MOUNT_APPLY_APIVFS_NETNS
);
400 bool ro
= (mount_settings
& MOUNT_APPLY_APIVFS_RO
);
401 bool in_userns
= (mount_settings
& MOUNT_IN_USERNS
);
403 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
404 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
406 bool fatal
= (mount_table
[k
].mount_settings
& MOUNT_FATAL
);
408 if (in_userns
!= (bool)(mount_table
[k
].mount_settings
& MOUNT_IN_USERNS
))
411 if (!netns
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_NETNS
))
414 if (!ro
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_RO
))
417 where
= prefix_root(dest
, mount_table
[k
].where
);
421 r
= path_is_mount_point(where
, dest
, AT_SYMLINK_FOLLOW
);
422 if (r
< 0 && r
!= -ENOENT
)
423 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
425 /* Skip this entry if it is not a remount. */
426 if (mount_table
[k
].what
&& r
> 0)
429 r
= mkdir_userns_p(dest
, where
, 0755, mount_settings
, uid_shift
);
430 if (r
< 0 && r
!= -EEXIST
) {
432 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
434 log_debug_errno(r
, "Failed to create directory %s: %m", where
);
438 o
= mount_table
[k
].options
;
439 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
441 r
= tmpfs_patch_options(o
, use_userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
443 r
= tmpfs_patch_options(o
, use_userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
450 r
= mount_verbose(fatal
? LOG_ERR
: LOG_DEBUG
,
454 mount_table
[k
].flags
,
463 static int parse_mount_bind_options(const char *options
, unsigned long *mount_flags
, char **mount_opts
) {
464 const char *p
= options
;
465 unsigned long flags
= *mount_flags
;
472 _cleanup_free_
char *word
= NULL
;
474 r
= extract_first_word(&p
, &word
, ",", 0);
476 return log_error_errno(r
, "Failed to extract mount option: %m");
480 if (streq(word
, "rbind"))
482 else if (streq(word
, "norbind"))
485 log_error("Invalid bind mount option: %s", word
);
490 *mount_flags
= flags
;
491 /* in the future mount_opts will hold string options for mount(2) */
497 static int mount_bind(const char *dest
, CustomMount
*m
) {
498 struct stat source_st
, dest_st
;
500 unsigned long mount_flags
= MS_BIND
| MS_REC
;
501 _cleanup_free_
char *mount_opts
= NULL
;
507 r
= parse_mount_bind_options(m
->options
, &mount_flags
, &mount_opts
);
512 if (stat(m
->source
, &source_st
) < 0)
513 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
515 where
= prefix_roota(dest
, m
->destination
);
517 if (stat(where
, &dest_st
) >= 0) {
518 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
519 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
523 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
524 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
528 } else if (errno
== ENOENT
) {
529 r
= mkdir_parents_label(where
, 0755);
531 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
533 /* Create the mount point. Any non-directory file can be
534 * mounted on any non-directory file (regular, fifo, socket,
537 if (S_ISDIR(source_st
.st_mode
))
538 r
= mkdir_label(where
, 0755);
542 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
545 return log_error_errno(errno
, "Failed to stat %s: %m", where
);
547 r
= mount_verbose(LOG_ERR
, m
->source
, where
, NULL
, mount_flags
, mount_opts
);
552 r
= bind_remount_recursive(where
, true, NULL
);
554 return log_error_errno(r
, "Read-only bind mount failed: %m");
560 static int mount_tmpfs(
563 bool userns
, uid_t uid_shift
, uid_t uid_range
,
564 const char *selinux_apifs_context
) {
566 const char *where
, *options
;
567 _cleanup_free_
char *buf
= NULL
;
573 where
= prefix_roota(dest
, m
->destination
);
575 r
= mkdir_p_label(where
, 0755);
576 if (r
< 0 && r
!= -EEXIST
)
577 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
579 r
= tmpfs_patch_options(m
->options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
582 options
= r
> 0 ? buf
: m
->options
;
584 return mount_verbose(LOG_ERR
, "tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
);
587 static char *joined_and_escaped_lower_dirs(char * const *lower
) {
588 _cleanup_strv_free_
char **sv
= NULL
;
590 sv
= strv_copy(lower
);
596 if (!strv_shell_escape(sv
, ",:"))
599 return strv_join(sv
, ":");
602 static int mount_overlay(const char *dest
, CustomMount
*m
) {
603 _cleanup_free_
char *lower
= NULL
;
604 const char *where
, *options
;
610 where
= prefix_roota(dest
, m
->destination
);
612 r
= mkdir_label(where
, 0755);
613 if (r
< 0 && r
!= -EEXIST
)
614 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
616 (void) mkdir_p_label(m
->source
, 0755);
618 lower
= joined_and_escaped_lower_dirs(m
->lower
);
623 _cleanup_free_
char *escaped_source
= NULL
;
625 escaped_source
= shell_escape(m
->source
, ",:");
629 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
631 _cleanup_free_
char *escaped_source
= NULL
, *escaped_work_dir
= NULL
;
634 (void) mkdir_label(m
->work_dir
, 0700);
636 escaped_source
= shell_escape(m
->source
, ",:");
639 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
640 if (!escaped_work_dir
)
643 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
646 return mount_verbose(LOG_ERR
, "overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
);
651 CustomMount
*mounts
, unsigned n
,
652 bool userns
, uid_t uid_shift
, uid_t uid_range
,
653 const char *selinux_apifs_context
) {
660 for (i
= 0; i
< n
; i
++) {
661 CustomMount
*m
= mounts
+ i
;
665 case CUSTOM_MOUNT_BIND
:
666 r
= mount_bind(dest
, m
);
669 case CUSTOM_MOUNT_TMPFS
:
670 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
673 case CUSTOM_MOUNT_OVERLAY
:
674 r
= mount_overlay(dest
, m
);
678 assert_not_reached("Unknown custom mount type");
688 /* Retrieve existing subsystems. This function is called in a new cgroup
691 static int get_controllers(Set
*subsystems
) {
692 _cleanup_fclose_
FILE *f
= NULL
;
697 f
= fopen("/proc/self/cgroup", "re");
699 return errno
== ENOENT
? -ESRCH
: -errno
;
701 FOREACH_LINE(line
, f
, return -errno
) {
705 l
= strchr(line
, ':');
716 if (STR_IN_SET(l
, "", "name=systemd"))
723 r
= set_consume(subsystems
, p
);
731 static int mount_legacy_cgroup_hierarchy(
733 const char *controller
,
734 const char *hierarchy
,
735 CGroupUnified unified_requested
,
738 const char *to
, *fstype
, *opts
;
741 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
743 r
= path_is_mount_point(to
, dest
, 0);
744 if (r
< 0 && r
!= -ENOENT
)
745 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
751 /* The superblock mount options of the mount point need to be
752 * identical to the hosts', and hence writable... */
753 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER
)) {
754 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
759 opts
= "none,name=systemd,xattr";
766 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
770 /* ... hence let's only make the bind mount read-only, not the superblock. */
772 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
773 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
781 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
782 static int mount_legacy_cgns_supported(
784 CGroupUnified unified_requested
,
788 const char *selinux_apifs_context
) {
790 _cleanup_set_free_free_ Set
*controllers
= NULL
;
791 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
794 (void) mkdir_p(cgroup_root
, 0755);
796 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
797 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
799 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
801 _cleanup_free_
char *options
= NULL
;
803 /* When cgroup namespaces are enabled and user namespaces are
804 * used then the mount of the cgroupfs is done *inside* the new
805 * user namespace. We're root in the new user namespace and the
806 * kernel will happily translate our uid/gid to the correct
807 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
808 * pass uid 0 and not uid_shift to tmpfs_patch_options().
810 r
= tmpfs_patch_options("mode=755", userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
814 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
815 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
820 if (cg_all_unified() > 0)
821 goto skip_controllers
;
823 controllers
= set_new(&string_hash_ops
);
827 r
= get_controllers(controllers
);
829 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
832 _cleanup_free_
const char *controller
= NULL
;
834 controller
= set_steal_first(controllers
);
838 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, unified_requested
, !userns
);
842 /* When multiple hierarchies are co-mounted, make their
843 * constituting individual hierarchies a symlink to the
848 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
850 r
= extract_first_word(&c
, &tok
, ",", 0);
852 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
856 target
= prefix_root("/sys/fs/cgroup", tok
);
860 if (streq(controller
, tok
))
863 r
= symlink_idempotent(controller
, target
);
865 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
867 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
872 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER
, "systemd", unified_requested
, false);
877 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
878 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
883 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
884 static int mount_legacy_cgns_unsupported(
886 CGroupUnified unified_requested
,
890 const char *selinux_apifs_context
) {
892 _cleanup_set_free_free_ Set
*controllers
= NULL
;
893 const char *cgroup_root
;
896 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
898 (void) mkdir_p(cgroup_root
, 0755);
900 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
901 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
903 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
905 _cleanup_free_
char *options
= NULL
;
907 r
= tmpfs_patch_options("mode=755", userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
911 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
912 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
917 if (cg_all_unified() > 0)
918 goto skip_controllers
;
920 controllers
= set_new(&string_hash_ops
);
924 r
= cg_kernel_controllers(controllers
);
926 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
929 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
931 controller
= set_steal_first(controllers
);
935 origin
= prefix_root("/sys/fs/cgroup/", controller
);
939 r
= readlink_malloc(origin
, &combined
);
941 /* Not a symbolic link, but directly a single cgroup hierarchy */
943 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, unified_requested
, true);
948 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
950 _cleanup_free_
char *target
= NULL
;
952 target
= prefix_root(dest
, origin
);
956 /* A symbolic link, a combination of controllers in one hierarchy */
958 if (!filename_is_valid(combined
)) {
959 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
963 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, unified_requested
, true);
967 r
= symlink_idempotent(combined
, target
);
969 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
971 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
976 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER
, "systemd", unified_requested
, false);
980 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
981 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
984 static int mount_unified_cgroups(const char *dest
) {
990 p
= prefix_roota(dest
, "/sys/fs/cgroup");
992 (void) mkdir_p(p
, 0755);
994 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
996 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
998 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
999 if (access(p
, F_OK
) >= 0)
1001 if (errno
!= ENOENT
)
1002 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
1004 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
1008 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
1013 CGroupUnified unified_requested
,
1017 const char *selinux_apifs_context
,
1020 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
1021 return mount_unified_cgroups(dest
);
1023 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1025 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1028 int mount_systemd_cgroup_writable(
1030 CGroupUnified unified_requested
) {
1032 _cleanup_free_
char *own_cgroup_path
= NULL
;
1033 const char *systemd_root
, *systemd_own
;
1038 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1040 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1042 /* If we are living in the top-level, then there's nothing to do... */
1043 if (path_equal(own_cgroup_path
, "/"))
1046 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
1047 systemd_own
= strjoina(dest
, "/sys/fs/cgroup", own_cgroup_path
);
1048 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1050 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
1051 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1054 /* Make our own cgroup a (writable) bind mount */
1055 r
= mount_verbose(LOG_ERR
, systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
);
1059 /* And then remount the systemd cgroup root read-only */
1060 return mount_verbose(LOG_ERR
, NULL
, systemd_root
, NULL
,
1061 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
1064 int setup_volatile_state(
1065 const char *directory
,
1067 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1068 const char *selinux_apifs_context
) {
1070 _cleanup_free_
char *buf
= NULL
;
1071 const char *p
, *options
;
1076 if (mode
!= VOLATILE_STATE
)
1079 /* --volatile=state means we simply overmount /var
1080 with a tmpfs, and the rest read-only. */
1082 r
= bind_remount_recursive(directory
, true, NULL
);
1084 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1086 p
= prefix_roota(directory
, "/var");
1088 if (r
< 0 && errno
!= EEXIST
)
1089 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1091 options
= "mode=755";
1092 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1098 return mount_verbose(LOG_ERR
, "tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
);
1102 const char *directory
,
1104 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1105 const char *selinux_apifs_context
) {
1107 bool tmpfs_mounted
= false, bind_mounted
= false;
1108 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1109 _cleanup_free_
char *buf
= NULL
;
1110 const char *f
, *t
, *options
;
1115 if (mode
!= VOLATILE_YES
)
1118 /* --volatile=yes means we mount a tmpfs to the root dir, and
1119 the original /usr to use inside it, and that read-only. */
1121 if (!mkdtemp(template))
1122 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1124 options
= "mode=755";
1125 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1131 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
1135 tmpfs_mounted
= true;
1137 f
= prefix_roota(directory
, "/usr");
1138 t
= prefix_roota(template, "/usr");
1141 if (r
< 0 && errno
!= EEXIST
) {
1142 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1146 r
= mount_verbose(LOG_ERR
, f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
);
1150 bind_mounted
= true;
1152 r
= bind_remount_recursive(t
, true, NULL
);
1154 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1158 r
= mount_verbose(LOG_ERR
, template, directory
, NULL
, MS_MOVE
, NULL
);
1162 (void) rmdir(template);
1168 (void) umount_verbose(t
);
1171 (void) umount_verbose(template);
1172 (void) rmdir(template);
1176 VolatileMode
volatile_mode_from_string(const char *s
) {
1180 return _VOLATILE_MODE_INVALID
;
1182 b
= parse_boolean(s
);
1184 return VOLATILE_YES
;
1188 if (streq(s
, "state"))
1189 return VOLATILE_STATE
;
1191 return _VOLATILE_MODE_INVALID
;