1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2015 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <sys/mount.h>
22 #include <linux/magic.h>
24 #include "alloc-util.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
37 #include "stat-util.h"
38 #include "string-util.h"
40 #include "user-util.h"
43 CustomMount
* custom_mount_add(CustomMount
**l
, unsigned *n
, CustomMountType t
) {
49 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
51 c
= reallocarray(*l
, *n
+ 1, sizeof(CustomMount
));
59 *ret
= (CustomMount
) { .type
= t
};
64 void custom_mount_free_all(CustomMount
*l
, unsigned n
) {
67 for (i
= 0; i
< n
; i
++) {
68 CustomMount
*m
= l
+ i
;
75 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
79 if (m
->rm_rf_tmpdir
) {
80 (void) rm_rf(m
->rm_rf_tmpdir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
81 free(m
->rm_rf_tmpdir
);
90 static int custom_mount_compare(const void *a
, const void *b
) {
91 const CustomMount
*x
= a
, *y
= b
;
94 r
= path_compare(x
->destination
, y
->destination
);
98 if (x
->type
< y
->type
)
100 if (x
->type
> y
->type
)
106 static bool source_path_is_valid(const char *p
) {
112 return path_is_absolute(p
);
115 static char *resolve_source_path(const char *dest
, const char *source
) {
120 if (source
[0] == '+')
121 return prefix_root(dest
, source
+ 1);
123 return strdup(source
);
126 int custom_mount_prepare_all(const char *dest
, CustomMount
*l
, unsigned n
) {
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l
, n
, sizeof(CustomMount
), custom_mount_compare
);
139 for (i
= 0; i
< n
; i
++) {
140 CustomMount
*m
= l
+ i
;
145 s
= resolve_source_path(dest
, m
->source
);
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
154 m
->rm_rf_tmpdir
= strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m
->rm_rf_tmpdir
)
158 if (!mkdtemp(m
->rm_rf_tmpdir
)) {
159 m
->rm_rf_tmpdir
= mfree(m
->rm_rf_tmpdir
);
160 return log_error_errno(errno
, "Failed to acquire temporary directory: %m");
163 m
->source
= strjoin(m
->rm_rf_tmpdir
, "/src");
167 if (mkdir(m
->source
, 0755) < 0)
168 return log_error_errno(errno
, "Failed to create %s: %m", m
->source
);
171 if (m
->type
== CUSTOM_MOUNT_OVERLAY
) {
174 STRV_FOREACH(j
, m
->lower
) {
177 s
= resolve_source_path(dest
, *j
);
188 s
= resolve_source_path(dest
, m
->work_dir
);
197 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
199 return log_error_errno(r
, "Failed to acquire working directory: %m");
202 (void) mkdir_label(m
->work_dir
, 0700);
209 int bind_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
210 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
218 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
224 destination
= strdup(source
[0] == '+' ? source
+1 : source
);
228 if (r
== 2 && !isempty(p
)) {
236 else if (!source_path_is_valid(source
))
239 if (!path_is_absolute(destination
))
242 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
247 m
->destination
= destination
;
248 m
->read_only
= read_only
;
251 source
= destination
= opts
= NULL
;
255 int tmpfs_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
) {
256 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
265 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
272 opts
= strdup("mode=0755");
278 if (!path_is_absolute(path
))
281 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
285 m
->destination
= path
;
292 int overlay_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
293 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
294 _cleanup_strv_free_
char **lower
= NULL
;
298 k
= strv_split_extract(&lower
, s
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
302 return -EADDRNOTAVAIL
;
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
307 if (!source_path_is_valid(lower
[0]) ||
308 !source_path_is_valid(lower
[1]))
311 upper
= TAKE_PTR(lower
[1]);
313 destination
= strdup(upper
[0] == '+' ? upper
+1 : upper
); /* take the destination without "+" prefix */
319 /* If more than two parameters are specified, the last one is the destination, the second to last one
320 * the "upper", and all before that the "lower" directories. */
322 destination
= lower
[k
- 1];
323 upper
= TAKE_PTR(lower
[k
- 2]);
325 STRV_FOREACH(i
, lower
)
326 if (!source_path_is_valid(*i
))
329 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
333 else if (!source_path_is_valid(upper
))
336 if (!path_is_absolute(destination
))
340 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_OVERLAY
);
344 m
->destination
= destination
;
347 m
->read_only
= read_only
;
349 upper
= destination
= NULL
;
355 static int tmpfs_patch_options(
358 uid_t uid_shift
, uid_t uid_range
,
360 const char *selinux_apifs_context
,
365 if ((userns
&& uid_shift
!= 0) || patch_ids
) {
366 assert(uid_shift
!= UID_INVALID
);
368 if (asprintf(&buf
, "%s%suid=" UID_FMT
",gid=" UID_FMT
,
369 strempty(options
), options
? "," : "",
370 uid_shift
, uid_shift
) < 0)
377 if (selinux_apifs_context
) {
380 t
= strjoin(strempty(options
), options
? "," : "",
381 "context=\"", selinux_apifs_context
, "\"");
390 if (!buf
&& options
) {
391 buf
= strdup(options
);
400 int mount_sysfs(const char *dest
, MountSettingsMask mount_settings
) {
401 const char *full
, *top
, *x
;
403 unsigned long extra_flags
= 0;
405 top
= prefix_roota(dest
, "/sys");
406 r
= path_is_fs_type(top
, SYSFS_MAGIC
);
408 return log_error_errno(r
, "Failed to determine filesystem type of %s: %m", top
);
409 /* /sys might already be mounted as sysfs by the outer child in the
410 * !netns case. In this case, it's all good. Don't touch it because we
411 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
416 full
= prefix_roota(top
, "/full");
418 (void) mkdir(full
, 0755);
420 if (mount_settings
& MOUNT_APPLY_APIVFS_RO
)
421 extra_flags
|= MS_RDONLY
;
423 r
= mount_verbose(LOG_ERR
, "sysfs", full
, "sysfs",
424 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|extra_flags
, NULL
);
428 FOREACH_STRING(x
, "block", "bus", "class", "dev", "devices", "kernel") {
429 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
431 from
= prefix_root(full
, x
);
435 to
= prefix_root(top
, x
);
439 (void) mkdir(to
, 0755);
441 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
445 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
446 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
451 r
= umount_verbose(full
);
456 return log_error_errno(errno
, "Failed to remove %s: %m", full
);
458 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
459 * remount /sys read-only.
461 if (cg_ns_supported()) {
462 x
= prefix_roota(top
, "/fs/cgroup");
463 (void) mkdir_p(x
, 0755);
466 return mount_verbose(LOG_ERR
, NULL
, top
, NULL
,
467 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
470 static int mkdir_userns(const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
475 r
= mkdir_errno_wrapper(path
, mode
);
476 if (r
< 0 && r
!= -EEXIST
)
479 if ((mask
& MOUNT_USE_USERNS
) == 0)
482 if (mask
& MOUNT_IN_USERNS
)
485 if (lchown(path
, uid_shift
, uid_shift
) < 0)
491 static int mkdir_userns_p(const char *prefix
, const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
497 if (prefix
&& !path_startswith(path
, prefix
))
500 /* create every parent directory in the path, except the last component */
501 p
= path
+ strspn(path
, "/");
503 char t
[strlen(path
) + 1];
505 e
= p
+ strcspn(p
, "/");
506 p
= e
+ strspn(e
, "/");
508 /* Is this the last component? If so, then we're done */
512 memcpy(t
, path
, e
- path
);
515 if (prefix
&& path_startswith(prefix
, t
))
518 r
= mkdir_userns(t
, mode
, mask
, uid_shift
);
523 return mkdir_userns(path
, mode
, mask
, uid_shift
);
526 int mount_all(const char *dest
,
527 MountSettingsMask mount_settings
,
528 uid_t uid_shift
, uid_t uid_range
,
529 const char *selinux_apifs_context
) {
531 typedef struct MountPoint
{
537 MountSettingsMask mount_settings
;
540 static const MountPoint mount_table
[] = {
541 /* inner child mounts */
542 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_IN_USERNS
},
543 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
544 { "/proc/sys/net", "/proc/sys/net", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
}, /* (except for this) */
545 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
546 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
547 { NULL
, "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
549 /* outer child mounts */
550 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
551 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_NETNS
},
552 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_RO
}, /* skipped if above was mounted */
553 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
}, /* skipped if above was mounted */
555 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, MOUNT_FATAL
},
556 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
557 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
559 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, 0 }, /* Bind mount first */
560 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, 0 }, /* Then, make it r/o */
566 bool use_userns
= (mount_settings
& MOUNT_USE_USERNS
);
567 bool netns
= (mount_settings
& MOUNT_APPLY_APIVFS_NETNS
);
568 bool ro
= (mount_settings
& MOUNT_APPLY_APIVFS_RO
);
569 bool in_userns
= (mount_settings
& MOUNT_IN_USERNS
);
571 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
572 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
574 bool fatal
= (mount_table
[k
].mount_settings
& MOUNT_FATAL
);
576 if (in_userns
!= (bool)(mount_table
[k
].mount_settings
& MOUNT_IN_USERNS
))
579 if (!netns
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_NETNS
))
582 if (!ro
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_RO
))
585 r
= chase_symlinks(mount_table
[k
].where
, dest
, CHASE_NONEXISTENT
|CHASE_PREFIX_ROOT
, &where
);
587 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, mount_table
[k
].where
);
589 r
= path_is_mount_point(where
, NULL
, 0);
590 if (r
< 0 && r
!= -ENOENT
)
591 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
593 /* Skip this entry if it is not a remount. */
594 if (mount_table
[k
].what
&& r
> 0)
597 r
= mkdir_userns_p(dest
, where
, 0755, mount_settings
, uid_shift
);
598 if (r
< 0 && r
!= -EEXIST
) {
599 if (fatal
&& r
!= -EROFS
)
600 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
602 log_debug_errno(r
, "Failed to create directory %s: %m", where
);
603 /* If we failed mkdir() or chown() due to the root
604 * directory being read only, attempt to mount this fs
605 * anyway and let mount_verbose log any errors */
610 o
= mount_table
[k
].options
;
611 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
613 r
= tmpfs_patch_options(o
, use_userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
615 r
= tmpfs_patch_options(o
, use_userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
622 r
= mount_verbose(fatal
? LOG_ERR
: LOG_DEBUG
,
626 mount_table
[k
].flags
,
635 static int mount_bind(const char *dest
, CustomMount
*m
) {
637 _cleanup_free_
char *where
= NULL
;
638 struct stat source_st
, dest_st
;
644 if (stat(m
->source
, &source_st
) < 0)
645 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
647 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
649 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
650 if (r
> 0) { /* Path exists already? */
652 if (stat(where
, &dest_st
) < 0)
653 return log_error_errno(errno
, "Failed to stat %s: %m", where
);
655 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
656 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
660 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
661 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
665 } else { /* Path doesn't exist yet? */
666 r
= mkdir_parents_label(where
, 0755);
668 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
670 /* Create the mount point. Any non-directory file can be
671 * mounted on any non-directory file (regular, fifo, socket,
674 if (S_ISDIR(source_st
.st_mode
))
675 r
= mkdir_label(where
, 0755);
679 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
683 r
= mount_verbose(LOG_ERR
, m
->source
, where
, NULL
, MS_BIND
| MS_REC
, m
->options
);
688 r
= bind_remount_recursive(where
, true, NULL
);
690 return log_error_errno(r
, "Read-only bind mount failed: %m");
696 static int mount_tmpfs(
699 bool userns
, uid_t uid_shift
, uid_t uid_range
,
700 const char *selinux_apifs_context
) {
703 _cleanup_free_
char *buf
= NULL
, *where
= NULL
;
709 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
711 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
712 if (r
== 0) { /* Doesn't exist yet? */
713 r
= mkdir_p_label(where
, 0755);
715 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
718 r
= tmpfs_patch_options(m
->options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
721 options
= r
> 0 ? buf
: m
->options
;
723 return mount_verbose(LOG_ERR
, "tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
);
726 static char *joined_and_escaped_lower_dirs(char **lower
) {
727 _cleanup_strv_free_
char **sv
= NULL
;
729 sv
= strv_copy(lower
);
735 if (!strv_shell_escape(sv
, ",:"))
738 return strv_join(sv
, ":");
741 static int mount_overlay(const char *dest
, CustomMount
*m
) {
743 _cleanup_free_
char *lower
= NULL
, *where
= NULL
, *escaped_source
= NULL
;
750 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
752 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
753 if (r
== 0) { /* Doesn't exist yet? */
754 r
= mkdir_label(where
, 0755);
756 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
759 (void) mkdir_p_label(m
->source
, 0755);
761 lower
= joined_and_escaped_lower_dirs(m
->lower
);
765 escaped_source
= shell_escape(m
->source
, ",:");
770 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
772 _cleanup_free_
char *escaped_work_dir
= NULL
;
774 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
775 if (!escaped_work_dir
)
778 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
781 return mount_verbose(LOG_ERR
, "overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
);
786 CustomMount
*mounts
, unsigned n
,
787 bool userns
, uid_t uid_shift
, uid_t uid_range
,
788 const char *selinux_apifs_context
) {
795 for (i
= 0; i
< n
; i
++) {
796 CustomMount
*m
= mounts
+ i
;
800 case CUSTOM_MOUNT_BIND
:
801 r
= mount_bind(dest
, m
);
804 case CUSTOM_MOUNT_TMPFS
:
805 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
808 case CUSTOM_MOUNT_OVERLAY
:
809 r
= mount_overlay(dest
, m
);
813 assert_not_reached("Unknown custom mount type");
823 /* Retrieve existing subsystems. This function is called in a new cgroup
826 static int get_process_controllers(Set
**ret
) {
827 _cleanup_set_free_free_ Set
*controllers
= NULL
;
828 _cleanup_fclose_
FILE *f
= NULL
;
833 controllers
= set_new(&string_hash_ops
);
837 f
= fopen("/proc/self/cgroup", "re");
839 return errno
== ENOENT
? -ESRCH
: -errno
;
842 _cleanup_free_
char *line
= NULL
;
845 r
= read_line(f
, LONG_LINE_MAX
, &line
);
851 l
= strchr(line
, ':');
862 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
865 r
= set_put_strdup(controllers
, l
);
876 static int mount_legacy_cgroup_hierarchy(
878 const char *controller
,
879 const char *hierarchy
,
882 const char *to
, *fstype
, *opts
;
885 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
887 r
= path_is_mount_point(to
, dest
, 0);
888 if (r
< 0 && r
!= -ENOENT
)
889 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
895 /* The superblock mount options of the mount point need to be
896 * identical to the hosts', and hence writable... */
897 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
900 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
902 opts
= "none,name=systemd,xattr";
908 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
912 /* ... hence let's only make the bind mount read-only, not the superblock. */
914 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
915 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
923 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
924 static int mount_legacy_cgns_supported(
926 CGroupUnified unified_requested
,
930 const char *selinux_apifs_context
) {
932 _cleanup_set_free_free_ Set
*controllers
= NULL
;
933 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
936 (void) mkdir_p(cgroup_root
, 0755);
938 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
939 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
941 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
943 _cleanup_free_
char *options
= NULL
;
945 /* When cgroup namespaces are enabled and user namespaces are
946 * used then the mount of the cgroupfs is done *inside* the new
947 * user namespace. We're root in the new user namespace and the
948 * kernel will happily translate our uid/gid to the correct
949 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
950 * pass uid 0 and not uid_shift to tmpfs_patch_options().
952 r
= tmpfs_patch_options("mode=755", userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
956 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
957 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
962 r
= cg_all_unified();
966 goto skip_controllers
;
968 r
= get_process_controllers(&controllers
);
970 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
973 _cleanup_free_
const char *controller
= NULL
;
975 controller
= set_steal_first(controllers
);
979 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
983 /* When multiple hierarchies are co-mounted, make their
984 * constituting individual hierarchies a symlink to the
989 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
991 r
= extract_first_word(&c
, &tok
, ",", 0);
993 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
997 if (streq(controller
, tok
))
1000 target
= prefix_root("/sys/fs/cgroup/", tok
);
1004 r
= symlink_idempotent(controller
, target
);
1006 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1008 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1013 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1014 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1019 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1024 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1025 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1030 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1031 static int mount_legacy_cgns_unsupported(
1033 CGroupUnified unified_requested
,
1037 const char *selinux_apifs_context
) {
1039 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1040 const char *cgroup_root
;
1043 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1045 (void) mkdir_p(cgroup_root
, 0755);
1047 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1048 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
1050 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1052 _cleanup_free_
char *options
= NULL
;
1054 r
= tmpfs_patch_options("mode=755", userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
1058 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
1059 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
1064 r
= cg_all_unified();
1068 goto skip_controllers
;
1070 r
= cg_kernel_controllers(&controllers
);
1072 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1075 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1077 controller
= set_steal_first(controllers
);
1081 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1085 r
= readlink_malloc(origin
, &combined
);
1087 /* Not a symbolic link, but directly a single cgroup hierarchy */
1089 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
1094 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1096 _cleanup_free_
char *target
= NULL
;
1098 target
= prefix_root(dest
, origin
);
1102 /* A symbolic link, a combination of controllers in one hierarchy */
1104 if (!filename_is_valid(combined
)) {
1105 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1109 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
1113 r
= symlink_idempotent(combined
, target
);
1115 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1117 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1122 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1123 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1128 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1132 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1133 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1136 static int mount_unified_cgroups(const char *dest
) {
1142 p
= prefix_roota(dest
, "/sys/fs/cgroup");
1144 (void) mkdir_p(p
, 0755);
1146 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
1148 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
1150 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
1151 if (access(p
, F_OK
) >= 0)
1153 if (errno
!= ENOENT
)
1154 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
1156 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
1160 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
1165 CGroupUnified unified_requested
,
1169 const char *selinux_apifs_context
,
1172 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
1173 return mount_unified_cgroups(dest
);
1175 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1177 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1180 static int mount_systemd_cgroup_writable_one(const char *root
, const char *own
) {
1186 /* Make our own cgroup a (writable) bind mount */
1187 r
= mount_verbose(LOG_ERR
, own
, own
, NULL
, MS_BIND
, NULL
);
1191 /* And then remount the systemd cgroup root read-only */
1192 return mount_verbose(LOG_ERR
, NULL
, root
, NULL
,
1193 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
1196 int mount_systemd_cgroup_writable(
1198 CGroupUnified unified_requested
) {
1200 _cleanup_free_
char *own_cgroup_path
= NULL
;
1201 const char *root
, *own
;
1206 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1208 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1210 /* If we are living in the top-level, then there's nothing to do... */
1211 if (path_equal(own_cgroup_path
, "/"))
1214 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
1216 root
= prefix_roota(dest
, "/sys/fs/cgroup");
1217 own
= strjoina(root
, own_cgroup_path
);
1221 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1222 root
= prefix_roota(dest
, "/sys/fs/cgroup/unified");
1223 own
= strjoina(root
, own_cgroup_path
);
1225 r
= mount_systemd_cgroup_writable_one(root
, own
);
1230 root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1231 own
= strjoina(root
, own_cgroup_path
);
1234 return mount_systemd_cgroup_writable_one(root
, own
);
1237 int setup_volatile_state(
1238 const char *directory
,
1240 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1241 const char *selinux_apifs_context
) {
1243 _cleanup_free_
char *buf
= NULL
;
1244 const char *p
, *options
;
1249 if (mode
!= VOLATILE_STATE
)
1252 /* --volatile=state means we simply overmount /var
1253 with a tmpfs, and the rest read-only. */
1255 r
= bind_remount_recursive(directory
, true, NULL
);
1257 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1259 p
= prefix_roota(directory
, "/var");
1261 if (r
< 0 && errno
!= EEXIST
)
1262 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1264 options
= "mode=755";
1265 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1271 return mount_verbose(LOG_ERR
, "tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
);
1275 const char *directory
,
1277 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1278 const char *selinux_apifs_context
) {
1280 bool tmpfs_mounted
= false, bind_mounted
= false;
1281 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1282 _cleanup_free_
char *buf
= NULL
;
1283 const char *f
, *t
, *options
;
1288 if (mode
!= VOLATILE_YES
)
1291 /* --volatile=yes means we mount a tmpfs to the root dir, and
1292 the original /usr to use inside it, and that read-only. */
1294 if (!mkdtemp(template))
1295 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1297 options
= "mode=755";
1298 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1304 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
1308 tmpfs_mounted
= true;
1310 f
= prefix_roota(directory
, "/usr");
1311 t
= prefix_roota(template, "/usr");
1314 if (r
< 0 && errno
!= EEXIST
) {
1315 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1319 r
= mount_verbose(LOG_ERR
, f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
);
1323 bind_mounted
= true;
1325 r
= bind_remount_recursive(t
, true, NULL
);
1327 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1331 r
= mount_verbose(LOG_ERR
, template, directory
, NULL
, MS_MOVE
, NULL
);
1335 (void) rmdir(template);
1341 (void) umount_verbose(t
);
1344 (void) umount_verbose(template);
1345 (void) rmdir(template);
1349 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1350 int pivot_root_parse(char **pivot_root_new
, char **pivot_root_old
, const char *s
) {
1351 _cleanup_free_
char *root_new
= NULL
, *root_old
= NULL
;
1355 assert(pivot_root_new
);
1356 assert(pivot_root_old
);
1358 r
= extract_first_word(&p
, &root_new
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1367 root_old
= strdup(p
);
1372 if (!path_is_absolute(root_new
))
1374 if (root_old
&& !path_is_absolute(root_old
))
1377 free_and_replace(*pivot_root_new
, root_new
);
1378 free_and_replace(*pivot_root_old
, root_old
);
1383 int setup_pivot_root(const char *directory
, const char *pivot_root_new
, const char *pivot_root_old
) {
1384 _cleanup_free_
char *directory_pivot_root_new
= NULL
;
1385 _cleanup_free_
char *pivot_tmp_pivot_root_old
= NULL
;
1386 char pivot_tmp
[] = "/tmp/nspawn-pivot-XXXXXX";
1387 bool remove_pivot_tmp
= false;
1392 if (!pivot_root_new
)
1395 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1396 * If pivot_root_old is NULL, the existing / disappears.
1397 * This requires a temporary directory, pivot_tmp, which is
1398 * not a child of either.
1400 * This is typically used for OSTree-style containers, where
1401 * the root partition contains several sysroots which could be
1402 * run. Normally, one would be chosen by the bootloader and
1403 * pivoted to / by initramfs.
1405 * For example, for an OSTree deployment, pivot_root_new
1406 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1407 * code doesn’t do the /var mount which OSTree expects: use
1408 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1410 * So in the OSTree case, we’ll end up with something like:
1411 * - directory = /tmp/nspawn-root-123456
1412 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1413 * - pivot_root_old = /sysroot
1414 * - directory_pivot_root_new =
1415 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1416 * - pivot_tmp = /tmp/nspawn-pivot-123456
1417 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1419 * Requires all file systems at directory and below to be mounted
1420 * MS_PRIVATE or MS_SLAVE so they can be moved.
1422 directory_pivot_root_new
= prefix_root(directory
, pivot_root_new
);
1424 /* Remount directory_pivot_root_new to make it movable. */
1425 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory_pivot_root_new
, NULL
, MS_BIND
, NULL
);
1429 if (pivot_root_old
) {
1430 if (!mkdtemp(pivot_tmp
)) {
1431 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
1435 remove_pivot_tmp
= true;
1436 pivot_tmp_pivot_root_old
= prefix_root(pivot_tmp
, pivot_root_old
);
1438 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, pivot_tmp
, NULL
, MS_MOVE
, NULL
);
1442 r
= mount_verbose(LOG_ERR
, directory
, pivot_tmp_pivot_root_old
, NULL
, MS_MOVE
, NULL
);
1446 r
= mount_verbose(LOG_ERR
, pivot_tmp
, directory
, NULL
, MS_MOVE
, NULL
);
1450 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory
, NULL
, MS_MOVE
, NULL
);
1456 if (remove_pivot_tmp
)
1457 (void) rmdir(pivot_tmp
);