1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2015 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <sys/mount.h>
22 #include <linux/magic.h>
24 #include "alloc-util.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
37 #include "stat-util.h"
38 #include "string-util.h"
40 #include "user-util.h"
43 CustomMount
* custom_mount_add(CustomMount
**l
, unsigned *n
, CustomMountType t
) {
49 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
51 c
= realloc_multiply(*l
, (*n
+ 1), sizeof(CustomMount
));
59 *ret
= (CustomMount
) { .type
= t
};
64 void custom_mount_free_all(CustomMount
*l
, unsigned n
) {
67 for (i
= 0; i
< n
; i
++) {
68 CustomMount
*m
= l
+ i
;
75 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
79 if (m
->rm_rf_tmpdir
) {
80 (void) rm_rf(m
->rm_rf_tmpdir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
81 free(m
->rm_rf_tmpdir
);
90 static int custom_mount_compare(const void *a
, const void *b
) {
91 const CustomMount
*x
= a
, *y
= b
;
94 r
= path_compare(x
->destination
, y
->destination
);
98 if (x
->type
< y
->type
)
100 if (x
->type
> y
->type
)
106 static bool source_path_is_valid(const char *p
) {
112 return path_is_absolute(p
);
115 static char *resolve_source_path(const char *dest
, const char *source
) {
120 if (source
[0] == '+')
121 return prefix_root(dest
, source
+ 1);
123 return strdup(source
);
126 int custom_mount_prepare_all(const char *dest
, CustomMount
*l
, unsigned n
) {
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l
, n
, sizeof(CustomMount
), custom_mount_compare
);
139 for (i
= 0; i
< n
; i
++) {
140 CustomMount
*m
= l
+ i
;
145 s
= resolve_source_path(dest
, m
->source
);
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
154 m
->rm_rf_tmpdir
= strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m
->rm_rf_tmpdir
)
158 if (!mkdtemp(m
->rm_rf_tmpdir
)) {
159 m
->rm_rf_tmpdir
= mfree(m
->rm_rf_tmpdir
);
160 return log_error_errno(errno
, "Failed to acquire temporary directory: %m");
163 m
->source
= strjoin(m
->rm_rf_tmpdir
, "/src");
167 if (mkdir(m
->source
, 0755) < 0)
168 return log_error_errno(errno
, "Failed to create %s: %m", m
->source
);
171 if (m
->type
== CUSTOM_MOUNT_OVERLAY
) {
174 STRV_FOREACH(j
, m
->lower
) {
177 s
= resolve_source_path(dest
, *j
);
188 s
= resolve_source_path(dest
, m
->work_dir
);
197 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
199 return log_error_errno(r
, "Failed to acquire working directory: %m");
202 (void) mkdir_label(m
->work_dir
, 0700);
209 int bind_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
210 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
218 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
224 destination
= strdup(source
[0] == '+' ? source
+1 : source
);
228 if (r
== 2 && !isempty(p
)) {
236 else if (!source_path_is_valid(source
))
239 if (!path_is_absolute(destination
))
242 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
247 m
->destination
= destination
;
248 m
->read_only
= read_only
;
251 source
= destination
= opts
= NULL
;
255 int tmpfs_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
) {
256 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
265 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
272 opts
= strdup("mode=0755");
278 if (!path_is_absolute(path
))
281 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
285 m
->destination
= path
;
292 int overlay_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
293 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
294 _cleanup_strv_free_
char **lower
= NULL
;
298 k
= strv_split_extract(&lower
, s
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
302 return -EADDRNOTAVAIL
;
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
307 if (!source_path_is_valid(lower
[0]) ||
308 !source_path_is_valid(lower
[1]))
314 destination
= strdup(upper
[0] == '+' ? upper
+1 : upper
); /* take the destination without "+" prefix */
320 /* If more than two parameters are specified, the last one is the destination, the second to last one
321 * the "upper", and all before that the "lower" directories. */
323 destination
= lower
[k
- 1];
324 upper
= lower
[k
- 2];
327 STRV_FOREACH(i
, lower
)
328 if (!source_path_is_valid(*i
))
331 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
335 else if (!source_path_is_valid(upper
))
338 if (!path_is_absolute(destination
))
342 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_OVERLAY
);
346 m
->destination
= destination
;
349 m
->read_only
= read_only
;
351 upper
= destination
= NULL
;
357 static int tmpfs_patch_options(
360 uid_t uid_shift
, uid_t uid_range
,
362 const char *selinux_apifs_context
,
367 if ((userns
&& uid_shift
!= 0) || patch_ids
) {
368 assert(uid_shift
!= UID_INVALID
);
370 if (asprintf(&buf
, "%s%suid=" UID_FMT
",gid=" UID_FMT
,
371 options
?: "", options
? "," : "",
372 uid_shift
, uid_shift
) < 0)
379 if (selinux_apifs_context
) {
382 t
= strjoin(options
?: "", options
? "," : "",
383 "context=\"", selinux_apifs_context
, "\"");
392 if (!buf
&& options
) {
393 buf
= strdup(options
);
402 int mount_sysfs(const char *dest
, MountSettingsMask mount_settings
) {
403 const char *full
, *top
, *x
;
405 unsigned long extra_flags
= 0;
407 top
= prefix_roota(dest
, "/sys");
408 r
= path_check_fstype(top
, SYSFS_MAGIC
);
410 return log_error_errno(r
, "Failed to determine filesystem type of %s: %m", top
);
411 /* /sys might already be mounted as sysfs by the outer child in the
412 * !netns case. In this case, it's all good. Don't touch it because we
413 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
418 full
= prefix_roota(top
, "/full");
420 (void) mkdir(full
, 0755);
422 if (mount_settings
& MOUNT_APPLY_APIVFS_RO
)
423 extra_flags
|= MS_RDONLY
;
425 r
= mount_verbose(LOG_ERR
, "sysfs", full
, "sysfs",
426 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|extra_flags
, NULL
);
430 FOREACH_STRING(x
, "block", "bus", "class", "dev", "devices", "kernel") {
431 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
433 from
= prefix_root(full
, x
);
437 to
= prefix_root(top
, x
);
441 (void) mkdir(to
, 0755);
443 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
447 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
448 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
453 r
= umount_verbose(full
);
458 return log_error_errno(errno
, "Failed to remove %s: %m", full
);
460 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
461 * remount /sys read-only.
463 if (cg_ns_supported()) {
464 x
= prefix_roota(top
, "/fs/cgroup");
465 (void) mkdir_p(x
, 0755);
468 return mount_verbose(LOG_ERR
, NULL
, top
, NULL
,
469 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
472 static int mkdir_userns(const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
477 r
= mkdir(path
, mode
);
478 if (r
< 0 && errno
!= EEXIST
)
481 if ((mask
& MOUNT_USE_USERNS
) == 0)
484 if (mask
& MOUNT_IN_USERNS
)
487 r
= lchown(path
, uid_shift
, uid_shift
);
494 static int mkdir_userns_p(const char *prefix
, const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
500 if (prefix
&& !path_startswith(path
, prefix
))
503 /* create every parent directory in the path, except the last component */
504 p
= path
+ strspn(path
, "/");
506 char t
[strlen(path
) + 1];
508 e
= p
+ strcspn(p
, "/");
509 p
= e
+ strspn(e
, "/");
511 /* Is this the last component? If so, then we're done */
515 memcpy(t
, path
, e
- path
);
518 if (prefix
&& path_startswith(prefix
, t
))
521 r
= mkdir_userns(t
, mode
, mask
, uid_shift
);
526 return mkdir_userns(path
, mode
, mask
, uid_shift
);
529 int mount_all(const char *dest
,
530 MountSettingsMask mount_settings
,
531 uid_t uid_shift
, uid_t uid_range
,
532 const char *selinux_apifs_context
) {
534 typedef struct MountPoint
{
540 MountSettingsMask mount_settings
;
543 static const MountPoint mount_table
[] = {
544 /* inner child mounts */
545 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_IN_USERNS
},
546 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
547 { "/proc/sys/net", "/proc/sys/net", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
}, /* (except for this) */
548 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
549 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
550 { NULL
, "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
552 /* outer child mounts */
553 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
554 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_NETNS
},
555 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_RO
}, /* skipped if above was mounted */
556 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
}, /* skipped if above was mounted */
558 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, MOUNT_FATAL
},
559 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
560 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
562 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, 0 }, /* Bind mount first */
563 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, 0 }, /* Then, make it r/o */
569 bool use_userns
= (mount_settings
& MOUNT_USE_USERNS
);
570 bool netns
= (mount_settings
& MOUNT_APPLY_APIVFS_NETNS
);
571 bool ro
= (mount_settings
& MOUNT_APPLY_APIVFS_RO
);
572 bool in_userns
= (mount_settings
& MOUNT_IN_USERNS
);
574 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
575 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
577 bool fatal
= (mount_table
[k
].mount_settings
& MOUNT_FATAL
);
579 if (in_userns
!= (bool)(mount_table
[k
].mount_settings
& MOUNT_IN_USERNS
))
582 if (!netns
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_NETNS
))
585 if (!ro
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_RO
))
588 r
= chase_symlinks(mount_table
[k
].where
, dest
, CHASE_NONEXISTENT
|CHASE_PREFIX_ROOT
, &where
);
590 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, mount_table
[k
].where
);
592 r
= path_is_mount_point(where
, NULL
, 0);
593 if (r
< 0 && r
!= -ENOENT
)
594 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
596 /* Skip this entry if it is not a remount. */
597 if (mount_table
[k
].what
&& r
> 0)
600 r
= mkdir_userns_p(dest
, where
, 0755, mount_settings
, uid_shift
);
601 if (r
< 0 && r
!= -EEXIST
) {
602 if (fatal
&& r
!= -EROFS
)
603 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
605 log_debug_errno(r
, "Failed to create directory %s: %m", where
);
606 /* If we failed mkdir() or chown() due to the root
607 * directory being read only, attempt to mount this fs
608 * anyway and let mount_verbose log any errors */
613 o
= mount_table
[k
].options
;
614 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
616 r
= tmpfs_patch_options(o
, use_userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
618 r
= tmpfs_patch_options(o
, use_userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
625 r
= mount_verbose(fatal
? LOG_ERR
: LOG_DEBUG
,
629 mount_table
[k
].flags
,
638 static int parse_mount_bind_options(const char *options
, unsigned long *mount_flags
, char **mount_opts
) {
639 const char *p
= options
;
640 unsigned long flags
= *mount_flags
;
647 _cleanup_free_
char *word
= NULL
;
649 r
= extract_first_word(&p
, &word
, ",", 0);
651 return log_error_errno(r
, "Failed to extract mount option: %m");
655 if (streq(word
, "rbind"))
657 else if (streq(word
, "norbind"))
660 log_error("Invalid bind mount option: %s", word
);
665 *mount_flags
= flags
;
666 /* in the future mount_opts will hold string options for mount(2) */
672 static int mount_bind(const char *dest
, CustomMount
*m
) {
674 _cleanup_free_
char *mount_opts
= NULL
, *where
= NULL
;
675 unsigned long mount_flags
= MS_BIND
| MS_REC
;
676 struct stat source_st
, dest_st
;
683 r
= parse_mount_bind_options(m
->options
, &mount_flags
, &mount_opts
);
688 if (stat(m
->source
, &source_st
) < 0)
689 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
691 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
693 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
694 if (r
> 0) { /* Path exists already? */
696 if (stat(where
, &dest_st
) < 0)
697 return log_error_errno(errno
, "Failed to stat %s: %m", where
);
699 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
700 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
704 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
705 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
709 } else { /* Path doesn't exist yet? */
710 r
= mkdir_parents_label(where
, 0755);
712 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
714 /* Create the mount point. Any non-directory file can be
715 * mounted on any non-directory file (regular, fifo, socket,
718 if (S_ISDIR(source_st
.st_mode
))
719 r
= mkdir_label(where
, 0755);
723 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
727 r
= mount_verbose(LOG_ERR
, m
->source
, where
, NULL
, mount_flags
, mount_opts
);
732 r
= bind_remount_recursive(where
, true, NULL
);
734 return log_error_errno(r
, "Read-only bind mount failed: %m");
740 static int mount_tmpfs(
743 bool userns
, uid_t uid_shift
, uid_t uid_range
,
744 const char *selinux_apifs_context
) {
747 _cleanup_free_
char *buf
= NULL
, *where
= NULL
;
753 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
755 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
756 if (r
== 0) { /* Doesn't exist yet? */
757 r
= mkdir_p_label(where
, 0755);
759 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
762 r
= tmpfs_patch_options(m
->options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
765 options
= r
> 0 ? buf
: m
->options
;
767 return mount_verbose(LOG_ERR
, "tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
);
770 static char *joined_and_escaped_lower_dirs(char **lower
) {
771 _cleanup_strv_free_
char **sv
= NULL
;
773 sv
= strv_copy(lower
);
779 if (!strv_shell_escape(sv
, ",:"))
782 return strv_join(sv
, ":");
785 static int mount_overlay(const char *dest
, CustomMount
*m
) {
787 _cleanup_free_
char *lower
= NULL
, *where
= NULL
, *escaped_source
= NULL
;
794 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
796 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
797 if (r
== 0) { /* Doesn't exist yet? */
798 r
= mkdir_label(where
, 0755);
800 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
803 (void) mkdir_p_label(m
->source
, 0755);
805 lower
= joined_and_escaped_lower_dirs(m
->lower
);
809 escaped_source
= shell_escape(m
->source
, ",:");
814 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
816 _cleanup_free_
char *escaped_work_dir
= NULL
;
818 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
819 if (!escaped_work_dir
)
822 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
825 return mount_verbose(LOG_ERR
, "overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
);
830 CustomMount
*mounts
, unsigned n
,
831 bool userns
, uid_t uid_shift
, uid_t uid_range
,
832 const char *selinux_apifs_context
) {
839 for (i
= 0; i
< n
; i
++) {
840 CustomMount
*m
= mounts
+ i
;
844 case CUSTOM_MOUNT_BIND
:
845 r
= mount_bind(dest
, m
);
848 case CUSTOM_MOUNT_TMPFS
:
849 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
852 case CUSTOM_MOUNT_OVERLAY
:
853 r
= mount_overlay(dest
, m
);
857 assert_not_reached("Unknown custom mount type");
867 /* Retrieve existing subsystems. This function is called in a new cgroup
870 static int get_controllers(Set
*subsystems
) {
871 _cleanup_fclose_
FILE *f
= NULL
;
876 f
= fopen("/proc/self/cgroup", "re");
878 return errno
== ENOENT
? -ESRCH
: -errno
;
880 FOREACH_LINE(line
, f
, return -errno
) {
884 l
= strchr(line
, ':');
895 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
902 r
= set_consume(subsystems
, p
);
910 static int mount_legacy_cgroup_hierarchy(
912 const char *controller
,
913 const char *hierarchy
,
916 const char *to
, *fstype
, *opts
;
919 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
921 r
= path_is_mount_point(to
, dest
, 0);
922 if (r
< 0 && r
!= -ENOENT
)
923 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
929 /* The superblock mount options of the mount point need to be
930 * identical to the hosts', and hence writable... */
931 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
934 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
936 opts
= "none,name=systemd,xattr";
942 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
946 /* ... hence let's only make the bind mount read-only, not the superblock. */
948 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
949 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
957 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
958 static int mount_legacy_cgns_supported(
960 CGroupUnified unified_requested
,
964 const char *selinux_apifs_context
) {
966 _cleanup_set_free_free_ Set
*controllers
= NULL
;
967 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
970 (void) mkdir_p(cgroup_root
, 0755);
972 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
973 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
975 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
977 _cleanup_free_
char *options
= NULL
;
979 /* When cgroup namespaces are enabled and user namespaces are
980 * used then the mount of the cgroupfs is done *inside* the new
981 * user namespace. We're root in the new user namespace and the
982 * kernel will happily translate our uid/gid to the correct
983 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
984 * pass uid 0 and not uid_shift to tmpfs_patch_options().
986 r
= tmpfs_patch_options("mode=755", userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
990 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
991 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
996 r
= cg_all_unified();
1000 goto skip_controllers
;
1002 controllers
= set_new(&string_hash_ops
);
1006 r
= get_controllers(controllers
);
1008 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1011 _cleanup_free_
const char *controller
= NULL
;
1013 controller
= set_steal_first(controllers
);
1017 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
1021 /* When multiple hierarchies are co-mounted, make their
1022 * constituting individual hierarchies a symlink to the
1027 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
1029 r
= extract_first_word(&c
, &tok
, ",", 0);
1031 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
1035 target
= prefix_root("/sys/fs/cgroup", tok
);
1039 if (streq(controller
, tok
))
1042 r
= symlink_idempotent(controller
, target
);
1044 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1046 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1051 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1052 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1057 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1062 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1063 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1068 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1069 static int mount_legacy_cgns_unsupported(
1071 CGroupUnified unified_requested
,
1075 const char *selinux_apifs_context
) {
1077 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1078 const char *cgroup_root
;
1081 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1083 (void) mkdir_p(cgroup_root
, 0755);
1085 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1086 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
1088 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1090 _cleanup_free_
char *options
= NULL
;
1092 r
= tmpfs_patch_options("mode=755", userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
1096 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
1097 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
1102 r
= cg_all_unified();
1106 goto skip_controllers
;
1108 controllers
= set_new(&string_hash_ops
);
1112 r
= cg_kernel_controllers(controllers
);
1114 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1117 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1119 controller
= set_steal_first(controllers
);
1123 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1127 r
= readlink_malloc(origin
, &combined
);
1129 /* Not a symbolic link, but directly a single cgroup hierarchy */
1131 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
1136 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1138 _cleanup_free_
char *target
= NULL
;
1140 target
= prefix_root(dest
, origin
);
1144 /* A symbolic link, a combination of controllers in one hierarchy */
1146 if (!filename_is_valid(combined
)) {
1147 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1151 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
1155 r
= symlink_idempotent(combined
, target
);
1157 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1159 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1164 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1165 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1170 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1174 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1175 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1178 static int mount_unified_cgroups(const char *dest
) {
1184 p
= prefix_roota(dest
, "/sys/fs/cgroup");
1186 (void) mkdir_p(p
, 0755);
1188 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
1190 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
1192 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
1193 if (access(p
, F_OK
) >= 0)
1195 if (errno
!= ENOENT
)
1196 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
1198 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
1202 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
1207 CGroupUnified unified_requested
,
1211 const char *selinux_apifs_context
,
1214 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
1215 return mount_unified_cgroups(dest
);
1217 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1219 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1222 static int mount_systemd_cgroup_writable_one(const char *systemd_own
, const char *systemd_root
)
1226 /* Make our own cgroup a (writable) bind mount */
1227 r
= mount_verbose(LOG_ERR
, systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
);
1231 /* And then remount the systemd cgroup root read-only */
1232 return mount_verbose(LOG_ERR
, NULL
, systemd_root
, NULL
,
1233 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
1236 int mount_systemd_cgroup_writable(
1238 CGroupUnified unified_requested
) {
1240 _cleanup_free_
char *own_cgroup_path
= NULL
;
1245 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1247 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1249 /* If we are living in the top-level, then there's nothing to do... */
1250 if (path_equal(own_cgroup_path
, "/"))
1253 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
1254 return mount_systemd_cgroup_writable_one(strjoina(dest
, "/sys/fs/cgroup", own_cgroup_path
),
1255 prefix_roota(dest
, "/sys/fs/cgroup"));
1257 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1258 r
= mount_systemd_cgroup_writable_one(strjoina(dest
, "/sys/fs/cgroup/unified", own_cgroup_path
),
1259 prefix_roota(dest
, "/sys/fs/cgroup/unified"));
1264 return mount_systemd_cgroup_writable_one(strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
),
1265 prefix_roota(dest
, "/sys/fs/cgroup/systemd"));
1268 int setup_volatile_state(
1269 const char *directory
,
1271 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1272 const char *selinux_apifs_context
) {
1274 _cleanup_free_
char *buf
= NULL
;
1275 const char *p
, *options
;
1280 if (mode
!= VOLATILE_STATE
)
1283 /* --volatile=state means we simply overmount /var
1284 with a tmpfs, and the rest read-only. */
1286 r
= bind_remount_recursive(directory
, true, NULL
);
1288 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1290 p
= prefix_roota(directory
, "/var");
1292 if (r
< 0 && errno
!= EEXIST
)
1293 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1295 options
= "mode=755";
1296 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1302 return mount_verbose(LOG_ERR
, "tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
);
1306 const char *directory
,
1308 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1309 const char *selinux_apifs_context
) {
1311 bool tmpfs_mounted
= false, bind_mounted
= false;
1312 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1313 _cleanup_free_
char *buf
= NULL
;
1314 const char *f
, *t
, *options
;
1319 if (mode
!= VOLATILE_YES
)
1322 /* --volatile=yes means we mount a tmpfs to the root dir, and
1323 the original /usr to use inside it, and that read-only. */
1325 if (!mkdtemp(template))
1326 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1328 options
= "mode=755";
1329 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1335 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
1339 tmpfs_mounted
= true;
1341 f
= prefix_roota(directory
, "/usr");
1342 t
= prefix_roota(template, "/usr");
1345 if (r
< 0 && errno
!= EEXIST
) {
1346 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1350 r
= mount_verbose(LOG_ERR
, f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
);
1354 bind_mounted
= true;
1356 r
= bind_remount_recursive(t
, true, NULL
);
1358 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1362 r
= mount_verbose(LOG_ERR
, template, directory
, NULL
, MS_MOVE
, NULL
);
1366 (void) rmdir(template);
1372 (void) umount_verbose(t
);
1375 (void) umount_verbose(template);
1376 (void) rmdir(template);
1380 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1381 int pivot_root_parse(char **pivot_root_new
, char **pivot_root_old
, const char *s
) {
1382 _cleanup_free_
char *root_new
= NULL
, *root_old
= NULL
;
1386 assert(pivot_root_new
);
1387 assert(pivot_root_old
);
1389 r
= extract_first_word(&p
, &root_new
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1398 root_old
= strdup(p
);
1403 if (!path_is_absolute(root_new
))
1405 if (root_old
&& !path_is_absolute(root_old
))
1408 free_and_replace(*pivot_root_new
, root_new
);
1409 free_and_replace(*pivot_root_old
, root_old
);
1414 int setup_pivot_root(const char *directory
, const char *pivot_root_new
, const char *pivot_root_old
) {
1415 _cleanup_free_
char *directory_pivot_root_new
= NULL
;
1416 _cleanup_free_
char *pivot_tmp_pivot_root_old
= NULL
;
1417 char pivot_tmp
[] = "/tmp/nspawn-pivot-XXXXXX";
1418 bool remove_pivot_tmp
= false;
1423 if (!pivot_root_new
)
1426 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1427 * If pivot_root_old is NULL, the existing / disappears.
1428 * This requires a temporary directory, pivot_tmp, which is
1429 * not a child of either.
1431 * This is typically used for OSTree-style containers, where
1432 * the root partition contains several sysroots which could be
1433 * run. Normally, one would be chosen by the bootloader and
1434 * pivoted to / by initramfs.
1436 * For example, for an OSTree deployment, pivot_root_new
1437 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1438 * code doesn’t do the /var mount which OSTree expects: use
1439 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1441 * So in the OSTree case, we’ll end up with something like:
1442 * - directory = /tmp/nspawn-root-123456
1443 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1444 * - pivot_root_old = /sysroot
1445 * - directory_pivot_root_new =
1446 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1447 * - pivot_tmp = /tmp/nspawn-pivot-123456
1448 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1450 * Requires all file systems at directory and below to be mounted
1451 * MS_PRIVATE or MS_SLAVE so they can be moved.
1453 directory_pivot_root_new
= prefix_root(directory
, pivot_root_new
);
1455 /* Remount directory_pivot_root_new to make it movable. */
1456 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory_pivot_root_new
, NULL
, MS_BIND
, NULL
);
1460 if (pivot_root_old
) {
1461 if (!mkdtemp(pivot_tmp
)) {
1462 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
1466 remove_pivot_tmp
= true;
1467 pivot_tmp_pivot_root_old
= prefix_root(pivot_tmp
, pivot_root_old
);
1469 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, pivot_tmp
, NULL
, MS_MOVE
, NULL
);
1473 r
= mount_verbose(LOG_ERR
, directory
, pivot_tmp_pivot_root_old
, NULL
, MS_MOVE
, NULL
);
1477 r
= mount_verbose(LOG_ERR
, pivot_tmp
, directory
, NULL
, MS_MOVE
, NULL
);
1481 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory
, NULL
, MS_MOVE
, NULL
);
1487 if (remove_pivot_tmp
)
1488 (void) rmdir(pivot_tmp
);