1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2015 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <sys/mount.h>
22 #include <linux/magic.h>
24 #include "alloc-util.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
37 #include "stat-util.h"
38 #include "string-util.h"
40 #include "user-util.h"
43 CustomMount
* custom_mount_add(CustomMount
**l
, unsigned *n
, CustomMountType t
) {
49 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
51 c
= realloc_multiply(*l
, (*n
+ 1), sizeof(CustomMount
));
59 *ret
= (CustomMount
) { .type
= t
};
64 void custom_mount_free_all(CustomMount
*l
, unsigned n
) {
67 for (i
= 0; i
< n
; i
++) {
68 CustomMount
*m
= l
+ i
;
75 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
79 if (m
->rm_rf_tmpdir
) {
80 (void) rm_rf(m
->rm_rf_tmpdir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
81 free(m
->rm_rf_tmpdir
);
90 static int custom_mount_compare(const void *a
, const void *b
) {
91 const CustomMount
*x
= a
, *y
= b
;
94 r
= path_compare(x
->destination
, y
->destination
);
98 if (x
->type
< y
->type
)
100 if (x
->type
> y
->type
)
106 static bool source_path_is_valid(const char *p
) {
112 return path_is_absolute(p
);
115 static char *resolve_source_path(const char *dest
, const char *source
) {
120 if (source
[0] == '+')
121 return prefix_root(dest
, source
+ 1);
123 return strdup(source
);
126 int custom_mount_prepare_all(const char *dest
, CustomMount
*l
, unsigned n
) {
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l
, n
, sizeof(CustomMount
), custom_mount_compare
);
139 for (i
= 0; i
< n
; i
++) {
140 CustomMount
*m
= l
+ i
;
145 s
= resolve_source_path(dest
, m
->source
);
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
154 m
->rm_rf_tmpdir
= strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m
->rm_rf_tmpdir
)
158 if (!mkdtemp(m
->rm_rf_tmpdir
)) {
159 m
->rm_rf_tmpdir
= mfree(m
->rm_rf_tmpdir
);
160 return log_error_errno(errno
, "Failed to acquire temporary directory: %m");
163 m
->source
= strjoin(m
->rm_rf_tmpdir
, "/src");
167 if (mkdir(m
->source
, 0755) < 0)
168 return log_error_errno(errno
, "Failed to create %s: %m", m
->source
);
171 if (m
->type
== CUSTOM_MOUNT_OVERLAY
) {
174 STRV_FOREACH(j
, m
->lower
) {
177 s
= resolve_source_path(dest
, *j
);
188 s
= resolve_source_path(dest
, m
->work_dir
);
197 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
199 return log_error_errno(r
, "Failed to acquire working directory: %m");
202 (void) mkdir_label(m
->work_dir
, 0700);
209 int bind_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
210 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
218 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
224 destination
= strdup(source
[0] == '+' ? source
+1 : source
);
228 if (r
== 2 && !isempty(p
)) {
236 else if (!source_path_is_valid(source
))
239 if (!path_is_absolute(destination
))
242 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
247 m
->destination
= destination
;
248 m
->read_only
= read_only
;
251 source
= destination
= opts
= NULL
;
255 int tmpfs_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
) {
256 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
265 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
272 opts
= strdup("mode=0755");
278 if (!path_is_absolute(path
))
281 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
285 m
->destination
= path
;
292 int overlay_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
293 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
294 _cleanup_strv_free_
char **lower
= NULL
;
298 k
= strv_split_extract(&lower
, s
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
302 return -EADDRNOTAVAIL
;
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
307 if (!source_path_is_valid(lower
[0]) ||
308 !source_path_is_valid(lower
[1]))
314 destination
= strdup(upper
[0] == '+' ? upper
+1 : upper
); /* take the destination without "+" prefix */
320 /* If more than two parameters are specified, the last one is the destination, the second to last one
321 * the "upper", and all before that the "lower" directories. */
323 destination
= lower
[k
- 1];
324 upper
= lower
[k
- 2];
327 STRV_FOREACH(i
, lower
)
328 if (!source_path_is_valid(*i
))
331 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
335 else if (!source_path_is_valid(upper
))
338 if (!path_is_absolute(destination
))
342 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_OVERLAY
);
346 m
->destination
= destination
;
349 m
->read_only
= read_only
;
351 upper
= destination
= NULL
;
357 static int tmpfs_patch_options(
360 uid_t uid_shift
, uid_t uid_range
,
362 const char *selinux_apifs_context
,
367 if ((userns
&& uid_shift
!= 0) || patch_ids
) {
368 assert(uid_shift
!= UID_INVALID
);
370 if (asprintf(&buf
, "%s%suid=" UID_FMT
",gid=" UID_FMT
,
371 strempty(options
), options
? "," : "",
372 uid_shift
, uid_shift
) < 0)
379 if (selinux_apifs_context
) {
382 t
= strjoin(strempty(options
), options
? "," : "",
383 "context=\"", selinux_apifs_context
, "\"");
392 if (!buf
&& options
) {
393 buf
= strdup(options
);
402 int mount_sysfs(const char *dest
, MountSettingsMask mount_settings
) {
403 const char *full
, *top
, *x
;
405 unsigned long extra_flags
= 0;
407 top
= prefix_roota(dest
, "/sys");
408 r
= path_check_fstype(top
, SYSFS_MAGIC
);
410 return log_error_errno(r
, "Failed to determine filesystem type of %s: %m", top
);
411 /* /sys might already be mounted as sysfs by the outer child in the
412 * !netns case. In this case, it's all good. Don't touch it because we
413 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
418 full
= prefix_roota(top
, "/full");
420 (void) mkdir(full
, 0755);
422 if (mount_settings
& MOUNT_APPLY_APIVFS_RO
)
423 extra_flags
|= MS_RDONLY
;
425 r
= mount_verbose(LOG_ERR
, "sysfs", full
, "sysfs",
426 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|extra_flags
, NULL
);
430 FOREACH_STRING(x
, "block", "bus", "class", "dev", "devices", "kernel") {
431 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
433 from
= prefix_root(full
, x
);
437 to
= prefix_root(top
, x
);
441 (void) mkdir(to
, 0755);
443 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
447 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
448 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
453 r
= umount_verbose(full
);
458 return log_error_errno(errno
, "Failed to remove %s: %m", full
);
460 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
461 * remount /sys read-only.
463 if (cg_ns_supported()) {
464 x
= prefix_roota(top
, "/fs/cgroup");
465 (void) mkdir_p(x
, 0755);
468 return mount_verbose(LOG_ERR
, NULL
, top
, NULL
,
469 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
472 static int mkdir_userns(const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
477 r
= mkdir(path
, mode
);
478 if (r
< 0 && errno
!= EEXIST
)
481 if ((mask
& MOUNT_USE_USERNS
) == 0)
484 if (mask
& MOUNT_IN_USERNS
)
487 r
= lchown(path
, uid_shift
, uid_shift
);
494 static int mkdir_userns_p(const char *prefix
, const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
500 if (prefix
&& !path_startswith(path
, prefix
))
503 /* create every parent directory in the path, except the last component */
504 p
= path
+ strspn(path
, "/");
506 char t
[strlen(path
) + 1];
508 e
= p
+ strcspn(p
, "/");
509 p
= e
+ strspn(e
, "/");
511 /* Is this the last component? If so, then we're done */
515 memcpy(t
, path
, e
- path
);
518 if (prefix
&& path_startswith(prefix
, t
))
521 r
= mkdir_userns(t
, mode
, mask
, uid_shift
);
526 return mkdir_userns(path
, mode
, mask
, uid_shift
);
529 int mount_all(const char *dest
,
530 MountSettingsMask mount_settings
,
531 uid_t uid_shift
, uid_t uid_range
,
532 const char *selinux_apifs_context
) {
534 typedef struct MountPoint
{
540 MountSettingsMask mount_settings
;
543 static const MountPoint mount_table
[] = {
544 /* inner child mounts */
545 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_IN_USERNS
},
546 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
547 { "/proc/sys/net", "/proc/sys/net", NULL
, NULL
, MS_BIND
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
}, /* (except for this) */
548 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
549 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
550 { NULL
, "/proc/sysrq-trigger", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
552 /* outer child mounts */
553 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
554 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_NETNS
},
555 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
|MOUNT_APPLY_APIVFS_RO
}, /* skipped if above was mounted */
556 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, MOUNT_FATAL
}, /* skipped if above was mounted */
558 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, MOUNT_FATAL
},
559 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
560 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, MOUNT_FATAL
},
562 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, 0 }, /* Bind mount first */
563 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, 0 }, /* Then, make it r/o */
569 bool use_userns
= (mount_settings
& MOUNT_USE_USERNS
);
570 bool netns
= (mount_settings
& MOUNT_APPLY_APIVFS_NETNS
);
571 bool ro
= (mount_settings
& MOUNT_APPLY_APIVFS_RO
);
572 bool in_userns
= (mount_settings
& MOUNT_IN_USERNS
);
574 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
575 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
577 bool fatal
= (mount_table
[k
].mount_settings
& MOUNT_FATAL
);
579 if (in_userns
!= (bool)(mount_table
[k
].mount_settings
& MOUNT_IN_USERNS
))
582 if (!netns
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_NETNS
))
585 if (!ro
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_RO
))
588 r
= chase_symlinks(mount_table
[k
].where
, dest
, CHASE_NONEXISTENT
|CHASE_PREFIX_ROOT
, &where
);
590 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, mount_table
[k
].where
);
592 r
= path_is_mount_point(where
, NULL
, 0);
593 if (r
< 0 && r
!= -ENOENT
)
594 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
596 /* Skip this entry if it is not a remount. */
597 if (mount_table
[k
].what
&& r
> 0)
600 r
= mkdir_userns_p(dest
, where
, 0755, mount_settings
, uid_shift
);
601 if (r
< 0 && r
!= -EEXIST
) {
602 if (fatal
&& r
!= -EROFS
)
603 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
605 log_debug_errno(r
, "Failed to create directory %s: %m", where
);
606 /* If we failed mkdir() or chown() due to the root
607 * directory being read only, attempt to mount this fs
608 * anyway and let mount_verbose log any errors */
613 o
= mount_table
[k
].options
;
614 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
616 r
= tmpfs_patch_options(o
, use_userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
618 r
= tmpfs_patch_options(o
, use_userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
625 r
= mount_verbose(fatal
? LOG_ERR
: LOG_DEBUG
,
629 mount_table
[k
].flags
,
638 static int parse_mount_bind_options(const char *options
, unsigned long *mount_flags
, char **mount_opts
) {
639 const char *p
= options
;
640 unsigned long flags
= *mount_flags
;
647 _cleanup_free_
char *word
= NULL
;
649 r
= extract_first_word(&p
, &word
, ",", 0);
651 return log_error_errno(r
, "Failed to extract mount option: %m");
655 if (streq(word
, "rbind"))
657 else if (streq(word
, "norbind"))
660 log_error("Invalid bind mount option: %s", word
);
665 *mount_flags
= flags
;
666 /* in the future mount_opts will hold string options for mount(2) */
672 static int mount_bind(const char *dest
, CustomMount
*m
) {
674 _cleanup_free_
char *mount_opts
= NULL
, *where
= NULL
;
675 unsigned long mount_flags
= MS_BIND
| MS_REC
;
676 struct stat source_st
, dest_st
;
683 r
= parse_mount_bind_options(m
->options
, &mount_flags
, &mount_opts
);
688 if (stat(m
->source
, &source_st
) < 0)
689 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
691 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
693 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
694 if (r
> 0) { /* Path exists already? */
696 if (stat(where
, &dest_st
) < 0)
697 return log_error_errno(errno
, "Failed to stat %s: %m", where
);
699 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
700 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
704 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
705 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
709 } else { /* Path doesn't exist yet? */
710 r
= mkdir_parents_label(where
, 0755);
712 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
714 /* Create the mount point. Any non-directory file can be
715 * mounted on any non-directory file (regular, fifo, socket,
718 if (S_ISDIR(source_st
.st_mode
))
719 r
= mkdir_label(where
, 0755);
723 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
727 r
= mount_verbose(LOG_ERR
, m
->source
, where
, NULL
, mount_flags
, mount_opts
);
732 r
= bind_remount_recursive(where
, true, NULL
);
734 return log_error_errno(r
, "Read-only bind mount failed: %m");
740 static int mount_tmpfs(
743 bool userns
, uid_t uid_shift
, uid_t uid_range
,
744 const char *selinux_apifs_context
) {
747 _cleanup_free_
char *buf
= NULL
, *where
= NULL
;
753 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
755 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
756 if (r
== 0) { /* Doesn't exist yet? */
757 r
= mkdir_p_label(where
, 0755);
759 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
762 r
= tmpfs_patch_options(m
->options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
765 options
= r
> 0 ? buf
: m
->options
;
767 return mount_verbose(LOG_ERR
, "tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
);
770 static char *joined_and_escaped_lower_dirs(char **lower
) {
771 _cleanup_strv_free_
char **sv
= NULL
;
773 sv
= strv_copy(lower
);
779 if (!strv_shell_escape(sv
, ",:"))
782 return strv_join(sv
, ":");
785 static int mount_overlay(const char *dest
, CustomMount
*m
) {
787 _cleanup_free_
char *lower
= NULL
, *where
= NULL
, *escaped_source
= NULL
;
794 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
796 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
797 if (r
== 0) { /* Doesn't exist yet? */
798 r
= mkdir_label(where
, 0755);
800 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
803 (void) mkdir_p_label(m
->source
, 0755);
805 lower
= joined_and_escaped_lower_dirs(m
->lower
);
809 escaped_source
= shell_escape(m
->source
, ",:");
814 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
816 _cleanup_free_
char *escaped_work_dir
= NULL
;
818 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
819 if (!escaped_work_dir
)
822 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
825 return mount_verbose(LOG_ERR
, "overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
);
830 CustomMount
*mounts
, unsigned n
,
831 bool userns
, uid_t uid_shift
, uid_t uid_range
,
832 const char *selinux_apifs_context
) {
839 for (i
= 0; i
< n
; i
++) {
840 CustomMount
*m
= mounts
+ i
;
844 case CUSTOM_MOUNT_BIND
:
845 r
= mount_bind(dest
, m
);
848 case CUSTOM_MOUNT_TMPFS
:
849 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
852 case CUSTOM_MOUNT_OVERLAY
:
853 r
= mount_overlay(dest
, m
);
857 assert_not_reached("Unknown custom mount type");
867 /* Retrieve existing subsystems. This function is called in a new cgroup
870 static int get_process_controllers(Set
**ret
) {
871 _cleanup_set_free_free_ Set
*controllers
= NULL
;
872 _cleanup_fclose_
FILE *f
= NULL
;
877 controllers
= set_new(&string_hash_ops
);
881 f
= fopen("/proc/self/cgroup", "re");
883 return errno
== ENOENT
? -ESRCH
: -errno
;
886 _cleanup_free_
char *line
= NULL
;
889 r
= read_line(f
, LONG_LINE_MAX
, &line
);
895 l
= strchr(line
, ':');
906 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
909 r
= set_put_strdup(controllers
, l
);
920 static int mount_legacy_cgroup_hierarchy(
922 const char *controller
,
923 const char *hierarchy
,
926 const char *to
, *fstype
, *opts
;
929 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
931 r
= path_is_mount_point(to
, dest
, 0);
932 if (r
< 0 && r
!= -ENOENT
)
933 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
939 /* The superblock mount options of the mount point need to be
940 * identical to the hosts', and hence writable... */
941 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
944 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
946 opts
= "none,name=systemd,xattr";
952 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
956 /* ... hence let's only make the bind mount read-only, not the superblock. */
958 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
959 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
967 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
968 static int mount_legacy_cgns_supported(
970 CGroupUnified unified_requested
,
974 const char *selinux_apifs_context
) {
976 _cleanup_set_free_free_ Set
*controllers
= NULL
;
977 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
980 (void) mkdir_p(cgroup_root
, 0755);
982 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
983 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
985 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
987 _cleanup_free_
char *options
= NULL
;
989 /* When cgroup namespaces are enabled and user namespaces are
990 * used then the mount of the cgroupfs is done *inside* the new
991 * user namespace. We're root in the new user namespace and the
992 * kernel will happily translate our uid/gid to the correct
993 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
994 * pass uid 0 and not uid_shift to tmpfs_patch_options().
996 r
= tmpfs_patch_options("mode=755", userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
1000 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
1001 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
1006 r
= cg_all_unified();
1010 goto skip_controllers
;
1012 r
= get_process_controllers(&controllers
);
1014 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1017 _cleanup_free_
const char *controller
= NULL
;
1019 controller
= set_steal_first(controllers
);
1023 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
1027 /* When multiple hierarchies are co-mounted, make their
1028 * constituting individual hierarchies a symlink to the
1033 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
1035 r
= extract_first_word(&c
, &tok
, ",", 0);
1037 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
1041 if (streq(controller
, tok
))
1044 target
= prefix_root("/sys/fs/cgroup/", tok
);
1048 r
= symlink_idempotent(controller
, target
);
1050 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1052 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1057 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1058 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1063 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1068 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1069 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1074 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1075 static int mount_legacy_cgns_unsupported(
1077 CGroupUnified unified_requested
,
1081 const char *selinux_apifs_context
) {
1083 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1084 const char *cgroup_root
;
1087 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1089 (void) mkdir_p(cgroup_root
, 0755);
1091 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1092 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
1094 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1096 _cleanup_free_
char *options
= NULL
;
1098 r
= tmpfs_patch_options("mode=755", userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
1102 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
1103 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
1108 r
= cg_all_unified();
1112 goto skip_controllers
;
1114 r
= cg_kernel_controllers(&controllers
);
1116 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1119 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1121 controller
= set_steal_first(controllers
);
1125 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1129 r
= readlink_malloc(origin
, &combined
);
1131 /* Not a symbolic link, but directly a single cgroup hierarchy */
1133 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
1138 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1140 _cleanup_free_
char *target
= NULL
;
1142 target
= prefix_root(dest
, origin
);
1146 /* A symbolic link, a combination of controllers in one hierarchy */
1148 if (!filename_is_valid(combined
)) {
1149 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1153 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
1157 r
= symlink_idempotent(combined
, target
);
1159 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1161 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1166 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1167 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1172 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1176 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1177 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1180 static int mount_unified_cgroups(const char *dest
) {
1186 p
= prefix_roota(dest
, "/sys/fs/cgroup");
1188 (void) mkdir_p(p
, 0755);
1190 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
1192 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
1194 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
1195 if (access(p
, F_OK
) >= 0)
1197 if (errno
!= ENOENT
)
1198 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
1200 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
1204 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
1209 CGroupUnified unified_requested
,
1213 const char *selinux_apifs_context
,
1216 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
1217 return mount_unified_cgroups(dest
);
1219 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1221 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1224 static int mount_systemd_cgroup_writable_one(const char *root
, const char *own
) {
1230 /* Make our own cgroup a (writable) bind mount */
1231 r
= mount_verbose(LOG_ERR
, own
, own
, NULL
, MS_BIND
, NULL
);
1235 /* And then remount the systemd cgroup root read-only */
1236 return mount_verbose(LOG_ERR
, NULL
, root
, NULL
,
1237 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
1240 int mount_systemd_cgroup_writable(
1242 CGroupUnified unified_requested
) {
1244 _cleanup_free_
char *own_cgroup_path
= NULL
;
1245 const char *root
, *own
;
1250 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1252 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1254 /* If we are living in the top-level, then there's nothing to do... */
1255 if (path_equal(own_cgroup_path
, "/"))
1258 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
1260 root
= prefix_roota(dest
, "/sys/fs/cgroup");
1261 own
= strjoina(root
, own_cgroup_path
);
1265 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1266 root
= prefix_roota(dest
, "/sys/fs/cgroup/unified");
1267 own
= strjoina(root
, own_cgroup_path
);
1269 r
= mount_systemd_cgroup_writable_one(root
, own
);
1274 root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1275 own
= strjoina(root
, own_cgroup_path
);
1278 return mount_systemd_cgroup_writable_one(root
, own
);
1281 int setup_volatile_state(
1282 const char *directory
,
1284 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1285 const char *selinux_apifs_context
) {
1287 _cleanup_free_
char *buf
= NULL
;
1288 const char *p
, *options
;
1293 if (mode
!= VOLATILE_STATE
)
1296 /* --volatile=state means we simply overmount /var
1297 with a tmpfs, and the rest read-only. */
1299 r
= bind_remount_recursive(directory
, true, NULL
);
1301 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1303 p
= prefix_roota(directory
, "/var");
1305 if (r
< 0 && errno
!= EEXIST
)
1306 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1308 options
= "mode=755";
1309 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1315 return mount_verbose(LOG_ERR
, "tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
);
1319 const char *directory
,
1321 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1322 const char *selinux_apifs_context
) {
1324 bool tmpfs_mounted
= false, bind_mounted
= false;
1325 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1326 _cleanup_free_
char *buf
= NULL
;
1327 const char *f
, *t
, *options
;
1332 if (mode
!= VOLATILE_YES
)
1335 /* --volatile=yes means we mount a tmpfs to the root dir, and
1336 the original /usr to use inside it, and that read-only. */
1338 if (!mkdtemp(template))
1339 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1341 options
= "mode=755";
1342 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1348 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
1352 tmpfs_mounted
= true;
1354 f
= prefix_roota(directory
, "/usr");
1355 t
= prefix_roota(template, "/usr");
1358 if (r
< 0 && errno
!= EEXIST
) {
1359 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1363 r
= mount_verbose(LOG_ERR
, f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
);
1367 bind_mounted
= true;
1369 r
= bind_remount_recursive(t
, true, NULL
);
1371 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1375 r
= mount_verbose(LOG_ERR
, template, directory
, NULL
, MS_MOVE
, NULL
);
1379 (void) rmdir(template);
1385 (void) umount_verbose(t
);
1388 (void) umount_verbose(template);
1389 (void) rmdir(template);
1393 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1394 int pivot_root_parse(char **pivot_root_new
, char **pivot_root_old
, const char *s
) {
1395 _cleanup_free_
char *root_new
= NULL
, *root_old
= NULL
;
1399 assert(pivot_root_new
);
1400 assert(pivot_root_old
);
1402 r
= extract_first_word(&p
, &root_new
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1411 root_old
= strdup(p
);
1416 if (!path_is_absolute(root_new
))
1418 if (root_old
&& !path_is_absolute(root_old
))
1421 free_and_replace(*pivot_root_new
, root_new
);
1422 free_and_replace(*pivot_root_old
, root_old
);
1427 int setup_pivot_root(const char *directory
, const char *pivot_root_new
, const char *pivot_root_old
) {
1428 _cleanup_free_
char *directory_pivot_root_new
= NULL
;
1429 _cleanup_free_
char *pivot_tmp_pivot_root_old
= NULL
;
1430 char pivot_tmp
[] = "/tmp/nspawn-pivot-XXXXXX";
1431 bool remove_pivot_tmp
= false;
1436 if (!pivot_root_new
)
1439 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1440 * If pivot_root_old is NULL, the existing / disappears.
1441 * This requires a temporary directory, pivot_tmp, which is
1442 * not a child of either.
1444 * This is typically used for OSTree-style containers, where
1445 * the root partition contains several sysroots which could be
1446 * run. Normally, one would be chosen by the bootloader and
1447 * pivoted to / by initramfs.
1449 * For example, for an OSTree deployment, pivot_root_new
1450 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1451 * code doesn’t do the /var mount which OSTree expects: use
1452 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1454 * So in the OSTree case, we’ll end up with something like:
1455 * - directory = /tmp/nspawn-root-123456
1456 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1457 * - pivot_root_old = /sysroot
1458 * - directory_pivot_root_new =
1459 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1460 * - pivot_tmp = /tmp/nspawn-pivot-123456
1461 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1463 * Requires all file systems at directory and below to be mounted
1464 * MS_PRIVATE or MS_SLAVE so they can be moved.
1466 directory_pivot_root_new
= prefix_root(directory
, pivot_root_new
);
1468 /* Remount directory_pivot_root_new to make it movable. */
1469 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory_pivot_root_new
, NULL
, MS_BIND
, NULL
);
1473 if (pivot_root_old
) {
1474 if (!mkdtemp(pivot_tmp
)) {
1475 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
1479 remove_pivot_tmp
= true;
1480 pivot_tmp_pivot_root_old
= prefix_root(pivot_tmp
, pivot_root_old
);
1482 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, pivot_tmp
, NULL
, MS_MOVE
, NULL
);
1486 r
= mount_verbose(LOG_ERR
, directory
, pivot_tmp_pivot_root_old
, NULL
, MS_MOVE
, NULL
);
1490 r
= mount_verbose(LOG_ERR
, pivot_tmp
, directory
, NULL
, MS_MOVE
, NULL
);
1494 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory
, NULL
, MS_MOVE
, NULL
);
1500 if (remove_pivot_tmp
)
1501 (void) rmdir(pivot_tmp
);