1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2015 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <sys/mount.h>
27 #include "path-util.h"
31 #include "cgroup-util.h"
33 #include "nspawn-mount.h"
35 CustomMount
* custom_mount_add(CustomMount
**l
, unsigned *n
, CustomMountType t
) {
41 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
43 c
= realloc(*l
, (*n
+ 1) * sizeof(CustomMount
));
51 *ret
= (CustomMount
) { .type
= t
};
56 void custom_mount_free_all(CustomMount
*l
, unsigned n
) {
59 for (i
= 0; i
< n
; i
++) {
60 CustomMount
*m
= l
+ i
;
67 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
77 int custom_mount_compare(const void *a
, const void *b
) {
78 const CustomMount
*x
= a
, *y
= b
;
81 r
= path_compare(x
->destination
, y
->destination
);
85 if (x
->type
< y
->type
)
87 if (x
->type
> y
->type
)
93 int bind_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
94 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
102 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
109 destination
= strdup(source
);
114 if (r
== 2 && !isempty(p
)) {
120 if (!path_is_absolute(source
))
123 if (!path_is_absolute(destination
))
126 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
131 m
->destination
= destination
;
132 m
->read_only
= read_only
;
135 source
= destination
= opts
= NULL
;
139 int tmpfs_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
) {
140 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
149 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
156 opts
= strdup("mode=0755");
162 if (!path_is_absolute(path
))
165 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
169 m
->destination
= path
;
176 static int tmpfs_patch_options(
178 bool userns
, uid_t uid_shift
, uid_t uid_range
,
179 const char *selinux_apifs_context
,
184 if (userns
&& uid_shift
!= 0) {
185 assert(uid_shift
!= UID_INVALID
);
188 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, uid_shift
, uid_shift
);
190 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, uid_shift
, uid_shift
);
198 if (selinux_apifs_context
) {
202 t
= strjoin(options
, ",context=\"", selinux_apifs_context
, "\"", NULL
);
204 t
= strjoin("context=\"", selinux_apifs_context
, "\"", NULL
);
219 int mount_all(const char *dest
,
220 bool use_userns
, bool in_userns
,
221 uid_t uid_shift
, uid_t uid_range
,
222 const char *selinux_apifs_context
) {
224 typedef struct MountPoint
{
234 static const MountPoint mount_table
[] = {
235 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
236 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
237 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
238 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
239 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
240 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
241 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
242 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
244 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
245 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
252 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
253 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
256 if (in_userns
!= mount_table
[k
].userns
)
259 where
= prefix_root(dest
, mount_table
[k
].where
);
263 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
264 if (r
< 0 && r
!= -ENOENT
)
265 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
267 /* Skip this entry if it is not a remount. */
268 if (mount_table
[k
].what
&& r
> 0)
271 r
= mkdir_p(where
, 0755);
273 if (mount_table
[k
].fatal
)
274 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
276 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
280 o
= mount_table
[k
].options
;
281 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
282 r
= tmpfs_patch_options(o
, use_userns
, uid_shift
, uid_range
, selinux_apifs_context
, &options
);
289 if (mount(mount_table
[k
].what
,
292 mount_table
[k
].flags
,
295 if (mount_table
[k
].fatal
)
296 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
298 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
305 static int parse_mount_bind_options(const char *options
, unsigned long *mount_flags
, char **mount_opts
) {
306 const char *p
= options
;
307 unsigned long flags
= *mount_flags
;
313 _cleanup_free_
char *word
= NULL
;
314 int r
= extract_first_word(&p
, &word
, ",", 0);
316 return log_error_errno(r
, "Failed to extract mount option: %m");
320 if (streq(word
, "rbind"))
322 else if (streq(word
, "norbind"))
325 log_error("Invalid bind mount option: %s", word
);
330 *mount_flags
= flags
;
331 /* in the future mount_opts will hold string options for mount(2) */
337 static int mount_bind(const char *dest
, CustomMount
*m
) {
338 struct stat source_st
, dest_st
;
340 unsigned long mount_flags
= MS_BIND
| MS_REC
;
341 _cleanup_free_
char *mount_opts
= NULL
;
347 r
= parse_mount_bind_options(m
->options
, &mount_flags
, &mount_opts
);
352 if (stat(m
->source
, &source_st
) < 0)
353 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
355 where
= prefix_roota(dest
, m
->destination
);
357 if (stat(where
, &dest_st
) >= 0) {
358 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
359 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
363 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
364 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
368 } else if (errno
== ENOENT
) {
369 r
= mkdir_parents_label(where
, 0755);
371 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
373 log_error_errno(errno
, "Failed to stat %s: %m", where
);
377 /* Create the mount point. Any non-directory file can be
378 * mounted on any non-directory file (regular, fifo, socket,
381 if (S_ISDIR(source_st
.st_mode
))
382 r
= mkdir_label(where
, 0755);
385 if (r
< 0 && r
!= -EEXIST
)
386 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
388 if (mount(m
->source
, where
, NULL
, mount_flags
, mount_opts
) < 0)
389 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
392 r
= bind_remount_recursive(where
, true);
394 return log_error_errno(r
, "Read-only bind mount failed: %m");
400 static int mount_tmpfs(
403 bool userns
, uid_t uid_shift
, uid_t uid_range
,
404 const char *selinux_apifs_context
) {
406 const char *where
, *options
;
407 _cleanup_free_
char *buf
= NULL
;
413 where
= prefix_roota(dest
, m
->destination
);
415 r
= mkdir_p_label(where
, 0755);
416 if (r
< 0 && r
!= -EEXIST
)
417 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
419 r
= tmpfs_patch_options(m
->options
, userns
, uid_shift
, uid_range
, selinux_apifs_context
, &buf
);
422 options
= r
> 0 ? buf
: m
->options
;
424 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
425 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
430 static char *joined_and_escaped_lower_dirs(char * const *lower
) {
431 _cleanup_strv_free_
char **sv
= NULL
;
433 sv
= strv_copy(lower
);
439 if (!strv_shell_escape(sv
, ",:"))
442 return strv_join(sv
, ":");
445 static int mount_overlay(const char *dest
, CustomMount
*m
) {
446 _cleanup_free_
char *lower
= NULL
;
447 const char *where
, *options
;
453 where
= prefix_roota(dest
, m
->destination
);
455 r
= mkdir_label(where
, 0755);
456 if (r
< 0 && r
!= -EEXIST
)
457 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
459 (void) mkdir_p_label(m
->source
, 0755);
461 lower
= joined_and_escaped_lower_dirs(m
->lower
);
466 _cleanup_free_
char *escaped_source
= NULL
;
468 escaped_source
= shell_escape(m
->source
, ",:");
472 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
474 _cleanup_free_
char *escaped_source
= NULL
, *escaped_work_dir
= NULL
;
477 (void) mkdir_label(m
->work_dir
, 0700);
479 escaped_source
= shell_escape(m
->source
, ",:");
482 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
483 if (!escaped_work_dir
)
486 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
489 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
490 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
497 CustomMount
*mounts
, unsigned n
,
498 bool userns
, uid_t uid_shift
, uid_t uid_range
,
499 const char *selinux_apifs_context
) {
506 for (i
= 0; i
< n
; i
++) {
507 CustomMount
*m
= mounts
+ i
;
511 case CUSTOM_MOUNT_BIND
:
512 r
= mount_bind(dest
, m
);
515 case CUSTOM_MOUNT_TMPFS
:
516 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
519 case CUSTOM_MOUNT_OVERLAY
:
520 r
= mount_overlay(dest
, m
);
524 assert_not_reached("Unknown custom mount type");
534 static int mount_legacy_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
538 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
540 r
= path_is_mount_point(to
, 0);
541 if (r
< 0 && r
!= -ENOENT
)
542 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
548 /* The superblock mount options of the mount point need to be
549 * identical to the hosts', and hence writable... */
550 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
551 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
553 /* ... hence let's only make the bind mount read-only, not the
556 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
557 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
562 static int mount_legacy_cgroups(
564 bool userns
, uid_t uid_shift
, uid_t uid_range
,
565 const char *selinux_apifs_context
) {
567 _cleanup_set_free_free_ Set
*controllers
= NULL
;
568 const char *cgroup_root
;
571 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
573 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
574 r
= path_is_mount_point(cgroup_root
, AT_SYMLINK_FOLLOW
);
576 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
578 _cleanup_free_
char *options
= NULL
;
580 r
= tmpfs_patch_options("mode=755", userns
, uid_shift
, uid_range
, selinux_apifs_context
, &options
);
584 if (mount("tmpfs", cgroup_root
, "tmpfs", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
) < 0)
585 return log_error_errno(errno
, "Failed to mount /sys/fs/cgroup: %m");
588 if (cg_unified() > 0)
589 goto skip_controllers
;
591 controllers
= set_new(&string_hash_ops
);
595 r
= cg_kernel_controllers(controllers
);
597 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
600 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
602 controller
= set_steal_first(controllers
);
606 origin
= prefix_root("/sys/fs/cgroup/", controller
);
610 r
= readlink_malloc(origin
, &combined
);
612 /* Not a symbolic link, but directly a single cgroup hierarchy */
614 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
619 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
621 _cleanup_free_
char *target
= NULL
;
623 target
= prefix_root(dest
, origin
);
627 /* A symbolic link, a combination of controllers in one hierarchy */
629 if (!filename_is_valid(combined
)) {
630 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
634 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
638 r
= symlink_idempotent(combined
, target
);
640 log_error("Invalid existing symlink for combined hierarchy");
644 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
649 r
= mount_legacy_cgroup_hierarchy(dest
, "none,name=systemd,xattr", "systemd", false);
653 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
654 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
659 static int mount_unified_cgroups(const char *dest
) {
665 p
= strjoina(dest
, "/sys/fs/cgroup");
667 r
= path_is_mount_point(p
, AT_SYMLINK_FOLLOW
);
669 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
671 p
= strjoina(dest
, "/sys/fs/cgroup/cgroup.procs");
672 if (access(p
, F_OK
) >= 0)
675 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
677 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
681 if (mount("cgroup", p
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "__DEVEL__sane_behavior") < 0)
682 return log_error_errno(errno
, "Failed to mount unified cgroup hierarchy to %s: %m", p
);
689 bool unified_requested
,
690 bool userns
, uid_t uid_shift
, uid_t uid_range
,
691 const char *selinux_apifs_context
) {
693 if (unified_requested
)
694 return mount_unified_cgroups(dest
);
696 return mount_legacy_cgroups(dest
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
699 int mount_systemd_cgroup_writable(
701 bool unified_requested
) {
703 _cleanup_free_
char *own_cgroup_path
= NULL
;
704 const char *systemd_root
, *systemd_own
;
709 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
711 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
713 /* If we are living in the top-level, then there's nothing to do... */
714 if (path_equal(own_cgroup_path
, "/"))
717 if (unified_requested
) {
718 systemd_own
= strjoina(dest
, "/sys/fs/cgroup", own_cgroup_path
);
719 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup");
721 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
722 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
725 /* Make our own cgroup a (writable) bind mount */
726 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
727 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
729 /* And then remount the systemd cgroup root read-only */
730 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
731 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
736 int setup_volatile_state(
737 const char *directory
,
739 bool userns
, uid_t uid_shift
, uid_t uid_range
,
740 const char *selinux_apifs_context
) {
742 _cleanup_free_
char *buf
= NULL
;
743 const char *p
, *options
;
748 if (mode
!= VOLATILE_STATE
)
751 /* --volatile=state means we simply overmount /var
752 with a tmpfs, and the rest read-only. */
754 r
= bind_remount_recursive(directory
, true);
756 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
758 p
= prefix_roota(directory
, "/var");
760 if (r
< 0 && errno
!= EEXIST
)
761 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
763 options
= "mode=755";
764 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, selinux_apifs_context
, &buf
);
770 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
771 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
777 const char *directory
,
779 bool userns
, uid_t uid_shift
, uid_t uid_range
,
780 const char *selinux_apifs_context
) {
782 bool tmpfs_mounted
= false, bind_mounted
= false;
783 char template[] = "/tmp/nspawn-volatile-XXXXXX";
784 _cleanup_free_
char *buf
= NULL
;
785 const char *f
, *t
, *options
;
790 if (mode
!= VOLATILE_YES
)
793 /* --volatile=yes means we mount a tmpfs to the root dir, and
794 the original /usr to use inside it, and that read-only. */
796 if (!mkdtemp(template))
797 return log_error_errno(errno
, "Failed to create temporary directory: %m");
799 options
= "mode=755";
800 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, selinux_apifs_context
, &buf
);
806 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
807 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
811 tmpfs_mounted
= true;
813 f
= prefix_roota(directory
, "/usr");
814 t
= prefix_roota(template, "/usr");
817 if (r
< 0 && errno
!= EEXIST
) {
818 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
822 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
823 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
829 r
= bind_remount_recursive(t
, true);
831 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
835 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
836 r
= log_error_errno(errno
, "Failed to move root mount: %m");
840 (void) rmdir(template);
849 (void) umount(template);
850 (void) rmdir(template);
854 VolatileMode
volatile_mode_from_string(const char *s
) {
858 return _VOLATILE_MODE_INVALID
;
860 b
= parse_boolean(s
);
866 if (streq(s
, "state"))
867 return VOLATILE_STATE
;
869 return _VOLATILE_MODE_INVALID
;