1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2015 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <sys/mount.h>
27 #include "path-util.h"
31 #include "cgroup-util.h"
33 #include "nspawn-mount.h"
35 CustomMount
* custom_mount_add(CustomMount
**l
, unsigned *n
, CustomMountType t
) {
41 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
43 c
= realloc(*l
, (*n
+ 1) * sizeof(CustomMount
));
51 *ret
= (CustomMount
) { .type
= t
};
56 void custom_mount_free_all(CustomMount
*l
, unsigned n
) {
59 for (i
= 0; i
< n
; i
++) {
60 CustomMount
*m
= l
+ i
;
67 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
77 int custom_mount_compare(const void *a
, const void *b
) {
78 const CustomMount
*x
= a
, *y
= b
;
81 r
= path_compare(x
->destination
, y
->destination
);
85 if (x
->type
< y
->type
)
87 if (x
->type
> y
->type
)
93 int bind_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
94 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
102 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
109 destination
= strdup(source
);
114 if (r
== 2 && !isempty(p
)) {
120 if (!path_is_absolute(source
))
123 if (!path_is_absolute(destination
))
126 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
131 m
->destination
= destination
;
132 m
->read_only
= read_only
;
135 source
= destination
= opts
= NULL
;
139 int tmpfs_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
) {
140 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
149 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
156 opts
= strdup("mode=0755");
162 if (!path_is_absolute(path
))
165 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
169 m
->destination
= path
;
176 static int tmpfs_patch_options(
178 bool userns
, uid_t uid_shift
, uid_t uid_range
,
179 const char *selinux_apifs_context
,
184 if (userns
&& uid_shift
!= 0) {
185 assert(uid_shift
!= UID_INVALID
);
188 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, uid_shift
, uid_shift
);
190 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, uid_shift
, uid_shift
);
198 if (selinux_apifs_context
) {
202 t
= strjoin(options
, ",context=\"", selinux_apifs_context
, "\"", NULL
);
204 t
= strjoin("context=\"", selinux_apifs_context
, "\"", NULL
);
219 int mount_all(const char *dest
,
220 bool userns
, uid_t uid_shift
, uid_t uid_range
,
221 const char *selinux_apifs_context
) {
223 typedef struct MountPoint
{
233 static const MountPoint mount_table
[] = {
234 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
235 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
236 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
237 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
238 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
239 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
240 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
241 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
243 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
244 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
251 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
252 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
255 if (userns
!= mount_table
[k
].userns
)
258 where
= prefix_root(dest
, mount_table
[k
].where
);
262 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
263 if (r
< 0 && r
!= -ENOENT
)
264 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
266 /* Skip this entry if it is not a remount. */
267 if (mount_table
[k
].what
&& r
> 0)
270 r
= mkdir_p(where
, 0755);
272 if (mount_table
[k
].fatal
)
273 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
275 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
279 o
= mount_table
[k
].options
;
280 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
281 r
= tmpfs_patch_options(o
, userns
, uid_shift
, uid_range
, selinux_apifs_context
, &options
);
288 if (mount(mount_table
[k
].what
,
291 mount_table
[k
].flags
,
294 if (mount_table
[k
].fatal
)
295 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
297 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
304 static int parse_mount_bind_options(const char *options
, unsigned long *mount_flags
, char **mount_opts
) {
305 const char *p
= options
;
306 unsigned long flags
= *mount_flags
;
312 _cleanup_free_
char *word
= NULL
;
313 int r
= extract_first_word(&p
, &word
, ",", 0);
315 return log_error_errno(r
, "Failed to extract mount option: %m");
319 if (streq(word
, "rbind"))
321 else if (streq(word
, "norbind"))
324 log_error("Invalid bind mount option: %s", word
);
329 *mount_flags
= flags
;
330 /* in the future mount_opts will hold string options for mount(2) */
336 static int mount_bind(const char *dest
, CustomMount
*m
) {
337 struct stat source_st
, dest_st
;
339 unsigned long mount_flags
= MS_BIND
| MS_REC
;
340 _cleanup_free_
char *mount_opts
= NULL
;
346 r
= parse_mount_bind_options(m
->options
, &mount_flags
, &mount_opts
);
351 if (stat(m
->source
, &source_st
) < 0)
352 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
354 where
= prefix_roota(dest
, m
->destination
);
356 if (stat(where
, &dest_st
) >= 0) {
357 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
358 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
362 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
363 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
367 } else if (errno
== ENOENT
) {
368 r
= mkdir_parents_label(where
, 0755);
370 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
372 log_error_errno(errno
, "Failed to stat %s: %m", where
);
376 /* Create the mount point. Any non-directory file can be
377 * mounted on any non-directory file (regular, fifo, socket,
380 if (S_ISDIR(source_st
.st_mode
))
381 r
= mkdir_label(where
, 0755);
384 if (r
< 0 && r
!= -EEXIST
)
385 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
387 if (mount(m
->source
, where
, NULL
, mount_flags
, mount_opts
) < 0)
388 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
391 r
= bind_remount_recursive(where
, true);
393 return log_error_errno(r
, "Read-only bind mount failed: %m");
399 static int mount_tmpfs(
402 bool userns
, uid_t uid_shift
, uid_t uid_range
,
403 const char *selinux_apifs_context
) {
405 const char *where
, *options
;
406 _cleanup_free_
char *buf
= NULL
;
412 where
= prefix_roota(dest
, m
->destination
);
414 r
= mkdir_p_label(where
, 0755);
415 if (r
< 0 && r
!= -EEXIST
)
416 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
418 r
= tmpfs_patch_options(m
->options
, userns
, uid_shift
, uid_range
, selinux_apifs_context
, &buf
);
421 options
= r
> 0 ? buf
: m
->options
;
423 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
424 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
429 static char *joined_and_escaped_lower_dirs(char * const *lower
) {
430 _cleanup_strv_free_
char **sv
= NULL
;
432 sv
= strv_copy(lower
);
438 if (!strv_shell_escape(sv
, ",:"))
441 return strv_join(sv
, ":");
444 static int mount_overlay(const char *dest
, CustomMount
*m
) {
445 _cleanup_free_
char *lower
= NULL
;
446 const char *where
, *options
;
452 where
= prefix_roota(dest
, m
->destination
);
454 r
= mkdir_label(where
, 0755);
455 if (r
< 0 && r
!= -EEXIST
)
456 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
458 (void) mkdir_p_label(m
->source
, 0755);
460 lower
= joined_and_escaped_lower_dirs(m
->lower
);
465 _cleanup_free_
char *escaped_source
= NULL
;
467 escaped_source
= shell_escape(m
->source
, ",:");
471 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
473 _cleanup_free_
char *escaped_source
= NULL
, *escaped_work_dir
= NULL
;
476 (void) mkdir_label(m
->work_dir
, 0700);
478 escaped_source
= shell_escape(m
->source
, ",:");
481 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
482 if (!escaped_work_dir
)
485 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
488 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
489 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
496 CustomMount
*mounts
, unsigned n
,
497 bool userns
, uid_t uid_shift
, uid_t uid_range
,
498 const char *selinux_apifs_context
) {
505 for (i
= 0; i
< n
; i
++) {
506 CustomMount
*m
= mounts
+ i
;
510 case CUSTOM_MOUNT_BIND
:
511 r
= mount_bind(dest
, m
);
514 case CUSTOM_MOUNT_TMPFS
:
515 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
518 case CUSTOM_MOUNT_OVERLAY
:
519 r
= mount_overlay(dest
, m
);
523 assert_not_reached("Unknown custom mount type");
533 static int mount_legacy_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
537 to
= strjoina(dest
, "/sys/fs/cgroup/", hierarchy
);
539 r
= path_is_mount_point(to
, 0);
540 if (r
< 0 && r
!= -ENOENT
)
541 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
547 /* The superblock mount options of the mount point need to be
548 * identical to the hosts', and hence writable... */
549 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
550 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
552 /* ... hence let's only make the bind mount read-only, not the
555 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
556 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
561 static int mount_legacy_cgroups(
563 bool userns
, uid_t uid_shift
, uid_t uid_range
,
564 const char *selinux_apifs_context
) {
566 _cleanup_set_free_free_ Set
*controllers
= NULL
;
567 const char *cgroup_root
;
570 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
572 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
573 r
= path_is_mount_point(cgroup_root
, AT_SYMLINK_FOLLOW
);
575 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
577 _cleanup_free_
char *options
= NULL
;
579 r
= tmpfs_patch_options("mode=755", userns
, uid_shift
, uid_range
, selinux_apifs_context
, &options
);
583 if (mount("tmpfs", cgroup_root
, "tmpfs", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
) < 0)
584 return log_error_errno(errno
, "Failed to mount /sys/fs/cgroup: %m");
587 if (cg_unified() > 0)
588 goto skip_controllers
;
590 controllers
= set_new(&string_hash_ops
);
594 r
= cg_kernel_controllers(controllers
);
596 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
599 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
601 controller
= set_steal_first(controllers
);
605 origin
= prefix_root("/sys/fs/cgroup/", controller
);
609 r
= readlink_malloc(origin
, &combined
);
611 /* Not a symbolic link, but directly a single cgroup hierarchy */
613 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
618 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
620 _cleanup_free_
char *target
= NULL
;
622 target
= prefix_root(dest
, origin
);
626 /* A symbolic link, a combination of controllers in one hierarchy */
628 if (!filename_is_valid(combined
)) {
629 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
633 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
637 r
= symlink_idempotent(combined
, target
);
639 log_error("Invalid existing symlink for combined hierarchy");
643 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
648 r
= mount_legacy_cgroup_hierarchy(dest
, "none,name=systemd,xattr", "systemd", false);
652 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
653 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
658 static int mount_unified_cgroups(const char *dest
) {
664 p
= strjoina(dest
, "/sys/fs/cgroup");
666 r
= path_is_mount_point(p
, AT_SYMLINK_FOLLOW
);
668 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
670 p
= strjoina(dest
, "/sys/fs/cgroup/cgroup.procs");
671 if (access(p
, F_OK
) >= 0)
674 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
676 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
680 if (mount("cgroup", p
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "__DEVEL__sane_behavior") < 0)
681 return log_error_errno(errno
, "Failed to mount unified cgroup hierarchy to %s: %m", p
);
688 bool unified_requested
,
689 bool userns
, uid_t uid_shift
, uid_t uid_range
,
690 const char *selinux_apifs_context
) {
692 if (unified_requested
)
693 return mount_unified_cgroups(dest
);
695 return mount_legacy_cgroups(dest
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
698 int mount_systemd_cgroup_writable(
700 bool unified_requested
) {
702 _cleanup_free_
char *own_cgroup_path
= NULL
;
703 const char *systemd_root
, *systemd_own
;
708 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
710 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
712 /* If we are living in the top-level, then there's nothing to do... */
713 if (path_equal(own_cgroup_path
, "/"))
716 if (unified_requested
) {
717 systemd_own
= strjoina(dest
, "/sys/fs/cgroup", own_cgroup_path
);
718 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup");
720 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
721 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
724 /* Make our own cgroup a (writable) bind mount */
725 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
726 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
728 /* And then remount the systemd cgroup root read-only */
729 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
730 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
735 int setup_volatile_state(
736 const char *directory
,
738 bool userns
, uid_t uid_shift
, uid_t uid_range
,
739 const char *selinux_apifs_context
) {
741 _cleanup_free_
char *buf
= NULL
;
742 const char *p
, *options
;
747 if (mode
!= VOLATILE_STATE
)
750 /* --volatile=state means we simply overmount /var
751 with a tmpfs, and the rest read-only. */
753 r
= bind_remount_recursive(directory
, true);
755 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
757 p
= prefix_roota(directory
, "/var");
759 if (r
< 0 && errno
!= EEXIST
)
760 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
762 options
= "mode=755";
763 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, selinux_apifs_context
, &buf
);
769 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
770 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
776 const char *directory
,
778 bool userns
, uid_t uid_shift
, uid_t uid_range
,
779 const char *selinux_apifs_context
) {
781 bool tmpfs_mounted
= false, bind_mounted
= false;
782 char template[] = "/tmp/nspawn-volatile-XXXXXX";
783 _cleanup_free_
char *buf
= NULL
;
784 const char *f
, *t
, *options
;
789 if (mode
!= VOLATILE_YES
)
792 /* --volatile=yes means we mount a tmpfs to the root dir, and
793 the original /usr to use inside it, and that read-only. */
795 if (!mkdtemp(template))
796 return log_error_errno(errno
, "Failed to create temporary directory: %m");
798 options
= "mode=755";
799 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, selinux_apifs_context
, &buf
);
805 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
806 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
810 tmpfs_mounted
= true;
812 f
= prefix_roota(directory
, "/usr");
813 t
= prefix_roota(template, "/usr");
816 if (r
< 0 && errno
!= EEXIST
) {
817 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
821 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
822 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
828 r
= bind_remount_recursive(t
, true);
830 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
834 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
835 r
= log_error_errno(errno
, "Failed to move root mount: %m");
839 (void) rmdir(template);
848 (void) umount(template);
849 (void) rmdir(template);
853 VolatileMode
volatile_mode_from_string(const char *s
) {
857 return _VOLATILE_MODE_INVALID
;
859 b
= parse_boolean(s
);
865 if (streq(s
, "state"))
866 return VOLATILE_STATE
;
868 return _VOLATILE_MODE_INVALID
;