1 /* SPDX-License-Identifier: LGPL-2.1+ */
4 #include <linux/magic.h>
6 #include "alloc-util.h"
13 #include "mount-util.h"
14 #include "nspawn-mount.h"
15 #include "parse-util.h"
16 #include "path-util.h"
19 #include "stat-util.h"
20 #include "string-util.h"
22 #include "user-util.h"
25 CustomMount
* custom_mount_add(CustomMount
**l
, size_t *n
, CustomMountType t
) {
31 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
33 c
= reallocarray(*l
, *n
+ 1, sizeof(CustomMount
));
41 *ret
= (CustomMount
) { .type
= t
};
46 void custom_mount_free_all(CustomMount
*l
, size_t n
) {
49 for (i
= 0; i
< n
; i
++) {
50 CustomMount
*m
= l
+ i
;
57 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
61 if (m
->rm_rf_tmpdir
) {
62 (void) rm_rf(m
->rm_rf_tmpdir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
63 free(m
->rm_rf_tmpdir
);
72 static int custom_mount_compare(const void *a
, const void *b
) {
73 const CustomMount
*x
= a
, *y
= b
;
76 r
= path_compare(x
->destination
, y
->destination
);
80 if (x
->type
< y
->type
)
82 if (x
->type
> y
->type
)
88 static bool source_path_is_valid(const char *p
) {
94 return path_is_absolute(p
);
97 static char *resolve_source_path(const char *dest
, const char *source
) {
102 if (source
[0] == '+')
103 return prefix_root(dest
, source
+ 1);
105 return strdup(source
);
108 int custom_mount_prepare_all(const char *dest
, CustomMount
*l
, size_t n
) {
112 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
113 * parent process, so that we know the temporary directories to remove on exit before we fork off the
118 /* Order the custom mounts, and make sure we have a working directory */
119 qsort_safe(l
, n
, sizeof(CustomMount
), custom_mount_compare
);
121 for (i
= 0; i
< n
; i
++) {
122 CustomMount
*m
= l
+ i
;
127 s
= resolve_source_path(dest
, m
->source
);
131 free_and_replace(m
->source
, s
);
133 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
135 m
->rm_rf_tmpdir
= strdup("/var/tmp/nspawn-temp-XXXXXX");
136 if (!m
->rm_rf_tmpdir
)
139 if (!mkdtemp(m
->rm_rf_tmpdir
)) {
140 m
->rm_rf_tmpdir
= mfree(m
->rm_rf_tmpdir
);
141 return log_error_errno(errno
, "Failed to acquire temporary directory: %m");
144 m
->source
= strjoin(m
->rm_rf_tmpdir
, "/src");
148 if (mkdir(m
->source
, 0755) < 0)
149 return log_error_errno(errno
, "Failed to create %s: %m", m
->source
);
152 if (m
->type
== CUSTOM_MOUNT_OVERLAY
) {
155 STRV_FOREACH(j
, m
->lower
) {
158 s
= resolve_source_path(dest
, *j
);
162 free_and_replace(*j
, s
);
168 s
= resolve_source_path(dest
, m
->work_dir
);
172 free_and_replace(m
->work_dir
, s
);
176 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
178 return log_error_errno(r
, "Failed to acquire working directory: %m");
181 (void) mkdir_label(m
->work_dir
, 0700);
188 int bind_mount_parse(CustomMount
**l
, size_t *n
, const char *s
, bool read_only
) {
189 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
197 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
203 destination
= strdup(source
[0] == '+' ? source
+1 : source
);
207 if (r
== 2 && !isempty(p
)) {
215 else if (!source_path_is_valid(source
))
218 if (!path_is_absolute(destination
))
221 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
226 m
->destination
= destination
;
227 m
->read_only
= read_only
;
230 source
= destination
= opts
= NULL
;
234 int tmpfs_mount_parse(CustomMount
**l
, size_t *n
, const char *s
) {
235 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
244 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
251 opts
= strdup("mode=0755");
257 if (!path_is_absolute(path
))
260 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
264 m
->destination
= TAKE_PTR(path
);
265 m
->options
= TAKE_PTR(opts
);
270 int overlay_mount_parse(CustomMount
**l
, size_t *n
, const char *s
, bool read_only
) {
271 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
272 _cleanup_strv_free_
char **lower
= NULL
;
276 k
= strv_split_extract(&lower
, s
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
280 return -EADDRNOTAVAIL
;
282 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
283 * we'll also define the destination mount point the same as the upper. */
285 if (!source_path_is_valid(lower
[0]) ||
286 !source_path_is_valid(lower
[1]))
289 upper
= TAKE_PTR(lower
[1]);
291 destination
= strdup(upper
[0] == '+' ? upper
+1 : upper
); /* take the destination without "+" prefix */
297 /* If more than two parameters are specified, the last one is the destination, the second to last one
298 * the "upper", and all before that the "lower" directories. */
300 destination
= lower
[k
- 1];
301 upper
= TAKE_PTR(lower
[k
- 2]);
303 STRV_FOREACH(i
, lower
)
304 if (!source_path_is_valid(*i
))
307 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
311 else if (!source_path_is_valid(upper
))
314 if (!path_is_absolute(destination
))
318 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_OVERLAY
);
322 m
->destination
= TAKE_PTR(destination
);
323 m
->source
= TAKE_PTR(upper
);
324 m
->lower
= TAKE_PTR(lower
);
325 m
->read_only
= read_only
;
330 static int tmpfs_patch_options(
333 uid_t uid_shift
, uid_t uid_range
,
335 const char *selinux_apifs_context
,
340 if ((userns
&& uid_shift
!= 0) || patch_ids
) {
341 assert(uid_shift
!= UID_INVALID
);
343 if (asprintf(&buf
, "%s%suid=" UID_FMT
",gid=" UID_FMT
,
344 strempty(options
), options
? "," : "",
345 uid_shift
, uid_shift
) < 0)
352 if (selinux_apifs_context
) {
355 t
= strjoin(strempty(options
), options
? "," : "",
356 "context=\"", selinux_apifs_context
, "\"");
365 if (!buf
&& options
) {
366 buf
= strdup(options
);
375 int mount_sysfs(const char *dest
, MountSettingsMask mount_settings
) {
376 const char *full
, *top
, *x
;
378 unsigned long extra_flags
= 0;
380 top
= prefix_roota(dest
, "/sys");
381 r
= path_is_fs_type(top
, SYSFS_MAGIC
);
383 return log_error_errno(r
, "Failed to determine filesystem type of %s: %m", top
);
384 /* /sys might already be mounted as sysfs by the outer child in the
385 * !netns case. In this case, it's all good. Don't touch it because we
386 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
391 full
= prefix_roota(top
, "/full");
393 (void) mkdir(full
, 0755);
395 if (mount_settings
& MOUNT_APPLY_APIVFS_RO
)
396 extra_flags
|= MS_RDONLY
;
398 r
= mount_verbose(LOG_ERR
, "sysfs", full
, "sysfs",
399 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|extra_flags
, NULL
);
403 FOREACH_STRING(x
, "block", "bus", "class", "dev", "devices", "kernel") {
404 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
406 from
= prefix_root(full
, x
);
410 to
= prefix_root(top
, x
);
414 (void) mkdir(to
, 0755);
416 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
420 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
421 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
426 r
= umount_verbose(full
);
431 return log_error_errno(errno
, "Failed to remove %s: %m", full
);
433 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
434 * remount /sys read-only.
436 if (cg_ns_supported()) {
437 x
= prefix_roota(top
, "/fs/cgroup");
438 (void) mkdir_p(x
, 0755);
441 return mount_verbose(LOG_ERR
, NULL
, top
, NULL
,
442 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
445 static int mkdir_userns(const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
450 r
= mkdir_errno_wrapper(path
, mode
);
451 if (r
< 0 && r
!= -EEXIST
)
454 if ((mask
& MOUNT_USE_USERNS
) == 0)
457 if (mask
& MOUNT_IN_USERNS
)
460 if (lchown(path
, uid_shift
, uid_shift
) < 0)
466 static int mkdir_userns_p(const char *prefix
, const char *path
, mode_t mode
, MountSettingsMask mask
, uid_t uid_shift
) {
472 if (prefix
&& !path_startswith(path
, prefix
))
475 /* create every parent directory in the path, except the last component */
476 p
= path
+ strspn(path
, "/");
478 char t
[strlen(path
) + 1];
480 e
= p
+ strcspn(p
, "/");
481 p
= e
+ strspn(e
, "/");
483 /* Is this the last component? If so, then we're done */
487 memcpy(t
, path
, e
- path
);
490 if (prefix
&& path_startswith(prefix
, t
))
493 r
= mkdir_userns(t
, mode
, mask
, uid_shift
);
498 return mkdir_userns(path
, mode
, mask
, uid_shift
);
501 int mount_all(const char *dest
,
502 MountSettingsMask mount_settings
,
503 uid_t uid_shift
, uid_t uid_range
,
504 const char *selinux_apifs_context
) {
506 #define PROC_INACCESSIBLE(path) \
507 { NULL, (path), NULL, NULL, MS_BIND, \
508 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
509 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
510 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
512 #define PROC_READ_ONLY(path) \
513 { (path), (path), NULL, NULL, MS_BIND, \
514 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
515 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
516 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
518 typedef struct MountPoint
{
524 MountSettingsMask mount_settings
;
527 static const MountPoint mount_table
[] = {
528 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
529 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
530 MOUNT_FATAL
|MOUNT_IN_USERNS
},
532 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
,
533 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
535 { "/proc/sys/net", "/proc/sys/net", NULL
, NULL
, MS_BIND
,
536 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
}, /* (except for this) */
538 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
,
539 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
541 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
542 * internals or the host's execution environment to the container */
543 PROC_INACCESSIBLE("/proc/kallsyms"),
544 PROC_INACCESSIBLE("/proc/kcore"),
545 PROC_INACCESSIBLE("/proc/keys"),
546 PROC_INACCESSIBLE("/proc/sysrq-trigger"),
547 PROC_INACCESSIBLE("/proc/timer_list"),
549 /* Make these directories read-only to container payloads: they show hardware information, and in some
550 * cases contain tunables the container really shouldn't have access to. */
551 PROC_READ_ONLY("/proc/acpi"),
552 PROC_READ_ONLY("/proc/apm"),
553 PROC_READ_ONLY("/proc/asound"),
554 PROC_READ_ONLY("/proc/bus"),
555 PROC_READ_ONLY("/proc/fs"),
556 PROC_READ_ONLY("/proc/irq"),
557 PROC_READ_ONLY("/proc/scsi"),
559 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
560 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
562 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
563 MOUNT_FATAL
|MOUNT_APPLY_APIVFS_NETNS
},
564 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
565 MOUNT_FATAL
|MOUNT_APPLY_APIVFS_RO
}, /* skipped if above was mounted */
566 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
567 MOUNT_FATAL
}, /* skipped if above was mounted */
568 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
,
570 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
572 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
576 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
,
577 0 }, /* Bind mount first */
578 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
,
579 0 }, /* Then, make it r/o */
583 _cleanup_(unlink_and_freep
) char *inaccessible
= NULL
;
584 bool use_userns
= (mount_settings
& MOUNT_USE_USERNS
);
585 bool netns
= (mount_settings
& MOUNT_APPLY_APIVFS_NETNS
);
586 bool ro
= (mount_settings
& MOUNT_APPLY_APIVFS_RO
);
587 bool in_userns
= (mount_settings
& MOUNT_IN_USERNS
);
591 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
592 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
593 const char *o
, *what
;
594 bool fatal
= (mount_table
[k
].mount_settings
& MOUNT_FATAL
);
596 if (in_userns
!= (bool)(mount_table
[k
].mount_settings
& MOUNT_IN_USERNS
))
599 if (!netns
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_NETNS
))
602 if (!ro
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_RO
))
605 r
= chase_symlinks(mount_table
[k
].where
, dest
, CHASE_NONEXISTENT
|CHASE_PREFIX_ROOT
, &where
);
607 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, mount_table
[k
].where
);
609 if (mount_table
[k
].mount_settings
& MOUNT_INACCESSIBLE_REG
) {
612 _cleanup_free_
char *np
= NULL
;
614 r
= tempfn_random_child(NULL
, "inaccessible", &np
);
616 return log_error_errno(r
, "Failed to generate inaccessible file node path: %m");
618 r
= touch_file(np
, false, USEC_INFINITY
, UID_INVALID
, GID_INVALID
, 0000);
620 return log_error_errno(r
, "Failed to create inaccessible file node '%s': %m", np
);
622 inaccessible
= TAKE_PTR(np
);
627 what
= mount_table
[k
].what
;
629 r
= path_is_mount_point(where
, NULL
, 0);
630 if (r
< 0 && r
!= -ENOENT
)
631 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
633 /* Skip this entry if it is not a remount. */
637 r
= mkdir_userns_p(dest
, where
, 0755, mount_settings
, uid_shift
);
638 if (r
< 0 && r
!= -EEXIST
) {
639 if (fatal
&& r
!= -EROFS
)
640 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
642 log_debug_errno(r
, "Failed to create directory %s: %m", where
);
643 /* If we failed mkdir() or chown() due to the root
644 * directory being read only, attempt to mount this fs
645 * anyway and let mount_verbose log any errors */
650 o
= mount_table
[k
].options
;
651 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
653 r
= tmpfs_patch_options(o
, use_userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
655 r
= tmpfs_patch_options(o
, use_userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
662 r
= mount_verbose(fatal
? LOG_ERR
: LOG_DEBUG
,
666 mount_table
[k
].flags
,
675 static int mount_bind(const char *dest
, CustomMount
*m
) {
677 _cleanup_free_
char *where
= NULL
;
678 struct stat source_st
, dest_st
;
684 if (stat(m
->source
, &source_st
) < 0)
685 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
687 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
689 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
690 if (r
> 0) { /* Path exists already? */
692 if (stat(where
, &dest_st
) < 0)
693 return log_error_errno(errno
, "Failed to stat %s: %m", where
);
695 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
696 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
700 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
701 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
705 } else { /* Path doesn't exist yet? */
706 r
= mkdir_parents_label(where
, 0755);
708 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
710 /* Create the mount point. Any non-directory file can be
711 * mounted on any non-directory file (regular, fifo, socket,
714 if (S_ISDIR(source_st
.st_mode
))
715 r
= mkdir_label(where
, 0755);
719 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
723 r
= mount_verbose(LOG_ERR
, m
->source
, where
, NULL
, MS_BIND
| MS_REC
, m
->options
);
728 r
= bind_remount_recursive(where
, true, NULL
);
730 return log_error_errno(r
, "Read-only bind mount failed: %m");
736 static int mount_tmpfs(
739 bool userns
, uid_t uid_shift
, uid_t uid_range
,
740 const char *selinux_apifs_context
) {
743 _cleanup_free_
char *buf
= NULL
, *where
= NULL
;
749 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
751 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
752 if (r
== 0) { /* Doesn't exist yet? */
753 r
= mkdir_p_label(where
, 0755);
755 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
758 r
= tmpfs_patch_options(m
->options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
761 options
= r
> 0 ? buf
: m
->options
;
763 return mount_verbose(LOG_ERR
, "tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
);
766 static char *joined_and_escaped_lower_dirs(char **lower
) {
767 _cleanup_strv_free_
char **sv
= NULL
;
769 sv
= strv_copy(lower
);
775 if (!strv_shell_escape(sv
, ",:"))
778 return strv_join(sv
, ":");
781 static int mount_overlay(const char *dest
, CustomMount
*m
) {
783 _cleanup_free_
char *lower
= NULL
, *where
= NULL
, *escaped_source
= NULL
;
790 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
792 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
793 if (r
== 0) { /* Doesn't exist yet? */
794 r
= mkdir_label(where
, 0755);
796 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
799 (void) mkdir_p_label(m
->source
, 0755);
801 lower
= joined_and_escaped_lower_dirs(m
->lower
);
805 escaped_source
= shell_escape(m
->source
, ",:");
810 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
812 _cleanup_free_
char *escaped_work_dir
= NULL
;
814 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
815 if (!escaped_work_dir
)
818 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
821 return mount_verbose(LOG_ERR
, "overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
);
826 CustomMount
*mounts
, size_t n
,
827 bool userns
, uid_t uid_shift
, uid_t uid_range
,
828 const char *selinux_apifs_context
) {
835 for (i
= 0; i
< n
; i
++) {
836 CustomMount
*m
= mounts
+ i
;
840 case CUSTOM_MOUNT_BIND
:
841 r
= mount_bind(dest
, m
);
844 case CUSTOM_MOUNT_TMPFS
:
845 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
848 case CUSTOM_MOUNT_OVERLAY
:
849 r
= mount_overlay(dest
, m
);
853 assert_not_reached("Unknown custom mount type");
863 /* Retrieve existing subsystems. This function is called in a new cgroup
866 static int get_process_controllers(Set
**ret
) {
867 _cleanup_set_free_free_ Set
*controllers
= NULL
;
868 _cleanup_fclose_
FILE *f
= NULL
;
873 controllers
= set_new(&string_hash_ops
);
877 f
= fopen("/proc/self/cgroup", "re");
879 return errno
== ENOENT
? -ESRCH
: -errno
;
882 _cleanup_free_
char *line
= NULL
;
885 r
= read_line(f
, LONG_LINE_MAX
, &line
);
891 l
= strchr(line
, ':');
902 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
905 r
= set_put_strdup(controllers
, l
);
910 *ret
= TAKE_PTR(controllers
);
915 static int mount_legacy_cgroup_hierarchy(
917 const char *controller
,
918 const char *hierarchy
,
921 const char *to
, *fstype
, *opts
;
924 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
926 r
= path_is_mount_point(to
, dest
, 0);
927 if (r
< 0 && r
!= -ENOENT
)
928 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
934 /* The superblock mount options of the mount point need to be
935 * identical to the hosts', and hence writable... */
936 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
939 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
941 opts
= "none,name=systemd,xattr";
947 r
= mount_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
951 /* ... hence let's only make the bind mount read-only, not the superblock. */
953 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
954 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
962 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
963 static int mount_legacy_cgns_supported(
965 CGroupUnified unified_requested
,
969 const char *selinux_apifs_context
) {
971 _cleanup_set_free_free_ Set
*controllers
= NULL
;
972 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
975 (void) mkdir_p(cgroup_root
, 0755);
977 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
978 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
980 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
982 _cleanup_free_
char *options
= NULL
;
984 /* When cgroup namespaces are enabled and user namespaces are
985 * used then the mount of the cgroupfs is done *inside* the new
986 * user namespace. We're root in the new user namespace and the
987 * kernel will happily translate our uid/gid to the correct
988 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
989 * pass uid 0 and not uid_shift to tmpfs_patch_options().
991 r
= tmpfs_patch_options("mode=755", userns
, 0, uid_range
, true, selinux_apifs_context
, &options
);
995 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
996 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
1001 r
= cg_all_unified();
1005 goto skip_controllers
;
1007 r
= get_process_controllers(&controllers
);
1009 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1012 _cleanup_free_
const char *controller
= NULL
;
1014 controller
= set_steal_first(controllers
);
1018 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
1022 /* When multiple hierarchies are co-mounted, make their
1023 * constituting individual hierarchies a symlink to the
1028 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
1030 r
= extract_first_word(&c
, &tok
, ",", 0);
1032 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
1036 if (streq(controller
, tok
))
1039 target
= prefix_root("/sys/fs/cgroup/", tok
);
1043 r
= symlink_idempotent(controller
, target
);
1045 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1047 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1052 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1053 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1058 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1063 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1064 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1069 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1070 static int mount_legacy_cgns_unsupported(
1072 CGroupUnified unified_requested
,
1076 const char *selinux_apifs_context
) {
1078 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1079 const char *cgroup_root
;
1082 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1084 (void) mkdir_p(cgroup_root
, 0755);
1086 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1087 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
1089 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1091 _cleanup_free_
char *options
= NULL
;
1093 r
= tmpfs_patch_options("mode=755", userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &options
);
1097 r
= mount_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
1098 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
1103 r
= cg_all_unified();
1107 goto skip_controllers
;
1109 r
= cg_kernel_controllers(&controllers
);
1111 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1114 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1116 controller
= set_steal_first(controllers
);
1120 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1124 r
= readlink_malloc(origin
, &combined
);
1126 /* Not a symbolic link, but directly a single cgroup hierarchy */
1128 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
1133 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1135 _cleanup_free_
char *target
= NULL
;
1137 target
= prefix_root(dest
, origin
);
1141 /* A symbolic link, a combination of controllers in one hierarchy */
1143 if (!filename_is_valid(combined
)) {
1144 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1148 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
1152 r
= symlink_idempotent(combined
, target
);
1154 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
1156 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1161 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1162 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
1167 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
1171 return mount_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
1172 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
1175 static int mount_unified_cgroups(const char *dest
) {
1181 p
= prefix_roota(dest
, "/sys/fs/cgroup");
1183 (void) mkdir_p(p
, 0755);
1185 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
1187 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
1189 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
1190 if (access(p
, F_OK
) >= 0)
1192 if (errno
!= ENOENT
)
1193 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
1195 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
1199 return mount_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
1204 CGroupUnified unified_requested
,
1208 const char *selinux_apifs_context
,
1211 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
1212 return mount_unified_cgroups(dest
);
1214 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1216 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
1219 static int mount_systemd_cgroup_writable_one(const char *root
, const char *own
) {
1225 /* Make our own cgroup a (writable) bind mount */
1226 r
= mount_verbose(LOG_ERR
, own
, own
, NULL
, MS_BIND
, NULL
);
1230 /* And then remount the systemd cgroup root read-only */
1231 return mount_verbose(LOG_ERR
, NULL
, root
, NULL
,
1232 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
1235 int mount_systemd_cgroup_writable(
1237 CGroupUnified unified_requested
) {
1239 _cleanup_free_
char *own_cgroup_path
= NULL
;
1240 const char *root
, *own
;
1245 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1247 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1249 /* If we are living in the top-level, then there's nothing to do... */
1250 if (path_equal(own_cgroup_path
, "/"))
1253 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
1255 root
= prefix_roota(dest
, "/sys/fs/cgroup");
1256 own
= strjoina(root
, own_cgroup_path
);
1260 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
1261 root
= prefix_roota(dest
, "/sys/fs/cgroup/unified");
1262 own
= strjoina(root
, own_cgroup_path
);
1264 r
= mount_systemd_cgroup_writable_one(root
, own
);
1269 root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1270 own
= strjoina(root
, own_cgroup_path
);
1273 return mount_systemd_cgroup_writable_one(root
, own
);
1276 int setup_volatile_state(
1277 const char *directory
,
1279 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1280 const char *selinux_apifs_context
) {
1282 _cleanup_free_
char *buf
= NULL
;
1283 const char *p
, *options
;
1288 if (mode
!= VOLATILE_STATE
)
1291 /* --volatile=state means we simply overmount /var
1292 with a tmpfs, and the rest read-only. */
1294 r
= bind_remount_recursive(directory
, true, NULL
);
1296 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1298 p
= prefix_roota(directory
, "/var");
1300 if (r
< 0 && errno
!= EEXIST
)
1301 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1303 options
= "mode=755";
1304 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1310 return mount_verbose(LOG_ERR
, "tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
);
1314 const char *directory
,
1316 bool userns
, uid_t uid_shift
, uid_t uid_range
,
1317 const char *selinux_apifs_context
) {
1319 bool tmpfs_mounted
= false, bind_mounted
= false;
1320 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1321 _cleanup_free_
char *buf
= NULL
;
1322 const char *f
, *t
, *options
;
1327 if (mode
!= VOLATILE_YES
)
1330 /* --volatile=yes means we mount a tmpfs to the root dir, and
1331 the original /usr to use inside it, and that read-only. */
1333 if (!mkdtemp(template))
1334 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1336 options
= "mode=755";
1337 r
= tmpfs_patch_options(options
, userns
, uid_shift
, uid_range
, false, selinux_apifs_context
, &buf
);
1343 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
1347 tmpfs_mounted
= true;
1349 f
= prefix_roota(directory
, "/usr");
1350 t
= prefix_roota(template, "/usr");
1353 if (r
< 0 && errno
!= EEXIST
) {
1354 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1358 r
= mount_verbose(LOG_ERR
, f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
);
1362 bind_mounted
= true;
1364 r
= bind_remount_recursive(t
, true, NULL
);
1366 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1370 r
= mount_verbose(LOG_ERR
, template, directory
, NULL
, MS_MOVE
, NULL
);
1374 (void) rmdir(template);
1380 (void) umount_verbose(t
);
1383 (void) umount_verbose(template);
1384 (void) rmdir(template);
1388 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1389 int pivot_root_parse(char **pivot_root_new
, char **pivot_root_old
, const char *s
) {
1390 _cleanup_free_
char *root_new
= NULL
, *root_old
= NULL
;
1394 assert(pivot_root_new
);
1395 assert(pivot_root_old
);
1397 r
= extract_first_word(&p
, &root_new
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1406 root_old
= strdup(p
);
1411 if (!path_is_absolute(root_new
))
1413 if (root_old
&& !path_is_absolute(root_old
))
1416 free_and_replace(*pivot_root_new
, root_new
);
1417 free_and_replace(*pivot_root_old
, root_old
);
1422 int setup_pivot_root(const char *directory
, const char *pivot_root_new
, const char *pivot_root_old
) {
1423 _cleanup_free_
char *directory_pivot_root_new
= NULL
;
1424 _cleanup_free_
char *pivot_tmp_pivot_root_old
= NULL
;
1425 char pivot_tmp
[] = "/tmp/nspawn-pivot-XXXXXX";
1426 bool remove_pivot_tmp
= false;
1431 if (!pivot_root_new
)
1434 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1435 * If pivot_root_old is NULL, the existing / disappears.
1436 * This requires a temporary directory, pivot_tmp, which is
1437 * not a child of either.
1439 * This is typically used for OSTree-style containers, where
1440 * the root partition contains several sysroots which could be
1441 * run. Normally, one would be chosen by the bootloader and
1442 * pivoted to / by initramfs.
1444 * For example, for an OSTree deployment, pivot_root_new
1445 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1446 * code doesn’t do the /var mount which OSTree expects: use
1447 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1449 * So in the OSTree case, we’ll end up with something like:
1450 * - directory = /tmp/nspawn-root-123456
1451 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1452 * - pivot_root_old = /sysroot
1453 * - directory_pivot_root_new =
1454 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1455 * - pivot_tmp = /tmp/nspawn-pivot-123456
1456 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1458 * Requires all file systems at directory and below to be mounted
1459 * MS_PRIVATE or MS_SLAVE so they can be moved.
1461 directory_pivot_root_new
= prefix_root(directory
, pivot_root_new
);
1463 /* Remount directory_pivot_root_new to make it movable. */
1464 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory_pivot_root_new
, NULL
, MS_BIND
, NULL
);
1468 if (pivot_root_old
) {
1469 if (!mkdtemp(pivot_tmp
)) {
1470 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
1474 remove_pivot_tmp
= true;
1475 pivot_tmp_pivot_root_old
= prefix_root(pivot_tmp
, pivot_root_old
);
1477 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, pivot_tmp
, NULL
, MS_MOVE
, NULL
);
1481 r
= mount_verbose(LOG_ERR
, directory
, pivot_tmp_pivot_root_old
, NULL
, MS_MOVE
, NULL
);
1485 r
= mount_verbose(LOG_ERR
, pivot_tmp
, directory
, NULL
, MS_MOVE
, NULL
);
1489 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory
, NULL
, MS_MOVE
, NULL
);
1495 if (remove_pivot_tmp
)
1496 (void) rmdir(pivot_tmp
);