+/* SPDX-License-Identifier: LGPL-2.1+ */
/***
This file is part of systemd.
assert(t >= 0);
assert(t < _CUSTOM_MOUNT_TYPE_MAX);
- c = realloc(*l, (*n + 1) * sizeof(CustomMount));
+ c = realloc_multiply(*l, (*n + 1), sizeof(CustomMount));
if (!c)
return NULL;
free(m->work_dir);
}
+ if (m->rm_rf_tmpdir) {
+ (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
+ free(m->rm_rf_tmpdir);
+ }
+
strv_free(m->lower);
}
free(l);
}
-int custom_mount_compare(const void *a, const void *b) {
+static int custom_mount_compare(const void *a, const void *b) {
const CustomMount *x = a, *y = b;
int r;
return 0;
}
+static bool source_path_is_valid(const char *p) {
+ assert(p);
+
+ if (*p == '+')
+ p++;
+
+ return path_is_absolute(p);
+}
+
+static char *resolve_source_path(const char *dest, const char *source) {
+
+ if (!source)
+ return NULL;
+
+ if (source[0] == '+')
+ return prefix_root(dest, source + 1);
+
+ return strdup(source);
+}
+
+int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
+ unsigned i;
+ int r;
+
+ /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
+ * parent process, so that we know the temporary directories to remove on exit before we fork off the
+ * children. */
+
+ assert(l || n == 0);
+
+ /* Order the custom mounts, and make sure we have a working directory */
+ qsort_safe(l, n, sizeof(CustomMount), custom_mount_compare);
+
+ for (i = 0; i < n; i++) {
+ CustomMount *m = l + i;
+
+ if (m->source) {
+ char *s;
+
+ s = resolve_source_path(dest, m->source);
+ if (!s)
+ return log_oom();
+
+ free(m->source);
+ m->source = s;
+ } else {
+ /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
+
+ m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
+ if (!m->rm_rf_tmpdir)
+ return log_oom();
+
+ if (!mkdtemp(m->rm_rf_tmpdir)) {
+ m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
+ return log_error_errno(errno, "Failed to acquire temporary directory: %m");
+ }
+
+ m->source = strjoin(m->rm_rf_tmpdir, "/src");
+ if (!m->source)
+ return log_oom();
+
+ if (mkdir(m->source, 0755) < 0)
+ return log_error_errno(errno, "Failed to create %s: %m", m->source);
+ }
+
+ if (m->type == CUSTOM_MOUNT_OVERLAY) {
+ char **j;
+
+ STRV_FOREACH(j, m->lower) {
+ char *s;
+
+ s = resolve_source_path(dest, *j);
+ if (!s)
+ return log_oom();
+
+ free(*j);
+ *j = s;
+ }
+
+ if (m->work_dir) {
+ char *s;
+
+ s = resolve_source_path(dest, m->work_dir);
+ if (!s)
+ return log_oom();
+
+ free(m->work_dir);
+ m->work_dir = s;
+ } else {
+ assert(m->source);
+
+ r = tempfn_random(m->source, NULL, &m->work_dir);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire working directory: %m");
+ }
+
+ (void) mkdir_label(m->work_dir, 0700);
+ }
+ }
+
+ return 0;
+}
+
int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
_cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
const char *p = s;
return r;
if (r == 0)
return -EINVAL;
-
if (r == 1) {
- destination = strdup(source);
+ destination = strdup(source[0] == '+' ? source+1 : source);
if (!destination)
return -ENOMEM;
}
-
if (r == 2 && !isempty(p)) {
opts = strdup(p);
if (!opts)
return -ENOMEM;
}
- if (!path_is_absolute(source))
+ if (isempty(source))
+ source = NULL;
+ else if (!source_path_is_valid(source))
return -EINVAL;
if (!path_is_absolute(destination))
m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
if (!m)
- return log_oom();
+ return -ENOMEM;
m->source = source;
m->destination = destination;
return 0;
}
+int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+ _cleanup_free_ char *upper = NULL, *destination = NULL;
+ _cleanup_strv_free_ char **lower = NULL;
+ CustomMount *m;
+ int k;
+
+ k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (k < 0)
+ return k;
+ if (k < 2)
+ return -EADDRNOTAVAIL;
+ if (k == 2) {
+ /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
+ * we'll also define the destination mount point the same as the upper. */
+
+ if (!source_path_is_valid(lower[0]) ||
+ !source_path_is_valid(lower[1]))
+ return -EINVAL;
+
+ upper = lower[1];
+ lower[1] = NULL;
+
+ destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
+ if (!destination)
+ return -ENOMEM;
+ } else {
+ char **i;
+
+ /* If more than two parameters are specified, the last one is the destination, the second to last one
+ * the "upper", and all before that the "lower" directories. */
+
+ destination = lower[k - 1];
+ upper = lower[k - 2];
+ lower[k - 2] = NULL;
+
+ STRV_FOREACH(i, lower)
+ if (!source_path_is_valid(*i))
+ return -EINVAL;
+
+ /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
+ * in /var/tmp */
+ if (isempty(upper))
+ upper = NULL;
+ else if (!source_path_is_valid(upper))
+ return -EINVAL;
+
+ if (!path_is_absolute(destination))
+ return -EINVAL;
+ }
+
+ m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
+ if (!m)
+ return -ENOMEM;
+
+ m->destination = destination;
+ m->source = upper;
+ m->lower = lower;
+ m->read_only = read_only;
+
+ upper = destination = NULL;
+ lower = NULL;
+
+ return 0;
+}
+
static int tmpfs_patch_options(
const char *options,
bool userns,
options = buf;
}
-#ifdef HAVE_SELINUX
+#if HAVE_SELINUX
if (selinux_apifs_context) {
char *t;
return !!buf;
}
-int mount_sysfs(const char *dest) {
+int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
const char *full, *top, *x;
int r;
+ unsigned long extra_flags = 0;
top = prefix_roota(dest, "/sys");
r = path_check_fstype(top, SYSFS_MAGIC);
(void) mkdir(full, 0755);
+ if (mount_settings & MOUNT_APPLY_APIVFS_RO)
+ extra_flags |= MS_RDONLY;
+
r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
- MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
if (r < 0)
return r;
return r;
r = mount_verbose(LOG_ERR, NULL, to, NULL,
- MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+ MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
if (r < 0)
return r;
}
if (rmdir(full) < 0)
return log_error_errno(errno, "Failed to remove %s: %m", full);
- x = prefix_roota(top, "/fs/kdbus");
- (void) mkdir_p(x, 0755);
-
/* Create mountpoint for cgroups. Otherwise we are not allowed since we
* remount /sys read-only.
*/
}
return mount_verbose(LOG_ERR, NULL, top, NULL,
- MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+ MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
}
-static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
int r;
assert(path);
if (r < 0 && errno != EEXIST)
return -errno;
- if (!in_userns) {
- r = lchown(path, uid_shift, uid_shift);
- if (r < 0)
- return -errno;
- }
+ if ((mask & MOUNT_USE_USERNS) == 0)
+ return 0;
+
+ if (mask & MOUNT_IN_USERNS)
+ return 0;
+
+ r = lchown(path, uid_shift, uid_shift);
+ if (r < 0)
+ return -errno;
return 0;
}
-static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
const char *p, *e;
int r;
if (prefix && path_startswith(prefix, t))
continue;
- r = mkdir_userns(t, mode, in_userns, uid_shift);
+ r = mkdir_userns(t, mode, mask, uid_shift);
if (r < 0)
return r;
}
- return mkdir_userns(path, mode, in_userns, uid_shift);
+ return mkdir_userns(path, mode, mask, uid_shift);
}
int mount_all(const char *dest,
- bool use_userns, bool in_userns,
- bool use_netns,
+ MountSettingsMask mount_settings,
uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
const char *type;
const char *options;
unsigned long flags;
- bool fatal;
- bool in_userns;
- bool use_netns;
+ MountSettingsMask mount_settings;
} MountPoint;
static const MountPoint mount_table[] = {
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
- { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
- { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, false, true, false }, /* Bind mount first ...*/
- { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true, false }, /* ... then, make it r/o */
- { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, true, false },
-#ifdef HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
+ /* inner child mounts */
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
+ { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
+ { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
+ { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
+
+ /* outer child mounts */
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
+ { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
+ { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
+
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
+#if HAVE_SELINUX
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
#endif
};
unsigned k;
int r;
+ bool use_userns = (mount_settings & MOUNT_USE_USERNS);
+ bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
+ bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
+ bool in_userns = (mount_settings & MOUNT_IN_USERNS);
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
_cleanup_free_ char *where = NULL, *options = NULL;
const char *o;
+ bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
- if (in_userns != mount_table[k].in_userns)
+ if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
continue;
- if (!use_netns && mount_table[k].use_netns)
+ if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
continue;
- where = prefix_root(dest, mount_table[k].where);
- if (!where)
- return log_oom();
+ if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
+ continue;
- r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
+ r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
+
+ r = path_is_mount_point(where, NULL, 0);
if (r < 0 && r != -ENOENT)
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
if (mount_table[k].what && r > 0)
continue;
- r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift);
+ r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
if (r < 0 && r != -EEXIST) {
- if (mount_table[k].fatal)
+ if (fatal && r != -EROFS)
return log_error_errno(r, "Failed to create directory %s: %m", where);
log_debug_errno(r, "Failed to create directory %s: %m", where);
- continue;
+ /* If we failed mkdir() or chown() due to the root
+ * directory being read only, attempt to mount this fs
+ * anyway and let mount_verbose log any errors */
+ if (r != -EROFS)
+ continue;
}
o = mount_table[k].options;
o = options;
}
- r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_WARNING,
+ r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
mount_table[k].what,
where,
mount_table[k].type,
mount_table[k].flags,
o);
- if (r < 0 && mount_table[k].fatal)
+ if (r < 0 && fatal)
return r;
}
const char *p = options;
unsigned long flags = *mount_flags;
char *opts = NULL;
+ int r;
assert(options);
for (;;) {
_cleanup_free_ char *word = NULL;
- int r = extract_first_word(&p, &word, ",", 0);
+
+ r = extract_first_word(&p, &word, ",", 0);
if (r < 0)
return log_error_errno(r, "Failed to extract mount option: %m");
if (r == 0)
}
static int mount_bind(const char *dest, CustomMount *m) {
- struct stat source_st, dest_st;
- const char *where;
+
+ _cleanup_free_ char *mount_opts = NULL, *where = NULL;
unsigned long mount_flags = MS_BIND | MS_REC;
- _cleanup_free_ char *mount_opts = NULL;
+ struct stat source_st, dest_st;
int r;
+ assert(dest);
assert(m);
if (m->options) {
if (stat(m->source, &source_st) < 0)
return log_error_errno(errno, "Failed to stat %s: %m", m->source);
- where = prefix_roota(dest, m->destination);
+ r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
+ if (r > 0) { /* Path exists already? */
+
+ if (stat(where, &dest_st) < 0)
+ return log_error_errno(errno, "Failed to stat %s: %m", where);
- if (stat(where, &dest_st) >= 0) {
if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
log_error("Cannot bind mount directory %s on file %s.", m->source, where);
return -EINVAL;
return -EINVAL;
}
- } else if (errno == ENOENT) {
+ } else { /* Path doesn't exist yet? */
r = mkdir_parents_label(where, 0755);
if (r < 0)
return log_error_errno(r, "Failed to make parents of %s: %m", where);
if (r < 0)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
- } else
- return log_error_errno(errno, "Failed to stat %s: %m", where);
+ }
r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
if (r < 0)
bool userns, uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
- const char *where, *options;
- _cleanup_free_ char *buf = NULL;
+ const char *options;
+ _cleanup_free_ char *buf = NULL, *where = NULL;
int r;
assert(dest);
assert(m);
- where = prefix_roota(dest, m->destination);
-
- r = mkdir_p_label(where, 0755);
- if (r < 0 && r != -EEXIST)
- return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
+ r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
+ if (r == 0) { /* Doesn't exist yet? */
+ r = mkdir_p_label(where, 0755);
+ if (r < 0)
+ return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
+ }
r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
if (r < 0)
return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
}
-static char *joined_and_escaped_lower_dirs(char * const *lower) {
+static char *joined_and_escaped_lower_dirs(char **lower) {
_cleanup_strv_free_ char **sv = NULL;
sv = strv_copy(lower);
}
static int mount_overlay(const char *dest, CustomMount *m) {
- _cleanup_free_ char *lower = NULL;
- const char *where, *options;
+
+ _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
+ const char *options;
int r;
assert(dest);
assert(m);
- where = prefix_roota(dest, m->destination);
-
- r = mkdir_label(where, 0755);
- if (r < 0 && r != -EEXIST)
- return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
+ r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
+ if (r == 0) { /* Doesn't exist yet? */
+ r = mkdir_label(where, 0755);
+ if (r < 0)
+ return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
+ }
(void) mkdir_p_label(m->source, 0755);
if (!lower)
return log_oom();
- if (m->read_only) {
- _cleanup_free_ char *escaped_source = NULL;
-
- escaped_source = shell_escape(m->source, ",:");
- if (!escaped_source)
- return log_oom();
+ escaped_source = shell_escape(m->source, ",:");
+ if (!escaped_source)
+ return log_oom();
+ if (m->read_only)
options = strjoina("lowerdir=", escaped_source, ":", lower);
- } else {
- _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
-
- assert(m->work_dir);
- (void) mkdir_label(m->work_dir, 0700);
+ else {
+ _cleanup_free_ char *escaped_work_dir = NULL;
- escaped_source = shell_escape(m->source, ",:");
- if (!escaped_source)
- return log_oom();
escaped_work_dir = shell_escape(m->work_dir, ",:");
if (!escaped_work_dir)
return log_oom();
*e = 0;
- if (STR_IN_SET(l, "", "name=systemd"))
+ if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
continue;
p = strdup(l);
return 0;
}
-static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy,
- CGroupUnified unified_requested, bool read_only) {
+static int mount_legacy_cgroup_hierarchy(
+ const char *dest,
+ const char *controller,
+ const char *hierarchy,
+ bool read_only) {
+
const char *to, *fstype, *opts;
int r;
to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
- r = path_is_mount_point(to, 0);
+ r = path_is_mount_point(to, dest, 0);
if (r < 0 && r != -ENOENT)
return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
if (r > 0)
/* The superblock mount options of the mount point need to be
* identical to the hosts', and hence writable... */
- if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
- if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
- fstype = "cgroup2";
- opts = NULL;
- } else {
- fstype = "cgroup";
- opts = "none,name=systemd,xattr";
- }
+ if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
+ fstype = "cgroup2";
+ opts = NULL;
+ } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
+ fstype = "cgroup";
+ opts = "none,name=systemd,xattr";
} else {
fstype = "cgroup";
opts = controller;
/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
static int mount_legacy_cgns_supported(
- CGroupUnified unified_requested, bool userns, uid_t uid_shift,
- uid_t uid_range, const char *selinux_apifs_context) {
+ const char *dest,
+ CGroupUnified unified_requested,
+ bool userns,
+ uid_t uid_shift,
+ uid_t uid_range,
+ const char *selinux_apifs_context) {
+
_cleanup_set_free_free_ Set *controllers = NULL;
const char *cgroup_root = "/sys/fs/cgroup", *c;
int r;
(void) mkdir_p(cgroup_root, 0755);
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
- r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+ r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
if (r < 0)
return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
if (r == 0) {
return r;
}
- if (cg_all_unified() > 0)
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0)
goto skip_controllers;
controllers = set_new(&string_hash_ops);
if (!controller)
break;
- r = mount_legacy_cgroup_hierarchy("", controller, controller, unified_requested, !userns);
+ r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
if (r < 0)
return r;
}
skip_controllers:
- r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
+ if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
+ r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
+ if (r < 0)
+ return r;
+ }
+
+ r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
if (r < 0)
return r;
/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
static int mount_legacy_cgns_unsupported(
const char *dest,
- CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range,
+ CGroupUnified unified_requested,
+ bool userns,
+ uid_t uid_shift,
+ uid_t uid_range,
const char *selinux_apifs_context) {
+
_cleanup_set_free_free_ Set *controllers = NULL;
const char *cgroup_root;
int r;
(void) mkdir_p(cgroup_root, 0755);
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
- r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+ r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
if (r < 0)
return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
if (r == 0) {
return r;
}
- if (cg_all_unified() > 0)
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0)
goto skip_controllers;
controllers = set_new(&string_hash_ops);
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
- r = mount_legacy_cgroup_hierarchy(dest, controller, controller, unified_requested, true);
+ r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
if (r < 0)
return r;
continue;
}
- r = mount_legacy_cgroup_hierarchy(dest, combined, combined, unified_requested, true);
+ r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
if (r < 0)
return r;
}
skip_controllers:
- r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
+ if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
+ r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
+ if (r < 0)
+ return r;
+ }
+
+ r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
if (r < 0)
return r;
(void) mkdir_p(p, 0755);
- r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+ r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
if (r < 0)
return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
if (r > 0) {
int mount_cgroups(
const char *dest,
CGroupUnified unified_requested,
- bool userns, uid_t uid_shift, uid_t uid_range,
+ bool userns,
+ uid_t uid_shift,
+ uid_t uid_range,
const char *selinux_apifs_context,
bool use_cgns) {
if (unified_requested >= CGROUP_UNIFIED_ALL)
return mount_unified_cgroups(dest);
else if (use_cgns)
- return mount_legacy_cgns_supported(unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
+ return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
}
+static int mount_systemd_cgroup_writable_one(const char *systemd_own, const char *systemd_root)
+{
+ int r;
+
+ /* Make our own cgroup a (writable) bind mount */
+ r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
+
+ /* And then remount the systemd cgroup root read-only */
+ return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
+ MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+}
+
int mount_systemd_cgroup_writable(
const char *dest,
CGroupUnified unified_requested) {
_cleanup_free_ char *own_cgroup_path = NULL;
- const char *systemd_root, *systemd_own;
int r;
assert(dest);
if (path_equal(own_cgroup_path, "/"))
return 0;
- if (unified_requested >= CGROUP_UNIFIED_ALL) {
- systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
- } else {
- systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
- }
+ if (unified_requested >= CGROUP_UNIFIED_ALL)
+ return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup", own_cgroup_path),
+ prefix_roota(dest, "/sys/fs/cgroup"));
- /* Make our own cgroup a (writable) bind mount */
- r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
- if (r < 0)
- return r;
+ if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
+ r = mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/unified", own_cgroup_path),
+ prefix_roota(dest, "/sys/fs/cgroup/unified"));
+ if (r < 0)
+ return r;
+ }
- /* And then remount the systemd cgroup root read-only */
- return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path),
+ prefix_roota(dest, "/sys/fs/cgroup/systemd"));
}
int setup_volatile_state(
return r;
}
-VolatileMode volatile_mode_from_string(const char *s) {
- int b;
+/* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
+int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
+ _cleanup_free_ char *root_new = NULL, *root_old = NULL;
+ const char *p = s;
+ int r;
+
+ assert(pivot_root_new);
+ assert(pivot_root_old);
+
+ r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ if (isempty(p))
+ root_old = NULL;
+ else {
+ root_old = strdup(p);
+ if (!root_old)
+ return -ENOMEM;
+ }
+
+ if (!path_is_absolute(root_new))
+ return -EINVAL;
+ if (root_old && !path_is_absolute(root_old))
+ return -EINVAL;
+
+ free_and_replace(*pivot_root_new, root_new);
+ free_and_replace(*pivot_root_old, root_old);
+
+ return 0;
+}
+
+int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
+ _cleanup_free_ char *directory_pivot_root_new = NULL;
+ _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
+ char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
+ bool remove_pivot_tmp = false;
+ int r;
- if (isempty(s))
- return _VOLATILE_MODE_INVALID;
+ assert(directory);
- b = parse_boolean(s);
- if (b > 0)
- return VOLATILE_YES;
- if (b == 0)
- return VOLATILE_NO;
+ if (!pivot_root_new)
+ return 0;
- if (streq(s, "state"))
- return VOLATILE_STATE;
+ /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
+ * If pivot_root_old is NULL, the existing / disappears.
+ * This requires a temporary directory, pivot_tmp, which is
+ * not a child of either.
+ *
+ * This is typically used for OSTree-style containers, where
+ * the root partition contains several sysroots which could be
+ * run. Normally, one would be chosen by the bootloader and
+ * pivoted to / by initramfs.
+ *
+ * For example, for an OSTree deployment, pivot_root_new
+ * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
+ * code doesn’t do the /var mount which OSTree expects: use
+ * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
+ *
+ * So in the OSTree case, we’ll end up with something like:
+ * - directory = /tmp/nspawn-root-123456
+ * - pivot_root_new = /ostree/deploy/os/deploy/123abc
+ * - pivot_root_old = /sysroot
+ * - directory_pivot_root_new =
+ * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
+ * - pivot_tmp = /tmp/nspawn-pivot-123456
+ * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
+ *
+ * Requires all file systems at directory and below to be mounted
+ * MS_PRIVATE or MS_SLAVE so they can be moved.
+ */
+ directory_pivot_root_new = prefix_root(directory, pivot_root_new);
- return _VOLATILE_MODE_INVALID;
+ /* Remount directory_pivot_root_new to make it movable. */
+ r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
+ if (r < 0)
+ goto done;
+
+ if (pivot_root_old) {
+ if (!mkdtemp(pivot_tmp)) {
+ r = log_error_errno(errno, "Failed to create temporary directory: %m");
+ goto done;
+ }
+
+ remove_pivot_tmp = true;
+ pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
+
+ r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto done;
+
+ r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto done;
+
+ r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto done;
+ } else {
+ r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto done;
+ }
+
+done:
+ if (remove_pivot_tmp)
+ (void) rmdir(pivot_tmp);
+
+ return r;
}