};
static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
- [INACCESSIBLE] = "inaccessible",
- [OVERLAY_MOUNT] = "overlay",
- [BIND_MOUNT] = "bind",
- [BIND_MOUNT_RECURSIVE] = "rbind",
- [PRIVATE_TMP] = "private-tmp",
- [PRIVATE_DEV] = "private-dev",
- [BIND_DEV] = "bind-dev",
- [EMPTY_DIR] = "empty",
- [PRIVATE_SYSFS] = "private-sysfs",
- [BIND_SYSFS] = "bind-sysfs",
- [PROCFS] = "procfs",
- [READONLY] = "read-only",
- [READWRITE] = "read-write",
- [TMPFS] = "tmpfs",
- [MOUNT_IMAGES] = "mount-images",
- [READWRITE_IMPLICIT] = "rw-implicit",
- [EXEC] = "exec",
- [NOEXEC] = "noexec",
- [MQUEUEFS] = "mqueuefs",
+ [INACCESSIBLE] = "inaccessible",
+ [OVERLAY_MOUNT] = "overlay",
+ [MOUNT_IMAGES] = "mount-images",
+ [BIND_MOUNT] = "bind",
+ [BIND_MOUNT_RECURSIVE] = "rbind",
+ [PRIVATE_TMP] = "private-tmp",
+ [PRIVATE_TMP_READONLY] = "private-tmp-read-only",
+ [PRIVATE_DEV] = "private-dev",
+ [BIND_DEV] = "bind-dev",
+ [EMPTY_DIR] = "empty",
+ [PRIVATE_SYSFS] = "private-sysfs",
+ [BIND_SYSFS] = "bind-sysfs",
+ [PROCFS] = "procfs",
+ [READONLY] = "read-only",
+ [READWRITE] = "read-write",
+ [NOEXEC] = "noexec",
+ [EXEC] = "exec",
+ [TMPFS] = "tmpfs",
+ [RUN] = "run",
+ [EXTENSION_DIRECTORIES] = "extension-directories",
+ [EXTENSION_IMAGES] = "extension-images",
+ [MQUEUEFS] = "mqueuefs",
+ [READWRITE_IMPLICIT] = "read-write-implicit",
};
/* Helper struct for naming simplicity and reusability */
return 0;
}
-static int mount_private_dev(MountEntry *m) {
+static char *settle_runtime_dir(RuntimeScope scope) {
+ char *runtime_dir;
+
+ if (scope != RUNTIME_SCOPE_USER)
+ return strdup("/run/");
+
+ if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
+ return NULL;
+
+ return runtime_dir;
+}
+
+static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
static const char devnodes[] =
"/dev/null\0"
"/dev/zero\0"
"/dev/urandom\0"
"/dev/tty\0";
- char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
+ _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
bool can_mknod = true;
int r;
assert(m);
+ runtime_dir = settle_runtime_dir(scope);
+ if (!runtime_dir)
+ return log_oom_debug();
+
+ temporary_mount = path_join(runtime_dir, "systemd/namespace-dev-XXXXXX");
+ if (!temporary_mount)
+ return log_oom_debug();
+
if (!mkdtemp(temporary_mount))
return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
if (r < 0)
log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
+ /* Make the bind mount read-only. */
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, dev, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
+
/* Create the /dev directory if missing. It is more likely to be missing when the service is started
* with RootDirectory. This is consistent with mount units creating the mount points when missing. */
(void) mkdir_p_label(mount_entry_path(m), 0755);
if (r > 0) /* make this a NOP if /dev is already a mount point */
return 0;
- r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return r;
-
- return 1;
-}
-
-static int mount_private_sysfs(const MountEntry *m) {
- const char *p = mount_entry_path(ASSERT_PTR(m));
- int r;
-
- (void) mkdir_p_label(p, 0755);
-
- r = remount_sysfs(p);
- if (r < 0 && (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))) {
- /* Running with an unprivileged user (PrivateUsers=yes), or the kernel seems old. Falling
- * back to bind mount the host's version so that we get all child mounts of it, too. */
-
- log_debug_errno(r, "Failed to remount sysfs on %s, falling back to bind mount: %m", p);
-
- (void) umount_recursive(p, 0);
-
- r = mount_nofollow_verbose(LOG_DEBUG, "/sys", p, NULL, MS_BIND|MS_REC, NULL);
- }
- if (r < 0)
- return log_debug_errno(r, "Failed to remount sysfs on %s: %m", p);
-
- return 1;
+ return mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
}
static int mount_bind_sysfs(const MountEntry *m) {
return 0;
/* Bind mount the host's version so that we get all child mounts of it, too. */
- r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
+ return mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+}
+
+static int mount_private_sysfs(const MountEntry *m) {
+ const char *entry_path = mount_entry_path(ASSERT_PTR(m));
+ int r, n;
+
+ (void) mkdir_p_label(entry_path, 0755);
+
+ n = umount_recursive(entry_path, 0);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "sysfs", entry_path, "sysfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+ /* When we do not have enough privileges to mount sysfs, fall back to use existing /sys. */
+
+ if (n > 0)
+ /* /sys or some of sub-mounts are umounted in the above. Refuse incomplete tree.
+ * Propagate the original error code returned by mount() in the above. */
+ return r;
+
+ return mount_bind_sysfs(m);
+
+ } else if (r < 0)
return r;
- return 1;
+ /* We mounted a new instance now. Let's bind mount the children over now. */
+ (void) bind_mount_submounts("/sys", entry_path);
+ return 0;
}
static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
* means we really don't want to use it, since it would affect our host's /proc
* mount. Hence let's gracefully fallback to a classic, unrestricted version. */
r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
- if (r == -EPERM) {
- /* When we do not have enough privileges to mount /proc, fallback to use existing /proc. */
+ if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+ /* When we do not have enough privileges to mount /proc, fall back to use existing /proc. */
if (n > 0)
/* /proc or some of sub-mounts are umounted in the above. Refuse incomplete tree.
* Propagate the original error code returned by mount() in the above. */
- return -EPERM;
+ return r;
r = path_is_mount_point(entry_path, NULL, 0);
if (r < 0)
return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
- if (r == 0) {
- /* We lack permissions to mount a new instance of /proc, and it is not already
- * mounted. But we can access the host's, so as a final fallback bind-mount it to
- * the destination, as most likely we are inside a user manager in an unprivileged
- * user namespace. */
- r = mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return -EPERM;
- }
+ if (r > 0)
+ return 0;
+
+ /* We lack permissions to mount a new instance of /proc, and it is not already mounted. But
+ * we can access the host's, so as a final fallback bind-mount it to the destination, as most
+ * likely we are inside a user manager in an unprivileged user namespace. */
+ return mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
+
} else if (r < 0)
return r;
- else
- /* We mounted a new instance now. Let's bind mount the children over now. This matters for
- * nspawn where a bunch of files are overmounted, in particular the boot id */
- (void) bind_mount_submounts("/proc", entry_path);
- return 1;
+ /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
+ * where a bunch of files are overmounted, in particular the boot id */
+ (void) bind_mount_submounts("/proc", entry_path);
+ return 0;
}
static int mount_tmpfs(const MountEntry *m) {
if (r < 0)
return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
- return 1;
+ return 0;
}
static int mount_run(const MountEntry *m) {
if (r < 0)
return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
- return 1;
+ return 0;
}
static int mount_overlay(const MountEntry *m) {
r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
if (r == -ENOENT && m->ignore)
return 0;
- if (r < 0)
- return r;
- return 1;
+ return r;
}
static int follow_symlink(
MountEntry *m,
const ImagePolicy *mount_image_policy,
const ImagePolicy *extension_image_policy,
- const NamespaceInfo *ns_info) {
+ const NamespaceInfo *ns_info,
+ RuntimeScope scope) {
_cleanup_free_ char *inaccessible = NULL;
bool rbind = true, make = false;
switch (m->mode) {
case INACCESSIBLE: {
- _cleanup_free_ char *tmp = NULL;
- const char *runtime_dir;
+ _cleanup_free_ char *runtime_dir = NULL;
struct stat target;
/* First, get rid of everything that is below if there
mount_entry_path(m));
}
- if (geteuid() == 0)
- runtime_dir = "/run";
- else {
- if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
- return -ENOMEM;
-
- runtime_dir = tmp;
- }
+ /* We don't pass the literal runtime scope through here but one based purely on our UID. This
+ * means that the root user's --user services will use the host's inaccessible inodes rather
+ * then root's private ones. This is preferable since it means device nodes that are
+ * overmounted to make them inaccessible will be overmounted with a device node, rather than
+ * an AF_UNIX socket inode. */
+ runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
+ if (!runtime_dir)
+ return log_oom_debug();
r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
if (r < 0)
break;
case PRIVATE_DEV:
- return mount_private_dev(m);
+ return mount_private_dev(m, scope);
case BIND_DEV:
return mount_bind_dev(m);
return 0;
}
+static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
+ assert(m);
+
+ /* Create a string suitable for debugging logs, stripping for example the local working directory.
+ * For example, with a BindPaths=/var/bar that does not exist on the host:
+ *
+ * Before:
+ * foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
+ * After:
+ * foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
+ *
+ * Note that this is an error path, so no OOM check is done on purpose. */
+
+ if (!error_path)
+ return;
+
+ if (!mount_entry_path(m)) {
+ *error_path = NULL;
+ return;
+ }
+
+ if (root) {
+ const char *e = startswith(mount_entry_path(m), root);
+ if (e) {
+ *error_path = strdup(e);
+ return;
+ }
+ }
+
+ *error_path = strdup(mount_entry_path(m));
+ return;
+}
+
static int apply_mounts(
const char *root,
const ImagePolicy *mount_image_policy,
const NamespaceInfo *ns_info,
MountEntry *mounts,
size_t *n_mounts,
+ RuntimeScope scope,
char **symlinks,
char **error_path) {
/* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
r = follow_symlink(!IN_SET(m->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) ? root : NULL, m);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
if (r == 0) {
break;
}
- r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info);
+ r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info, scope);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
r = make_read_only(m, deny_list, proc_self_mountinfo);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
}
for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
r = make_noexec(m, deny_list, proc_self_mountinfo);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
}
for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
r = make_nosuid(m, proc_self_mountinfo);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
}
const char *extension_dir,
const char *notify_socket,
const char *host_os_release_stage,
+ RuntimeScope scope,
char **error_path) {
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
(void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
/* Now make the magic happen */
- r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path);
+ r = apply_mounts(root,
+ mount_image_policy,
+ extension_image_policy,
+ ns_info,
+ mounts, &n_mounts,
+ scope,
+ symlinks,
+ error_path);
if (r < 0)
goto finish;