EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */
EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */
MQUEUEFS,
- READWRITE_IMPLICIT, /* Should have the 2nd lowest priority. */
- MKDIR, /* Should have the lowest priority. */
+ READWRITE_IMPLICIT, /* Should have the lowest priority. */
_MOUNT_MODE_MAX,
} MountMode;
[EXTENSION_IMAGES] = "extension-images",
[MQUEUEFS] = "mqueuefs",
[READWRITE_IMPLICIT] = "read-write-implicit",
- [MKDIR] = "mkdir",
};
/* Helper struct for naming simplicity and reusability */
return 0;
}
-static int mount_private_dev(MountEntry *m) {
+static char *settle_runtime_dir(RuntimeScope scope) {
+ char *runtime_dir;
+
+ if (scope != RUNTIME_SCOPE_USER)
+ return strdup("/run/");
+
+ if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
+ return NULL;
+
+ return runtime_dir;
+}
+
+static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
static const char devnodes[] =
"/dev/null\0"
"/dev/zero\0"
"/dev/urandom\0"
"/dev/tty\0";
- char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
+ _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
bool can_mknod = true;
int r;
assert(m);
+ runtime_dir = settle_runtime_dir(scope);
+ if (!runtime_dir)
+ return log_oom_debug();
+
+ temporary_mount = path_join(runtime_dir, "systemd/namespace-dev-XXXXXX");
+ if (!temporary_mount)
+ return log_oom_debug();
+
if (!mkdtemp(temporary_mount))
return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
if (r < 0)
log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
+ /* Make the bind mount read-only. */
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, dev, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
+
/* Create the /dev directory if missing. It is more likely to be missing when the service is started
* with RootDirectory. This is consistent with mount units creating the mount points when missing. */
(void) mkdir_p_label(mount_entry_path(m), 0755);
MountEntry *m,
const ImagePolicy *mount_image_policy,
const ImagePolicy *extension_image_policy,
- const NamespaceInfo *ns_info) {
+ const NamespaceInfo *ns_info,
+ RuntimeScope scope) {
_cleanup_free_ char *inaccessible = NULL;
bool rbind = true, make = false;
switch (m->mode) {
case INACCESSIBLE: {
- _cleanup_free_ char *tmp = NULL;
- const char *runtime_dir;
+ _cleanup_free_ char *runtime_dir = NULL;
struct stat target;
/* First, get rid of everything that is below if there
mount_entry_path(m));
}
- if (geteuid() == 0)
- runtime_dir = "/run";
- else {
- if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
- return -ENOMEM;
-
- runtime_dir = tmp;
- }
+ /* We don't pass the literal runtime scope through here but one based purely on our UID. This
+ * means that the root user's --user services will use the host's inaccessible inodes rather
+ * then root's private ones. This is preferable since it means device nodes that are
+ * overmounted to make them inaccessible will be overmounted with a device node, rather than
+ * an AF_UNIX socket inode. */
+ runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
+ if (!runtime_dir)
+ return log_oom_debug();
r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
if (r < 0)
break;
case PRIVATE_DEV:
- return mount_private_dev(m);
+ return mount_private_dev(m, scope);
case BIND_DEV:
return mount_bind_dev(m);
case OVERLAY_MOUNT:
return mount_overlay(m);
- case MKDIR:
- r = mkdir_p_label(mount_entry_path(m), 0755);
- if (r < 0)
- return r;
- return 1;
-
default:
assert_not_reached();
}
return 0;
}
+static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
+ assert(m);
+
+ /* Create a string suitable for debugging logs, stripping for example the local working directory.
+ * For example, with a BindPaths=/var/bar that does not exist on the host:
+ *
+ * Before:
+ * foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
+ * After:
+ * foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
+ *
+ * Note that this is an error path, so no OOM check is done on purpose. */
+
+ if (!error_path)
+ return;
+
+ if (!mount_entry_path(m)) {
+ *error_path = NULL;
+ return;
+ }
+
+ if (root) {
+ const char *e = startswith(mount_entry_path(m), root);
+ if (e) {
+ *error_path = strdup(e);
+ return;
+ }
+ }
+
+ *error_path = strdup(mount_entry_path(m));
+ return;
+}
+
static int apply_mounts(
const char *root,
const ImagePolicy *mount_image_policy,
const NamespaceInfo *ns_info,
MountEntry *mounts,
size_t *n_mounts,
+ RuntimeScope scope,
char **symlinks,
char **error_path) {
/* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
r = follow_symlink(!IN_SET(m->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) ? root : NULL, m);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
if (r == 0) {
break;
}
- r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info);
+ r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info, scope);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
r = make_read_only(m, deny_list, proc_self_mountinfo);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
}
for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
r = make_noexec(m, deny_list, proc_self_mountinfo);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
}
for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
r = make_nosuid(m, proc_self_mountinfo);
if (r < 0) {
- if (error_path && mount_entry_path(m))
- *error_path = strdup(mount_entry_path(m));
+ mount_entry_path_debug_string(root, m, error_path);
return r;
}
}
const char* tmp_dir,
const char* var_tmp_dir,
const char *creds_path,
- int creds_fd,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,
const char *extension_dir,
const char *notify_socket,
const char *host_os_release_stage,
+ RuntimeScope scope,
char **error_path) {
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
.flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
};
- /* If we have mount fd for credentials directory, then it will be mounted after
- * namespace is set up. So, here we only create the mount point. */
-
- if (creds_fd < 0)
- *(m++) = (MountEntry) {
- .path_const = creds_path,
- .mode = BIND_MOUNT,
- .read_only = true,
- .source_const = creds_path,
- .ignore = true,
- };
- else
- *(m++) = (MountEntry) {
- .path_const = creds_path,
- .mode = MKDIR,
- };
+ *(m++) = (MountEntry) {
+ .path_const = creds_path,
+ .mode = BIND_MOUNT,
+ .read_only = true,
+ .source_const = creds_path,
+ .ignore = true,
+ };
} else {
/* If our service has no credentials store configured, then make the whole
* credentials tree inaccessible wholesale. */
(void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
/* Now make the magic happen */
- r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path);
+ r = apply_mounts(root,
+ mount_image_policy,
+ extension_image_policy,
+ ns_info,
+ mounts, &n_mounts,
+ scope,
+ symlinks,
+ error_path);
if (r < 0)
goto finish;