]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/core/namespace.c
Merge pull request #29343 from DaanDeMeyer/tmp
[thirdparty/systemd.git] / src / core / namespace.c
index 51b5aad9c906d592703ea6248ac95a9e177733cb..206453f30f9cf45c3f276a09fe14f04e1a4198ac 100644 (file)
@@ -208,25 +208,29 @@ static const MountEntry protect_system_strict_table[] = {
 };
 
 static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
-        [INACCESSIBLE]         = "inaccessible",
-        [OVERLAY_MOUNT]        = "overlay",
-        [BIND_MOUNT]           = "bind",
-        [BIND_MOUNT_RECURSIVE] = "rbind",
-        [PRIVATE_TMP]          = "private-tmp",
-        [PRIVATE_DEV]          = "private-dev",
-        [BIND_DEV]             = "bind-dev",
-        [EMPTY_DIR]            = "empty",
-        [PRIVATE_SYSFS]        = "private-sysfs",
-        [BIND_SYSFS]           = "bind-sysfs",
-        [PROCFS]               = "procfs",
-        [READONLY]             = "read-only",
-        [READWRITE]            = "read-write",
-        [TMPFS]                = "tmpfs",
-        [MOUNT_IMAGES]         = "mount-images",
-        [READWRITE_IMPLICIT]   = "rw-implicit",
-        [EXEC]                 = "exec",
-        [NOEXEC]               = "noexec",
-        [MQUEUEFS]             = "mqueuefs",
+        [INACCESSIBLE]          = "inaccessible",
+        [OVERLAY_MOUNT]         = "overlay",
+        [MOUNT_IMAGES]          = "mount-images",
+        [BIND_MOUNT]            = "bind",
+        [BIND_MOUNT_RECURSIVE]  = "rbind",
+        [PRIVATE_TMP]           = "private-tmp",
+        [PRIVATE_TMP_READONLY]  = "private-tmp-read-only",
+        [PRIVATE_DEV]           = "private-dev",
+        [BIND_DEV]              = "bind-dev",
+        [EMPTY_DIR]             = "empty",
+        [PRIVATE_SYSFS]         = "private-sysfs",
+        [BIND_SYSFS]            = "bind-sysfs",
+        [PROCFS]                = "procfs",
+        [READONLY]              = "read-only",
+        [READWRITE]             = "read-write",
+        [NOEXEC]                = "noexec",
+        [EXEC]                  = "exec",
+        [TMPFS]                 = "tmpfs",
+        [RUN]                   = "run",
+        [EXTENSION_DIRECTORIES] = "extension-directories",
+        [EXTENSION_IMAGES]      = "extension-images",
+        [MQUEUEFS]              = "mqueuefs",
+        [READWRITE_IMPLICIT]    = "read-write-implicit",
 };
 
 /* Helper struct for naming simplicity and reusability */
@@ -905,7 +909,19 @@ add_symlink:
         return 0;
 }
 
-static int mount_private_dev(MountEntry *m) {
+static char *settle_runtime_dir(RuntimeScope scope) {
+        char *runtime_dir;
+
+        if (scope != RUNTIME_SCOPE_USER)
+                return strdup("/run/");
+
+        if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
+                return NULL;
+
+        return runtime_dir;
+}
+
+static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
         static const char devnodes[] =
                 "/dev/null\0"
                 "/dev/zero\0"
@@ -914,13 +930,21 @@ static int mount_private_dev(MountEntry *m) {
                 "/dev/urandom\0"
                 "/dev/tty\0";
 
-        char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
+        _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
         const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
         bool can_mknod = true;
         int r;
 
         assert(m);
 
+        runtime_dir = settle_runtime_dir(scope);
+        if (!runtime_dir)
+                return log_oom_debug();
+
+        temporary_mount = path_join(runtime_dir, "systemd/namespace-dev-XXXXXX");
+        if (!temporary_mount)
+                return log_oom_debug();
+
         if (!mkdtemp(temporary_mount))
                 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
 
@@ -991,6 +1015,11 @@ static int mount_private_dev(MountEntry *m) {
         if (r < 0)
                 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
 
+        /* Make the bind mount read-only. */
+        r = mount_nofollow_verbose(LOG_DEBUG, NULL, dev, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
+        if (r < 0)
+                return r;
+
         /* Create the /dev directory if missing. It is more likely to be missing when the service is started
          * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
         (void) mkdir_p_label(mount_entry_path(m), 0755);
@@ -1045,34 +1074,7 @@ static int mount_bind_dev(const MountEntry *m) {
         if (r > 0) /* make this a NOP if /dev is already a mount point */
                 return 0;
 
-        r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
-        if (r < 0)
-                return r;
-
-        return 1;
-}
-
-static int mount_private_sysfs(const MountEntry *m) {
-        const char *p = mount_entry_path(ASSERT_PTR(m));
-        int r;
-
-        (void) mkdir_p_label(p, 0755);
-
-        r = remount_sysfs(p);
-        if (r < 0 && (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))) {
-                /* Running with an unprivileged user (PrivateUsers=yes), or the kernel seems old. Falling
-                 * back to bind mount the host's version so that we get all child mounts of it, too. */
-
-                log_debug_errno(r, "Failed to remount sysfs on %s, falling back to bind mount: %m", p);
-
-                (void) umount_recursive(p, 0);
-
-                r = mount_nofollow_verbose(LOG_DEBUG, "/sys", p, NULL, MS_BIND|MS_REC, NULL);
-        }
-        if (r < 0)
-                return log_debug_errno(r, "Failed to remount sysfs on %s: %m", p);
-
-        return 1;
+        return mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
 }
 
 static int mount_bind_sysfs(const MountEntry *m) {
@@ -1089,11 +1091,34 @@ static int mount_bind_sysfs(const MountEntry *m) {
                 return 0;
 
         /* Bind mount the host's version so that we get all child mounts of it, too. */
-        r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
-        if (r < 0)
+        return mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+}
+
+static int mount_private_sysfs(const MountEntry *m) {
+        const char *entry_path = mount_entry_path(ASSERT_PTR(m));
+        int r, n;
+
+        (void) mkdir_p_label(entry_path, 0755);
+
+        n = umount_recursive(entry_path, 0);
+
+        r = mount_nofollow_verbose(LOG_DEBUG, "sysfs", entry_path, "sysfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+                /* When we do not have enough privileges to mount sysfs, fall back to use existing /sys. */
+
+                if (n > 0)
+                        /* /sys or some of sub-mounts are umounted in the above. Refuse incomplete tree.
+                         * Propagate the original error code returned by mount() in the above. */
+                        return r;
+
+                return mount_bind_sysfs(m);
+
+        } else if (r < 0)
                 return r;
 
-        return 1;
+        /* We mounted a new instance now. Let's bind mount the children over now. */
+        (void) bind_mount_submounts("/sys", entry_path);
+        return 0;
 }
 
 static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
@@ -1152,34 +1177,32 @@ static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
                  * means we really don't want to use it, since it would affect our host's /proc
                  * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
                 r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
-        if (r == -EPERM) {
-                /* When we do not have enough privileges to mount /proc, fallback to use existing /proc. */
+        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+                /* When we do not have enough privileges to mount /proc, fall back to use existing /proc. */
 
                 if (n > 0)
                         /* /proc or some of sub-mounts are umounted in the above. Refuse incomplete tree.
                          * Propagate the original error code returned by mount() in the above. */
-                        return -EPERM;
+                        return r;
 
                 r = path_is_mount_point(entry_path, NULL, 0);
                 if (r < 0)
                         return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
-                if (r == 0) {
-                        /* We lack permissions to mount a new instance of /proc, and it is not already
-                         * mounted. But we can access the host's, so as a final fallback bind-mount it to
-                         * the destination, as most likely we are inside a user manager in an unprivileged
-                         * user namespace. */
-                        r = mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
-                        if (r < 0)
-                                return -EPERM;
-                }
+                if (r > 0)
+                        return 0;
+
+                /* We lack permissions to mount a new instance of /proc, and it is not already mounted. But
+                 * we can access the host's, so as a final fallback bind-mount it to the destination, as most
+                 * likely we are inside a user manager in an unprivileged user namespace. */
+                return mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
+
         } else if (r < 0)
                 return r;
-        else
-                /* We mounted a new instance now. Let's bind mount the children over now. This matters for
-                 * nspawn where a bunch of files are overmounted, in particular the boot id */
-                (void) bind_mount_submounts("/proc", entry_path);
 
-        return 1;
+        /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
+         * where a bunch of files are overmounted, in particular the boot id */
+        (void) bind_mount_submounts("/proc", entry_path);
+        return 0;
 }
 
 static int mount_tmpfs(const MountEntry *m) {
@@ -1205,7 +1228,7 @@ static int mount_tmpfs(const MountEntry *m) {
         if (r < 0)
                 return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
 
-        return 1;
+        return 0;
 }
 
 static int mount_run(const MountEntry *m) {
@@ -1303,7 +1326,7 @@ static int mount_image(
         if (r < 0)
                 return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
 
-        return 1;
+        return 0;
 }
 
 static int mount_overlay(const MountEntry *m) {
@@ -1319,10 +1342,8 @@ static int mount_overlay(const MountEntry *m) {
         r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
         if (r == -ENOENT && m->ignore)
                 return 0;
-        if (r < 0)
-                return r;
 
-        return 1;
+        return r;
 }
 
 static int follow_symlink(
@@ -1363,7 +1384,8 @@ static int apply_one_mount(
                 MountEntry *m,
                 const ImagePolicy *mount_image_policy,
                 const ImagePolicy *extension_image_policy,
-                const NamespaceInfo *ns_info) {
+                const NamespaceInfo *ns_info,
+                RuntimeScope scope) {
 
         _cleanup_free_ char *inaccessible = NULL;
         bool rbind = true, make = false;
@@ -1378,8 +1400,7 @@ static int apply_one_mount(
         switch (m->mode) {
 
         case INACCESSIBLE: {
-                _cleanup_free_ char *tmp = NULL;
-                const char *runtime_dir;
+                _cleanup_free_ char *runtime_dir = NULL;
                 struct stat target;
 
                 /* First, get rid of everything that is below if there
@@ -1395,14 +1416,14 @@ static int apply_one_mount(
                                                mount_entry_path(m));
                 }
 
-                if (geteuid() == 0)
-                        runtime_dir = "/run";
-                else {
-                        if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
-                                return -ENOMEM;
-
-                        runtime_dir = tmp;
-                }
+                /* We don't pass the literal runtime scope through here but one based purely on our UID. This
+                 * means that the root user's --user services will use the host's inaccessible inodes rather
+                 * then root's private ones. This is preferable since it means device nodes that are
+                 * overmounted to make them inaccessible will be overmounted with a device node, rather than
+                 * an AF_UNIX socket inode. */
+                runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
+                if (!runtime_dir)
+                        return log_oom_debug();
 
                 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
                 if (r < 0)
@@ -1522,7 +1543,7 @@ static int apply_one_mount(
                 break;
 
         case PRIVATE_DEV:
-                return mount_private_dev(m);
+                return mount_private_dev(m, scope);
 
         case BIND_DEV:
                 return mount_bind_dev(m);
@@ -1816,6 +1837,39 @@ static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
         return 0;
 }
 
+static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
+        assert(m);
+
+        /* Create a string suitable for debugging logs, stripping for example the local working directory.
+         * For example, with a BindPaths=/var/bar that does not exist on the host:
+         *
+         * Before:
+         *  foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
+         * After:
+         *  foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
+         *
+         * Note that this is an error path, so no OOM check is done on purpose. */
+
+        if (!error_path)
+                return;
+
+        if (!mount_entry_path(m)) {
+                *error_path = NULL;
+                return;
+        }
+
+        if (root) {
+                const char *e = startswith(mount_entry_path(m), root);
+                if (e) {
+                        *error_path = strdup(e);
+                        return;
+                }
+        }
+
+        *error_path = strdup(mount_entry_path(m));
+        return;
+}
+
 static int apply_mounts(
                 const char *root,
                 const ImagePolicy *mount_image_policy,
@@ -1823,6 +1877,7 @@ static int apply_mounts(
                 const NamespaceInfo *ns_info,
                 MountEntry *mounts,
                 size_t *n_mounts,
+                RuntimeScope scope,
                 char **symlinks,
                 char **error_path) {
 
@@ -1861,8 +1916,7 @@ static int apply_mounts(
                         /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
                         r = follow_symlink(!IN_SET(m->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) ? root : NULL, m);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
                         if (r == 0) {
@@ -1874,10 +1928,9 @@ static int apply_mounts(
                                 break;
                         }
 
-                        r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info);
+                        r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info, scope);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
 
@@ -1910,8 +1963,7 @@ static int apply_mounts(
         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                 r = make_read_only(m, deny_list, proc_self_mountinfo);
                 if (r < 0) {
-                        if (error_path && mount_entry_path(m))
-                                *error_path = strdup(mount_entry_path(m));
+                        mount_entry_path_debug_string(root, m, error_path);
                         return r;
                 }
         }
@@ -1925,8 +1977,7 @@ static int apply_mounts(
         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                 r = make_noexec(m, deny_list, proc_self_mountinfo);
                 if (r < 0) {
-                        if (error_path && mount_entry_path(m))
-                                *error_path = strdup(mount_entry_path(m));
+                        mount_entry_path_debug_string(root, m, error_path);
                         return r;
                 }
         }
@@ -1936,8 +1987,7 @@ static int apply_mounts(
                 for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                         r = make_nosuid(m, proc_self_mountinfo);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
                 }
@@ -2029,6 +2079,7 @@ int setup_namespace(
                 const char *extension_dir,
                 const char *notify_socket,
                 const char *host_os_release_stage,
+                RuntimeScope scope,
                 char **error_path) {
 
         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
@@ -2489,7 +2540,14 @@ int setup_namespace(
                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
 
         /* Now make the magic happen */
-        r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path);
+        r = apply_mounts(root,
+                         mount_image_policy,
+                         extension_image_policy,
+                         ns_info,
+                         mounts, &n_mounts,
+                         scope,
+                         symlinks,
+                         error_path);
         if (r < 0)
                 goto finish;