]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/core/namespace.c
Merge pull request #29343 from DaanDeMeyer/tmp
[thirdparty/systemd.git] / src / core / namespace.c
index 2197287fd08afa3824269b02018ce6e27ec6447b..206453f30f9cf45c3f276a09fe14f04e1a4198ac 100644 (file)
@@ -74,8 +74,7 @@ typedef enum MountMode {
         EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */
         EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */
         MQUEUEFS,
-        READWRITE_IMPLICIT, /* Should have the 2nd lowest priority. */
-        MKDIR,              /* Should have the lowest priority. */
+        READWRITE_IMPLICIT, /* Should have the lowest priority. */
         _MOUNT_MODE_MAX,
 } MountMode;
 
@@ -232,7 +231,6 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
         [EXTENSION_IMAGES]      = "extension-images",
         [MQUEUEFS]              = "mqueuefs",
         [READWRITE_IMPLICIT]    = "read-write-implicit",
-        [MKDIR]                 = "mkdir",
 };
 
 /* Helper struct for naming simplicity and reusability */
@@ -911,7 +909,19 @@ add_symlink:
         return 0;
 }
 
-static int mount_private_dev(MountEntry *m) {
+static char *settle_runtime_dir(RuntimeScope scope) {
+        char *runtime_dir;
+
+        if (scope != RUNTIME_SCOPE_USER)
+                return strdup("/run/");
+
+        if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
+                return NULL;
+
+        return runtime_dir;
+}
+
+static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
         static const char devnodes[] =
                 "/dev/null\0"
                 "/dev/zero\0"
@@ -920,13 +930,21 @@ static int mount_private_dev(MountEntry *m) {
                 "/dev/urandom\0"
                 "/dev/tty\0";
 
-        char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
+        _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
         const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
         bool can_mknod = true;
         int r;
 
         assert(m);
 
+        runtime_dir = settle_runtime_dir(scope);
+        if (!runtime_dir)
+                return log_oom_debug();
+
+        temporary_mount = path_join(runtime_dir, "systemd/namespace-dev-XXXXXX");
+        if (!temporary_mount)
+                return log_oom_debug();
+
         if (!mkdtemp(temporary_mount))
                 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
 
@@ -997,6 +1015,11 @@ static int mount_private_dev(MountEntry *m) {
         if (r < 0)
                 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
 
+        /* Make the bind mount read-only. */
+        r = mount_nofollow_verbose(LOG_DEBUG, NULL, dev, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
+        if (r < 0)
+                return r;
+
         /* Create the /dev directory if missing. It is more likely to be missing when the service is started
          * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
         (void) mkdir_p_label(mount_entry_path(m), 0755);
@@ -1361,7 +1384,8 @@ static int apply_one_mount(
                 MountEntry *m,
                 const ImagePolicy *mount_image_policy,
                 const ImagePolicy *extension_image_policy,
-                const NamespaceInfo *ns_info) {
+                const NamespaceInfo *ns_info,
+                RuntimeScope scope) {
 
         _cleanup_free_ char *inaccessible = NULL;
         bool rbind = true, make = false;
@@ -1376,8 +1400,7 @@ static int apply_one_mount(
         switch (m->mode) {
 
         case INACCESSIBLE: {
-                _cleanup_free_ char *tmp = NULL;
-                const char *runtime_dir;
+                _cleanup_free_ char *runtime_dir = NULL;
                 struct stat target;
 
                 /* First, get rid of everything that is below if there
@@ -1393,14 +1416,14 @@ static int apply_one_mount(
                                                mount_entry_path(m));
                 }
 
-                if (geteuid() == 0)
-                        runtime_dir = "/run";
-                else {
-                        if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
-                                return -ENOMEM;
-
-                        runtime_dir = tmp;
-                }
+                /* We don't pass the literal runtime scope through here but one based purely on our UID. This
+                 * means that the root user's --user services will use the host's inaccessible inodes rather
+                 * then root's private ones. This is preferable since it means device nodes that are
+                 * overmounted to make them inaccessible will be overmounted with a device node, rather than
+                 * an AF_UNIX socket inode. */
+                runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
+                if (!runtime_dir)
+                        return log_oom_debug();
 
                 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
                 if (r < 0)
@@ -1520,7 +1543,7 @@ static int apply_one_mount(
                 break;
 
         case PRIVATE_DEV:
-                return mount_private_dev(m);
+                return mount_private_dev(m, scope);
 
         case BIND_DEV:
                 return mount_bind_dev(m);
@@ -1549,12 +1572,6 @@ static int apply_one_mount(
         case OVERLAY_MOUNT:
                 return mount_overlay(m);
 
-        case MKDIR:
-                r = mkdir_p_label(mount_entry_path(m), 0755);
-                if (r < 0)
-                        return r;
-                return 1;
-
         default:
                 assert_not_reached();
         }
@@ -1820,6 +1837,39 @@ static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
         return 0;
 }
 
+static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
+        assert(m);
+
+        /* Create a string suitable for debugging logs, stripping for example the local working directory.
+         * For example, with a BindPaths=/var/bar that does not exist on the host:
+         *
+         * Before:
+         *  foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
+         * After:
+         *  foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
+         *
+         * Note that this is an error path, so no OOM check is done on purpose. */
+
+        if (!error_path)
+                return;
+
+        if (!mount_entry_path(m)) {
+                *error_path = NULL;
+                return;
+        }
+
+        if (root) {
+                const char *e = startswith(mount_entry_path(m), root);
+                if (e) {
+                        *error_path = strdup(e);
+                        return;
+                }
+        }
+
+        *error_path = strdup(mount_entry_path(m));
+        return;
+}
+
 static int apply_mounts(
                 const char *root,
                 const ImagePolicy *mount_image_policy,
@@ -1827,6 +1877,7 @@ static int apply_mounts(
                 const NamespaceInfo *ns_info,
                 MountEntry *mounts,
                 size_t *n_mounts,
+                RuntimeScope scope,
                 char **symlinks,
                 char **error_path) {
 
@@ -1865,8 +1916,7 @@ static int apply_mounts(
                         /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
                         r = follow_symlink(!IN_SET(m->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) ? root : NULL, m);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
                         if (r == 0) {
@@ -1878,10 +1928,9 @@ static int apply_mounts(
                                 break;
                         }
 
-                        r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info);
+                        r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info, scope);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
 
@@ -1914,8 +1963,7 @@ static int apply_mounts(
         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                 r = make_read_only(m, deny_list, proc_self_mountinfo);
                 if (r < 0) {
-                        if (error_path && mount_entry_path(m))
-                                *error_path = strdup(mount_entry_path(m));
+                        mount_entry_path_debug_string(root, m, error_path);
                         return r;
                 }
         }
@@ -1929,8 +1977,7 @@ static int apply_mounts(
         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                 r = make_noexec(m, deny_list, proc_self_mountinfo);
                 if (r < 0) {
-                        if (error_path && mount_entry_path(m))
-                                *error_path = strdup(mount_entry_path(m));
+                        mount_entry_path_debug_string(root, m, error_path);
                         return r;
                 }
         }
@@ -1940,8 +1987,7 @@ static int apply_mounts(
                 for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                         r = make_nosuid(m, proc_self_mountinfo);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
                 }
@@ -2021,7 +2067,6 @@ int setup_namespace(
                 const char* tmp_dir,
                 const char* var_tmp_dir,
                 const char *creds_path,
-                int creds_fd,
                 const char *log_namespace,
                 unsigned long mount_propagation_flag,
                 VeritySettings *verity,
@@ -2034,6 +2079,7 @@ int setup_namespace(
                 const char *extension_dir,
                 const char *notify_socket,
                 const char *host_os_release_stage,
+                RuntimeScope scope,
                 char **error_path) {
 
         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
@@ -2344,22 +2390,13 @@ int setup_namespace(
                                 .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
                         };
 
-                        /* If we have mount fd for credentials directory, then it will be mounted after
-                         * namespace is set up. So, here we only create the mount point. */
-
-                        if (creds_fd < 0)
-                                *(m++) = (MountEntry) {
-                                        .path_const = creds_path,
-                                        .mode = BIND_MOUNT,
-                                        .read_only = true,
-                                        .source_const = creds_path,
-                                        .ignore = true,
-                                };
-                        else
-                                *(m++) = (MountEntry) {
-                                        .path_const = creds_path,
-                                        .mode = MKDIR,
-                                };
+                        *(m++) = (MountEntry) {
+                                .path_const = creds_path,
+                                .mode = BIND_MOUNT,
+                                .read_only = true,
+                                .source_const = creds_path,
+                                .ignore = true,
+                        };
                 } else {
                         /* If our service has no credentials store configured, then make the whole
                          * credentials tree inaccessible wholesale. */
@@ -2503,7 +2540,14 @@ int setup_namespace(
                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
 
         /* Now make the magic happen */
-        r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path);
+        r = apply_mounts(root,
+                         mount_image_policy,
+                         extension_image_policy,
+                         ns_info,
+                         mounts, &n_mounts,
+                         scope,
+                         symlinks,
+                         error_path);
         if (r < 0)
                 goto finish;