core/namespace: check if we have enough privilege to mount sysfs or procfs

author Yu Watanabe <watanabe.yu+github@gmail.com>

Wed, 11 Oct 2023 05:58:38 +0000 (14:58 +0900)

committer Yu Watanabe <watanabe.yu+github@gmail.com>

Thu, 26 Oct 2023 10:09:46 +0000 (19:09 +0900)
author Yu Watanabe <watanabe.yu+github@gmail.com>
Wed, 11 Oct 2023 05:58:38 +0000 (14:58 +0900)
committer Yu Watanabe <watanabe.yu+github@gmail.com>
Thu, 26 Oct 2023 10:09:46 +0000 (19:09 +0900)
diff --git a/src/core/namespace.c b/src/core/namespace.c

index 9202fbc1b73f6cb36ad8221c5e3297a49a120ab7..38f74346e1e54ff6a50ed566e0551f9c0a5dc47b 100644 (file)
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -1164,33 +1164,35 @@ static int mount_private_apivfs(
                  const char *fstype,
                  const char *entry_path,
                  const char *bind_source,
-                const char *opts) {
+                const char *opts,
+                RuntimeScope scope) {
  
-        int r, n;
+        _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
+        int r;
  
          assert(fstype);
          assert(entry_path);
          assert(bind_source);
  
          (void) mkdir_p_label(entry_path, 0755);
-        n = umount_recursive(entry_path, /* flags = */ 0);
  
-        r = mount_nofollow_verbose(LOG_DEBUG, fstype, entry_path, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+        /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance
+         * cannot be mounted on an already existing mount. Let's use a temporary place. */
+        r = create_temporary_mount_point(scope, &temporary_mount);
+        if (r < 0)
+                return r;
+
+        r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
          if (r == -EINVAL && opts)
                  /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
                   * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
                   * really don't want to use it, since it would affect our host's /proc mount. Hence let's
                   * gracefully fallback to a classic, unrestricted version. */
-                r = mount_nofollow_verbose(LOG_DEBUG, fstype, entry_path, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
+                r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
          if (ERRNO_IS_NEG_PRIVILEGE(r)) {
                  /* When we do not have enough privileges to mount a new instance, fall back to use an
                   * existing mount. */
  
-                if (n > 0)
-                        /* The mount or some of sub-mounts are umounted in the above. Refuse incomplete tree.
-                         * Propagate the original error code returned by mount() in the above. */
-                        return r;
-
                  r = path_is_mount_point(entry_path, /* root = */ NULL, /* flags = */ 0);
                  if (r < 0)
                          return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path);
@@ -1205,15 +1207,26 @@ static int mount_private_apivfs(
          } else if (r < 0)
                  return r;
  
+        /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
+        r = umount_recursive(entry_path, /* flags = */ 0);
+        if (r < 0)
+                log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path);
+
+        /* Then, move the new mount instance. */
+        r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype = */ NULL, MS_MOVE, /* opts = */ NULL);
+        if (r < 0)
+                return r;
+
          /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
           * where a bunch of files are overmounted, in particular the boot id. */
          (void) bind_mount_submounts(bind_source, entry_path);
          return 0;
  }
  
-static int mount_private_sysfs(const MountEntry *m) {
+static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p) {
          assert(m);
-        return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL);
+        assert(p);
+        return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
  }
  
  static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
@@ -1257,7 +1270,7 @@ static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
           * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
           * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
           * mounted on /proc/ first. */
-        return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts);
+        return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts, p->runtime_scope);
  }
  
  static int mount_tmpfs(const MountEntry *m) {
@@ -1596,7 +1609,7 @@ static int apply_one_mount(
                  return mount_bind_dev(m);
  
          case PRIVATE_SYSFS:
-                return mount_private_sysfs(m);
+                return mount_private_sysfs(m, p);
  
          case BIND_SYSFS:
                  return mount_bind_sysfs(m);
author	Yu Watanabe <watanabe.yu+github@gmail.com>
	Wed, 11 Oct 2023 05:58:38 +0000 (14:58 +0900)
committer	Yu Watanabe <watanabe.yu+github@gmail.com>
	Thu, 26 Oct 2023 10:09:46 +0000 (19:09 +0900)