mounts: keep list of mounts in an rbtree

author Miklos Szeredi <mszeredi@redhat.com>

Wed, 25 Oct 2023 14:02:00 +0000 (16:02 +0200)

committer Christian Brauner <brauner@kernel.org>

Sat, 18 Nov 2023 13:56:16 +0000 (14:56 +0100)
author Miklos Szeredi <mszeredi@redhat.com>
Wed, 25 Oct 2023 14:02:00 +0000 (16:02 +0200)
committer Christian Brauner <brauner@kernel.org>
Sat, 18 Nov 2023 13:56:16 +0000 (14:56 +0100)
diff --git a/fs/mount.h b/fs/mount.h

index a14f762b3f290bf0e74efe6ea9c92c4b8685af66..4a42fc68f4cca94770fe5131cb0fcd264b9a820d 100644 (file)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,19 +8,13 @@
  struct mnt_namespace {
         struct ns_common        ns;
         struct mount *  root;
-       /*
-        * Traversal and modification of .list is protected by either
-        * - taking namespace_sem for write, OR
-        * - taking namespace_sem for read AND taking .ns_lock.
-        */
-       struct list_head        list;
-       spinlock_t              ns_lock;
+       struct rb_root          mounts; /* Protected by namespace_sem */
         struct user_namespace   *user_ns;
         struct ucounts          *ucounts;
         u64                     seq;    /* Sequence number to prevent loops */
         wait_queue_head_t poll;
         u64 event;
-       unsigned int            mounts; /* # of mounts in the namespace */
+       unsigned int            nr_mounts; /* # of mounts in the namespace */
         unsigned int            pending_mounts;
  } __randomize_layout;
  
@@ -55,7 +49,10 @@ struct mount {
         struct list_head mnt_child;     /* and going through their mnt_child */
         struct list_head mnt_instance;  /* mount instance on sb->s_mounts */
         const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
-       struct list_head mnt_list;
+       union {
+               struct rb_node mnt_node;        /* Under ns->mounts */
+               struct list_head mnt_list;
+       };
         struct list_head mnt_expire;    /* link in fs-specific expiry list */
         struct list_head mnt_share;     /* circular list of shared mounts */
         struct list_head mnt_slave_list;/* list of slave mounts */
@@ -128,7 +125,6 @@ struct proc_mounts {
         struct mnt_namespace *ns;
         struct path root;
         int (*show)(struct seq_file *, struct vfsmount *);
-       struct mount cursor;
  };
  
  extern const struct seq_operations mounts_op;
@@ -147,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
         return ns->seq == 0;
  }
  
+static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+{
+       WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
+       mnt->mnt.mnt_flags &= ~MNT_ONRB;
+       rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+       list_add_tail(&mnt->mnt_list, dt_list);
+}
+
  extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
diff --git a/fs/namespace.c b/fs/namespace.c

index 0bcba81402b56e9765939443331059dd135c8bb3..bbe94096e26216116e918759702818163949a16a 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -734,21 +734,6 @@ struct vfsmount *lookup_mnt(const struct path *path)
         return m;
  }
  
-static inline void lock_ns_list(struct mnt_namespace *ns)
-{
-       spin_lock(&ns->ns_lock);
-}
-
-static inline void unlock_ns_list(struct mnt_namespace *ns)
-{
-       spin_unlock(&ns->ns_lock);
-}
-
-static inline bool mnt_is_cursor(struct mount *mnt)
-{
-       return mnt->mnt.mnt_flags & MNT_CURSOR;
-}
-
  /*
   * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
   *                         current mount namespace.
@@ -767,19 +752,15 @@ static inline bool mnt_is_cursor(struct mount *mnt)
  bool __is_local_mountpoint(struct dentry *dentry)
  {
         struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-       struct mount *mnt;
+       struct mount *mnt, *n;
         bool is_covered = false;
  
         down_read(&namespace_sem);
-       lock_ns_list(ns);
-       list_for_each_entry(mnt, &ns->list, mnt_list) {
-               if (mnt_is_cursor(mnt))
-                       continue;
+       rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                 is_covered = (mnt->mnt_mountpoint == dentry);
                 if (is_covered)
                         break;
         }
-       unlock_ns_list(ns);
         up_read(&namespace_sem);
  
         return is_covered;
@@ -1026,6 +1007,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
         mnt_add_count(old_parent, -1);
  }
  
+static inline struct mount *node_to_mount(struct rb_node *node)
+{
+       return rb_entry(node, struct mount, mnt_node);
+}
+
+static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
+{
+       struct rb_node **link = &ns->mounts.rb_node;
+       struct rb_node *parent = NULL;
+
+       WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+       mnt->mnt_ns = ns;
+       while (*link) {
+               parent = *link;
+               if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+                       link = &parent->rb_left;
+               else
+                       link = &parent->rb_right;
+       }
+       rb_link_node(&mnt->mnt_node, parent, link);
+       rb_insert_color(&mnt->mnt_node, &ns->mounts);
+       mnt->mnt.mnt_flags |= MNT_ONRB;
+}
+
  /*
   * vfsmount lock must be held for write
   */
@@ -1039,12 +1044,13 @@ static void commit_tree(struct mount *mnt)
         BUG_ON(parent == mnt);
  
         list_add_tail(&head, &mnt->mnt_list);
-       list_for_each_entry(m, &head, mnt_list)
-               m->mnt_ns = n;
+       while (!list_empty(&head)) {
+               m = list_first_entry(&head, typeof(*m), mnt_list);
+               list_del(&m->mnt_list);
  
-       list_splice(&head, n->list.prev);
-
-       n->mounts += n->pending_mounts;
+               mnt_add_to_ns(n, m);
+       }
+       n->nr_mounts += n->pending_mounts;
         n->pending_mounts = 0;
  
         __attach_mnt(mnt, parent);
@@ -1192,7 +1198,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
         }
  
         mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-       mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+       mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
  
         atomic_inc(&sb->s_active);
         mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1417,65 +1423,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
         return &p->mnt;
  }
  
-#ifdef CONFIG_PROC_FS
-static struct mount *mnt_list_next(struct mnt_namespace *ns,
-                                  struct list_head *p)
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * smallest id afer the specified one.
+ */
+static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
  {
-       struct mount *mnt, *ret = NULL;
+       struct rb_node *node = ns->mounts.rb_node;
+       struct mount *ret = NULL;
  
-       lock_ns_list(ns);
-       list_for_each_continue(p, &ns->list) {
-               mnt = list_entry(p, typeof(*mnt), mnt_list);
-               if (!mnt_is_cursor(mnt)) {
-                       ret = mnt;
-                       break;
+       while (node) {
+               struct mount *m = node_to_mount(node);
+
+               if (mnt_id <= m->mnt_id_unique) {
+                       ret = node_to_mount(node);
+                       if (mnt_id == m->mnt_id_unique)
+                               break;
+                       node = node->rb_left;
+               } else {
+                       node = node->rb_right;
                 }
         }
-       unlock_ns_list(ns);
-
         return ret;
  }
  
+#ifdef CONFIG_PROC_FS
+
  /* iterator; we want it to have access to namespace_sem, thus here... */
  static void *m_start(struct seq_file *m, loff_t *pos)
  {
         struct proc_mounts *p = m->private;
-       struct list_head *prev;
  
         down_read(&namespace_sem);
-       if (!*pos) {
-               prev = &p->ns->list;
-       } else {
-               prev = &p->cursor.mnt_list;
  
-               /* Read after we'd reached the end? */
-               if (list_empty(prev))
-                       return NULL;
-       }
-
-       return mnt_list_next(p->ns, prev);
+       return mnt_find_id_at(p->ns, *pos);
  }
  
  static void *m_next(struct seq_file *m, void *v, loff_t *pos)
  {
-       struct proc_mounts *p = m->private;
-       struct mount *mnt = v;
+       struct mount *next = NULL, *mnt = v;
+       struct rb_node *node = rb_next(&mnt->mnt_node);
  
         ++*pos;
-       return mnt_list_next(p->ns, &mnt->mnt_list);
+       if (node) {
+               next = node_to_mount(node);
+               *pos = next->mnt_id_unique;
+       }
+       return next;
  }
  
  static void m_stop(struct seq_file *m, void *v)
  {
-       struct proc_mounts *p = m->private;
-       struct mount *mnt = v;
-
-       lock_ns_list(p->ns);
-       if (mnt)
-               list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
-       else
-               list_del_init(&p->cursor.mnt_list);
-       unlock_ns_list(p->ns);
         up_read(&namespace_sem);
  }
  
@@ -1493,14 +1491,6 @@ const struct seq_operations mounts_op = {
         .show   = m_show,
  };
  
-void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
-{
-       down_read(&namespace_sem);
-       lock_ns_list(ns);
-       list_del(&cursor->mnt_list);
-       unlock_ns_list(ns);
-       up_read(&namespace_sem);
-}
  #endif  /* CONFIG_PROC_FS */
  
  /**
@@ -1642,7 +1632,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
         /* Gather the mounts to umount */
         for (p = mnt; p; p = next_mnt(p, mnt)) {
                 p->mnt.mnt_flags |= MNT_UMOUNT;
-               list_move(&p->mnt_list, &tmp_list);
+               if (p->mnt.mnt_flags & MNT_ONRB)
+                       move_from_ns(p, &tmp_list);
+               else
+                       list_move(&p->mnt_list, &tmp_list);
         }
  
         /* Hide the mounts from mnt_mounts */
@@ -1662,7 +1655,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                 list_del_init(&p->mnt_list);
                 ns = p->mnt_ns;
                 if (ns) {
-                       ns->mounts--;
+                       ns->nr_mounts--;
                         __touch_mnt_namespace(ns);
                 }
                 p->mnt_ns = NULL;
@@ -1788,14 +1781,16 @@ static int do_umount(struct mount *mnt, int flags)
  
         event++;
         if (flags & MNT_DETACH) {
-               if (!list_empty(&mnt->mnt_list))
+               if (mnt->mnt.mnt_flags & MNT_ONRB ||
+                   !list_empty(&mnt->mnt_list))
                         umount_tree(mnt, UMOUNT_PROPAGATE);
                 retval = 0;
         } else {
                 shrink_submounts(mnt);
                 retval = -EBUSY;
                 if (!propagate_mount_busy(mnt, 2)) {
-                       if (!list_empty(&mnt->mnt_list))
+                       if (mnt->mnt.mnt_flags & MNT_ONRB ||
+                           !list_empty(&mnt->mnt_list))
                                 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                         retval = 0;
                 }
@@ -2213,9 +2208,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
         unsigned int mounts = 0;
         struct mount *p;
  
-       if (ns->mounts >= max)
+       if (ns->nr_mounts >= max)
                 return -ENOSPC;
-       max -= ns->mounts;
+       max -= ns->nr_mounts;
         if (ns->pending_mounts >= max)
                 return -ENOSPC;
         max -= ns->pending_mounts;
@@ -2359,8 +2354,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                 touch_mnt_namespace(source_mnt->mnt_ns);
         } else {
                 if (source_mnt->mnt_ns) {
+                       LIST_HEAD(head);
+
                         /* move from anon - the caller will destroy */
-                       list_del_init(&source_mnt->mnt_ns->list);
+                       for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+                               move_from_ns(p, &head);
+                       list_del_init(&head);
                 }
                 if (beneath)
                         mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
@@ -2671,11 +2670,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
  
         lock_mount_hash();
         for (p = mnt; p; p = next_mnt(p, mnt)) {
-               p->mnt_ns = ns;
-               ns->mounts++;
+               mnt_add_to_ns(ns, p);
+               ns->nr_mounts++;
         }
         ns->root = mnt;
-       list_add_tail(&ns->list, &mnt->mnt_list);
         mntget(&mnt->mnt);
         unlock_mount_hash();
         namespace_unlock();
@@ -3738,9 +3736,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
         if (!anon)
                 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
         refcount_set(&new_ns->ns.count, 1);
-       INIT_LIST_HEAD(&new_ns->list);
+       new_ns->mounts = RB_ROOT;
         init_waitqueue_head(&new_ns->poll);
-       spin_lock_init(&new_ns->ns_lock);
         new_ns->user_ns = get_user_ns(user_ns);
         new_ns->ucounts = ucounts;
         return new_ns;
@@ -3787,7 +3784,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                 unlock_mount_hash();
         }
         new_ns->root = new;
-       list_add_tail(&new_ns->list, &new->mnt_list);
  
         /*
          * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -3797,8 +3793,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
         p = old;
         q = new;
         while (p) {
-               q->mnt_ns = new_ns;
-               new_ns->mounts++;
+               mnt_add_to_ns(new_ns, q);
+               new_ns->nr_mounts++;
                 if (new_fs) {
                         if (&p->mnt == new_fs->root.mnt) {
                                 new_fs->root.mnt = mntget(&q->mnt);
@@ -3840,10 +3836,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
                 mntput(m);
                 return ERR_CAST(ns);
         }
-       mnt->mnt_ns = ns;
         ns->root = mnt;
-       ns->mounts++;
-       list_add(&mnt->mnt_list, &ns->list);
+       ns->nr_mounts++;
+       mnt_add_to_ns(ns, mnt);
  
         err = vfs_path_lookup(m->mnt_root, m,
                         name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
@@ -4021,10 +4016,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
                 goto err_path;
         }
         mnt = real_mount(newmount.mnt);
-       mnt->mnt_ns = ns;
         ns->root = mnt;
-       ns->mounts = 1;
-       list_add(&mnt->mnt_list, &ns->list);
+       ns->nr_mounts = 1;
+       mnt_add_to_ns(ns, mnt);
         mntget(newmount.mnt);
  
         /* Attach to an apparent O_PATH fd with a note that we need to unmount
@@ -4695,10 +4689,9 @@ static void __init init_mount_tree(void)
         if (IS_ERR(ns))
                 panic("Can't allocate initial namespace");
         m = real_mount(mnt);
-       m->mnt_ns = ns;
         ns->root = m;
-       ns->mounts = 1;
-       list_add(&m->mnt_list, &ns->list);
+       ns->nr_mounts = 1;
+       mnt_add_to_ns(ns, m);
         init_task.nsproxy->mnt_ns = ns;
         get_mnt_ns(ns);
  
@@ -4825,18 +4818,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
                                 int *new_mnt_flags)
  {
         int new_flags = *new_mnt_flags;
-       struct mount *mnt;
+       struct mount *mnt, *n;
         bool visible = false;
  
         down_read(&namespace_sem);
-       lock_ns_list(ns);
-       list_for_each_entry(mnt, &ns->list, mnt_list) {
+       rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                 struct mount *child;
                 int mnt_flags;
  
-               if (mnt_is_cursor(mnt))
-                       continue;
-
                 if (mnt->mnt.mnt_sb->s_type != sb->s_type)
                         continue;
  
@@ -4884,7 +4873,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
         next:   ;
         }
  found:
-       unlock_ns_list(ns);
         up_read(&namespace_sem);
         return visible;
  }
diff --git a/fs/pnode.c b/fs/pnode.c

index e4d0340393d59702bf40b381005642c38a23e369..a799e0315cc9a3bab0492b4c39d0d5c7172449ea 100644 (file)
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount)
         mnt->mnt.mnt_flags |= MNT_UMOUNT;
         list_del_init(&mnt->mnt_child);
         list_del_init(&mnt->mnt_umounting);
-       list_move_tail(&mnt->mnt_list, to_umount);
+       move_from_ns(mnt, to_umount);
  }
  
  /*
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c

index 250eb5bf7b522b16476165de34917c5dbc2f8365..73d2274d5f59cf007be0aa2d59056dfe2e324578 100644 (file)
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -283,8 +283,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
         p->ns = ns;
         p->root = root;
         p->show = show;
-       INIT_LIST_HEAD(&p->cursor.mnt_list);
-       p->cursor.mnt.mnt_flags = MNT_CURSOR;
  
         return 0;
  
@@ -301,7 +299,6 @@ static int mounts_release(struct inode *inode, struct file *file)
         struct seq_file *m = file->private_data;
         struct proc_mounts *p = m->private;
         path_put(&p->root);
-       mnt_cursor_del(p->ns, &p->cursor);
         put_mnt_ns(p->ns);
         return seq_release_private(inode, file);
  }
diff --git a/include/linux/mount.h b/include/linux/mount.h

index ac3dd287619784dfbeb9d3859d6d2287a7d9d423..c34c18b4e8f36f27775c4df624890eb8e6060965 100644 (file)
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -50,8 +50,7 @@ struct path;
  #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
  
  #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-                           MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
-                           MNT_CURSOR)
+                           MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
  
  #define MNT_INTERNAL   0x4000
  
@@ -65,7 +64,7 @@ struct path;
  #define MNT_SYNC_UMOUNT                0x2000000
  #define MNT_MARKED             0x4000000
  #define MNT_UMOUNT             0x8000000
-#define MNT_CURSOR             0x10000000
+#define MNT_ONRB               0x10000000
  
  struct vfsmount {
         struct dentry *mnt_root;        /* root of the mounted tree */
author	Miklos Szeredi <mszeredi@redhat.com>
	Wed, 25 Oct 2023 14:02:00 +0000 (16:02 +0200)
committer	Christian Brauner <brauner@kernel.org>
	Sat, 18 Nov 2023 13:56:16 +0000 (14:56 +0100)
fs/mount.h		patch \| blob \| blame \| history
fs/namespace.c		patch \| blob \| blame \| history
fs/pnode.c		patch \| blob \| blame \| history
fs/proc_namespace.c		patch \| blob \| blame \| history
include/linux/mount.h		patch \| blob \| blame \| history