fs: lockless mntns rbtree lookup

author Christian Brauner <brauner@kernel.org>

Thu, 12 Dec 2024 23:03:42 +0000 (00:03 +0100)

committer Christian Brauner <brauner@kernel.org>

Thu, 9 Jan 2025 15:58:52 +0000 (16:58 +0100)
author Christian Brauner <brauner@kernel.org>
Thu, 12 Dec 2024 23:03:42 +0000 (00:03 +0100)
committer Christian Brauner <brauner@kernel.org>
Thu, 9 Jan 2025 15:58:52 +0000 (16:58 +0100)
diff --git a/fs/mount.h b/fs/mount.h

index 179f690a0c7223f04b57be181b7f0412ce6a7946..bd8d6c36b4214519758d3af589c827358dadc728 100644 (file)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -12,7 +12,10 @@ struct mnt_namespace {
         struct user_namespace   *user_ns;
         struct ucounts          *ucounts;
         u64                     seq;    /* Sequence number to prevent loops */
-       wait_queue_head_t poll;
+       union {
+               wait_queue_head_t       poll;
+               struct rcu_head         mnt_ns_rcu;
+       };
         u64 event;
         unsigned int            nr_mounts; /* # of mounts in the namespace */
         unsigned int            pending_mounts;
diff --git a/fs/namespace.c b/fs/namespace.c

index 0ca2cda6bc16cfe0f15ec31146b56682b78ebf4d..d2dbbbc91cc08ae1c7ca4d41bd527de961936e2a 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
  static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
  static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
+
  static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
  
  struct mount_kattr {
@@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
   */
  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
  
-static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
-{
-       u64 seq_b = ns->seq;
-
-       if (seq < seq_b)
-               return -1;
-       if (seq > seq_b)
-               return 1;
-       return 0;
-}
-
  static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
  {
         if (!node)
@@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
         return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
  }
  
-static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
  {
         struct mnt_namespace *ns_a = node_to_mnt_ns(a);
         struct mnt_namespace *ns_b = node_to_mnt_ns(b);
         u64 seq_a = ns_a->seq;
+       u64 seq_b = ns_b->seq;
  
-       return mnt_ns_cmp(seq_a, ns_b) < 0;
+       if (seq_a < seq_b)
+               return -1;
+       if (seq_a > seq_b)
+               return 1;
+       return 0;
+}
+
+static inline void mnt_ns_tree_write_lock(void)
+{
+       write_lock(&mnt_ns_tree_lock);
+       write_seqcount_begin(&mnt_ns_tree_seqcount);
+}
+
+static inline void mnt_ns_tree_write_unlock(void)
+{
+       write_seqcount_end(&mnt_ns_tree_seqcount);
+       write_unlock(&mnt_ns_tree_lock);
  }
  
  static void mnt_ns_tree_add(struct mnt_namespace *ns)
  {
-       guard(write_lock)(&mnt_ns_tree_lock);
-       rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+       struct rb_node *node;
+
+       mnt_ns_tree_write_lock();
+       node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+       mnt_ns_tree_write_unlock();
+
+       WARN_ON_ONCE(node);
  }
  
  static void mnt_ns_release(struct mnt_namespace *ns)
@@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
  }
  DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
  
+static void mnt_ns_release_rcu(struct rcu_head *rcu)
+{
+       mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+}
+
  static void mnt_ns_tree_remove(struct mnt_namespace *ns)
  {
         /* remove from global mount namespace list */
         if (!is_anon_ns(ns)) {
-               guard(write_lock)(&mnt_ns_tree_lock);
+               mnt_ns_tree_write_lock();
                 rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+               mnt_ns_tree_write_unlock();
         }
  
-       mnt_ns_release(ns);
+       call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
  }
  
-/*
- * Returns the mount namespace which either has the specified id, or has the
- * next smallest id afer the specified one.
- */
-static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+static int mnt_ns_find(const void *key, const struct rb_node *node)
  {
-       struct rb_node *node = mnt_ns_tree.rb_node;
-       struct mnt_namespace *ret = NULL;
+       const u64 mnt_ns_id = *(u64 *)key;
+       const struct mnt_namespace *ns = node_to_mnt_ns(node);
  
-       lockdep_assert_held(&mnt_ns_tree_lock);
-
-       while (node) {
-               struct mnt_namespace *n = node_to_mnt_ns(node);
-
-               if (mnt_ns_id <= n->seq) {
-                       ret = node_to_mnt_ns(node);
-                       if (mnt_ns_id == n->seq)
-                               break;
-                       node = node->rb_left;
-               } else {
-                       node = node->rb_right;
-               }
-       }
-       return ret;
+       if (mnt_ns_id < ns->seq)
+               return -1;
+       if (mnt_ns_id > ns->seq)
+               return 1;
+       return 0;
  }
  
  /*
@@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
   * namespace the @namespace_sem must first be acquired. If the namespace has
   * already shut down before acquiring @namespace_sem, {list,stat}mount() will
   * see that the mount rbtree of the namespace is empty.
+ *
+ * Note the lookup is lockless protected by a sequence counter. We only
+ * need to guard against false negatives as false positives aren't
+ * possible. So if we didn't find a mount namespace and the sequence
+ * counter has changed we need to retry. If the sequence counter is
+ * still the same we know the search actually failed.
   */
  static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
  {
-       struct mnt_namespace *ns;
+       struct mnt_namespace *ns;
+       struct rb_node *node;
+       unsigned int seq;
+
+       guard(rcu)();
+       do {
+               seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
+               node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
+               if (node)
+                       break;
+       } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
  
-       guard(read_lock)(&mnt_ns_tree_lock);
-       ns = mnt_ns_find_id_at(mnt_ns_id);
-       if (!ns || ns->seq != mnt_ns_id)
-               return NULL;
+       if (!node)
+               return NULL;
  
-       refcount_inc(&ns->passive);
-       return ns;
+       /*
+        * The last reference count is put with RCU delay so we can
+        * unconditonally acquire a reference here.
+        */
+       ns = node_to_mnt_ns(node);
+       refcount_inc(&ns->passive);
+       return ns;
  }
  
  static inline void lock_mount_hash(void)
author	Christian Brauner <brauner@kernel.org>
	Thu, 12 Dec 2024 23:03:42 +0000 (00:03 +0100)
committer	Christian Brauner <brauner@kernel.org>
	Thu, 9 Jan 2025 15:58:52 +0000 (16:58 +0100)
fs/mount.h		patch \| blob \| blame \| history
fs/namespace.c		patch \| blob \| blame \| history