]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
wind ->s_roots via ->d_sib instead of ->d_hash
authorAl Viro <viro@zeniv.linux.org.uk>
Sat, 18 Apr 2026 22:39:03 +0000 (18:39 -0400)
committerAl Viro <viro@zeniv.linux.org.uk>
Fri, 5 Jun 2026 04:34:56 +0000 (00:34 -0400)
shrink_dcache_for_umount() is supposed to handle the possibility of
some of the dentries to be evicted being in other threads shrink
lists; it either kills them, leaving an empty husk to be freed by
the owner of shrink list whenever it gets around to that, or it
waits for the eviction in progress to get completed.

That relies upon dentry remaining attached to the tree until the
eviction reaches dentry_unlist() and its ->d_sib gets removed
from the list.  Unfortunately, the secondary roots are linked
via ->d_hash, rather than ->d_sib and they become removed from
that list before their inode references are dropped.

If shrink_dentry_list() from another thread ends up evicting
one of the secondary roots and gets to that point in dentry_kill()
when shrink_dcache_for_umount() is looking for secondary roots,
the latter will *not* notice anything, possibly leading to
warnings about busy inodes at umount time and all kinds of breakage
after that.

Moreover, shrink_dcache_for_umount() walks the list of secondary
roots with no protection whatsoever, so it might end up calling
dget() on a dentry that already passed through
lockref_mark_dead(&dentry->d_lockref);
ending up with corrupted refcount and possible UAF.

AFAICS, the most straightforward way to deal with that would be
to have secondary roots linked via ->d_sib rather than ->d_hash;
then they would remain on the list until killed, and we could
use d_add_waiter() machinery to wait for eviction in progress.

Changes:
* secondary roots look the same as ->s_root from d_unhashed()
and d_unlinked() POV now.
* secondary roots are represented as "no parent, but on ->d_sib"
instead of "no parent, but on ->d_hash".
* since ->d_sib is a plain hlist, we protect it with per-superblock
spinlock (sb->s_roots_lock) instead of the LSB of the head pointer (for
non-root dentries it would be protected by ->d_lock of parent).
* __d_obtain_alias() uses ->d_sib for linkage when allocating
a secondary root.
* d_splice_alias_ops() detects splicing of a secondary root and
removes it from the list before calling __d_move().
* dentry_unlist() detects eviction of a secondary root and
removes it from the list; no need to play the games for d_walk() sake,
since the latter is not going to look for the next sibling of those
anyway.
* ___d_drop() doesn't care about ->s_roots anymore.
* shrink_dcache_for_umount() uses proper locking for access to
the list of secondary roots and if it runs into one that is in the middle
of eviction waits for that to finish.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
fs/dcache.c
fs/super.c
include/linux/fs/super_types.h

index 1f4435b7f9cea8c5f014634bfcfd3669ed10995f..257eefc46f5ef61a69485080a27344876fa53f17 100644 (file)
@@ -43,8 +43,8 @@
  *   - i_dentry, d_alias, d_inode of aliases
  * dcache_hash_bucket lock protects:
  *   - the dcache hash table
- * s_roots bl list spinlock protects:
- *   - the s_roots list (see __d_drop)
+ * s_roots_lock protects:
+ *   - the s_roots list (see __d_move()/dentry_unlist()/d_obtain_root())
  * dentry->d_sb->s_dentry_lru_lock protects:
  *   - the dcache lru lists and counters
  * d_lock protects:
@@ -562,16 +562,7 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
 
 static void ___d_drop(struct dentry *dentry)
 {
-       struct hlist_bl_head *b;
-       /*
-        * Hashed dentries are normally on the dentry hashtable,
-        * with the exception of those newly allocated by
-        * d_obtain_root, which are always IS_ROOT:
-        */
-       if (unlikely(IS_ROOT(dentry)))
-               b = &dentry->d_sb->s_roots;
-       else
-               b = d_hash(dentry->d_name.hash);
+       struct hlist_bl_head *b = d_hash(dentry->d_name.hash);
 
        hlist_bl_lock(b);
        __hlist_bl_del(&dentry->d_hash);
@@ -654,6 +645,13 @@ static inline void d_complete_waiters(struct dentry *dentry)
        }
 }
 
+static void unlink_secondary_root(struct dentry *dentry)
+{
+       spin_lock(&dentry->d_sb->s_roots_lock);
+       hlist_del_init(&dentry->d_sib);
+       spin_unlock(&dentry->d_sb->s_roots_lock);
+}
+
 static inline void dentry_unlist(struct dentry *dentry)
 {
        struct dentry *next;
@@ -665,6 +663,10 @@ static inline void dentry_unlist(struct dentry *dentry)
        d_complete_waiters(dentry);
        if (unlikely(hlist_unhashed(&dentry->d_sib)))
                return;
+       if (unlikely(IS_ROOT(dentry))) {
+               unlink_secondary_root(dentry); // secondary root goes away
+               return;
+       }
        __hlist_del(&dentry->d_sib);
        /*
         * Cursors can move around the list of children.  While we'd been
@@ -1805,9 +1807,30 @@ void shrink_dcache_for_umount(struct super_block *sb)
        sb->s_root = NULL;
        do_one_tree(dentry);
 
-       while (!hlist_bl_empty(&sb->s_roots)) {
-               dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
-               do_one_tree(dentry);
+       for (;;) {
+               spin_lock(&sb->s_roots_lock);
+               dentry = hlist_entry_safe(sb->s_roots.first,
+                                         struct dentry, d_sib);
+               if (!dentry) {
+                       spin_unlock(&sb->s_roots_lock);
+                       break;
+               }
+               rcu_read_lock();
+               spin_unlock(&sb->s_roots_lock);
+               spin_lock(&dentry->d_lock);
+               rcu_read_unlock();
+               if (unlikely(dentry->d_lockref.count < 0)) {
+                       struct completion_list wait;
+                       bool need_wait = d_add_waiter(dentry, &wait);
+
+                       spin_unlock(&dentry->d_lock);
+                       if (need_wait)
+                               wait_for_completion(&wait.completion);
+               } else {
+                       dget_dlock(dentry);
+                       spin_unlock(&dentry->d_lock);
+                       do_one_tree(dentry);
+               }
        }
 }
 
@@ -2224,9 +2247,9 @@ static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
                __d_set_inode_and_type(new, inode, add_flags);
                hlist_add_head(&new->d_alias, &inode->i_dentry);
                if (!disconnected) {
-                       hlist_bl_lock(&sb->s_roots);
-                       hlist_bl_add_head(&new->d_hash, &sb->s_roots);
-                       hlist_bl_unlock(&sb->s_roots);
+                       spin_lock(&sb->s_roots_lock);
+                       hlist_add_head(&new->d_sib, &sb->s_roots);
+                       spin_unlock(&sb->s_roots_lock);
                }
                spin_unlock(&new->d_lock);
                spin_unlock(&inode->i_lock);
@@ -3238,6 +3261,12 @@ struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry,
                                }
                                dput(old_parent);
                        } else {
+                               if (unlikely(!hlist_unhashed(&new->d_sib))) {
+                                       // secondary root getting spliced
+                                       spin_lock(&new->d_lock);
+                                       unlink_secondary_root(new);
+                                       spin_unlock(&new->d_lock);
+                               }
                                __d_move(new, dentry, false);
                                write_sequnlock(&rename_lock);
                        }
index 378e81efe643bd3c8156f5fa2c1b64419fd7bb23..fb44ebadda82a241d4e77de69e167c1a675b4d35 100644 (file)
@@ -359,6 +359,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
                s->s_iflags |= SB_I_NODEV;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_roots);
+       spin_lock_init(&s->s_roots_lock);
        mutex_init(&s->s_sync_lock);
        INIT_LIST_HEAD(&s->s_inodes);
        spin_lock_init(&s->s_inode_list_lock);
index 383050e7fdf57c066efa235f083a2adc760afb80..23d1c2612d0cc0c568ea15a644a0f72ea294e3b2 100644 (file)
@@ -162,7 +162,8 @@ struct super_block {
        struct unicode_map                      *s_encoding;
        __u16                                   s_encoding_flags;
 #endif
-       struct hlist_bl_head                    s_roots;        /* alternate root dentries for NFS */
+       struct hlist_head                       s_roots;        /* alternate root dentries for NFS */
+       spinlock_t                              s_roots_lock;
        struct mount                            *s_mounts;      /* list of mounts; _not_ for fs use */
        struct block_device                     *s_bdev;        /* can go away once we use an accessor for @s_bdev_file */
        struct file                             *s_bdev_file;