From: Linus Torvalds Date: Fri, 5 Dec 2025 22:36:21 +0000 (-0800) Subject: Merge tag 'pull-persistency' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7cd122b55283d3ceef71a5b723ccaa03a72284b4;p=thirdparty%2Flinux.git Merge tag 'pull-persistency' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs Pull persistent dentry infrastructure and conversion from Al Viro: "Some filesystems use a kinda-sorta controlled dentry refcount leak to pin dentries of created objects in dcache (and undo it when removing those). A reference is grabbed and not released, but it's not actually _stored_ anywhere. That works, but it's hard to follow and verify; among other things, we have no way to tell _which_ of the increments is intended to be an unpaired one. Worse, on removal we need to decide whether the reference had already been dropped, which can be non-trivial if that removal is on umount and we need to figure out if this dentry is pinned due to e.g. unlink() not done. Usually that is handled by using kill_litter_super() as ->kill_sb(), but there are open-coded special cases of the same (consider e.g. /proc/self). Things get simpler if we introduce a new dentry flag (DCACHE_PERSISTENT) marking those "leaked" dentries. Having it set claims responsibility for +1 in refcount. The end result this series is aiming for: - get these unbalanced dget() and dput() replaced with new primitives that would, in addition to adjusting refcount, set and clear persistency flag. - instead of having kill_litter_super() mess with removing the remaining "leaked" references (e.g. for all tmpfs files that hadn't been removed prior to umount), have the regular shrink_dcache_for_umount() strip DCACHE_PERSISTENT of all dentries, dropping the corresponding reference if it had been set. After that kill_litter_super() becomes an equivalent of kill_anon_super(). Doing that in a single step is not feasible - it would affect too many places in too many filesystems. It has to be split into a series. This work has really started early in 2024; quite a few preliminary pieces have already gone into mainline. This chunk is finally getting to the meat of that stuff - infrastructure and most of the conversions to it. Some pieces are still sitting in the local branches, but the bulk of that stuff is here" * tag 'pull-persistency' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (54 commits) d_make_discardable(): warn if given a non-persistent dentry kill securityfs_recursive_remove() convert securityfs get rid of kill_litter_super() convert rust_binderfs convert nfsctl convert rpc_pipefs convert hypfs hypfs: swich hypfs_create_u64() to returning int hypfs: switch hypfs_create_str() to returning int hypfs: don't pin dentries twice convert gadgetfs gadgetfs: switch to simple_remove_by_name() convert functionfs functionfs: switch to simple_remove_by_name() functionfs: fix the open/removal races functionfs: need to cancel ->reset_work in ->kill_sb() functionfs: don't bother with ffs->ref in ffs_data_{opened,closed}() functionfs: don't abuse ffs_data_closed() on fs shutdown convert selinuxfs ... --- 7cd122b55283d3ceef71a5b723ccaa03a72284b4 diff --cc Documentation/filesystems/porting.rst index 078142d6587e1,4921b3b0662a7..3397937ed838e --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@@ -1319,11 -1314,5 +1319,18 @@@ instead **mandatory** +vfs_mkdir() now returns a dentry - the one returned by ->mkdir(). If +that dentry is different from the dentry passed in, including if it is +an IS_ERR() dentry pointer, the original dentry is dput(). + +When vfs_mkdir() returns an error, and so both dputs() the original +dentry and doesn't provide a replacement, it also unlocks the parent. +Consequently the return value from vfs_mkdir() can be passed to +end_creating() and the parent will be unlocked precisely when necessary. ++ ++--- ++ ++**mandatory** ++ + kill_litter_super() is gone; convert to DCACHE_PERSISTENT use (as all + in-tree filesystems have done). diff --cc fs/binfmt_misc.c index d7aec5b87c2b1,2093f9dcd3210..8cb1a94339b82 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@@ -800,44 -830,10 +827,12 @@@ static ssize_t bm_register_write(struc e->interp_file = f; } - inode_lock(d_inode(root)); - dentry = lookup_noperm(&QSTR(e->name), root); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out; - - err = -EEXIST; - if (d_really_is_positive(dentry)) - goto out2; - - inode = bm_get_inode(sb, S_IFREG | 0644); - - err = -ENOMEM; - if (!inode) - goto out2; - - refcount_set(&e->users, 1); - e->dentry = dget(dentry); - inode->i_private = e; - inode->i_fop = &bm_entry_operations; - - d_instantiate(dentry, inode); - misc = i_binfmt_misc(inode); - write_lock(&misc->entries_lock); - list_add(&e->list, &misc->entries); - write_unlock(&misc->entries_lock); - - err = 0; - out2: - dput(dentry); - out: - inode_unlock(d_inode(root)); - + err = add_entry(e, sb); if (err) { - if (f) + if (f) { + exe_file_allow_write_access(f); filp_close(f, NULL); + } kfree(e); return err; } diff --cc fs/dcache.c index 9143fd502def9,824d620bb563b..c8e21b4423cff --- a/fs/dcache.c +++ b/fs/dcache.c @@@ -1980,9 -2012,17 +2013,11 @@@ void d_instantiate_new(struct dentry *e lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW & ~I_CREATING; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } diff --cc fs/debugfs/inode.c index 532bd7c46baf8,25a554331ac4f..5f15f855b4b91 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@@ -403,18 -403,17 +403,17 @@@ static struct dentry *debugfs_start_cre return dentry; } -static struct dentry *failed_creating(struct dentry *dentry) +static struct dentry *debugfs_failed_creating(struct dentry *dentry) { - inode_unlock(d_inode(dentry->d_parent)); - dput(dentry); + simple_done_creating(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return ERR_PTR(-ENOMEM); } -static struct dentry *end_creating(struct dentry *dentry) +static struct dentry *debugfs_end_creating(struct dentry *dentry) { - inode_unlock(d_inode(dentry->d_parent)); - return dentry; + simple_done_creating(dentry); + return dentry; // borrowed } static struct dentry *__debugfs_create_file(const char *name, umode_t mode, @@@ -456,9 -450,9 +450,9 @@@ DEBUGFS_I(inode)->raw = real_fops; DEBUGFS_I(inode)->aux = (void *)aux; - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } struct dentry *debugfs_create_file_full(const char *name, umode_t mode, @@@ -602,10 -591,10 +591,10 @@@ struct dentry *debugfs_create_dir(cons /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); @@@ -649,10 -633,10 +633,10 @@@ struct dentry *debugfs_create_automount DEBUGFS_I(inode)->automount = f; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); @@@ -704,8 -688,8 +688,8 @@@ struct dentry *debugfs_create_symlink(c inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); diff --cc fs/libfs.c index 2d6657947abd2,0aa630e7eb00f..9264523be85cf --- a/fs/libfs.c +++ b/fs/libfs.c @@@ -2290,25 -2315,35 +2316,33 @@@ void stashed_dentry_prune(struct dentr cmpxchg(stashed, dentry, NULL); } -/* parent must be held exclusive */ +/** + * simple_start_creating - prepare to create a given name + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Required lock is taken and a lookup in performed prior to creating an + * object in a directory. No permission checking is performed. + * + * Returns: a negative dentry on which vfs_create() or similar may + * be attempted, or an error. + */ struct dentry *simple_start_creating(struct dentry *parent, const char *name) { - struct dentry *dentry; - struct inode *dir = d_inode(parent); + struct qstr qname = QSTR(name); + int err; - inode_lock(dir); - if (unlikely(IS_DEADDIR(dir))) { - inode_unlock(dir); - return ERR_PTR(-ENOENT); - } - dentry = lookup_noperm(&QSTR(name), parent); - if (IS_ERR(dentry)) { - inode_unlock(dir); - return dentry; - } - if (dentry->d_inode) { - dput(dentry); - inode_unlock(dir); - return ERR_PTR(-EEXIST); - } - return dentry; + err = lookup_noperm_common(&qname, parent); + if (err) + return ERR_PTR(err); + return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL); } EXPORT_SYMBOL(simple_start_creating); + + /* parent must have been held exclusive since simple_start_creating() */ + void simple_done_creating(struct dentry *child) + { + inode_unlock(child->d_parent->d_inode); + dput(child); + } + EXPORT_SYMBOL(simple_done_creating); diff --cc security/apparmor/apparmorfs.c index e4114aa1e04f5,9b9090d38ea22..907bd2667e28c --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@@ -355,17 -355,22 +355,22 @@@ static void aafs_remove(struct dentry * if (!dentry || IS_ERR(dentry)) return; + /* ->d_parent is stable as rename is not supported */ dir = d_inode(dentry->d_parent); - inode_lock(dir); - if (simple_positive(dentry)) { + dentry = start_removing_dentry(dentry->d_parent, dentry); + if (!IS_ERR(dentry) && simple_positive(dentry)) { - if (d_is_dir(dentry)) - simple_rmdir(dir, dentry); - else - simple_unlink(dir, dentry); + if (d_is_dir(dentry)) { + if (!WARN_ON(!simple_empty(dentry))) { + __simple_rmdir(dir, dentry); + dput(dentry); + } + } else { + __simple_unlink(dir, dentry); + dput(dentry); + } d_delete(dentry); - dput(dentry); } - inode_unlock(dir); + end_removing(dentry); simple_release_fs(&aafs_mnt, &aafs_count); }