From: Greg Kroah-Hartman Date: Mon, 13 Aug 2018 17:12:51 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.18.1~28 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=00b5453c9f7dadb90e2aabf045c166273d7f48ae;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: proc-fix-proc_sys_prune_dcache-to-hold-a-sb-reference.patch proc-sysctl-don-t-grab-i_lock-under-sysctl_lock.patch proc-sysctl-prune-stale-dentries-during-unregistering.patch --- diff --git a/queue-4.9/proc-fix-proc_sys_prune_dcache-to-hold-a-sb-reference.patch b/queue-4.9/proc-fix-proc_sys_prune_dcache-to-hold-a-sb-reference.patch new file mode 100644 index 00000000000..af89e0475fc --- /dev/null +++ b/queue-4.9/proc-fix-proc_sys_prune_dcache-to-hold-a-sb-reference.patch @@ -0,0 +1,184 @@ +From 2fd1d2c4ceb2248a727696962cf3370dc9f5a0a4 Mon Sep 17 00:00:00 2001 +From: "Eric W. Biederman" +Date: Thu, 6 Jul 2017 08:41:06 -0500 +Subject: proc: Fix proc_sys_prune_dcache to hold a sb reference + +From: Eric W. Biederman + +commit 2fd1d2c4ceb2248a727696962cf3370dc9f5a0a4 upstream. + +Andrei Vagin writes: +FYI: This bug has been reproduced on 4.11.7 +> BUG: Dentry ffff895a3dd01240{i=4e7c09a,n=lo} still in use (1) [unmount of proc proc] +> ------------[ cut here ]------------ +> WARNING: CPU: 1 PID: 13588 at fs/dcache.c:1445 umount_check+0x6e/0x80 +> CPU: 1 PID: 13588 Comm: kworker/1:1 Not tainted 4.11.7-200.fc25.x86_64 #1 +> Hardware name: CompuLab sbc-flt1/fitlet, BIOS SBCFLT_0.08.04 06/27/2015 +> Workqueue: events proc_cleanup_work +> Call Trace: +> dump_stack+0x63/0x86 +> __warn+0xcb/0xf0 +> warn_slowpath_null+0x1d/0x20 +> umount_check+0x6e/0x80 +> d_walk+0xc6/0x270 +> ? dentry_free+0x80/0x80 +> do_one_tree+0x26/0x40 +> shrink_dcache_for_umount+0x2d/0x90 +> generic_shutdown_super+0x1f/0xf0 +> kill_anon_super+0x12/0x20 +> proc_kill_sb+0x40/0x50 +> deactivate_locked_super+0x43/0x70 +> deactivate_super+0x5a/0x60 +> cleanup_mnt+0x3f/0x90 +> mntput_no_expire+0x13b/0x190 +> kern_unmount+0x3e/0x50 +> pid_ns_release_proc+0x15/0x20 +> proc_cleanup_work+0x15/0x20 +> process_one_work+0x197/0x450 +> worker_thread+0x4e/0x4a0 +> kthread+0x109/0x140 +> ? process_one_work+0x450/0x450 +> ? kthread_park+0x90/0x90 +> ret_from_fork+0x2c/0x40 +> ---[ end trace e1c109611e5d0b41 ]--- +> VFS: Busy inodes after unmount of proc. Self-destruct in 5 seconds. Have a nice day... +> BUG: unable to handle kernel NULL pointer dereference at (null) +> IP: _raw_spin_lock+0xc/0x30 +> PGD 0 + +Fix this by taking a reference to the super block in proc_sys_prune_dcache. + +The superblock reference is the core of the fix however the sysctl_inodes +list is converted to a hlist so that hlist_del_init_rcu may be used. This +allows proc_sys_prune_dache to remove inodes the sysctl_inodes list, while +not causing problems for proc_sys_evict_inode when if it later choses to +remove the inode from the sysctl_inodes list. Removing inodes from the +sysctl_inodes list allows proc_sys_prune_dcache to have a progress +guarantee, while still being able to drop all locks. The fact that +head->unregistering is set in start_unregistering ensures that no more +inodes will be added to the the sysctl_inodes list. + +Previously the code did a dance where it delayed calling iput until the +next entry in the list was being considered to ensure the inode remained on +the sysctl_inodes list until the next entry was walked to. The structure +of the loop in this patch does not need that so is much easier to +understand and maintain. + +Cc: stable@vger.kernel.org +Reported-by: Andrei Vagin +Tested-by: Andrei Vagin +Fixes: ace0c791e6c3 ("proc/sysctl: Don't grab i_lock under sysctl_lock.") +Fixes: d6cffbbe9a7e ("proc/sysctl: prune stale dentries during unregistering") +Signed-off-by: "Eric W. Biederman" +Signed-off-by: Greg Kroah-Hartman + +--- + fs/proc/internal.h | 2 +- + fs/proc/proc_sysctl.c | 43 ++++++++++++++++++++++++++++++------------- + include/linux/sysctl.h | 2 +- + 3 files changed, 32 insertions(+), 15 deletions(-) + +--- a/fs/proc/internal.h ++++ b/fs/proc/internal.h +@@ -65,7 +65,7 @@ struct proc_inode { + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; +- struct list_head sysctl_inodes; ++ struct hlist_node sysctl_inodes; + const struct proc_ns_operations *ns_ops; + struct inode vfs_inode; + }; +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -190,7 +190,7 @@ static void init_header(struct ctl_table + head->set = set; + head->parent = NULL; + head->node = node; +- INIT_LIST_HEAD(&head->inodes); ++ INIT_HLIST_HEAD(&head->inodes); + if (node) { + struct ctl_table *entry; + for (entry = table; entry->procname; entry++, node++) +@@ -260,25 +260,42 @@ static void unuse_table(struct ctl_table + complete(p->unregistering); + } + +-/* called under sysctl_lock */ + static void proc_sys_prune_dcache(struct ctl_table_header *head) + { +- struct inode *inode, *prev = NULL; ++ struct inode *inode; + struct proc_inode *ei; ++ struct hlist_node *node; ++ struct super_block *sb; + + rcu_read_lock(); +- list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) { +- inode = igrab(&ei->vfs_inode); +- if (inode) { +- rcu_read_unlock(); +- iput(prev); +- prev = inode; +- d_prune_aliases(inode); ++ for (;;) { ++ node = hlist_first_rcu(&head->inodes); ++ if (!node) ++ break; ++ ei = hlist_entry(node, struct proc_inode, sysctl_inodes); ++ spin_lock(&sysctl_lock); ++ hlist_del_init_rcu(&ei->sysctl_inodes); ++ spin_unlock(&sysctl_lock); ++ ++ inode = &ei->vfs_inode; ++ sb = inode->i_sb; ++ if (!atomic_inc_not_zero(&sb->s_active)) ++ continue; ++ inode = igrab(inode); ++ rcu_read_unlock(); ++ if (unlikely(!inode)) { ++ deactivate_super(sb); + rcu_read_lock(); ++ continue; + } ++ ++ d_prune_aliases(inode); ++ iput(inode); ++ deactivate_super(sb); ++ ++ rcu_read_lock(); + } + rcu_read_unlock(); +- iput(prev); + } + + /* called under sysctl_lock, will reacquire if has to wait */ +@@ -464,7 +481,7 @@ static struct inode *proc_sys_make_inode + } + ei->sysctl = head; + ei->sysctl_entry = table; +- list_add_rcu(&ei->sysctl_inodes, &head->inodes); ++ hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes); + head->count++; + spin_unlock(&sysctl_lock); + +@@ -492,7 +509,7 @@ out: + void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) + { + spin_lock(&sysctl_lock); +- list_del_rcu(&PROC_I(inode)->sysctl_inodes); ++ hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +--- a/include/linux/sysctl.h ++++ b/include/linux/sysctl.h +@@ -143,7 +143,7 @@ struct ctl_table_header + struct ctl_table_set *set; + struct ctl_dir *parent; + struct ctl_node *node; +- struct list_head inodes; /* head for proc_inode->sysctl_inodes */ ++ struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */ + }; + + struct ctl_dir { diff --git a/queue-4.9/proc-sysctl-don-t-grab-i_lock-under-sysctl_lock.patch b/queue-4.9/proc-sysctl-don-t-grab-i_lock-under-sysctl_lock.patch new file mode 100644 index 00000000000..036058b50ea --- /dev/null +++ b/queue-4.9/proc-sysctl-don-t-grab-i_lock-under-sysctl_lock.patch @@ -0,0 +1,139 @@ +From ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb Mon Sep 17 00:00:00 2001 +From: "Eric W. Biederman" +Date: Mon, 20 Feb 2017 18:17:03 +1300 +Subject: proc/sysctl: Don't grab i_lock under sysctl_lock. + +From: Eric W. Biederman + +commit ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb upstream. + +Konstantin Khlebnikov writes: +> This patch has locking problem. I've got lockdep splat under LTP. +> +> [ 6633.115456] ====================================================== +> [ 6633.115502] [ INFO: possible circular locking dependency detected ] +> [ 6633.115553] 4.9.10-debug+ #9 Tainted: G L +> [ 6633.115584] ------------------------------------------------------- +> [ 6633.115627] ksm02/284980 is trying to acquire lock: +> [ 6633.115659] (&sb->s_type->i_lock_key#4){+.+...}, at: [] igrab+0x1e/0x80 +> [ 6633.115834] but task is already holding lock: +> [ 6633.115882] (sysctl_lock){+.+...}, at: [] unregister_sysctl_table+0x6b/0x110 +> [ 6633.116026] which lock already depends on the new lock. +> [ 6633.116026] +> [ 6633.116080] +> [ 6633.116080] the existing dependency chain (in reverse order) is: +> [ 6633.116117] +> -> #2 (sysctl_lock){+.+...}: +> -> #1 (&(&dentry->d_lockref.lock)->rlock){+.+...}: +> -> #0 (&sb->s_type->i_lock_key#4){+.+...}: +> +> d_lock nests inside i_lock +> sysctl_lock nests inside d_lock in d_compare +> +> This patch adds i_lock nesting inside sysctl_lock. + +Al Viro replied: +> Once ->unregistering is set, you can drop sysctl_lock just fine. So I'd +> try something like this - use rcu_read_lock() in proc_sys_prune_dcache(), +> drop sysctl_lock() before it and regain after. Make sure that no inodes +> are added to the list ones ->unregistering has been set and use RCU list +> primitives for modifying the inode list, with sysctl_lock still used to +> serialize its modifications. +> +> Freeing struct inode is RCU-delayed (see proc_destroy_inode()), so doing +> igrab() is safe there. Since we don't drop inode reference until after we'd +> passed beyond it in the list, list_for_each_entry_rcu() should be fine. + +I agree with Al Viro's analsysis of the situtation. + +Fixes: d6cffbbe9a7e ("proc/sysctl: prune stale dentries during unregistering") +Reported-by: Konstantin Khlebnikov +Tested-by: Konstantin Khlebnikov +Suggested-by: Al Viro +Signed-off-by: "Eric W. Biederman" +Signed-off-by: Greg Kroah-Hartman + +--- + fs/proc/proc_sysctl.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -266,21 +266,19 @@ static void proc_sys_prune_dcache(struct + struct inode *inode, *prev = NULL; + struct proc_inode *ei; + +- list_for_each_entry(ei, &head->inodes, sysctl_inodes) { ++ rcu_read_lock(); ++ list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) { + inode = igrab(&ei->vfs_inode); + if (inode) { +- spin_unlock(&sysctl_lock); ++ rcu_read_unlock(); + iput(prev); + prev = inode; + d_prune_aliases(inode); +- spin_lock(&sysctl_lock); ++ rcu_read_lock(); + } + } +- if (prev) { +- spin_unlock(&sysctl_lock); +- iput(prev); +- spin_lock(&sysctl_lock); +- } ++ rcu_read_unlock(); ++ iput(prev); + } + + /* called under sysctl_lock, will reacquire if has to wait */ +@@ -296,10 +294,10 @@ static void start_unregistering(struct c + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); +- spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); ++ spin_unlock(&sysctl_lock); + } + /* + * Prune dentries for unregistered sysctls: namespaced sysctls +@@ -310,6 +308,7 @@ static void start_unregistering(struct c + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ ++ spin_lock(&sysctl_lock); + erase_header(p); + } + +@@ -455,11 +454,17 @@ static struct inode *proc_sys_make_inode + inode->i_ino = get_next_ino(); + + ei = PROC_I(inode); +- ei->sysctl = head; +- ei->sysctl_entry = table; + + spin_lock(&sysctl_lock); +- list_add(&ei->sysctl_inodes, &head->inodes); ++ if (unlikely(head->unregistering)) { ++ spin_unlock(&sysctl_lock); ++ iput(inode); ++ inode = NULL; ++ goto out; ++ } ++ ei->sysctl = head; ++ ei->sysctl_entry = table; ++ list_add_rcu(&ei->sysctl_inodes, &head->inodes); + head->count++; + spin_unlock(&sysctl_lock); + +@@ -487,7 +492,7 @@ out: + void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) + { + spin_lock(&sysctl_lock); +- list_del(&PROC_I(inode)->sysctl_inodes); ++ list_del_rcu(&PROC_I(inode)->sysctl_inodes); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); diff --git a/queue-4.9/proc-sysctl-prune-stale-dentries-during-unregistering.patch b/queue-4.9/proc-sysctl-prune-stale-dentries-during-unregistering.patch new file mode 100644 index 00000000000..6737d9d71a6 --- /dev/null +++ b/queue-4.9/proc-sysctl-prune-stale-dentries-during-unregistering.patch @@ -0,0 +1,218 @@ +From d6cffbbe9a7e51eb705182965a189457c17ba8a3 Mon Sep 17 00:00:00 2001 +From: Konstantin Khlebnikov +Date: Fri, 10 Feb 2017 10:35:02 +0300 +Subject: proc/sysctl: prune stale dentries during unregistering + +From: Konstantin Khlebnikov + +commit d6cffbbe9a7e51eb705182965a189457c17ba8a3 upstream. + +Currently unregistering sysctl table does not prune its dentries. +Stale dentries could slowdown sysctl operations significantly. + +For example, command: + + # for i in {1..100000} ; do unshare -n -- sysctl -a &> /dev/null ; done + creates a millions of stale denties around sysctls of loopback interface: + + # sysctl fs.dentry-state + fs.dentry-state = 25812579 24724135 45 0 0 0 + + All of them have matching names thus lookup have to scan though whole + hash chain and call d_compare (proc_sys_compare) which checks them + under system-wide spinlock (sysctl_lock). + + # time sysctl -a > /dev/null + real 1m12.806s + user 0m0.016s + sys 1m12.400s + +Currently only memory reclaimer could remove this garbage. +But without significant memory pressure this never happens. + +This patch collects sysctl inodes into list on sysctl table header and +prunes all their dentries once that table unregisters. + +Konstantin Khlebnikov writes: +> On 10.02.2017 10:47, Al Viro wrote: +>> how about >> the matching stats *after* that patch? +> +> dcache size doesn't grow endlessly, so stats are fine +> +> # sysctl fs.dentry-state +> fs.dentry-state = 92712 58376 45 0 0 0 +> +> # time sysctl -a &>/dev/null +> +> real 0m0.013s +> user 0m0.004s +> sys 0m0.008s + +Signed-off-by: Konstantin Khlebnikov +Suggested-by: Al Viro +Signed-off-by: Eric W. Biederman +Signed-off-by: Greg Kroah-Hartman + +--- + fs/proc/inode.c | 3 +- + fs/proc/internal.h | 7 ++++- + fs/proc/proc_sysctl.c | 59 +++++++++++++++++++++++++++++++++++-------------- + include/linux/sysctl.h | 1 + 4 files changed, 51 insertions(+), 19 deletions(-) + +--- a/fs/proc/inode.c ++++ b/fs/proc/inode.c +@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inod + de = PDE(inode); + if (de) + pde_put(de); ++ + head = PROC_I(inode)->sysctl; + if (head) { + RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); +- sysctl_head_put(head); ++ proc_sys_evict_inode(inode, head); + } + } + +--- a/fs/proc/internal.h ++++ b/fs/proc/internal.h +@@ -65,6 +65,7 @@ struct proc_inode { + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; ++ struct list_head sysctl_inodes; + const struct proc_ns_operations *ns_ops; + struct inode vfs_inode; + }; +@@ -249,10 +250,12 @@ extern void proc_thread_self_init(void); + */ + #ifdef CONFIG_PROC_SYSCTL + extern int proc_sys_init(void); +-extern void sysctl_head_put(struct ctl_table_header *); ++extern void proc_sys_evict_inode(struct inode *inode, ++ struct ctl_table_header *head); + #else + static inline void proc_sys_init(void) { } +-static inline void sysctl_head_put(struct ctl_table_header *head) { } ++static inline void proc_sys_evict_inode(struct inode *inode, ++ struct ctl_table_header *head) { } + #endif + + /* +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -190,6 +190,7 @@ static void init_header(struct ctl_table + head->set = set; + head->parent = NULL; + head->node = node; ++ INIT_LIST_HEAD(&head->inodes); + if (node) { + struct ctl_table *entry; + for (entry = table; entry->procname; entry++, node++) +@@ -259,6 +260,29 @@ static void unuse_table(struct ctl_table + complete(p->unregistering); + } + ++/* called under sysctl_lock */ ++static void proc_sys_prune_dcache(struct ctl_table_header *head) ++{ ++ struct inode *inode, *prev = NULL; ++ struct proc_inode *ei; ++ ++ list_for_each_entry(ei, &head->inodes, sysctl_inodes) { ++ inode = igrab(&ei->vfs_inode); ++ if (inode) { ++ spin_unlock(&sysctl_lock); ++ iput(prev); ++ prev = inode; ++ d_prune_aliases(inode); ++ spin_lock(&sysctl_lock); ++ } ++ } ++ if (prev) { ++ spin_unlock(&sysctl_lock); ++ iput(prev); ++ spin_lock(&sysctl_lock); ++ } ++} ++ + /* called under sysctl_lock, will reacquire if has to wait */ + static void start_unregistering(struct ctl_table_header *p) + { +@@ -278,27 +302,17 @@ static void start_unregistering(struct c + p->unregistering = ERR_PTR(-EINVAL); + } + /* ++ * Prune dentries for unregistered sysctls: namespaced sysctls ++ * can have duplicate names and contaminate dcache very badly. ++ */ ++ proc_sys_prune_dcache(p); ++ /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + erase_header(p); + } + +-static void sysctl_head_get(struct ctl_table_header *head) +-{ +- spin_lock(&sysctl_lock); +- head->count++; +- spin_unlock(&sysctl_lock); +-} +- +-void sysctl_head_put(struct ctl_table_header *head) +-{ +- spin_lock(&sysctl_lock); +- if (!--head->count) +- kfree_rcu(head, rcu); +- spin_unlock(&sysctl_lock); +-} +- + static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) + { + BUG_ON(!head); +@@ -440,11 +454,15 @@ static struct inode *proc_sys_make_inode + + inode->i_ino = get_next_ino(); + +- sysctl_head_get(head); + ei = PROC_I(inode); + ei->sysctl = head; + ei->sysctl_entry = table; + ++ spin_lock(&sysctl_lock); ++ list_add(&ei->sysctl_inodes, &head->inodes); ++ head->count++; ++ spin_unlock(&sysctl_lock); ++ + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mode = table->mode; + if (!S_ISDIR(table->mode)) { +@@ -466,6 +484,15 @@ out: + return inode; + } + ++void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) ++{ ++ spin_lock(&sysctl_lock); ++ list_del(&PROC_I(inode)->sysctl_inodes); ++ if (!--head->count) ++ kfree_rcu(head, rcu); ++ spin_unlock(&sysctl_lock); ++} ++ + static struct ctl_table_header *grab_header(struct inode *inode) + { + struct ctl_table_header *head = PROC_I(inode)->sysctl; +--- a/include/linux/sysctl.h ++++ b/include/linux/sysctl.h +@@ -143,6 +143,7 @@ struct ctl_table_header + struct ctl_table_set *set; + struct ctl_dir *parent; + struct ctl_node *node; ++ struct list_head inodes; /* head for proc_inode->sysctl_inodes */ + }; + + struct ctl_dir { diff --git a/queue-4.9/series b/queue-4.9/series index 75aa15537ec..b9abdc77775 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -12,3 +12,6 @@ root-dentries-need-rcu-delayed-freeing.patch make-sure-that-__dentry_kill-always-invalidates-d_seq-unhashed-or-not.patch fix-mntput-mntput-race.patch fix-__legitimize_mnt-mntput-race.patch +proc-sysctl-prune-stale-dentries-during-unregistering.patch +proc-sysctl-don-t-grab-i_lock-under-sysctl_lock.patch +proc-fix-proc_sys_prune_dcache-to-hold-a-sb-reference.patch