--- /dev/null
+From 2fd1d2c4ceb2248a727696962cf3370dc9f5a0a4 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Thu, 6 Jul 2017 08:41:06 -0500
+Subject: proc: Fix proc_sys_prune_dcache to hold a sb reference
+
+From: Eric W. Biederman <ebiederm@xmission.com>
+
+commit 2fd1d2c4ceb2248a727696962cf3370dc9f5a0a4 upstream.
+
+Andrei Vagin writes:
+FYI: This bug has been reproduced on 4.11.7
+> BUG: Dentry ffff895a3dd01240{i=4e7c09a,n=lo} still in use (1) [unmount of proc proc]
+> ------------[ cut here ]------------
+> WARNING: CPU: 1 PID: 13588 at fs/dcache.c:1445 umount_check+0x6e/0x80
+> CPU: 1 PID: 13588 Comm: kworker/1:1 Not tainted 4.11.7-200.fc25.x86_64 #1
+> Hardware name: CompuLab sbc-flt1/fitlet, BIOS SBCFLT_0.08.04 06/27/2015
+> Workqueue: events proc_cleanup_work
+> Call Trace:
+> dump_stack+0x63/0x86
+> __warn+0xcb/0xf0
+> warn_slowpath_null+0x1d/0x20
+> umount_check+0x6e/0x80
+> d_walk+0xc6/0x270
+> ? dentry_free+0x80/0x80
+> do_one_tree+0x26/0x40
+> shrink_dcache_for_umount+0x2d/0x90
+> generic_shutdown_super+0x1f/0xf0
+> kill_anon_super+0x12/0x20
+> proc_kill_sb+0x40/0x50
+> deactivate_locked_super+0x43/0x70
+> deactivate_super+0x5a/0x60
+> cleanup_mnt+0x3f/0x90
+> mntput_no_expire+0x13b/0x190
+> kern_unmount+0x3e/0x50
+> pid_ns_release_proc+0x15/0x20
+> proc_cleanup_work+0x15/0x20
+> process_one_work+0x197/0x450
+> worker_thread+0x4e/0x4a0
+> kthread+0x109/0x140
+> ? process_one_work+0x450/0x450
+> ? kthread_park+0x90/0x90
+> ret_from_fork+0x2c/0x40
+> ---[ end trace e1c109611e5d0b41 ]---
+> VFS: Busy inodes after unmount of proc. Self-destruct in 5 seconds. Have a nice day...
+> BUG: unable to handle kernel NULL pointer dereference at (null)
+> IP: _raw_spin_lock+0xc/0x30
+> PGD 0
+
+Fix this by taking a reference to the super block in proc_sys_prune_dcache.
+
+The superblock reference is the core of the fix however the sysctl_inodes
+list is converted to a hlist so that hlist_del_init_rcu may be used. This
+allows proc_sys_prune_dache to remove inodes the sysctl_inodes list, while
+not causing problems for proc_sys_evict_inode when if it later choses to
+remove the inode from the sysctl_inodes list. Removing inodes from the
+sysctl_inodes list allows proc_sys_prune_dcache to have a progress
+guarantee, while still being able to drop all locks. The fact that
+head->unregistering is set in start_unregistering ensures that no more
+inodes will be added to the the sysctl_inodes list.
+
+Previously the code did a dance where it delayed calling iput until the
+next entry in the list was being considered to ensure the inode remained on
+the sysctl_inodes list until the next entry was walked to. The structure
+of the loop in this patch does not need that so is much easier to
+understand and maintain.
+
+Cc: stable@vger.kernel.org
+Reported-by: Andrei Vagin <avagin@gmail.com>
+Tested-by: Andrei Vagin <avagin@openvz.org>
+Fixes: ace0c791e6c3 ("proc/sysctl: Don't grab i_lock under sysctl_lock.")
+Fixes: d6cffbbe9a7e ("proc/sysctl: prune stale dentries during unregistering")
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/internal.h | 2 +-
+ fs/proc/proc_sysctl.c | 43 ++++++++++++++++++++++++++++++-------------
+ include/linux/sysctl.h | 2 +-
+ 3 files changed, 32 insertions(+), 15 deletions(-)
+
+--- a/fs/proc/internal.h
++++ b/fs/proc/internal.h
+@@ -65,7 +65,7 @@ struct proc_inode {
+ struct proc_dir_entry *pde;
+ struct ctl_table_header *sysctl;
+ struct ctl_table *sysctl_entry;
+- struct list_head sysctl_inodes;
++ struct hlist_node sysctl_inodes;
+ const struct proc_ns_operations *ns_ops;
+ struct inode vfs_inode;
+ };
+--- a/fs/proc/proc_sysctl.c
++++ b/fs/proc/proc_sysctl.c
+@@ -190,7 +190,7 @@ static void init_header(struct ctl_table
+ head->set = set;
+ head->parent = NULL;
+ head->node = node;
+- INIT_LIST_HEAD(&head->inodes);
++ INIT_HLIST_HEAD(&head->inodes);
+ if (node) {
+ struct ctl_table *entry;
+ for (entry = table; entry->procname; entry++, node++)
+@@ -260,25 +260,42 @@ static void unuse_table(struct ctl_table
+ complete(p->unregistering);
+ }
+
+-/* called under sysctl_lock */
+ static void proc_sys_prune_dcache(struct ctl_table_header *head)
+ {
+- struct inode *inode, *prev = NULL;
++ struct inode *inode;
+ struct proc_inode *ei;
++ struct hlist_node *node;
++ struct super_block *sb;
+
+ rcu_read_lock();
+- list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
+- inode = igrab(&ei->vfs_inode);
+- if (inode) {
+- rcu_read_unlock();
+- iput(prev);
+- prev = inode;
+- d_prune_aliases(inode);
++ for (;;) {
++ node = hlist_first_rcu(&head->inodes);
++ if (!node)
++ break;
++ ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
++ spin_lock(&sysctl_lock);
++ hlist_del_init_rcu(&ei->sysctl_inodes);
++ spin_unlock(&sysctl_lock);
++
++ inode = &ei->vfs_inode;
++ sb = inode->i_sb;
++ if (!atomic_inc_not_zero(&sb->s_active))
++ continue;
++ inode = igrab(inode);
++ rcu_read_unlock();
++ if (unlikely(!inode)) {
++ deactivate_super(sb);
+ rcu_read_lock();
++ continue;
+ }
++
++ d_prune_aliases(inode);
++ iput(inode);
++ deactivate_super(sb);
++
++ rcu_read_lock();
+ }
+ rcu_read_unlock();
+- iput(prev);
+ }
+
+ /* called under sysctl_lock, will reacquire if has to wait */
+@@ -464,7 +481,7 @@ static struct inode *proc_sys_make_inode
+ }
+ ei->sysctl = head;
+ ei->sysctl_entry = table;
+- list_add_rcu(&ei->sysctl_inodes, &head->inodes);
++ hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+
+@@ -492,7 +509,7 @@ out:
+ void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+ {
+ spin_lock(&sysctl_lock);
+- list_del_rcu(&PROC_I(inode)->sysctl_inodes);
++ hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
+--- a/include/linux/sysctl.h
++++ b/include/linux/sysctl.h
+@@ -143,7 +143,7 @@ struct ctl_table_header
+ struct ctl_table_set *set;
+ struct ctl_dir *parent;
+ struct ctl_node *node;
+- struct list_head inodes; /* head for proc_inode->sysctl_inodes */
++ struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */
+ };
+
+ struct ctl_dir {
--- /dev/null
+From ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Mon, 20 Feb 2017 18:17:03 +1300
+Subject: proc/sysctl: Don't grab i_lock under sysctl_lock.
+
+From: Eric W. Biederman <ebiederm@xmission.com>
+
+commit ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb upstream.
+
+Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
+> This patch has locking problem. I've got lockdep splat under LTP.
+>
+> [ 6633.115456] ======================================================
+> [ 6633.115502] [ INFO: possible circular locking dependency detected ]
+> [ 6633.115553] 4.9.10-debug+ #9 Tainted: G L
+> [ 6633.115584] -------------------------------------------------------
+> [ 6633.115627] ksm02/284980 is trying to acquire lock:
+> [ 6633.115659] (&sb->s_type->i_lock_key#4){+.+...}, at: [<ffffffff816bc1ce>] igrab+0x1e/0x80
+> [ 6633.115834] but task is already holding lock:
+> [ 6633.115882] (sysctl_lock){+.+...}, at: [<ffffffff817e379b>] unregister_sysctl_table+0x6b/0x110
+> [ 6633.116026] which lock already depends on the new lock.
+> [ 6633.116026]
+> [ 6633.116080]
+> [ 6633.116080] the existing dependency chain (in reverse order) is:
+> [ 6633.116117]
+> -> #2 (sysctl_lock){+.+...}:
+> -> #1 (&(&dentry->d_lockref.lock)->rlock){+.+...}:
+> -> #0 (&sb->s_type->i_lock_key#4){+.+...}:
+>
+> d_lock nests inside i_lock
+> sysctl_lock nests inside d_lock in d_compare
+>
+> This patch adds i_lock nesting inside sysctl_lock.
+
+Al Viro <viro@ZenIV.linux.org.uk> replied:
+> Once ->unregistering is set, you can drop sysctl_lock just fine. So I'd
+> try something like this - use rcu_read_lock() in proc_sys_prune_dcache(),
+> drop sysctl_lock() before it and regain after. Make sure that no inodes
+> are added to the list ones ->unregistering has been set and use RCU list
+> primitives for modifying the inode list, with sysctl_lock still used to
+> serialize its modifications.
+>
+> Freeing struct inode is RCU-delayed (see proc_destroy_inode()), so doing
+> igrab() is safe there. Since we don't drop inode reference until after we'd
+> passed beyond it in the list, list_for_each_entry_rcu() should be fine.
+
+I agree with Al Viro's analsysis of the situtation.
+
+Fixes: d6cffbbe9a7e ("proc/sysctl: prune stale dentries during unregistering")
+Reported-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Tested-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/proc_sysctl.c | 31 ++++++++++++++++++-------------
+ 1 file changed, 18 insertions(+), 13 deletions(-)
+
+--- a/fs/proc/proc_sysctl.c
++++ b/fs/proc/proc_sysctl.c
+@@ -266,21 +266,19 @@ static void proc_sys_prune_dcache(struct
+ struct inode *inode, *prev = NULL;
+ struct proc_inode *ei;
+
+- list_for_each_entry(ei, &head->inodes, sysctl_inodes) {
++ rcu_read_lock();
++ list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
+ inode = igrab(&ei->vfs_inode);
+ if (inode) {
+- spin_unlock(&sysctl_lock);
++ rcu_read_unlock();
+ iput(prev);
+ prev = inode;
+ d_prune_aliases(inode);
+- spin_lock(&sysctl_lock);
++ rcu_read_lock();
+ }
+ }
+- if (prev) {
+- spin_unlock(&sysctl_lock);
+- iput(prev);
+- spin_lock(&sysctl_lock);
+- }
++ rcu_read_unlock();
++ iput(prev);
+ }
+
+ /* called under sysctl_lock, will reacquire if has to wait */
+@@ -296,10 +294,10 @@ static void start_unregistering(struct c
+ p->unregistering = &wait;
+ spin_unlock(&sysctl_lock);
+ wait_for_completion(&wait);
+- spin_lock(&sysctl_lock);
+ } else {
+ /* anything non-NULL; we'll never dereference it */
+ p->unregistering = ERR_PTR(-EINVAL);
++ spin_unlock(&sysctl_lock);
+ }
+ /*
+ * Prune dentries for unregistered sysctls: namespaced sysctls
+@@ -310,6 +308,7 @@ static void start_unregistering(struct c
+ * do not remove from the list until nobody holds it; walking the
+ * list in do_sysctl() relies on that.
+ */
++ spin_lock(&sysctl_lock);
+ erase_header(p);
+ }
+
+@@ -455,11 +454,17 @@ static struct inode *proc_sys_make_inode
+ inode->i_ino = get_next_ino();
+
+ ei = PROC_I(inode);
+- ei->sysctl = head;
+- ei->sysctl_entry = table;
+
+ spin_lock(&sysctl_lock);
+- list_add(&ei->sysctl_inodes, &head->inodes);
++ if (unlikely(head->unregistering)) {
++ spin_unlock(&sysctl_lock);
++ iput(inode);
++ inode = NULL;
++ goto out;
++ }
++ ei->sysctl = head;
++ ei->sysctl_entry = table;
++ list_add_rcu(&ei->sysctl_inodes, &head->inodes);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+
+@@ -487,7 +492,7 @@ out:
+ void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+ {
+ spin_lock(&sysctl_lock);
+- list_del(&PROC_I(inode)->sysctl_inodes);
++ list_del_rcu(&PROC_I(inode)->sysctl_inodes);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
--- /dev/null
+From d6cffbbe9a7e51eb705182965a189457c17ba8a3 Mon Sep 17 00:00:00 2001
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Date: Fri, 10 Feb 2017 10:35:02 +0300
+Subject: proc/sysctl: prune stale dentries during unregistering
+
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+
+commit d6cffbbe9a7e51eb705182965a189457c17ba8a3 upstream.
+
+Currently unregistering sysctl table does not prune its dentries.
+Stale dentries could slowdown sysctl operations significantly.
+
+For example, command:
+
+ # for i in {1..100000} ; do unshare -n -- sysctl -a &> /dev/null ; done
+ creates a millions of stale denties around sysctls of loopback interface:
+
+ # sysctl fs.dentry-state
+ fs.dentry-state = 25812579 24724135 45 0 0 0
+
+ All of them have matching names thus lookup have to scan though whole
+ hash chain and call d_compare (proc_sys_compare) which checks them
+ under system-wide spinlock (sysctl_lock).
+
+ # time sysctl -a > /dev/null
+ real 1m12.806s
+ user 0m0.016s
+ sys 1m12.400s
+
+Currently only memory reclaimer could remove this garbage.
+But without significant memory pressure this never happens.
+
+This patch collects sysctl inodes into list on sysctl table header and
+prunes all their dentries once that table unregisters.
+
+Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
+> On 10.02.2017 10:47, Al Viro wrote:
+>> how about >> the matching stats *after* that patch?
+>
+> dcache size doesn't grow endlessly, so stats are fine
+>
+> # sysctl fs.dentry-state
+> fs.dentry-state = 92712 58376 45 0 0 0
+>
+> # time sysctl -a &>/dev/null
+>
+> real 0m0.013s
+> user 0m0.004s
+> sys 0m0.008s
+
+Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/inode.c | 3 +-
+ fs/proc/internal.h | 7 ++++-
+ fs/proc/proc_sysctl.c | 59 +++++++++++++++++++++++++++++++++++--------------
+ include/linux/sysctl.h | 1
+ 4 files changed, 51 insertions(+), 19 deletions(-)
+
+--- a/fs/proc/inode.c
++++ b/fs/proc/inode.c
+@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inod
+ de = PDE(inode);
+ if (de)
+ pde_put(de);
++
+ head = PROC_I(inode)->sysctl;
+ if (head) {
+ RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+- sysctl_head_put(head);
++ proc_sys_evict_inode(inode, head);
+ }
+ }
+
+--- a/fs/proc/internal.h
++++ b/fs/proc/internal.h
+@@ -65,6 +65,7 @@ struct proc_inode {
+ struct proc_dir_entry *pde;
+ struct ctl_table_header *sysctl;
+ struct ctl_table *sysctl_entry;
++ struct list_head sysctl_inodes;
+ const struct proc_ns_operations *ns_ops;
+ struct inode vfs_inode;
+ };
+@@ -249,10 +250,12 @@ extern void proc_thread_self_init(void);
+ */
+ #ifdef CONFIG_PROC_SYSCTL
+ extern int proc_sys_init(void);
+-extern void sysctl_head_put(struct ctl_table_header *);
++extern void proc_sys_evict_inode(struct inode *inode,
++ struct ctl_table_header *head);
+ #else
+ static inline void proc_sys_init(void) { }
+-static inline void sysctl_head_put(struct ctl_table_header *head) { }
++static inline void proc_sys_evict_inode(struct inode *inode,
++ struct ctl_table_header *head) { }
+ #endif
+
+ /*
+--- a/fs/proc/proc_sysctl.c
++++ b/fs/proc/proc_sysctl.c
+@@ -190,6 +190,7 @@ static void init_header(struct ctl_table
+ head->set = set;
+ head->parent = NULL;
+ head->node = node;
++ INIT_LIST_HEAD(&head->inodes);
+ if (node) {
+ struct ctl_table *entry;
+ for (entry = table; entry->procname; entry++, node++)
+@@ -259,6 +260,29 @@ static void unuse_table(struct ctl_table
+ complete(p->unregistering);
+ }
+
++/* called under sysctl_lock */
++static void proc_sys_prune_dcache(struct ctl_table_header *head)
++{
++ struct inode *inode, *prev = NULL;
++ struct proc_inode *ei;
++
++ list_for_each_entry(ei, &head->inodes, sysctl_inodes) {
++ inode = igrab(&ei->vfs_inode);
++ if (inode) {
++ spin_unlock(&sysctl_lock);
++ iput(prev);
++ prev = inode;
++ d_prune_aliases(inode);
++ spin_lock(&sysctl_lock);
++ }
++ }
++ if (prev) {
++ spin_unlock(&sysctl_lock);
++ iput(prev);
++ spin_lock(&sysctl_lock);
++ }
++}
++
+ /* called under sysctl_lock, will reacquire if has to wait */
+ static void start_unregistering(struct ctl_table_header *p)
+ {
+@@ -278,27 +302,17 @@ static void start_unregistering(struct c
+ p->unregistering = ERR_PTR(-EINVAL);
+ }
+ /*
++ * Prune dentries for unregistered sysctls: namespaced sysctls
++ * can have duplicate names and contaminate dcache very badly.
++ */
++ proc_sys_prune_dcache(p);
++ /*
+ * do not remove from the list until nobody holds it; walking the
+ * list in do_sysctl() relies on that.
+ */
+ erase_header(p);
+ }
+
+-static void sysctl_head_get(struct ctl_table_header *head)
+-{
+- spin_lock(&sysctl_lock);
+- head->count++;
+- spin_unlock(&sysctl_lock);
+-}
+-
+-void sysctl_head_put(struct ctl_table_header *head)
+-{
+- spin_lock(&sysctl_lock);
+- if (!--head->count)
+- kfree_rcu(head, rcu);
+- spin_unlock(&sysctl_lock);
+-}
+-
+ static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+ {
+ BUG_ON(!head);
+@@ -440,11 +454,15 @@ static struct inode *proc_sys_make_inode
+
+ inode->i_ino = get_next_ino();
+
+- sysctl_head_get(head);
+ ei = PROC_I(inode);
+ ei->sysctl = head;
+ ei->sysctl_entry = table;
+
++ spin_lock(&sysctl_lock);
++ list_add(&ei->sysctl_inodes, &head->inodes);
++ head->count++;
++ spin_unlock(&sysctl_lock);
++
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mode = table->mode;
+ if (!S_ISDIR(table->mode)) {
+@@ -466,6 +484,15 @@ out:
+ return inode;
+ }
+
++void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
++{
++ spin_lock(&sysctl_lock);
++ list_del(&PROC_I(inode)->sysctl_inodes);
++ if (!--head->count)
++ kfree_rcu(head, rcu);
++ spin_unlock(&sysctl_lock);
++}
++
+ static struct ctl_table_header *grab_header(struct inode *inode)
+ {
+ struct ctl_table_header *head = PROC_I(inode)->sysctl;
+--- a/include/linux/sysctl.h
++++ b/include/linux/sysctl.h
+@@ -143,6 +143,7 @@ struct ctl_table_header
+ struct ctl_table_set *set;
+ struct ctl_dir *parent;
+ struct ctl_node *node;
++ struct list_head inodes; /* head for proc_inode->sysctl_inodes */
+ };
+
+ struct ctl_dir {
make-sure-that-__dentry_kill-always-invalidates-d_seq-unhashed-or-not.patch
fix-mntput-mntput-race.patch
fix-__legitimize_mnt-mntput-race.patch
+proc-sysctl-prune-stale-dentries-during-unregistering.patch
+proc-sysctl-don-t-grab-i_lock-under-sysctl_lock.patch
+proc-fix-proc_sys_prune_dcache-to-hold-a-sb-reference.patch