From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 1 Mar 2016 22:03:53 +0000 (-0800)
Subject: 4.4-stable patches
X-Git-Tag: v3.10.99~9
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=08cae923c52e2aa4f5195c7c8681fab5e0ec2168;p=thirdparty%2Fkernel%2Fstable-queue.git

4.4-stable patches

added patches:
	cpuset-make-mm-migration-asynchronous.patch
	pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch
---

diff --git a/queue-4.4/cpuset-make-mm-migration-asynchronous.patch b/queue-4.4/cpuset-make-mm-migration-asynchronous.patch
new file mode 100644
index 00000000000..80b4c0fb76b
--- /dev/null
+++ b/queue-4.4/cpuset-make-mm-migration-asynchronous.patch
@@ -0,0 +1,207 @@
+From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 19 Jan 2016 12:18:41 -0500
+Subject: cpuset: make mm migration asynchronous
+
+From: Tejun Heo <tj@kernel.org>
+
+commit e93ad19d05648397ef3bcb838d26aec06c245dc0 upstream.
+
+If "cpuset.memory_migrate" is set, when a process is moved from one
+cpuset to another with a different memory node mask, pages in used by
+the process are migrated to the new set of nodes.  This was performed
+synchronously in the ->attach() callback, which is synchronized
+against process management.  Recently, the synchronization was changed
+from per-process rwsem to global percpu rwsem for simplicity and
+optimization.
+
+Combined with the synchronous mm migration, this led to deadlocks
+because mm migration could schedule a work item which may in turn try
+to create a new worker blocking on the process management lock held
+from cgroup process migration path.
+
+This heavy an operation shouldn't be performed synchronously from that
+deep inside cgroup migration in the first place.  This patch punts the
+actual migration to an ordered workqueue and updates cgroup process
+migration and cpuset config update paths to flush the workqueue after
+all locks are released.  This way, the operations still seem
+synchronous to userland without entangling mm migration with process
+management synchronization.  CPU hotplug can also invoke mm migration
+but there's no reason for it to wait for mm migrations and thus
+doesn't synchronize against their completions.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cpuset.h |    6 ++++
+ kernel/cgroup.c        |    3 +-
+ kernel/cpuset.c        |   71 +++++++++++++++++++++++++++++++++----------------
+ 3 files changed, 57 insertions(+), 23 deletions(-)
+
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -137,6 +137,8 @@ static inline void set_mems_allowed(node
+ 	task_unlock(current);
+ }
+ 
++extern void cpuset_post_attach_flush(void);
++
+ #else /* !CONFIG_CPUSETS */
+ 
+ static inline bool cpusets_enabled(void) { return false; }
+@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_ret
+ 	return false;
+ }
+ 
++static inline void cpuset_post_attach_flush(void)
++{
++}
++
+ #endif /* !CONFIG_CPUSETS */
+ 
+ #endif /* _LINUX_CPUSET_H */
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -57,7 +57,7 @@
+ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+ #include <linux/kthread.h>
+ #include <linux/delay.h>
+-
++#include <linux/cpuset.h>
+ #include <linux/atomic.h>
+ 
+ /*
+@@ -2764,6 +2764,7 @@ out_unlock_rcu:
+ out_unlock_threadgroup:
+ 	percpu_up_write(&cgroup_threadgroup_rwsem);
+ 	cgroup_kn_unlock(of->kn);
++	cpuset_post_attach_flush();
+ 	return ret ?: nbytes;
+ }
+ 
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -286,6 +286,8 @@ static struct cpuset top_cpuset = {
+ static DEFINE_MUTEX(cpuset_mutex);
+ static DEFINE_SPINLOCK(callback_lock);
+ 
++static struct workqueue_struct *cpuset_migrate_mm_wq;
++
+ /*
+  * CPU / memory hotplug is handled asynchronously.
+  */
+@@ -971,31 +973,51 @@ static int update_cpumask(struct cpuset
+ }
+ 
+ /*
+- * cpuset_migrate_mm
+- *
+- *    Migrate memory region from one set of nodes to another.
+- *
+- *    Temporarilly set tasks mems_allowed to target nodes of migration,
+- *    so that the migration code can allocate pages on these nodes.
+- *
+- *    While the mm_struct we are migrating is typically from some
+- *    other task, the task_struct mems_allowed that we are hacking
+- *    is for our current task, which must allocate new pages for that
+- *    migrating memory region.
++ * Migrate memory region from one set of nodes to another.  This is
++ * performed asynchronously as it can be called from process migration path
++ * holding locks involved in process management.  All mm migrations are
++ * performed in the queued order and can be waited for by flushing
++ * cpuset_migrate_mm_wq.
+  */
+ 
++struct cpuset_migrate_mm_work {
++	struct work_struct	work;
++	struct mm_struct	*mm;
++	nodemask_t		from;
++	nodemask_t		to;
++};
++
++static void cpuset_migrate_mm_workfn(struct work_struct *work)
++{
++	struct cpuset_migrate_mm_work *mwork =
++		container_of(work, struct cpuset_migrate_mm_work, work);
++
++	/* on a wq worker, no need to worry about %current's mems_allowed */
++	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
++	mmput(mwork->mm);
++	kfree(mwork);
++}
++
+ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+ 							const nodemask_t *to)
+ {
+-	struct task_struct *tsk = current;
+-
+-	tsk->mems_allowed = *to;
++	struct cpuset_migrate_mm_work *mwork;
+ 
+-	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
++	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
++	if (mwork) {
++		mwork->mm = mm;
++		mwork->from = *from;
++		mwork->to = *to;
++		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
++		queue_work(cpuset_migrate_mm_wq, &mwork->work);
++	} else {
++		mmput(mm);
++	}
++}
+ 
+-	rcu_read_lock();
+-	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
+-	rcu_read_unlock();
++void cpuset_post_attach_flush(void)
++{
++	flush_workqueue(cpuset_migrate_mm_wq);
+ }
+ 
+ /*
+@@ -1096,7 +1118,8 @@ static void update_tasks_nodemask(struct
+ 		mpol_rebind_mm(mm, &cs->mems_allowed);
+ 		if (migrate)
+ 			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+-		mmput(mm);
++		else
++			mmput(mm);
+ 	}
+ 	css_task_iter_end(&it);
+ 
+@@ -1541,11 +1564,11 @@ static void cpuset_attach(struct cgroup_
+ 			 * @old_mems_allowed is the right nodesets that we
+ 			 * migrate mm from.
+ 			 */
+-			if (is_memory_migrate(cs)) {
++			if (is_memory_migrate(cs))
+ 				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+ 						  &cpuset_attach_nodemask_to);
+-			}
+-			mmput(mm);
++			else
++				mmput(mm);
+ 		}
+ 	}
+ 
+@@ -1710,6 +1733,7 @@ out_unlock:
+ 	mutex_unlock(&cpuset_mutex);
+ 	kernfs_unbreak_active_protection(of->kn);
+ 	css_put(&cs->css);
++	flush_workqueue(cpuset_migrate_mm_wq);
+ 	return retval ?: nbytes;
+ }
+ 
+@@ -2355,6 +2379,9 @@ void __init cpuset_init_smp(void)
+ 	top_cpuset.effective_mems = node_states[N_MEMORY];
+ 
+ 	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
++
++	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
++	BUG_ON(!cpuset_migrate_mm_wq);
+ }
+ 
+ /**
diff --git a/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch b/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch
new file mode 100644
index 00000000000..a085dffad89
--- /dev/null
+++ b/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch
@@ -0,0 +1,98 @@
+From 4ae2182b1e3407de369f8c5d799543b7db74221b Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Mon, 25 Jan 2016 10:08:00 -0600
+Subject: PCI/AER: Flush workqueue on device remove to avoid use-after-free
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit 4ae2182b1e3407de369f8c5d799543b7db74221b upstream.
+
+A Root Port's AER structure (rpc) contains a queue of events.  aer_irq()
+enqueues AER status information and schedules aer_isr() to dequeue and
+process it.  When we remove a device, aer_remove() waits for the queue to
+be empty, then frees the rpc struct.
+
+But aer_isr() references the rpc struct after dequeueing and possibly
+emptying the queue, which can cause a use-after-free error as in the
+following scenario with two threads, aer_isr() on the left and a
+concurrent aer_remove() on the right:
+
+  Thread A                      Thread B
+  --------                      --------
+  aer_irq():
+    rpc->prod_idx++
+                                aer_remove():
+                                  wait_event(rpc->prod_idx == rpc->cons_idx)
+                                  # now blocked until queue becomes empty
+  aer_isr():                      # ...
+    rpc->cons_idx++               # unblocked because queue is now empty
+    ...                           kfree(rpc)
+    mutex_unlock(&rpc->rpc_mutex)
+
+To prevent this problem, use flush_work() to wait until the last scheduled
+instance of aer_isr() has completed before freeing the rpc struct in
+aer_remove().
+
+I reproduced this use-after-free by flashing a device FPGA and
+re-enumerating the bus to find the new device.  With SLUB debug, this
+crashes with 0x6b bytes (POISON_FREE, the use-after-free magic number) in
+GPR25:
+
+  pcieport 0000:00:00.0: AER: Multiple Corrected error received: id=0000
+  Unable to handle kernel paging request for data at address 0x27ef9e3e
+  Workqueue: events aer_isr
+  GPR24: dd6aa000 6b6b6b6b 605f8378 605f8360 d99b12c0 604fc674 606b1704 d99b12c0
+  NIP [602f5328] pci_walk_bus+0xd4/0x104
+
+[bhelgaas: changelog, stable tag]
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pcie/aer/aerdrv.c      |    4 +---
+ drivers/pci/pcie/aer/aerdrv.h      |    1 -
+ drivers/pci/pcie/aer/aerdrv_core.c |    2 --
+ 3 files changed, 1 insertion(+), 6 deletions(-)
+
+--- a/drivers/pci/pcie/aer/aerdrv.c
++++ b/drivers/pci/pcie/aer/aerdrv.c
+@@ -262,7 +262,6 @@ static struct aer_rpc *aer_alloc_rpc(str
+ 	rpc->rpd = dev;
+ 	INIT_WORK(&rpc->dpc_handler, aer_isr);
+ 	mutex_init(&rpc->rpc_mutex);
+-	init_waitqueue_head(&rpc->wait_release);
+ 
+ 	/* Use PCIe bus function to store rpc into PCIe device */
+ 	set_service_data(dev, rpc);
+@@ -285,8 +284,7 @@ static void aer_remove(struct pcie_devic
+ 		if (rpc->isr)
+ 			free_irq(dev->irq, dev);
+ 
+-		wait_event(rpc->wait_release, rpc->prod_idx == rpc->cons_idx);
+-
++		flush_work(&rpc->dpc_handler);
+ 		aer_disable_rootport(rpc);
+ 		kfree(rpc);
+ 		set_service_data(dev, NULL);
+--- a/drivers/pci/pcie/aer/aerdrv.h
++++ b/drivers/pci/pcie/aer/aerdrv.h
+@@ -72,7 +72,6 @@ struct aer_rpc {
+ 					 * recovery on the same
+ 					 * root port hierarchy
+ 					 */
+-	wait_queue_head_t wait_release;
+ };
+ 
+ struct aer_broadcast_data {
+--- a/drivers/pci/pcie/aer/aerdrv_core.c
++++ b/drivers/pci/pcie/aer/aerdrv_core.c
+@@ -811,8 +811,6 @@ void aer_isr(struct work_struct *work)
+ 	while (get_e_source(rpc, &e_src))
+ 		aer_isr_one_error(p_device, &e_src);
+ 	mutex_unlock(&rpc->rpc_mutex);
+-
+-	wake_up(&rpc->wait_release);
+ }
+ 
+ /**
diff --git a/queue-4.4/series b/queue-4.4/series
index a55467c7ed3..11889c19809 100644
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -286,3 +286,5 @@ qla2xxx-fix-stale-pointer-access.patch
 libata-fix-sff-host-state-machine-locking-while-polling.patch
 arcv2-star-9000950267-handle-return-from-intr-to-delay-slot-2.patch
 arcv2-smp-emulate-ipi-to-self-using-software-triggered-interrupt.patch
+pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch
+cpuset-make-mm-migration-asynchronous.patch