--- /dev/null
+From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 19 Jan 2016 12:18:41 -0500
+Subject: cpuset: make mm migration asynchronous
+
+From: Tejun Heo <tj@kernel.org>
+
+commit e93ad19d05648397ef3bcb838d26aec06c245dc0 upstream.
+
+If "cpuset.memory_migrate" is set, when a process is moved from one
+cpuset to another with a different memory node mask, pages in used by
+the process are migrated to the new set of nodes. This was performed
+synchronously in the ->attach() callback, which is synchronized
+against process management. Recently, the synchronization was changed
+from per-process rwsem to global percpu rwsem for simplicity and
+optimization.
+
+Combined with the synchronous mm migration, this led to deadlocks
+because mm migration could schedule a work item which may in turn try
+to create a new worker blocking on the process management lock held
+from cgroup process migration path.
+
+This heavy an operation shouldn't be performed synchronously from that
+deep inside cgroup migration in the first place. This patch punts the
+actual migration to an ordered workqueue and updates cgroup process
+migration and cpuset config update paths to flush the workqueue after
+all locks are released. This way, the operations still seem
+synchronous to userland without entangling mm migration with process
+management synchronization. CPU hotplug can also invoke mm migration
+but there's no reason for it to wait for mm migrations and thus
+doesn't synchronize against their completions.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cpuset.h | 6 ++++
+ kernel/cgroup.c | 3 +-
+ kernel/cpuset.c | 71 +++++++++++++++++++++++++++++++++----------------
+ 3 files changed, 57 insertions(+), 23 deletions(-)
+
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -137,6 +137,8 @@ static inline void set_mems_allowed(node
+ task_unlock(current);
+ }
+
++extern void cpuset_post_attach_flush(void);
++
+ #else /* !CONFIG_CPUSETS */
+
+ static inline bool cpusets_enabled(void) { return false; }
+@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_ret
+ return false;
+ }
+
++static inline void cpuset_post_attach_flush(void)
++{
++}
++
+ #endif /* !CONFIG_CPUSETS */
+
+ #endif /* _LINUX_CPUSET_H */
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -57,7 +57,7 @@
+ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+ #include <linux/kthread.h>
+ #include <linux/delay.h>
+-
++#include <linux/cpuset.h>
+ #include <linux/atomic.h>
+
+ /*
+@@ -2764,6 +2764,7 @@ out_unlock_rcu:
+ out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_kn_unlock(of->kn);
++ cpuset_post_attach_flush();
+ return ret ?: nbytes;
+ }
+
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -286,6 +286,8 @@ static struct cpuset top_cpuset = {
+ static DEFINE_MUTEX(cpuset_mutex);
+ static DEFINE_SPINLOCK(callback_lock);
+
++static struct workqueue_struct *cpuset_migrate_mm_wq;
++
+ /*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+@@ -971,31 +973,51 @@ static int update_cpumask(struct cpuset
+ }
+
+ /*
+- * cpuset_migrate_mm
+- *
+- * Migrate memory region from one set of nodes to another.
+- *
+- * Temporarilly set tasks mems_allowed to target nodes of migration,
+- * so that the migration code can allocate pages on these nodes.
+- *
+- * While the mm_struct we are migrating is typically from some
+- * other task, the task_struct mems_allowed that we are hacking
+- * is for our current task, which must allocate new pages for that
+- * migrating memory region.
++ * Migrate memory region from one set of nodes to another. This is
++ * performed asynchronously as it can be called from process migration path
++ * holding locks involved in process management. All mm migrations are
++ * performed in the queued order and can be waited for by flushing
++ * cpuset_migrate_mm_wq.
+ */
+
++struct cpuset_migrate_mm_work {
++ struct work_struct work;
++ struct mm_struct *mm;
++ nodemask_t from;
++ nodemask_t to;
++};
++
++static void cpuset_migrate_mm_workfn(struct work_struct *work)
++{
++ struct cpuset_migrate_mm_work *mwork =
++ container_of(work, struct cpuset_migrate_mm_work, work);
++
++ /* on a wq worker, no need to worry about %current's mems_allowed */
++ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
++ mmput(mwork->mm);
++ kfree(mwork);
++}
++
+ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to)
+ {
+- struct task_struct *tsk = current;
+-
+- tsk->mems_allowed = *to;
++ struct cpuset_migrate_mm_work *mwork;
+
+- do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
++ mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
++ if (mwork) {
++ mwork->mm = mm;
++ mwork->from = *from;
++ mwork->to = *to;
++ INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
++ queue_work(cpuset_migrate_mm_wq, &mwork->work);
++ } else {
++ mmput(mm);
++ }
++}
+
+- rcu_read_lock();
+- guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
+- rcu_read_unlock();
++void cpuset_post_attach_flush(void)
++{
++ flush_workqueue(cpuset_migrate_mm_wq);
+ }
+
+ /*
+@@ -1096,7 +1118,8 @@ static void update_tasks_nodemask(struct
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+- mmput(mm);
++ else
++ mmput(mm);
+ }
+ css_task_iter_end(&it);
+
+@@ -1541,11 +1564,11 @@ static void cpuset_attach(struct cgroup_
+ * @old_mems_allowed is the right nodesets that we
+ * migrate mm from.
+ */
+- if (is_memory_migrate(cs)) {
++ if (is_memory_migrate(cs))
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+ &cpuset_attach_nodemask_to);
+- }
+- mmput(mm);
++ else
++ mmput(mm);
+ }
+ }
+
+@@ -1710,6 +1733,7 @@ out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
++ flush_workqueue(cpuset_migrate_mm_wq);
+ return retval ?: nbytes;
+ }
+
+@@ -2355,6 +2379,9 @@ void __init cpuset_init_smp(void)
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
++
++ cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
++ BUG_ON(!cpuset_migrate_mm_wq);
+ }
+
+ /**
--- /dev/null
+From 4ae2182b1e3407de369f8c5d799543b7db74221b Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Mon, 25 Jan 2016 10:08:00 -0600
+Subject: PCI/AER: Flush workqueue on device remove to avoid use-after-free
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit 4ae2182b1e3407de369f8c5d799543b7db74221b upstream.
+
+A Root Port's AER structure (rpc) contains a queue of events. aer_irq()
+enqueues AER status information and schedules aer_isr() to dequeue and
+process it. When we remove a device, aer_remove() waits for the queue to
+be empty, then frees the rpc struct.
+
+But aer_isr() references the rpc struct after dequeueing and possibly
+emptying the queue, which can cause a use-after-free error as in the
+following scenario with two threads, aer_isr() on the left and a
+concurrent aer_remove() on the right:
+
+ Thread A Thread B
+ -------- --------
+ aer_irq():
+ rpc->prod_idx++
+ aer_remove():
+ wait_event(rpc->prod_idx == rpc->cons_idx)
+ # now blocked until queue becomes empty
+ aer_isr(): # ...
+ rpc->cons_idx++ # unblocked because queue is now empty
+ ... kfree(rpc)
+ mutex_unlock(&rpc->rpc_mutex)
+
+To prevent this problem, use flush_work() to wait until the last scheduled
+instance of aer_isr() has completed before freeing the rpc struct in
+aer_remove().
+
+I reproduced this use-after-free by flashing a device FPGA and
+re-enumerating the bus to find the new device. With SLUB debug, this
+crashes with 0x6b bytes (POISON_FREE, the use-after-free magic number) in
+GPR25:
+
+ pcieport 0000:00:00.0: AER: Multiple Corrected error received: id=0000
+ Unable to handle kernel paging request for data at address 0x27ef9e3e
+ Workqueue: events aer_isr
+ GPR24: dd6aa000 6b6b6b6b 605f8378 605f8360 d99b12c0 604fc674 606b1704 d99b12c0
+ NIP [602f5328] pci_walk_bus+0xd4/0x104
+
+[bhelgaas: changelog, stable tag]
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pcie/aer/aerdrv.c | 4 +---
+ drivers/pci/pcie/aer/aerdrv.h | 1 -
+ drivers/pci/pcie/aer/aerdrv_core.c | 2 --
+ 3 files changed, 1 insertion(+), 6 deletions(-)
+
+--- a/drivers/pci/pcie/aer/aerdrv.c
++++ b/drivers/pci/pcie/aer/aerdrv.c
+@@ -262,7 +262,6 @@ static struct aer_rpc *aer_alloc_rpc(str
+ rpc->rpd = dev;
+ INIT_WORK(&rpc->dpc_handler, aer_isr);
+ mutex_init(&rpc->rpc_mutex);
+- init_waitqueue_head(&rpc->wait_release);
+
+ /* Use PCIe bus function to store rpc into PCIe device */
+ set_service_data(dev, rpc);
+@@ -285,8 +284,7 @@ static void aer_remove(struct pcie_devic
+ if (rpc->isr)
+ free_irq(dev->irq, dev);
+
+- wait_event(rpc->wait_release, rpc->prod_idx == rpc->cons_idx);
+-
++ flush_work(&rpc->dpc_handler);
+ aer_disable_rootport(rpc);
+ kfree(rpc);
+ set_service_data(dev, NULL);
+--- a/drivers/pci/pcie/aer/aerdrv.h
++++ b/drivers/pci/pcie/aer/aerdrv.h
+@@ -72,7 +72,6 @@ struct aer_rpc {
+ * recovery on the same
+ * root port hierarchy
+ */
+- wait_queue_head_t wait_release;
+ };
+
+ struct aer_broadcast_data {
+--- a/drivers/pci/pcie/aer/aerdrv_core.c
++++ b/drivers/pci/pcie/aer/aerdrv_core.c
+@@ -811,8 +811,6 @@ void aer_isr(struct work_struct *work)
+ while (get_e_source(rpc, &e_src))
+ aer_isr_one_error(p_device, &e_src);
+ mutex_unlock(&rpc->rpc_mutex);
+-
+- wake_up(&rpc->wait_release);
+ }
+
+ /**