4.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 1 Mar 2016 22:03:53 +0000 (14:03 -0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 1 Mar 2016 22:03:53 +0000 (14:03 -0800)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 1 Mar 2016 22:03:53 +0000 (14:03 -0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 1 Mar 2016 22:03:53 +0000 (14:03 -0800)
diff --git a/queue-4.4/cpuset-make-mm-migration-asynchronous.patch b/queue-4.4/cpuset-make-mm-migration-asynchronous.patch

new file mode 100644 (file)

index 0000000..80b4c0f
--- /dev/null
+++ b/queue-4.4/cpuset-make-mm-migration-asynchronous.patch
@@ -0,0 +1,207 @@
+From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 19 Jan 2016 12:18:41 -0500
+Subject: cpuset: make mm migration asynchronous
+
+From: Tejun Heo <tj@kernel.org>
+
+commit e93ad19d05648397ef3bcb838d26aec06c245dc0 upstream.
+
+If "cpuset.memory_migrate" is set, when a process is moved from one
+cpuset to another with a different memory node mask, pages in used by
+the process are migrated to the new set of nodes.  This was performed
+synchronously in the ->attach() callback, which is synchronized
+against process management.  Recently, the synchronization was changed
+from per-process rwsem to global percpu rwsem for simplicity and
+optimization.
+
+Combined with the synchronous mm migration, this led to deadlocks
+because mm migration could schedule a work item which may in turn try
+to create a new worker blocking on the process management lock held
+from cgroup process migration path.
+
+This heavy an operation shouldn't be performed synchronously from that
+deep inside cgroup migration in the first place.  This patch punts the
+actual migration to an ordered workqueue and updates cgroup process
+migration and cpuset config update paths to flush the workqueue after
+all locks are released.  This way, the operations still seem
+synchronous to userland without entangling mm migration with process
+management synchronization.  CPU hotplug can also invoke mm migration
+but there's no reason for it to wait for mm migrations and thus
+doesn't synchronize against their completions.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cpuset.h |    6 ++++
+ kernel/cgroup.c        |    3 +-
+ kernel/cpuset.c        |   71 +++++++++++++++++++++++++++++++++----------------
+ 3 files changed, 57 insertions(+), 23 deletions(-)
+
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -137,6 +137,8 @@ static inline void set_mems_allowed(node
+       task_unlock(current);
+ }
+ 
++extern void cpuset_post_attach_flush(void);
++
+ #else /* !CONFIG_CPUSETS */
+ 
+ static inline bool cpusets_enabled(void) { return false; }
+@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_ret
+       return false;
+ }
+ 
++static inline void cpuset_post_attach_flush(void)
++{
++}
++
+ #endif /* !CONFIG_CPUSETS */
+ 
+ #endif /* _LINUX_CPUSET_H */
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -57,7 +57,7 @@
+ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+ #include <linux/kthread.h>
+ #include <linux/delay.h>
+-
++#include <linux/cpuset.h>
+ #include <linux/atomic.h>
+ 
+ /*
+@@ -2764,6 +2764,7 @@ out_unlock_rcu:
+ out_unlock_threadgroup:
+       percpu_up_write(&cgroup_threadgroup_rwsem);
+       cgroup_kn_unlock(of->kn);
++      cpuset_post_attach_flush();
+       return ret ?: nbytes;
+ }
+ 
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -286,6 +286,8 @@ static struct cpuset top_cpuset = {
+ static DEFINE_MUTEX(cpuset_mutex);
+ static DEFINE_SPINLOCK(callback_lock);
+ 
++static struct workqueue_struct *cpuset_migrate_mm_wq;
++
+ /*
+  * CPU / memory hotplug is handled asynchronously.
+  */
+@@ -971,31 +973,51 @@ static int update_cpumask(struct cpuset
+ }
+ 
+ /*
+- * cpuset_migrate_mm
+- *
+- *    Migrate memory region from one set of nodes to another.
+- *
+- *    Temporarilly set tasks mems_allowed to target nodes of migration,
+- *    so that the migration code can allocate pages on these nodes.
+- *
+- *    While the mm_struct we are migrating is typically from some
+- *    other task, the task_struct mems_allowed that we are hacking
+- *    is for our current task, which must allocate new pages for that
+- *    migrating memory region.
++ * Migrate memory region from one set of nodes to another.  This is
++ * performed asynchronously as it can be called from process migration path
++ * holding locks involved in process management.  All mm migrations are
++ * performed in the queued order and can be waited for by flushing
++ * cpuset_migrate_mm_wq.
+  */
+ 
++struct cpuset_migrate_mm_work {
++      struct work_struct      work;
++      struct mm_struct        *mm;
++      nodemask_t              from;
++      nodemask_t              to;
++};
++
++static void cpuset_migrate_mm_workfn(struct work_struct *work)
++{
++      struct cpuset_migrate_mm_work *mwork =
++              container_of(work, struct cpuset_migrate_mm_work, work);
++
++      /* on a wq worker, no need to worry about %current's mems_allowed */
++      do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
++      mmput(mwork->mm);
++      kfree(mwork);
++}
++
+ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+                                                       const nodemask_t *to)
+ {
+-      struct task_struct *tsk = current;
+-
+-      tsk->mems_allowed = *to;
++      struct cpuset_migrate_mm_work *mwork;
+ 
+-      do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
++      mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
++      if (mwork) {
++              mwork->mm = mm;
++              mwork->from = *from;
++              mwork->to = *to;
++              INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
++              queue_work(cpuset_migrate_mm_wq, &mwork->work);
++      } else {
++              mmput(mm);
++      }
++}
+ 
+-      rcu_read_lock();
+-      guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
+-      rcu_read_unlock();
++void cpuset_post_attach_flush(void)
++{
++      flush_workqueue(cpuset_migrate_mm_wq);
+ }
+ 
+ /*
+@@ -1096,7 +1118,8 @@ static void update_tasks_nodemask(struct
+               mpol_rebind_mm(mm, &cs->mems_allowed);
+               if (migrate)
+                       cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+-              mmput(mm);
++              else
++                      mmput(mm);
+       }
+       css_task_iter_end(&it);
+ 
+@@ -1541,11 +1564,11 @@ static void cpuset_attach(struct cgroup_
+                        * @old_mems_allowed is the right nodesets that we
+                        * migrate mm from.
+                        */
+-                      if (is_memory_migrate(cs)) {
++                      if (is_memory_migrate(cs))
+                               cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+                                                 &cpuset_attach_nodemask_to);
+-                      }
+-                      mmput(mm);
++                      else
++                              mmput(mm);
+               }
+       }
+ 
+@@ -1710,6 +1733,7 @@ out_unlock:
+       mutex_unlock(&cpuset_mutex);
+       kernfs_unbreak_active_protection(of->kn);
+       css_put(&cs->css);
++      flush_workqueue(cpuset_migrate_mm_wq);
+       return retval ?: nbytes;
+ }
+ 
+@@ -2355,6 +2379,9 @@ void __init cpuset_init_smp(void)
+       top_cpuset.effective_mems = node_states[N_MEMORY];
+ 
+       register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
++
++      cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
++      BUG_ON(!cpuset_migrate_mm_wq);
+ }
+ 
+ /**
diff --git a/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch b/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch

new file mode 100644 (file)

index 0000000..a085dff
--- /dev/null
+++ b/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch
@@ -0,0 +1,98 @@
+From 4ae2182b1e3407de369f8c5d799543b7db74221b Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Mon, 25 Jan 2016 10:08:00 -0600
+Subject: PCI/AER: Flush workqueue on device remove to avoid use-after-free
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit 4ae2182b1e3407de369f8c5d799543b7db74221b upstream.
+
+A Root Port's AER structure (rpc) contains a queue of events.  aer_irq()
+enqueues AER status information and schedules aer_isr() to dequeue and
+process it.  When we remove a device, aer_remove() waits for the queue to
+be empty, then frees the rpc struct.
+
+But aer_isr() references the rpc struct after dequeueing and possibly
+emptying the queue, which can cause a use-after-free error as in the
+following scenario with two threads, aer_isr() on the left and a
+concurrent aer_remove() on the right:
+
+  Thread A                      Thread B
+  --------                      --------
+  aer_irq():
+    rpc->prod_idx++
+                                aer_remove():
+                                  wait_event(rpc->prod_idx == rpc->cons_idx)
+                                  # now blocked until queue becomes empty
+  aer_isr():                      # ...
+    rpc->cons_idx++               # unblocked because queue is now empty
+    ...                           kfree(rpc)
+    mutex_unlock(&rpc->rpc_mutex)
+
+To prevent this problem, use flush_work() to wait until the last scheduled
+instance of aer_isr() has completed before freeing the rpc struct in
+aer_remove().
+
+I reproduced this use-after-free by flashing a device FPGA and
+re-enumerating the bus to find the new device.  With SLUB debug, this
+crashes with 0x6b bytes (POISON_FREE, the use-after-free magic number) in
+GPR25:
+
+  pcieport 0000:00:00.0: AER: Multiple Corrected error received: id=0000
+  Unable to handle kernel paging request for data at address 0x27ef9e3e
+  Workqueue: events aer_isr
+  GPR24: dd6aa000 6b6b6b6b 605f8378 605f8360 d99b12c0 604fc674 606b1704 d99b12c0
+  NIP [602f5328] pci_walk_bus+0xd4/0x104
+
+[bhelgaas: changelog, stable tag]
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pcie/aer/aerdrv.c      |    4 +---
+ drivers/pci/pcie/aer/aerdrv.h      |    1 -
+ drivers/pci/pcie/aer/aerdrv_core.c |    2 --
+ 3 files changed, 1 insertion(+), 6 deletions(-)
+
+--- a/drivers/pci/pcie/aer/aerdrv.c
++++ b/drivers/pci/pcie/aer/aerdrv.c
+@@ -262,7 +262,6 @@ static struct aer_rpc *aer_alloc_rpc(str
+       rpc->rpd = dev;
+       INIT_WORK(&rpc->dpc_handler, aer_isr);
+       mutex_init(&rpc->rpc_mutex);
+-      init_waitqueue_head(&rpc->wait_release);
+ 
+       /* Use PCIe bus function to store rpc into PCIe device */
+       set_service_data(dev, rpc);
+@@ -285,8 +284,7 @@ static void aer_remove(struct pcie_devic
+               if (rpc->isr)
+                       free_irq(dev->irq, dev);
+ 
+-              wait_event(rpc->wait_release, rpc->prod_idx == rpc->cons_idx);
+-
++              flush_work(&rpc->dpc_handler);
+               aer_disable_rootport(rpc);
+               kfree(rpc);
+               set_service_data(dev, NULL);
+--- a/drivers/pci/pcie/aer/aerdrv.h
++++ b/drivers/pci/pcie/aer/aerdrv.h
+@@ -72,7 +72,6 @@ struct aer_rpc {
+                                        * recovery on the same
+                                        * root port hierarchy
+                                        */
+-      wait_queue_head_t wait_release;
+ };
+ 
+ struct aer_broadcast_data {
+--- a/drivers/pci/pcie/aer/aerdrv_core.c
++++ b/drivers/pci/pcie/aer/aerdrv_core.c
+@@ -811,8 +811,6 @@ void aer_isr(struct work_struct *work)
+       while (get_e_source(rpc, &e_src))
+               aer_isr_one_error(p_device, &e_src);
+       mutex_unlock(&rpc->rpc_mutex);
+-
+-      wake_up(&rpc->wait_release);
+ }
+ 
+ /**
diff --git a/queue-4.4/series b/queue-4.4/series

index a55467c7ed31ebf36c10e248400aee45205c765d..11889c1980945c24ed831fb5eea9bc324686f286 100644 (file)
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -286,3 +286,5 @@ qla2xxx-fix-stale-pointer-access.patch
  libata-fix-sff-host-state-machine-locking-while-polling.patch
  arcv2-star-9000950267-handle-return-from-intr-to-delay-slot-2.patch
  arcv2-smp-emulate-ipi-to-self-using-software-triggered-interrupt.patch
+pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch
+cpuset-make-mm-migration-asynchronous.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 1 Mar 2016 22:03:53 +0000 (14:03 -0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 1 Mar 2016 22:03:53 +0000 (14:03 -0800)
queue-4.4/cpuset-make-mm-migration-asynchronous.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/series		patch \| blob \| blame \| history