From: Greg Kroah-Hartman Date: Tue, 1 Mar 2016 22:03:53 +0000 (-0800) Subject: 4.4-stable patches X-Git-Tag: v3.10.99~9 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=08cae923c52e2aa4f5195c7c8681fab5e0ec2168;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: cpuset-make-mm-migration-asynchronous.patch pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch --- diff --git a/queue-4.4/cpuset-make-mm-migration-asynchronous.patch b/queue-4.4/cpuset-make-mm-migration-asynchronous.patch new file mode 100644 index 00000000000..80b4c0fb76b --- /dev/null +++ b/queue-4.4/cpuset-make-mm-migration-asynchronous.patch @@ -0,0 +1,207 @@ +From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Tue, 19 Jan 2016 12:18:41 -0500 +Subject: cpuset: make mm migration asynchronous + +From: Tejun Heo + +commit e93ad19d05648397ef3bcb838d26aec06c245dc0 upstream. + +If "cpuset.memory_migrate" is set, when a process is moved from one +cpuset to another with a different memory node mask, pages in used by +the process are migrated to the new set of nodes. This was performed +synchronously in the ->attach() callback, which is synchronized +against process management. Recently, the synchronization was changed +from per-process rwsem to global percpu rwsem for simplicity and +optimization. + +Combined with the synchronous mm migration, this led to deadlocks +because mm migration could schedule a work item which may in turn try +to create a new worker blocking on the process management lock held +from cgroup process migration path. + +This heavy an operation shouldn't be performed synchronously from that +deep inside cgroup migration in the first place. This patch punts the +actual migration to an ordered workqueue and updates cgroup process +migration and cpuset config update paths to flush the workqueue after +all locks are released. This way, the operations still seem +synchronous to userland without entangling mm migration with process +management synchronization. CPU hotplug can also invoke mm migration +but there's no reason for it to wait for mm migrations and thus +doesn't synchronize against their completions. + +Signed-off-by: Tejun Heo +Reported-and-tested-by: Christian Borntraeger +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/cpuset.h | 6 ++++ + kernel/cgroup.c | 3 +- + kernel/cpuset.c | 71 +++++++++++++++++++++++++++++++++---------------- + 3 files changed, 57 insertions(+), 23 deletions(-) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -137,6 +137,8 @@ static inline void set_mems_allowed(node + task_unlock(current); + } + ++extern void cpuset_post_attach_flush(void); ++ + #else /* !CONFIG_CPUSETS */ + + static inline bool cpusets_enabled(void) { return false; } +@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_ret + return false; + } + ++static inline void cpuset_post_attach_flush(void) ++{ ++} ++ + #endif /* !CONFIG_CPUSETS */ + + #endif /* _LINUX_CPUSET_H */ +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -57,7 +57,7 @@ + #include /* TODO: replace with more sophisticated array */ + #include + #include +- ++#include + #include + + /* +@@ -2764,6 +2764,7 @@ out_unlock_rcu: + out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_kn_unlock(of->kn); ++ cpuset_post_attach_flush(); + return ret ?: nbytes; + } + +--- a/kernel/cpuset.c ++++ b/kernel/cpuset.c +@@ -286,6 +286,8 @@ static struct cpuset top_cpuset = { + static DEFINE_MUTEX(cpuset_mutex); + static DEFINE_SPINLOCK(callback_lock); + ++static struct workqueue_struct *cpuset_migrate_mm_wq; ++ + /* + * CPU / memory hotplug is handled asynchronously. + */ +@@ -971,31 +973,51 @@ static int update_cpumask(struct cpuset + } + + /* +- * cpuset_migrate_mm +- * +- * Migrate memory region from one set of nodes to another. +- * +- * Temporarilly set tasks mems_allowed to target nodes of migration, +- * so that the migration code can allocate pages on these nodes. +- * +- * While the mm_struct we are migrating is typically from some +- * other task, the task_struct mems_allowed that we are hacking +- * is for our current task, which must allocate new pages for that +- * migrating memory region. ++ * Migrate memory region from one set of nodes to another. This is ++ * performed asynchronously as it can be called from process migration path ++ * holding locks involved in process management. All mm migrations are ++ * performed in the queued order and can be waited for by flushing ++ * cpuset_migrate_mm_wq. + */ + ++struct cpuset_migrate_mm_work { ++ struct work_struct work; ++ struct mm_struct *mm; ++ nodemask_t from; ++ nodemask_t to; ++}; ++ ++static void cpuset_migrate_mm_workfn(struct work_struct *work) ++{ ++ struct cpuset_migrate_mm_work *mwork = ++ container_of(work, struct cpuset_migrate_mm_work, work); ++ ++ /* on a wq worker, no need to worry about %current's mems_allowed */ ++ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); ++ mmput(mwork->mm); ++ kfree(mwork); ++} ++ + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to) + { +- struct task_struct *tsk = current; +- +- tsk->mems_allowed = *to; ++ struct cpuset_migrate_mm_work *mwork; + +- do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); ++ mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); ++ if (mwork) { ++ mwork->mm = mm; ++ mwork->from = *from; ++ mwork->to = *to; ++ INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); ++ queue_work(cpuset_migrate_mm_wq, &mwork->work); ++ } else { ++ mmput(mm); ++ } ++} + +- rcu_read_lock(); +- guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); +- rcu_read_unlock(); ++void cpuset_post_attach_flush(void) ++{ ++ flush_workqueue(cpuset_migrate_mm_wq); + } + + /* +@@ -1096,7 +1118,8 @@ static void update_tasks_nodemask(struct + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) + cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); +- mmput(mm); ++ else ++ mmput(mm); + } + css_task_iter_end(&it); + +@@ -1541,11 +1564,11 @@ static void cpuset_attach(struct cgroup_ + * @old_mems_allowed is the right nodesets that we + * migrate mm from. + */ +- if (is_memory_migrate(cs)) { ++ if (is_memory_migrate(cs)) + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, + &cpuset_attach_nodemask_to); +- } +- mmput(mm); ++ else ++ mmput(mm); + } + } + +@@ -1710,6 +1733,7 @@ out_unlock: + mutex_unlock(&cpuset_mutex); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); ++ flush_workqueue(cpuset_migrate_mm_wq); + return retval ?: nbytes; + } + +@@ -2355,6 +2379,9 @@ void __init cpuset_init_smp(void) + top_cpuset.effective_mems = node_states[N_MEMORY]; + + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); ++ ++ cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); ++ BUG_ON(!cpuset_migrate_mm_wq); + } + + /** diff --git a/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch b/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch new file mode 100644 index 00000000000..a085dffad89 --- /dev/null +++ b/queue-4.4/pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch @@ -0,0 +1,98 @@ +From 4ae2182b1e3407de369f8c5d799543b7db74221b Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior +Date: Mon, 25 Jan 2016 10:08:00 -0600 +Subject: PCI/AER: Flush workqueue on device remove to avoid use-after-free + +From: Sebastian Andrzej Siewior + +commit 4ae2182b1e3407de369f8c5d799543b7db74221b upstream. + +A Root Port's AER structure (rpc) contains a queue of events. aer_irq() +enqueues AER status information and schedules aer_isr() to dequeue and +process it. When we remove a device, aer_remove() waits for the queue to +be empty, then frees the rpc struct. + +But aer_isr() references the rpc struct after dequeueing and possibly +emptying the queue, which can cause a use-after-free error as in the +following scenario with two threads, aer_isr() on the left and a +concurrent aer_remove() on the right: + + Thread A Thread B + -------- -------- + aer_irq(): + rpc->prod_idx++ + aer_remove(): + wait_event(rpc->prod_idx == rpc->cons_idx) + # now blocked until queue becomes empty + aer_isr(): # ... + rpc->cons_idx++ # unblocked because queue is now empty + ... kfree(rpc) + mutex_unlock(&rpc->rpc_mutex) + +To prevent this problem, use flush_work() to wait until the last scheduled +instance of aer_isr() has completed before freeing the rpc struct in +aer_remove(). + +I reproduced this use-after-free by flashing a device FPGA and +re-enumerating the bus to find the new device. With SLUB debug, this +crashes with 0x6b bytes (POISON_FREE, the use-after-free magic number) in +GPR25: + + pcieport 0000:00:00.0: AER: Multiple Corrected error received: id=0000 + Unable to handle kernel paging request for data at address 0x27ef9e3e + Workqueue: events aer_isr + GPR24: dd6aa000 6b6b6b6b 605f8378 605f8360 d99b12c0 604fc674 606b1704 d99b12c0 + NIP [602f5328] pci_walk_bus+0xd4/0x104 + +[bhelgaas: changelog, stable tag] +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Bjorn Helgaas +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/pci/pcie/aer/aerdrv.c | 4 +--- + drivers/pci/pcie/aer/aerdrv.h | 1 - + drivers/pci/pcie/aer/aerdrv_core.c | 2 -- + 3 files changed, 1 insertion(+), 6 deletions(-) + +--- a/drivers/pci/pcie/aer/aerdrv.c ++++ b/drivers/pci/pcie/aer/aerdrv.c +@@ -262,7 +262,6 @@ static struct aer_rpc *aer_alloc_rpc(str + rpc->rpd = dev; + INIT_WORK(&rpc->dpc_handler, aer_isr); + mutex_init(&rpc->rpc_mutex); +- init_waitqueue_head(&rpc->wait_release); + + /* Use PCIe bus function to store rpc into PCIe device */ + set_service_data(dev, rpc); +@@ -285,8 +284,7 @@ static void aer_remove(struct pcie_devic + if (rpc->isr) + free_irq(dev->irq, dev); + +- wait_event(rpc->wait_release, rpc->prod_idx == rpc->cons_idx); +- ++ flush_work(&rpc->dpc_handler); + aer_disable_rootport(rpc); + kfree(rpc); + set_service_data(dev, NULL); +--- a/drivers/pci/pcie/aer/aerdrv.h ++++ b/drivers/pci/pcie/aer/aerdrv.h +@@ -72,7 +72,6 @@ struct aer_rpc { + * recovery on the same + * root port hierarchy + */ +- wait_queue_head_t wait_release; + }; + + struct aer_broadcast_data { +--- a/drivers/pci/pcie/aer/aerdrv_core.c ++++ b/drivers/pci/pcie/aer/aerdrv_core.c +@@ -811,8 +811,6 @@ void aer_isr(struct work_struct *work) + while (get_e_source(rpc, &e_src)) + aer_isr_one_error(p_device, &e_src); + mutex_unlock(&rpc->rpc_mutex); +- +- wake_up(&rpc->wait_release); + } + + /** diff --git a/queue-4.4/series b/queue-4.4/series index a55467c7ed3..11889c19809 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -286,3 +286,5 @@ qla2xxx-fix-stale-pointer-access.patch libata-fix-sff-host-state-machine-locking-while-polling.patch arcv2-star-9000950267-handle-return-from-intr-to-delay-slot-2.patch arcv2-smp-emulate-ipi-to-self-using-software-triggered-interrupt.patch +pci-aer-flush-workqueue-on-device-remove-to-avoid-use-after-free.patch +cpuset-make-mm-migration-asynchronous.patch