From: Greg Kroah-Hartman Date: Mon, 5 Aug 2019 06:03:20 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.4.188~14 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=205ab887b2917a8506df77b610e05bb141fcf824;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch --- diff --git a/queue-4.9/coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch b/queue-4.9/coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch new file mode 100644 index 00000000000..8c920cef856 --- /dev/null +++ b/queue-4.9/coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch @@ -0,0 +1,99 @@ +From 59ea6d06cfa9247b586a695c21f94afa7183af74 Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli +Date: Thu, 13 Jun 2019 15:56:11 -0700 +Subject: coredump: fix race condition between collapse_huge_page() and core dumping + +From: Andrea Arcangeli + +commit 59ea6d06cfa9247b586a695c21f94afa7183af74 upstream. + +When fixing the race conditions between the coredump and the mmap_sem +holders outside the context of the process, we focused on +mmget_not_zero()/get_task_mm() callers in 04f5866e41fb70 ("coredump: fix +race condition between mmget_not_zero()/get_task_mm() and core +dumping"), but those aren't the only cases where the mmap_sem can be +taken outside of the context of the process as Michal Hocko noticed +while backporting that commit to older -stable kernels. + +If mmgrab() is called in the context of the process, but then the +mm_count reference is transferred outside the context of the process, +that can also be a problem if the mmap_sem has to be taken for writing +through that mm_count reference. + +khugepaged registration calls mmgrab() in the context of the process, +but the mmap_sem for writing is taken later in the context of the +khugepaged kernel thread. + +collapse_huge_page() after taking the mmap_sem for writing doesn't +modify any vma, so it's not obvious that it could cause a problem to the +coredump, but it happens to modify the pmd in a way that breaks an +invariant that pmd_trans_huge_lock() relies upon. collapse_huge_page() +needs the mmap_sem for writing just to block concurrent page faults that +call pmd_trans_huge_lock(). + +Specifically the invariant that "!pmd_trans_huge()" cannot become a +"pmd_trans_huge()" doesn't hold while collapse_huge_page() runs. + +The coredump will call __get_user_pages() without mmap_sem for reading, +which eventually can invoke a lockless page fault which will need a +functional pmd_trans_huge_lock(). + +So collapse_huge_page() needs to use mmget_still_valid() to check it's +not running concurrently with the coredump... as long as the coredump +can invoke page faults without holding the mmap_sem for reading. + +This has "Fixes: khugepaged" to facilitate backporting, but in my view +it's more a bug in the coredump code that will eventually have to be +rewritten to stop invoking page faults without the mmap_sem for reading. +So the long term plan is still to drop all mmget_still_valid(). + +Link: http://lkml.kernel.org/r/20190607161558.32104-1-aarcange@redhat.com +Fixes: ba76149f47d8 ("thp: khugepaged") +Signed-off-by: Andrea Arcangeli +Reported-by: Michal Hocko +Acked-by: Michal Hocko +Acked-by: Kirill A. Shutemov +Cc: Oleg Nesterov +Cc: Jann Horn +Cc: Hugh Dickins +Cc: Mike Rapoport +Cc: Mike Kravetz +Cc: Peter Xu +Cc: Jason Gunthorpe +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[Ajay: Just adjusted to apply on v4.9] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mm.h | 4 ++++ + mm/khugepaged.c | 3 +++ + 2 files changed, 7 insertions(+) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1197,6 +1197,10 @@ void unmap_vmas(struct mmu_gather *tlb, + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * ++ * It also has to be called when mmgrab() is used in the context of ++ * the process, but then the mm_count refcount is transferred outside ++ * the context of the process to run down_write() on that pinned mm. ++ * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1004,6 +1004,9 @@ static void collapse_huge_page(struct mm + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); ++ result = SCAN_ANY_PROCESS; ++ if (!mmget_still_valid(mm)) ++ goto out; + result = hugepage_vma_revalidate(mm, address, &vma); + if (result) + goto out; diff --git a/queue-4.9/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch b/queue-4.9/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch new file mode 100644 index 00000000000..91fb83859eb --- /dev/null +++ b/queue-4.9/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch @@ -0,0 +1,217 @@ +From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli +Date: Thu, 18 Apr 2019 17:50:52 -0700 +Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping + +From: Andrea Arcangeli + +commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream. + +The core dumping code has always run without holding the mmap_sem for +writing, despite that is the only way to ensure that the entire vma +layout will not change from under it. Only using some signal +serialization on the processes belonging to the mm is not nearly enough. +This was pointed out earlier. For example in Hugh's post from Jul 2017: + + https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils + + "Not strictly relevant here, but a related note: I was very surprised + to discover, only quite recently, how handle_mm_fault() may be called + without down_read(mmap_sem) - when core dumping. That seems a + misguided optimization to me, which would also be nice to correct" + +In particular because the growsdown and growsup can move the +vm_start/vm_end the various loops the core dump does around the vma will +not be consistent if page faults can happen concurrently. + +Pretty much all users calling mmget_not_zero()/get_task_mm() and then +taking the mmap_sem had the potential to introduce unexpected side +effects in the core dumping code. + +Adding mmap_sem for writing around the ->core_dump invocation is a +viable long term fix, but it requires removing all copy user and page +faults and to replace them with get_dump_page() for all binary formats +which is not suitable as a short term fix. + +For the time being this solution manually covers the places that can +confuse the core dump either by altering the vma layout or the vma flags +while it runs. Once ->core_dump runs under mmap_sem for writing the +function mmget_still_valid() can be dropped. + +Allowing mmap_sem protected sections to run in parallel with the +coredump provides some minor parallelism advantage to the swapoff code +(which seems to be safe enough by never mangling any vma field and can +keep doing swapins in parallel to the core dumping) and to some other +corner case. + +In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" +however the side effect of this same race condition in /proc/pid/mem +should be reproducible since before 2.6.12-rc2 so I couldn't add any +other "Fixes:" because there's no hash beyond the git genesis commit. + +Because find_extend_vma() is the only location outside of the process +context that could modify the "mm" structures under mmap_sem for +reading, by adding the mmget_still_valid() check to it, all other cases +that take the mmap_sem for reading don't need the new check after +mmget_not_zero()/get_task_mm(). The expand_stack() in page fault +context also doesn't need the new check, because all tasks under core +dumping are frozen. + +Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com +Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") +Signed-off-by: Andrea Arcangeli +Reported-by: Jann Horn +Suggested-by: Oleg Nesterov +Acked-by: Peter Xu +Reviewed-by: Mike Rapoport +Reviewed-by: Oleg Nesterov +Reviewed-by: Jann Horn +Acked-by: Jason Gunthorpe +Acked-by: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[akaher@vmware.com: stable 4.9 backport +- handle binder_update_page_range - mhocko@suse.com] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + drivers/android/binder.c | 6 ++++++ + fs/proc/task_mmu.c | 18 ++++++++++++++++++ + fs/userfaultfd.c | 9 +++++++++ + include/linux/mm.h | 20 ++++++++++++++++++++ + mm/mmap.c | 6 +++++- + 5 files changed, 58 insertions(+), 1 deletion(-) + +--- a/drivers/android/binder.c ++++ b/drivers/android/binder.c +@@ -581,6 +581,12 @@ static int binder_update_page_range(stru + + if (mm) { + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) { ++ if (allocate == 0) ++ goto free_range; ++ goto err_no_vma; ++ } ++ + vma = proc->vma; + if (vma && mm != proc->vma_vm_mm) { + pr_err("%d: vma mm and task mm mismatch\n", +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1057,6 +1057,24 @@ static ssize_t clear_refs_write(struct f + count = -EINTR; + goto out_mm; + } ++ /* ++ * Avoid to modify vma->vm_flags ++ * without locked ops while the ++ * coredump reads the vm_flags. ++ */ ++ if (!mmget_still_valid(mm)) { ++ /* ++ * Silently return "count" ++ * like if get_task_mm() ++ * failed. FIXME: should this ++ * function have returned ++ * -ESRCH if get_task_mm() ++ * failed like if ++ * get_proc_task() fails? ++ */ ++ up_write(&mm->mmap_sem); ++ goto out_mm; ++ } + for (vma = mm->mmap; vma; vma = vma->vm_next) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -479,6 +479,8 @@ static int userfaultfd_release(struct in + * taking the mmap_sem for writing. + */ + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) ++ goto skip_mm; + prev = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); +@@ -501,6 +503,7 @@ static int userfaultfd_release(struct in + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + } ++skip_mm: + up_write(&mm->mmap_sem); + mmput(mm); + wakeup: +@@ -802,6 +805,9 @@ static int userfaultfd_register(struct u + goto out; + + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) ++ goto out_unlock; ++ + vma = find_vma_prev(mm, start, &prev); + if (!vma) + goto out_unlock; +@@ -947,6 +953,9 @@ static int userfaultfd_unregister(struct + goto out; + + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) ++ goto out_unlock; ++ + vma = find_vma_prev(mm, start, &prev); + if (!vma) + goto out_unlock; +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1192,6 +1192,26 @@ void zap_page_range(struct vm_area_struc + unsigned long size, struct zap_details *); + void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long start, unsigned long end); ++/* ++ * This has to be called after a get_task_mm()/mmget_not_zero() ++ * followed by taking the mmap_sem for writing before modifying the ++ * vmas or anything the coredump pretends not to change from under it. ++ * ++ * NOTE: find_extend_vma() called from GUP context is the only place ++ * that can modify the "mm" (notably the vm_start/end) under mmap_sem ++ * for reading and outside the context of the process, so it is also ++ * the only case that holds the mmap_sem for reading that must call ++ * this function. Generally if the mmap_sem is hold for reading ++ * there's no need of this check after get_task_mm()/mmget_not_zero(). ++ * ++ * This function can be obsoleted and the check can be removed, after ++ * the coredump code will hold the mmap_sem for writing before ++ * invoking the ->core_dump methods. ++ */ ++static inline bool mmget_still_valid(struct mm_struct *mm) ++{ ++ return likely(!mm->core_state); ++} + + /** + * mm_walk - callbacks for walk_page_range +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2448,7 +2448,8 @@ find_extend_vma(struct mm_struct *mm, un + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; +- if (!prev || expand_stack(prev, addr)) ++ /* don't alter vm_end if the coredump is running */ ++ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); +@@ -2474,6 +2475,9 @@ find_extend_vma(struct mm_struct *mm, un + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; ++ /* don't alter vm_start if the coredump is running */ ++ if (!mmget_still_valid(mm)) ++ return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; diff --git a/queue-4.9/infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch b/queue-4.9/infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch new file mode 100644 index 00000000000..04f1f85eeda --- /dev/null +++ b/queue-4.9/infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch @@ -0,0 +1,66 @@ +From akaher@vmware.com Mon Aug 5 08:01:12 2019 +From: Ajay Kaher +Date: Sun, 4 Aug 2019 09:29:26 +0530 +Subject: infiniband: fix race condition between infiniband mlx4, mlx5 driver and core dumping +To: , , , , , , +Cc: srinidhir@vmware.com, linux-kernel@vger.kernel.org, linux-mm@kvack.org, amakhalov@vmware.com, sean.hefty@intel.com, srivatsa@csail.mit.edu, srivatsab@vmware.com, devel@driverdev.osuosl.org, linux-rdma@vger.kernel.org, bvikas@vmware.com, dledford@redhat.com, akaher@vmware.com, riandrews@android.com, hal.rosenstock@gmail.com, vsirnapalli@vmware.com, leonro@mellanox.com, jglisse@redhat.com, viro@zeniv.linux.org.uk, gregkh@linuxfoundation.org, yishaih@mellanox.com, matanb@mellanox.com, stable@vger.kernel.org, arve@android.com, linux-fsdevel@vger.kernel.org, akpm@linux-foundation.org, torvalds@linux-foundation.org, mike.kravetz@oracle.com +Message-ID: <1564891168-30016-2-git-send-email-akaher@vmware.com> + +From: Ajay Kaher + +This patch is the extension of following upstream commit to fix +the race condition between get_task_mm() and core dumping +for IB->mlx4 and IB->mlx5 drivers: + +commit 04f5866e41fb ("coredump: fix race condition between +mmget_not_zero()/get_task_mm() and core dumping")' + +Thanks to Jason for pointing this. + +Signed-off-by: Ajay Kaher +Reviewed-by: Jason Gunthorpe +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/hw/mlx4/main.c | 4 +++- + drivers/infiniband/hw/mlx5/main.c | 3 +++ + 2 files changed, 6 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/hw/mlx4/main.c ++++ b/drivers/infiniband/hw/mlx4/main.c +@@ -1172,6 +1172,8 @@ static void mlx4_ib_disassociate_ucontex + * mlx4_ib_vma_close(). + */ + down_write(&owning_mm->mmap_sem); ++ if (!mmget_still_valid(owning_mm)) ++ goto skip_mm; + for (i = 0; i < HW_BAR_COUNT; i++) { + vma = context->hw_bar_info[i].vma; + if (!vma) +@@ -1190,7 +1192,7 @@ static void mlx4_ib_disassociate_ucontex + /* context going to be destroyed, should not access ops any more */ + context->hw_bar_info[i].vma->vm_ops = NULL; + } +- ++skip_mm: + up_write(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); +--- a/drivers/infiniband/hw/mlx5/main.c ++++ b/drivers/infiniband/hw/mlx5/main.c +@@ -1307,6 +1307,8 @@ static void mlx5_ib_disassociate_ucontex + * mlx5_ib_vma_close. + */ + down_write(&owning_mm->mmap_sem); ++ if (!mmget_still_valid(owning_mm)) ++ goto skip_mm; + list_for_each_entry_safe(vma_private, n, &context->vma_private_list, + list) { + vma = vma_private->vma; +@@ -1321,6 +1323,7 @@ static void mlx5_ib_disassociate_ucontex + list_del(&vma_private->list); + kfree(vma_private); + } ++skip_mm: + up_write(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); diff --git a/queue-4.9/series b/queue-4.9/series index a1cf112b95a..70662e29a32 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -32,3 +32,6 @@ s390-dasd-fix-endless-loop-after-read-unit-address-configuration.patch drivers-perf-arm_pmu-fix-failure-path-in-pm-notifier.patch xen-swiotlb-fix-condition-for-calling-xen_destroy_contiguous_region.patch ib-mlx5-fix-rss-toeplitz-setup-to-be-aligned-with-the-hw-specification.patch +coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch +infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch +coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch