From 3b96c11f067206ce02615f82e8b31a8a8769acd9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 20 Jun 2019 16:32:24 +0200 Subject: [PATCH] 4.4-stable patches added patches: coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch --- ...ot_zero-get_task_mm-and-core-dumping.patch | 251 ++++++++++++++++++ queue-4.4/series | 1 + 2 files changed, 252 insertions(+) create mode 100644 queue-4.4/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch diff --git a/queue-4.4/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch b/queue-4.4/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch new file mode 100644 index 0000000000..eee7a7f55a --- /dev/null +++ b/queue-4.4/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch @@ -0,0 +1,251 @@ +From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli +Date: Thu, 18 Apr 2019 17:50:52 -0700 +Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping + +From: Andrea Arcangeli + +commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream. + +The core dumping code has always run without holding the mmap_sem for +writing, despite that is the only way to ensure that the entire vma +layout will not change from under it. Only using some signal +serialization on the processes belonging to the mm is not nearly enough. +This was pointed out earlier. For example in Hugh's post from Jul 2017: + + https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils + + "Not strictly relevant here, but a related note: I was very surprised + to discover, only quite recently, how handle_mm_fault() may be called + without down_read(mmap_sem) - when core dumping. That seems a + misguided optimization to me, which would also be nice to correct" + +In particular because the growsdown and growsup can move the +vm_start/vm_end the various loops the core dump does around the vma will +not be consistent if page faults can happen concurrently. + +Pretty much all users calling mmget_not_zero()/get_task_mm() and then +taking the mmap_sem had the potential to introduce unexpected side +effects in the core dumping code. + +Adding mmap_sem for writing around the ->core_dump invocation is a +viable long term fix, but it requires removing all copy user and page +faults and to replace them with get_dump_page() for all binary formats +which is not suitable as a short term fix. + +For the time being this solution manually covers the places that can +confuse the core dump either by altering the vma layout or the vma flags +while it runs. Once ->core_dump runs under mmap_sem for writing the +function mmget_still_valid() can be dropped. + +Allowing mmap_sem protected sections to run in parallel with the +coredump provides some minor parallelism advantage to the swapoff code +(which seems to be safe enough by never mangling any vma field and can +keep doing swapins in parallel to the core dumping) and to some other +corner case. + +In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" +however the side effect of this same race condition in /proc/pid/mem +should be reproducible since before 2.6.12-rc2 so I couldn't add any +other "Fixes:" because there's no hash beyond the git genesis commit. + +Because find_extend_vma() is the only location outside of the process +context that could modify the "mm" structures under mmap_sem for +reading, by adding the mmget_still_valid() check to it, all other cases +that take the mmap_sem for reading don't need the new check after +mmget_not_zero()/get_task_mm(). The expand_stack() in page fault +context also doesn't need the new check, because all tasks under core +dumping are frozen. + +Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com +Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") +Signed-off-by: Andrea Arcangeli +Reported-by: Jann Horn +Suggested-by: Oleg Nesterov +Acked-by: Peter Xu +Reviewed-by: Mike Rapoport +Reviewed-by: Oleg Nesterov +Reviewed-by: Jann Horn +Acked-by: Jason Gunthorpe +Acked-by: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Cc: Joel Fernandes (Google) +[mhocko@suse.com: stable 4.4 backport + - drop infiniband part because of missing 5f9794dc94f59 + - drop userfaultfd_event_wait_completion hunk because of + missing 9cd75c3cd4c3d] + - handle binder_update_page_range because of missing 720c241924046 + - handle mlx5_ib_disassociate_ucontext - akaher@vmware.com +] +Signed-off-by: Michal Hocko +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/android/binder.c | 6 ++++++ + drivers/infiniband/hw/mlx4/main.c | 3 +++ + fs/proc/task_mmu.c | 18 ++++++++++++++++++ + fs/userfaultfd.c | 7 +++++++ + include/linux/mm.h | 21 +++++++++++++++++++++ + mm/mmap.c | 7 ++++++- + 6 files changed, 61 insertions(+), 1 deletion(-) + +--- a/drivers/android/binder.c ++++ b/drivers/android/binder.c +@@ -570,6 +570,12 @@ static int binder_update_page_range(stru + + if (mm) { + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) { ++ if (allocate == 0) ++ goto free_range; ++ goto err_no_vma; ++ } ++ + vma = proc->vma; + if (vma && mm != proc->vma_vm_mm) { + pr_err("%d: vma mm and task mm mismatch\n", +--- a/drivers/infiniband/hw/mlx4/main.c ++++ b/drivers/infiniband/hw/mlx4/main.c +@@ -1042,6 +1042,8 @@ static void mlx4_ib_disassociate_ucontex + * mlx4_ib_vma_close(). + */ + down_write(&owning_mm->mmap_sem); ++ if (!mmget_still_valid(owning_mm)) ++ goto skip_mm; + for (i = 0; i < HW_BAR_COUNT; i++) { + vma = context->hw_bar_info[i].vma; + if (!vma) +@@ -1061,6 +1063,7 @@ static void mlx4_ib_disassociate_ucontex + context->hw_bar_info[i].vma->vm_ops = NULL; + } + ++skip_mm: + up_write(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -947,6 +947,24 @@ static ssize_t clear_refs_write(struct f + continue; + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); ++ /* ++ * Avoid to modify vma->vm_flags ++ * without locked ops while the ++ * coredump reads the vm_flags. ++ */ ++ if (!mmget_still_valid(mm)) { ++ /* ++ * Silently return "count" ++ * like if get_task_mm() ++ * failed. FIXME: should this ++ * function have returned ++ * -ESRCH if get_task_mm() ++ * failed like if ++ * get_proc_task() fails? ++ */ ++ up_write(&mm->mmap_sem); ++ goto out_mm; ++ } + for (vma = mm->mmap; vma; vma = vma->vm_next) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -446,6 +446,8 @@ static int userfaultfd_release(struct in + * taking the mmap_sem for writing. + */ + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) ++ goto skip_mm; + prev = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); +@@ -468,6 +470,7 @@ static int userfaultfd_release(struct in + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + } ++skip_mm: + up_write(&mm->mmap_sem); + mmput(mm); + wakeup: +@@ -769,6 +772,8 @@ static int userfaultfd_register(struct u + goto out; + + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) ++ goto out_unlock; + vma = find_vma_prev(mm, start, &prev); + if (!vma) + goto out_unlock; +@@ -914,6 +919,8 @@ static int userfaultfd_unregister(struct + goto out; + + down_write(&mm->mmap_sem); ++ if (!mmget_still_valid(mm)) ++ goto out_unlock; + vma = find_vma_prev(mm, start, &prev); + if (!vma) + goto out_unlock; +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1098,6 +1098,27 @@ void zap_page_range(struct vm_area_struc + void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long start, unsigned long end); + ++/* ++ * This has to be called after a get_task_mm()/mmget_not_zero() ++ * followed by taking the mmap_sem for writing before modifying the ++ * vmas or anything the coredump pretends not to change from under it. ++ * ++ * NOTE: find_extend_vma() called from GUP context is the only place ++ * that can modify the "mm" (notably the vm_start/end) under mmap_sem ++ * for reading and outside the context of the process, so it is also ++ * the only case that holds the mmap_sem for reading that must call ++ * this function. Generally if the mmap_sem is hold for reading ++ * there's no need of this check after get_task_mm()/mmget_not_zero(). ++ * ++ * This function can be obsoleted and the check can be removed, after ++ * the coredump code will hold the mmap_sem for writing before ++ * invoking the ->core_dump methods. ++ */ ++static inline bool mmget_still_valid(struct mm_struct *mm) ++{ ++ return likely(!mm->core_state); ++} ++ + /** + * mm_walk - callbacks for walk_page_range + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2398,7 +2399,8 @@ find_extend_vma(struct mm_struct *mm, un + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; +- if (!prev || expand_stack(prev, addr)) ++ /* don't alter vm_end if the coredump is running */ ++ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); +@@ -2424,6 +2426,9 @@ find_extend_vma(struct mm_struct *mm, un + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; ++ /* don't alter vm_start if the coredump is running */ ++ if (!mmget_still_valid(mm)) ++ return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; diff --git a/queue-4.4/series b/queue-4.4/series index 672cdd428e..8197be68b9 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -80,3 +80,4 @@ net-sh_eth-fix-mdio-access-in-sh_eth_close-for-r-car.patch scsi-libcxgbi-add-a-check-for-null-pointer-in-cxgbi_.patch scsi-libsas-delete-sas-port-if-expander-discover-fai.patch revert-crypto-crypto4xx-properly-set-iv-after-de-and-encrypt.patch +coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch -- 2.39.2