--- /dev/null
+From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 18 Apr 2019 17:50:52 -0700
+Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream.
+
+The core dumping code has always run without holding the mmap_sem for
+writing, despite that is the only way to ensure that the entire vma
+layout will not change from under it. Only using some signal
+serialization on the processes belonging to the mm is not nearly enough.
+This was pointed out earlier. For example in Hugh's post from Jul 2017:
+
+ https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils
+
+ "Not strictly relevant here, but a related note: I was very surprised
+ to discover, only quite recently, how handle_mm_fault() may be called
+ without down_read(mmap_sem) - when core dumping. That seems a
+ misguided optimization to me, which would also be nice to correct"
+
+In particular because the growsdown and growsup can move the
+vm_start/vm_end the various loops the core dump does around the vma will
+not be consistent if page faults can happen concurrently.
+
+Pretty much all users calling mmget_not_zero()/get_task_mm() and then
+taking the mmap_sem had the potential to introduce unexpected side
+effects in the core dumping code.
+
+Adding mmap_sem for writing around the ->core_dump invocation is a
+viable long term fix, but it requires removing all copy user and page
+faults and to replace them with get_dump_page() for all binary formats
+which is not suitable as a short term fix.
+
+For the time being this solution manually covers the places that can
+confuse the core dump either by altering the vma layout or the vma flags
+while it runs. Once ->core_dump runs under mmap_sem for writing the
+function mmget_still_valid() can be dropped.
+
+Allowing mmap_sem protected sections to run in parallel with the
+coredump provides some minor parallelism advantage to the swapoff code
+(which seems to be safe enough by never mangling any vma field and can
+keep doing swapins in parallel to the core dumping) and to some other
+corner case.
+
+In order to facilitate the backporting I added "Fixes: 86039bd3b4e6"
+however the side effect of this same race condition in /proc/pid/mem
+should be reproducible since before 2.6.12-rc2 so I couldn't add any
+other "Fixes:" because there's no hash beyond the git genesis commit.
+
+Because find_extend_vma() is the only location outside of the process
+context that could modify the "mm" structures under mmap_sem for
+reading, by adding the mmget_still_valid() check to it, all other cases
+that take the mmap_sem for reading don't need the new check after
+mmget_not_zero()/get_task_mm(). The expand_stack() in page fault
+context also doesn't need the new check, because all tasks under core
+dumping are frozen.
+
+Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com
+Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization")
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Reported-by: Jann Horn <jannh@google.com>
+Suggested-by: Oleg Nesterov <oleg@redhat.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
+Reviewed-by: Oleg Nesterov <oleg@redhat.com>
+Reviewed-by: Jann Horn <jannh@google.com>
+Acked-by: Jason Gunthorpe <jgg@mellanox.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+[mhocko@suse.com: stable 4.4 backport
+ - drop infiniband part because of missing 5f9794dc94f59
+ - drop userfaultfd_event_wait_completion hunk because of
+ missing 9cd75c3cd4c3d]
+ - handle binder_update_page_range because of missing 720c241924046
+ - handle mlx5_ib_disassociate_ucontext - akaher@vmware.com
+]
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/android/binder.c | 6 ++++++
+ drivers/infiniband/hw/mlx4/main.c | 3 +++
+ fs/proc/task_mmu.c | 18 ++++++++++++++++++
+ fs/userfaultfd.c | 7 +++++++
+ include/linux/mm.h | 21 +++++++++++++++++++++
+ mm/mmap.c | 7 ++++++-
+ 6 files changed, 61 insertions(+), 1 deletion(-)
+
+--- a/drivers/android/binder.c
++++ b/drivers/android/binder.c
+@@ -570,6 +570,12 @@ static int binder_update_page_range(stru
+
+ if (mm) {
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm)) {
++ if (allocate == 0)
++ goto free_range;
++ goto err_no_vma;
++ }
++
+ vma = proc->vma;
+ if (vma && mm != proc->vma_vm_mm) {
+ pr_err("%d: vma mm and task mm mismatch\n",
+--- a/drivers/infiniband/hw/mlx4/main.c
++++ b/drivers/infiniband/hw/mlx4/main.c
+@@ -1042,6 +1042,8 @@ static void mlx4_ib_disassociate_ucontex
+ * mlx4_ib_vma_close().
+ */
+ down_write(&owning_mm->mmap_sem);
++ if (!mmget_still_valid(owning_mm))
++ goto skip_mm;
+ for (i = 0; i < HW_BAR_COUNT; i++) {
+ vma = context->hw_bar_info[i].vma;
+ if (!vma)
+@@ -1061,6 +1063,7 @@ static void mlx4_ib_disassociate_ucontex
+ context->hw_bar_info[i].vma->vm_ops = NULL;
+ }
+
++skip_mm:
+ up_write(&owning_mm->mmap_sem);
+ mmput(owning_mm);
+ put_task_struct(owning_process);
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -947,6 +947,24 @@ static ssize_t clear_refs_write(struct f
+ continue;
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
++ /*
++ * Avoid to modify vma->vm_flags
++ * without locked ops while the
++ * coredump reads the vm_flags.
++ */
++ if (!mmget_still_valid(mm)) {
++ /*
++ * Silently return "count"
++ * like if get_task_mm()
++ * failed. FIXME: should this
++ * function have returned
++ * -ESRCH if get_task_mm()
++ * failed like if
++ * get_proc_task() fails?
++ */
++ up_write(&mm->mmap_sem);
++ goto out_mm;
++ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -446,6 +446,8 @@ static int userfaultfd_release(struct in
+ * taking the mmap_sem for writing.
+ */
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm))
++ goto skip_mm;
+ prev = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+@@ -468,6 +470,7 @@ static int userfaultfd_release(struct in
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
++skip_mm:
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ wakeup:
+@@ -769,6 +772,8 @@ static int userfaultfd_register(struct u
+ goto out;
+
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm))
++ goto out_unlock;
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma)
+ goto out_unlock;
+@@ -914,6 +919,8 @@ static int userfaultfd_unregister(struct
+ goto out;
+
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm))
++ goto out_unlock;
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma)
+ goto out_unlock;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1098,6 +1098,27 @@ void zap_page_range(struct vm_area_struc
+ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long start, unsigned long end);
+
++/*
++ * This has to be called after a get_task_mm()/mmget_not_zero()
++ * followed by taking the mmap_sem for writing before modifying the
++ * vmas or anything the coredump pretends not to change from under it.
++ *
++ * NOTE: find_extend_vma() called from GUP context is the only place
++ * that can modify the "mm" (notably the vm_start/end) under mmap_sem
++ * for reading and outside the context of the process, so it is also
++ * the only case that holds the mmap_sem for reading that must call
++ * this function. Generally if the mmap_sem is hold for reading
++ * there's no need of this check after get_task_mm()/mmget_not_zero().
++ *
++ * This function can be obsoleted and the check can be removed, after
++ * the coredump code will hold the mmap_sem for writing before
++ * invoking the ->core_dump methods.
++ */
++static inline bool mmget_still_valid(struct mm_struct *mm)
++{
++ return likely(!mm->core_state);
++}
++
+ /**
+ * mm_walk - callbacks for walk_page_range
+ * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -42,6 +42,7 @@
+ #include <linux/memory.h>
+ #include <linux/printk.h>
+ #include <linux/userfaultfd_k.h>
++#include <linux/mm.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -2398,7 +2399,8 @@ find_extend_vma(struct mm_struct *mm, un
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+- if (!prev || expand_stack(prev, addr))
++ /* don't alter vm_end if the coredump is running */
++ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+@@ -2424,6 +2426,9 @@ find_extend_vma(struct mm_struct *mm, un
+ return vma;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return NULL;
++ /* don't alter vm_start if the coredump is running */
++ if (!mmget_still_valid(mm))
++ return NULL;
+ start = vma->vm_start;
+ if (expand_stack(vma, addr))
+ return NULL;