--- /dev/null
+From 59ea6d06cfa9247b586a695c21f94afa7183af74 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 13 Jun 2019 15:56:11 -0700
+Subject: coredump: fix race condition between collapse_huge_page() and core dumping
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 59ea6d06cfa9247b586a695c21f94afa7183af74 upstream.
+
+When fixing the race conditions between the coredump and the mmap_sem
+holders outside the context of the process, we focused on
+mmget_not_zero()/get_task_mm() callers in 04f5866e41fb70 ("coredump: fix
+race condition between mmget_not_zero()/get_task_mm() and core
+dumping"), but those aren't the only cases where the mmap_sem can be
+taken outside of the context of the process as Michal Hocko noticed
+while backporting that commit to older -stable kernels.
+
+If mmgrab() is called in the context of the process, but then the
+mm_count reference is transferred outside the context of the process,
+that can also be a problem if the mmap_sem has to be taken for writing
+through that mm_count reference.
+
+khugepaged registration calls mmgrab() in the context of the process,
+but the mmap_sem for writing is taken later in the context of the
+khugepaged kernel thread.
+
+collapse_huge_page() after taking the mmap_sem for writing doesn't
+modify any vma, so it's not obvious that it could cause a problem to the
+coredump, but it happens to modify the pmd in a way that breaks an
+invariant that pmd_trans_huge_lock() relies upon. collapse_huge_page()
+needs the mmap_sem for writing just to block concurrent page faults that
+call pmd_trans_huge_lock().
+
+Specifically the invariant that "!pmd_trans_huge()" cannot become a
+"pmd_trans_huge()" doesn't hold while collapse_huge_page() runs.
+
+The coredump will call __get_user_pages() without mmap_sem for reading,
+which eventually can invoke a lockless page fault which will need a
+functional pmd_trans_huge_lock().
+
+So collapse_huge_page() needs to use mmget_still_valid() to check it's
+not running concurrently with the coredump... as long as the coredump
+can invoke page faults without holding the mmap_sem for reading.
+
+This has "Fixes: khugepaged" to facilitate backporting, but in my view
+it's more a bug in the coredump code that will eventually have to be
+rewritten to stop invoking page faults without the mmap_sem for reading.
+So the long term plan is still to drop all mmget_still_valid().
+
+Link: http://lkml.kernel.org/r/20190607161558.32104-1-aarcange@redhat.com
+Fixes: ba76149f47d8 ("thp: khugepaged")
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Reported-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Jason Gunthorpe <jgg@mellanox.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[Ajay: Just adjusted to apply on v4.9]
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm.h | 4 ++++
+ mm/khugepaged.c | 3 +++
+ 2 files changed, 7 insertions(+)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1197,6 +1197,10 @@ void unmap_vmas(struct mmu_gather *tlb,
+ * followed by taking the mmap_sem for writing before modifying the
+ * vmas or anything the coredump pretends not to change from under it.
+ *
++ * It also has to be called when mmgrab() is used in the context of
++ * the process, but then the mm_count refcount is transferred outside
++ * the context of the process to run down_write() on that pinned mm.
++ *
+ * NOTE: find_extend_vma() called from GUP context is the only place
+ * that can modify the "mm" (notably the vm_start/end) under mmap_sem
+ * for reading and outside the context of the process, so it is also
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1004,6 +1004,9 @@ static void collapse_huge_page(struct mm
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
++ result = SCAN_ANY_PROCESS;
++ if (!mmget_still_valid(mm))
++ goto out;
+ result = hugepage_vma_revalidate(mm, address, &vma);
+ if (result)
+ goto out;
--- /dev/null
+From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 18 Apr 2019 17:50:52 -0700
+Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream.
+
+The core dumping code has always run without holding the mmap_sem for
+writing, despite that is the only way to ensure that the entire vma
+layout will not change from under it. Only using some signal
+serialization on the processes belonging to the mm is not nearly enough.
+This was pointed out earlier. For example in Hugh's post from Jul 2017:
+
+ https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils
+
+ "Not strictly relevant here, but a related note: I was very surprised
+ to discover, only quite recently, how handle_mm_fault() may be called
+ without down_read(mmap_sem) - when core dumping. That seems a
+ misguided optimization to me, which would also be nice to correct"
+
+In particular because the growsdown and growsup can move the
+vm_start/vm_end the various loops the core dump does around the vma will
+not be consistent if page faults can happen concurrently.
+
+Pretty much all users calling mmget_not_zero()/get_task_mm() and then
+taking the mmap_sem had the potential to introduce unexpected side
+effects in the core dumping code.
+
+Adding mmap_sem for writing around the ->core_dump invocation is a
+viable long term fix, but it requires removing all copy user and page
+faults and to replace them with get_dump_page() for all binary formats
+which is not suitable as a short term fix.
+
+For the time being this solution manually covers the places that can
+confuse the core dump either by altering the vma layout or the vma flags
+while it runs. Once ->core_dump runs under mmap_sem for writing the
+function mmget_still_valid() can be dropped.
+
+Allowing mmap_sem protected sections to run in parallel with the
+coredump provides some minor parallelism advantage to the swapoff code
+(which seems to be safe enough by never mangling any vma field and can
+keep doing swapins in parallel to the core dumping) and to some other
+corner case.
+
+In order to facilitate the backporting I added "Fixes: 86039bd3b4e6"
+however the side effect of this same race condition in /proc/pid/mem
+should be reproducible since before 2.6.12-rc2 so I couldn't add any
+other "Fixes:" because there's no hash beyond the git genesis commit.
+
+Because find_extend_vma() is the only location outside of the process
+context that could modify the "mm" structures under mmap_sem for
+reading, by adding the mmget_still_valid() check to it, all other cases
+that take the mmap_sem for reading don't need the new check after
+mmget_not_zero()/get_task_mm(). The expand_stack() in page fault
+context also doesn't need the new check, because all tasks under core
+dumping are frozen.
+
+Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com
+Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization")
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Reported-by: Jann Horn <jannh@google.com>
+Suggested-by: Oleg Nesterov <oleg@redhat.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
+Reviewed-by: Oleg Nesterov <oleg@redhat.com>
+Reviewed-by: Jann Horn <jannh@google.com>
+Acked-by: Jason Gunthorpe <jgg@mellanox.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[akaher@vmware.com: stable 4.9 backport
+- handle binder_update_page_range - mhocko@suse.com]
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/android/binder.c | 6 ++++++
+ fs/proc/task_mmu.c | 18 ++++++++++++++++++
+ fs/userfaultfd.c | 9 +++++++++
+ include/linux/mm.h | 20 ++++++++++++++++++++
+ mm/mmap.c | 6 +++++-
+ 5 files changed, 58 insertions(+), 1 deletion(-)
+
+--- a/drivers/android/binder.c
++++ b/drivers/android/binder.c
+@@ -581,6 +581,12 @@ static int binder_update_page_range(stru
+
+ if (mm) {
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm)) {
++ if (allocate == 0)
++ goto free_range;
++ goto err_no_vma;
++ }
++
+ vma = proc->vma;
+ if (vma && mm != proc->vma_vm_mm) {
+ pr_err("%d: vma mm and task mm mismatch\n",
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1057,6 +1057,24 @@ static ssize_t clear_refs_write(struct f
+ count = -EINTR;
+ goto out_mm;
+ }
++ /*
++ * Avoid to modify vma->vm_flags
++ * without locked ops while the
++ * coredump reads the vm_flags.
++ */
++ if (!mmget_still_valid(mm)) {
++ /*
++ * Silently return "count"
++ * like if get_task_mm()
++ * failed. FIXME: should this
++ * function have returned
++ * -ESRCH if get_task_mm()
++ * failed like if
++ * get_proc_task() fails?
++ */
++ up_write(&mm->mmap_sem);
++ goto out_mm;
++ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -479,6 +479,8 @@ static int userfaultfd_release(struct in
+ * taking the mmap_sem for writing.
+ */
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm))
++ goto skip_mm;
+ prev = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+@@ -501,6 +503,7 @@ static int userfaultfd_release(struct in
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
++skip_mm:
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ wakeup:
+@@ -802,6 +805,9 @@ static int userfaultfd_register(struct u
+ goto out;
+
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm))
++ goto out_unlock;
++
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma)
+ goto out_unlock;
+@@ -947,6 +953,9 @@ static int userfaultfd_unregister(struct
+ goto out;
+
+ down_write(&mm->mmap_sem);
++ if (!mmget_still_valid(mm))
++ goto out_unlock;
++
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma)
+ goto out_unlock;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1192,6 +1192,26 @@ void zap_page_range(struct vm_area_struc
+ unsigned long size, struct zap_details *);
+ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long start, unsigned long end);
++/*
++ * This has to be called after a get_task_mm()/mmget_not_zero()
++ * followed by taking the mmap_sem for writing before modifying the
++ * vmas or anything the coredump pretends not to change from under it.
++ *
++ * NOTE: find_extend_vma() called from GUP context is the only place
++ * that can modify the "mm" (notably the vm_start/end) under mmap_sem
++ * for reading and outside the context of the process, so it is also
++ * the only case that holds the mmap_sem for reading that must call
++ * this function. Generally if the mmap_sem is hold for reading
++ * there's no need of this check after get_task_mm()/mmget_not_zero().
++ *
++ * This function can be obsoleted and the check can be removed, after
++ * the coredump code will hold the mmap_sem for writing before
++ * invoking the ->core_dump methods.
++ */
++static inline bool mmget_still_valid(struct mm_struct *mm)
++{
++ return likely(!mm->core_state);
++}
+
+ /**
+ * mm_walk - callbacks for walk_page_range
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2448,7 +2448,8 @@ find_extend_vma(struct mm_struct *mm, un
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+- if (!prev || expand_stack(prev, addr))
++ /* don't alter vm_end if the coredump is running */
++ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+@@ -2474,6 +2475,9 @@ find_extend_vma(struct mm_struct *mm, un
+ return vma;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return NULL;
++ /* don't alter vm_start if the coredump is running */
++ if (!mmget_still_valid(mm))
++ return NULL;
+ start = vma->vm_start;
+ if (expand_stack(vma, addr))
+ return NULL;
--- /dev/null
+From akaher@vmware.com Mon Aug 5 08:01:12 2019
+From: Ajay Kaher <akaher@vmware.com>
+Date: Sun, 4 Aug 2019 09:29:26 +0530
+Subject: infiniband: fix race condition between infiniband mlx4, mlx5 driver and core dumping
+To: <aarcange@redhat.com>, <jannh@google.com>, <oleg@redhat.com>, <peterx@redhat.com>, <rppt@linux.ibm.com>, <jgg@mellanox.com>, <mhocko@suse.com>
+Cc: srinidhir@vmware.com, linux-kernel@vger.kernel.org, linux-mm@kvack.org, amakhalov@vmware.com, sean.hefty@intel.com, srivatsa@csail.mit.edu, srivatsab@vmware.com, devel@driverdev.osuosl.org, linux-rdma@vger.kernel.org, bvikas@vmware.com, dledford@redhat.com, akaher@vmware.com, riandrews@android.com, hal.rosenstock@gmail.com, vsirnapalli@vmware.com, leonro@mellanox.com, jglisse@redhat.com, viro@zeniv.linux.org.uk, gregkh@linuxfoundation.org, yishaih@mellanox.com, matanb@mellanox.com, stable@vger.kernel.org, arve@android.com, linux-fsdevel@vger.kernel.org, akpm@linux-foundation.org, torvalds@linux-foundation.org, mike.kravetz@oracle.com
+Message-ID: <1564891168-30016-2-git-send-email-akaher@vmware.com>
+
+From: Ajay Kaher <akaher@vmware.com>
+
+This patch is the extension of following upstream commit to fix
+the race condition between get_task_mm() and core dumping
+for IB->mlx4 and IB->mlx5 drivers:
+
+commit 04f5866e41fb ("coredump: fix race condition between
+mmget_not_zero()/get_task_mm() and core dumping")'
+
+Thanks to Jason for pointing this.
+
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
+Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mlx4/main.c | 4 +++-
+ drivers/infiniband/hw/mlx5/main.c | 3 +++
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/mlx4/main.c
++++ b/drivers/infiniband/hw/mlx4/main.c
+@@ -1172,6 +1172,8 @@ static void mlx4_ib_disassociate_ucontex
+ * mlx4_ib_vma_close().
+ */
+ down_write(&owning_mm->mmap_sem);
++ if (!mmget_still_valid(owning_mm))
++ goto skip_mm;
+ for (i = 0; i < HW_BAR_COUNT; i++) {
+ vma = context->hw_bar_info[i].vma;
+ if (!vma)
+@@ -1190,7 +1192,7 @@ static void mlx4_ib_disassociate_ucontex
+ /* context going to be destroyed, should not access ops any more */
+ context->hw_bar_info[i].vma->vm_ops = NULL;
+ }
+-
++skip_mm:
+ up_write(&owning_mm->mmap_sem);
+ mmput(owning_mm);
+ put_task_struct(owning_process);
+--- a/drivers/infiniband/hw/mlx5/main.c
++++ b/drivers/infiniband/hw/mlx5/main.c
+@@ -1307,6 +1307,8 @@ static void mlx5_ib_disassociate_ucontex
+ * mlx5_ib_vma_close.
+ */
+ down_write(&owning_mm->mmap_sem);
++ if (!mmget_still_valid(owning_mm))
++ goto skip_mm;
+ list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
+ list) {
+ vma = vma_private->vma;
+@@ -1321,6 +1323,7 @@ static void mlx5_ib_disassociate_ucontex
+ list_del(&vma_private->list);
+ kfree(vma_private);
+ }
++skip_mm:
+ up_write(&owning_mm->mmap_sem);
+ mmput(owning_mm);
+ put_task_struct(owning_process);