From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 5 Aug 2019 06:03:20 +0000 (+0200)
Subject: 4.9-stable patches
X-Git-Tag: v4.4.188~14
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=205ab887b2917a8506df77b610e05bb141fcf824;p=thirdparty%2Fkernel%2Fstable-queue.git

4.9-stable patches

added patches:
	coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch
	coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch
	infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch
---

diff --git a/queue-4.9/coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch b/queue-4.9/coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch
new file mode 100644
index 00000000000..8c920cef856
--- /dev/null
+++ b/queue-4.9/coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch
@@ -0,0 +1,99 @@
+From 59ea6d06cfa9247b586a695c21f94afa7183af74 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 13 Jun 2019 15:56:11 -0700
+Subject: coredump: fix race condition between collapse_huge_page() and core dumping
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 59ea6d06cfa9247b586a695c21f94afa7183af74 upstream.
+
+When fixing the race conditions between the coredump and the mmap_sem
+holders outside the context of the process, we focused on
+mmget_not_zero()/get_task_mm() callers in 04f5866e41fb70 ("coredump: fix
+race condition between mmget_not_zero()/get_task_mm() and core
+dumping"), but those aren't the only cases where the mmap_sem can be
+taken outside of the context of the process as Michal Hocko noticed
+while backporting that commit to older -stable kernels.
+
+If mmgrab() is called in the context of the process, but then the
+mm_count reference is transferred outside the context of the process,
+that can also be a problem if the mmap_sem has to be taken for writing
+through that mm_count reference.
+
+khugepaged registration calls mmgrab() in the context of the process,
+but the mmap_sem for writing is taken later in the context of the
+khugepaged kernel thread.
+
+collapse_huge_page() after taking the mmap_sem for writing doesn't
+modify any vma, so it's not obvious that it could cause a problem to the
+coredump, but it happens to modify the pmd in a way that breaks an
+invariant that pmd_trans_huge_lock() relies upon.  collapse_huge_page()
+needs the mmap_sem for writing just to block concurrent page faults that
+call pmd_trans_huge_lock().
+
+Specifically the invariant that "!pmd_trans_huge()" cannot become a
+"pmd_trans_huge()" doesn't hold while collapse_huge_page() runs.
+
+The coredump will call __get_user_pages() without mmap_sem for reading,
+which eventually can invoke a lockless page fault which will need a
+functional pmd_trans_huge_lock().
+
+So collapse_huge_page() needs to use mmget_still_valid() to check it's
+not running concurrently with the coredump...  as long as the coredump
+can invoke page faults without holding the mmap_sem for reading.
+
+This has "Fixes: khugepaged" to facilitate backporting, but in my view
+it's more a bug in the coredump code that will eventually have to be
+rewritten to stop invoking page faults without the mmap_sem for reading.
+So the long term plan is still to drop all mmget_still_valid().
+
+Link: http://lkml.kernel.org/r/20190607161558.32104-1-aarcange@redhat.com
+Fixes: ba76149f47d8 ("thp: khugepaged")
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Reported-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Jason Gunthorpe <jgg@mellanox.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[Ajay: Just adjusted to apply on v4.9]
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm.h |    4 ++++
+ mm/khugepaged.c    |    3 +++
+ 2 files changed, 7 insertions(+)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1197,6 +1197,10 @@ void unmap_vmas(struct mmu_gather *tlb,
+  * followed by taking the mmap_sem for writing before modifying the
+  * vmas or anything the coredump pretends not to change from under it.
+  *
++ * It also has to be called when mmgrab() is used in the context of
++ * the process, but then the mm_count refcount is transferred outside
++ * the context of the process to run down_write() on that pinned mm.
++ *
+  * NOTE: find_extend_vma() called from GUP context is the only place
+  * that can modify the "mm" (notably the vm_start/end) under mmap_sem
+  * for reading and outside the context of the process, so it is also
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1004,6 +1004,9 @@ static void collapse_huge_page(struct mm
+ 	 * handled by the anon_vma lock + PG_lock.
+ 	 */
+ 	down_write(&mm->mmap_sem);
++	result = SCAN_ANY_PROCESS;
++	if (!mmget_still_valid(mm))
++		goto out;
+ 	result = hugepage_vma_revalidate(mm, address, &vma);
+ 	if (result)
+ 		goto out;
diff --git a/queue-4.9/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch b/queue-4.9/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch
new file mode 100644
index 00000000000..91fb83859eb
--- /dev/null
+++ b/queue-4.9/coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch
@@ -0,0 +1,217 @@
+From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 18 Apr 2019 17:50:52 -0700
+Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream.
+
+The core dumping code has always run without holding the mmap_sem for
+writing, despite that is the only way to ensure that the entire vma
+layout will not change from under it.  Only using some signal
+serialization on the processes belonging to the mm is not nearly enough.
+This was pointed out earlier.  For example in Hugh's post from Jul 2017:
+
+  https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils
+
+  "Not strictly relevant here, but a related note: I was very surprised
+   to discover, only quite recently, how handle_mm_fault() may be called
+   without down_read(mmap_sem) - when core dumping. That seems a
+   misguided optimization to me, which would also be nice to correct"
+
+In particular because the growsdown and growsup can move the
+vm_start/vm_end the various loops the core dump does around the vma will
+not be consistent if page faults can happen concurrently.
+
+Pretty much all users calling mmget_not_zero()/get_task_mm() and then
+taking the mmap_sem had the potential to introduce unexpected side
+effects in the core dumping code.
+
+Adding mmap_sem for writing around the ->core_dump invocation is a
+viable long term fix, but it requires removing all copy user and page
+faults and to replace them with get_dump_page() for all binary formats
+which is not suitable as a short term fix.
+
+For the time being this solution manually covers the places that can
+confuse the core dump either by altering the vma layout or the vma flags
+while it runs.  Once ->core_dump runs under mmap_sem for writing the
+function mmget_still_valid() can be dropped.
+
+Allowing mmap_sem protected sections to run in parallel with the
+coredump provides some minor parallelism advantage to the swapoff code
+(which seems to be safe enough by never mangling any vma field and can
+keep doing swapins in parallel to the core dumping) and to some other
+corner case.
+
+In order to facilitate the backporting I added "Fixes: 86039bd3b4e6"
+however the side effect of this same race condition in /proc/pid/mem
+should be reproducible since before 2.6.12-rc2 so I couldn't add any
+other "Fixes:" because there's no hash beyond the git genesis commit.
+
+Because find_extend_vma() is the only location outside of the process
+context that could modify the "mm" structures under mmap_sem for
+reading, by adding the mmget_still_valid() check to it, all other cases
+that take the mmap_sem for reading don't need the new check after
+mmget_not_zero()/get_task_mm().  The expand_stack() in page fault
+context also doesn't need the new check, because all tasks under core
+dumping are frozen.
+
+Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com
+Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization")
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Reported-by: Jann Horn <jannh@google.com>
+Suggested-by: Oleg Nesterov <oleg@redhat.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
+Reviewed-by: Oleg Nesterov <oleg@redhat.com>
+Reviewed-by: Jann Horn <jannh@google.com>
+Acked-by: Jason Gunthorpe <jgg@mellanox.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[akaher@vmware.com: stable 4.9 backport
+-  handle binder_update_page_range - mhocko@suse.com]
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/android/binder.c |    6 ++++++
+ fs/proc/task_mmu.c       |   18 ++++++++++++++++++
+ fs/userfaultfd.c         |    9 +++++++++
+ include/linux/mm.h       |   20 ++++++++++++++++++++
+ mm/mmap.c                |    6 +++++-
+ 5 files changed, 58 insertions(+), 1 deletion(-)
+
+--- a/drivers/android/binder.c
++++ b/drivers/android/binder.c
+@@ -581,6 +581,12 @@ static int binder_update_page_range(stru
+ 
+ 	if (mm) {
+ 		down_write(&mm->mmap_sem);
++		if (!mmget_still_valid(mm)) {
++			if (allocate == 0)
++				goto free_range;
++			goto err_no_vma;
++		}
++
+ 		vma = proc->vma;
+ 		if (vma && mm != proc->vma_vm_mm) {
+ 			pr_err("%d: vma mm and task mm mismatch\n",
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1057,6 +1057,24 @@ static ssize_t clear_refs_write(struct f
+ 					count = -EINTR;
+ 					goto out_mm;
+ 				}
++				/*
++				 * Avoid to modify vma->vm_flags
++				 * without locked ops while the
++				 * coredump reads the vm_flags.
++				 */
++				if (!mmget_still_valid(mm)) {
++					/*
++					 * Silently return "count"
++					 * like if get_task_mm()
++					 * failed. FIXME: should this
++					 * function have returned
++					 * -ESRCH if get_task_mm()
++					 * failed like if
++					 * get_proc_task() fails?
++					 */
++					up_write(&mm->mmap_sem);
++					goto out_mm;
++				}
+ 				for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ 					vma->vm_flags &= ~VM_SOFTDIRTY;
+ 					vma_set_page_prot(vma);
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -479,6 +479,8 @@ static int userfaultfd_release(struct in
+ 	 * taking the mmap_sem for writing.
+ 	 */
+ 	down_write(&mm->mmap_sem);
++	if (!mmget_still_valid(mm))
++		goto skip_mm;
+ 	prev = NULL;
+ 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ 		cond_resched();
+@@ -501,6 +503,7 @@ static int userfaultfd_release(struct in
+ 		vma->vm_flags = new_flags;
+ 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ 	}
++skip_mm:
+ 	up_write(&mm->mmap_sem);
+ 	mmput(mm);
+ wakeup:
+@@ -802,6 +805,9 @@ static int userfaultfd_register(struct u
+ 		goto out;
+ 
+ 	down_write(&mm->mmap_sem);
++	if (!mmget_still_valid(mm))
++		goto out_unlock;
++
+ 	vma = find_vma_prev(mm, start, &prev);
+ 	if (!vma)
+ 		goto out_unlock;
+@@ -947,6 +953,9 @@ static int userfaultfd_unregister(struct
+ 		goto out;
+ 
+ 	down_write(&mm->mmap_sem);
++	if (!mmget_still_valid(mm))
++		goto out_unlock;
++
+ 	vma = find_vma_prev(mm, start, &prev);
+ 	if (!vma)
+ 		goto out_unlock;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1192,6 +1192,26 @@ void zap_page_range(struct vm_area_struc
+ 		unsigned long size, struct zap_details *);
+ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ 		unsigned long start, unsigned long end);
++/*
++ * This has to be called after a get_task_mm()/mmget_not_zero()
++ * followed by taking the mmap_sem for writing before modifying the
++ * vmas or anything the coredump pretends not to change from under it.
++ *
++ * NOTE: find_extend_vma() called from GUP context is the only place
++ * that can modify the "mm" (notably the vm_start/end) under mmap_sem
++ * for reading and outside the context of the process, so it is also
++ * the only case that holds the mmap_sem for reading that must call
++ * this function. Generally if the mmap_sem is hold for reading
++ * there's no need of this check after get_task_mm()/mmget_not_zero().
++ *
++ * This function can be obsoleted and the check can be removed, after
++ * the coredump code will hold the mmap_sem for writing before
++ * invoking the ->core_dump methods.
++ */
++static inline bool mmget_still_valid(struct mm_struct *mm)
++{
++	return likely(!mm->core_state);
++}
+ 
+ /**
+  * mm_walk - callbacks for walk_page_range
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2448,7 +2448,8 @@ find_extend_vma(struct mm_struct *mm, un
+ 	vma = find_vma_prev(mm, addr, &prev);
+ 	if (vma && (vma->vm_start <= addr))
+ 		return vma;
+-	if (!prev || expand_stack(prev, addr))
++	/* don't alter vm_end if the coredump is running */
++	if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ 		return NULL;
+ 	if (prev->vm_flags & VM_LOCKED)
+ 		populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+@@ -2474,6 +2475,9 @@ find_extend_vma(struct mm_struct *mm, un
+ 		return vma;
+ 	if (!(vma->vm_flags & VM_GROWSDOWN))
+ 		return NULL;
++	/* don't alter vm_start if the coredump is running */
++	if (!mmget_still_valid(mm))
++		return NULL;
+ 	start = vma->vm_start;
+ 	if (expand_stack(vma, addr))
+ 		return NULL;
diff --git a/queue-4.9/infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch b/queue-4.9/infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch
new file mode 100644
index 00000000000..04f1f85eeda
--- /dev/null
+++ b/queue-4.9/infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch
@@ -0,0 +1,66 @@
+From akaher@vmware.com  Mon Aug  5 08:01:12 2019
+From: Ajay Kaher <akaher@vmware.com>
+Date: Sun, 4 Aug 2019 09:29:26 +0530
+Subject: infiniband: fix race condition between infiniband mlx4, mlx5  driver and core dumping
+To: <aarcange@redhat.com>, <jannh@google.com>, <oleg@redhat.com>, <peterx@redhat.com>, <rppt@linux.ibm.com>, <jgg@mellanox.com>, <mhocko@suse.com>
+Cc: srinidhir@vmware.com, linux-kernel@vger.kernel.org, linux-mm@kvack.org, amakhalov@vmware.com, sean.hefty@intel.com, srivatsa@csail.mit.edu, srivatsab@vmware.com, devel@driverdev.osuosl.org, linux-rdma@vger.kernel.org, bvikas@vmware.com, dledford@redhat.com, akaher@vmware.com, riandrews@android.com, hal.rosenstock@gmail.com, vsirnapalli@vmware.com, leonro@mellanox.com, jglisse@redhat.com, viro@zeniv.linux.org.uk, gregkh@linuxfoundation.org, yishaih@mellanox.com, matanb@mellanox.com, stable@vger.kernel.org, arve@android.com, linux-fsdevel@vger.kernel.org, akpm@linux-foundation.org, torvalds@linux-foundation.org, mike.kravetz@oracle.com
+Message-ID: <1564891168-30016-2-git-send-email-akaher@vmware.com>
+
+From: Ajay Kaher <akaher@vmware.com>
+
+This patch is the extension of following upstream commit to fix
+the race condition between get_task_mm() and core dumping
+for IB->mlx4 and IB->mlx5 drivers:
+
+commit 04f5866e41fb ("coredump: fix race condition between
+mmget_not_zero()/get_task_mm() and core dumping")'
+
+Thanks to Jason for pointing this.
+
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
+Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mlx4/main.c |    4 +++-
+ drivers/infiniband/hw/mlx5/main.c |    3 +++
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/mlx4/main.c
++++ b/drivers/infiniband/hw/mlx4/main.c
+@@ -1172,6 +1172,8 @@ static void mlx4_ib_disassociate_ucontex
+ 	 * mlx4_ib_vma_close().
+ 	 */
+ 	down_write(&owning_mm->mmap_sem);
++	if (!mmget_still_valid(owning_mm))
++		goto skip_mm;
+ 	for (i = 0; i < HW_BAR_COUNT; i++) {
+ 		vma = context->hw_bar_info[i].vma;
+ 		if (!vma)
+@@ -1190,7 +1192,7 @@ static void mlx4_ib_disassociate_ucontex
+ 		/* context going to be destroyed, should not access ops any more */
+ 		context->hw_bar_info[i].vma->vm_ops = NULL;
+ 	}
+-
++skip_mm:
+ 	up_write(&owning_mm->mmap_sem);
+ 	mmput(owning_mm);
+ 	put_task_struct(owning_process);
+--- a/drivers/infiniband/hw/mlx5/main.c
++++ b/drivers/infiniband/hw/mlx5/main.c
+@@ -1307,6 +1307,8 @@ static void mlx5_ib_disassociate_ucontex
+ 	 * mlx5_ib_vma_close.
+ 	 */
+ 	down_write(&owning_mm->mmap_sem);
++	if (!mmget_still_valid(owning_mm))
++		goto skip_mm;
+ 	list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
+ 				 list) {
+ 		vma = vma_private->vma;
+@@ -1321,6 +1323,7 @@ static void mlx5_ib_disassociate_ucontex
+ 		list_del(&vma_private->list);
+ 		kfree(vma_private);
+ 	}
++skip_mm:
+ 	up_write(&owning_mm->mmap_sem);
+ 	mmput(owning_mm);
+ 	put_task_struct(owning_process);
diff --git a/queue-4.9/series b/queue-4.9/series
index a1cf112b95a..70662e29a32 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -32,3 +32,6 @@ s390-dasd-fix-endless-loop-after-read-unit-address-configuration.patch
 drivers-perf-arm_pmu-fix-failure-path-in-pm-notifier.patch
 xen-swiotlb-fix-condition-for-calling-xen_destroy_contiguous_region.patch
 ib-mlx5-fix-rss-toeplitz-setup-to-be-aligned-with-the-hw-specification.patch
+coredump-fix-race-condition-between-mmget_not_zero-get_task_mm-and-core-dumping.patch
+infiniband-fix-race-condition-between-infiniband-mlx4-mlx5-driver-and-core-dumping.patch
+coredump-fix-race-condition-between-collapse_huge_page-and-core-dumping.patch