From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 28 Jun 2023 18:28:31 +0000 (+0200)
Subject: 6.1-stable patches
X-Git-Tag: v6.4.1~52
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=3b50a3d88b9ce2f318e320184d39972b24a39b3f;p=thirdparty%2Fkernel%2Fstable-queue.git

6.1-stable patches

added patches:
	mm-hwpoison-try-to-recover-from-copy-on-write-faults.patch
	mm-hwpoison-when-copy-on-write-hits-poison-take-page-offline.patch
	mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
---

diff --git a/queue-6.1/mm-hwpoison-try-to-recover-from-copy-on-write-faults.patch b/queue-6.1/mm-hwpoison-try-to-recover-from-copy-on-write-faults.patch
new file mode 100644
index 00000000000..63bc3270cb1
--- /dev/null
+++ b/queue-6.1/mm-hwpoison-try-to-recover-from-copy-on-write-faults.patch
@@ -0,0 +1,214 @@
+From a873dfe1032a132bf89f9e19a6ac44f5a0b78754 Mon Sep 17 00:00:00 2001
+From: Tony Luck <tony.luck@intel.com>
+Date: Fri, 21 Oct 2022 13:01:19 -0700
+Subject: mm, hwpoison: try to recover from copy-on write faults
+
+From: Tony Luck <tony.luck@intel.com>
+
+commit a873dfe1032a132bf89f9e19a6ac44f5a0b78754 upstream.
+
+Patch series "Copy-on-write poison recovery", v3.
+
+Part 1 deals with the process that triggered the copy on write fault with
+a store to a shared read-only page.  That process is send a SIGBUS with
+the usual machine check decoration to specify the virtual address of the
+lost page, together with the scope.
+
+Part 2 sets up to asynchronously take the page with the uncorrected error
+offline to prevent additional machine check faults.  H/t to Miaohe Lin
+<linmiaohe@huawei.com> and Shuai Xue <xueshuai@linux.alibaba.com> for
+pointing me to the existing function to queue a call to memory_failure().
+
+On x86 there is some duplicate reporting (because the error is also
+signalled by the memory controller as well as by the core that triggered
+the machine check).  Console logs look like this:
+
+
+This patch (of 2):
+
+If the kernel is copying a page as the result of a copy-on-write
+fault and runs into an uncorrectable error, Linux will crash because
+it does not have recovery code for this case where poison is consumed
+by the kernel.
+
+It is easy to set up a test case. Just inject an error into a private
+page, fork(2), and have the child process write to the page.
+
+I wrapped that neatly into a test at:
+
+  git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git
+
+just enable ACPI error injection and run:
+
+  # ./einj_mem-uc -f copy-on-write
+
+Add a new copy_user_highpage_mc() function that uses copy_mc_to_kernel()
+on architectures where that is available (currently x86 and powerpc).
+When an error is detected during the page copy, return VM_FAULT_HWPOISON
+to caller of wp_page_copy(). This propagates up the call stack. Both x86
+and powerpc have code in their fault handler to deal with this code by
+sending a SIGBUS to the application.
+
+Note that this patch avoids a system crash and signals the process that
+triggered the copy-on-write action. It does not take any action for the
+memory error that is still in the shared page. To handle that a call to
+memory_failure() is needed. But this cannot be done from wp_page_copy()
+because it holds mmap_lock(). Perhaps the architecture fault handlers
+can deal with this loose end in a subsequent patch?
+
+On Intel/x86 this loose end will often be handled automatically because
+the memory controller provides an additional notification of the h/w
+poison in memory, the handler for this will call memory_failure(). This
+isn't a 100% solution. If there are multiple errors, not all may be
+logged in this way.
+
+[tony.luck@intel.com: add call to kmsan_unpoison_memory(), per Miaohe Lin]
+  Link: https://lkml.kernel.org/r/20221031201029.102123-2-tony.luck@intel.com
+Link: https://lkml.kernel.org/r/20221021200120.175753-1-tony.luck@intel.com
+Link: https://lkml.kernel.org/r/20221021200120.175753-2-tony.luck@intel.com
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Reviewed-by: Dan Williams <dan.j.williams@intel.com>
+Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Reviewed-by: Alexander Potapenko <glider@google.com>
+Tested-by: Shuai Xue <xueshuai@linux.alibaba.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michael Ellerman <mpe@ellerman.id.au>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Igned-off-by: Jane Chu <jane.chu@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/highmem.h |   26 ++++++++++++++++++++++++++
+ mm/memory.c             |   30 ++++++++++++++++++++----------
+ 2 files changed, 46 insertions(+), 10 deletions(-)
+
+--- a/include/linux/highmem.h
++++ b/include/linux/highmem.h
+@@ -319,6 +319,32 @@ static inline void copy_user_highpage(st
+ 
+ #endif
+ 
++#ifdef copy_mc_to_kernel
++static inline int copy_mc_user_highpage(struct page *to, struct page *from,
++					unsigned long vaddr, struct vm_area_struct *vma)
++{
++	unsigned long ret;
++	char *vfrom, *vto;
++
++	vfrom = kmap_local_page(from);
++	vto = kmap_local_page(to);
++	ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
++	if (!ret)
++		kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
++	kunmap_local(vto);
++	kunmap_local(vfrom);
++
++	return ret;
++}
++#else
++static inline int copy_mc_user_highpage(struct page *to, struct page *from,
++					unsigned long vaddr, struct vm_area_struct *vma)
++{
++	copy_user_highpage(to, from, vaddr, vma);
++	return 0;
++}
++#endif
++
+ #ifndef __HAVE_ARCH_COPY_HIGHPAGE
+ 
+ static inline void copy_highpage(struct page *to, struct page *from)
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -2843,10 +2843,16 @@ static inline int pte_unmap_same(struct
+ 	return same;
+ }
+ 
+-static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+-				       struct vm_fault *vmf)
++/*
++ * Return:
++ *	0:		copied succeeded
++ *	-EHWPOISON:	copy failed due to hwpoison in source page
++ *	-EAGAIN:	copied failed (some other reason)
++ */
++static inline int __wp_page_copy_user(struct page *dst, struct page *src,
++				      struct vm_fault *vmf)
+ {
+-	bool ret;
++	int ret;
+ 	void *kaddr;
+ 	void __user *uaddr;
+ 	bool locked = false;
+@@ -2855,8 +2861,9 @@ static inline bool __wp_page_copy_user(s
+ 	unsigned long addr = vmf->address;
+ 
+ 	if (likely(src)) {
+-		copy_user_highpage(dst, src, addr, vma);
+-		return true;
++		if (copy_mc_user_highpage(dst, src, addr, vma))
++			return -EHWPOISON;
++		return 0;
+ 	}
+ 
+ 	/*
+@@ -2883,7 +2890,7 @@ static inline bool __wp_page_copy_user(s
+ 			 * and update local tlb only
+ 			 */
+ 			update_mmu_tlb(vma, addr, vmf->pte);
+-			ret = false;
++			ret = -EAGAIN;
+ 			goto pte_unlock;
+ 		}
+ 
+@@ -2908,7 +2915,7 @@ static inline bool __wp_page_copy_user(s
+ 		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ 			/* The PTE changed under us, update local tlb */
+ 			update_mmu_tlb(vma, addr, vmf->pte);
+-			ret = false;
++			ret = -EAGAIN;
+ 			goto pte_unlock;
+ 		}
+ 
+@@ -2927,7 +2934,7 @@ warn:
+ 		}
+ 	}
+ 
+-	ret = true;
++	ret = 0;
+ 
+ pte_unlock:
+ 	if (locked)
+@@ -3099,6 +3106,7 @@ static vm_fault_t wp_page_copy(struct vm
+ 	pte_t entry;
+ 	int page_copied = 0;
+ 	struct mmu_notifier_range range;
++	int ret;
+ 
+ 	delayacct_wpcopy_start();
+ 
+@@ -3116,19 +3124,21 @@ static vm_fault_t wp_page_copy(struct vm
+ 		if (!new_page)
+ 			goto oom;
+ 
+-		if (!__wp_page_copy_user(new_page, old_page, vmf)) {
++		ret = __wp_page_copy_user(new_page, old_page, vmf);
++		if (ret) {
+ 			/*
+ 			 * COW failed, if the fault was solved by other,
+ 			 * it's fine. If not, userspace would re-fault on
+ 			 * the same address and we will handle the fault
+ 			 * from the second attempt.
++			 * The -EHWPOISON case will not be retried.
+ 			 */
+ 			put_page(new_page);
+ 			if (old_page)
+ 				put_page(old_page);
+ 
+ 			delayacct_wpcopy_end();
+-			return 0;
++			return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
+ 		}
+ 		kmsan_copy_page_meta(new_page, old_page);
+ 	}
diff --git a/queue-6.1/mm-hwpoison-when-copy-on-write-hits-poison-take-page-offline.patch b/queue-6.1/mm-hwpoison-when-copy-on-write-hits-poison-take-page-offline.patch
new file mode 100644
index 00000000000..e7fb68e1fb2
--- /dev/null
+++ b/queue-6.1/mm-hwpoison-when-copy-on-write-hits-poison-take-page-offline.patch
@@ -0,0 +1,83 @@
+From d302c2398ba269e788a4f37ae57c07a7fcabaa42 Mon Sep 17 00:00:00 2001
+From: Tony Luck <tony.luck@intel.com>
+Date: Fri, 21 Oct 2022 13:01:20 -0700
+Subject: mm, hwpoison: when copy-on-write hits poison, take page offline
+
+From: Tony Luck <tony.luck@intel.com>
+
+commit d302c2398ba269e788a4f37ae57c07a7fcabaa42 upstream.
+
+Cannot call memory_failure() directly from the fault handler because
+mmap_lock (and others) are held.
+
+It is important, but not urgent, to mark the source page as h/w poisoned
+and unmap it from other tasks.
+
+Use memory_failure_queue() to request a call to memory_failure() for the
+page with the error.
+
+Also provide a stub version for CONFIG_MEMORY_FAILURE=n
+
+Link: https://lkml.kernel.org/r/20221021200120.175753-3-tony.luck@intel.com
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michael Ellerman <mpe@ellerman.id.au>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: Shuai Xue <xueshuai@linux.alibaba.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ Due to missing commits
+  e591ef7d96d6e ("mm,hwpoison,hugetlb,memory_hotplug: hotremove memory section with hwpoisoned hugepage")
+  5033091de814a ("mm/hwpoison: introduce per-memory_block hwpoison counter")
+  The impact of e591ef7d96d6e is its introduction of an additional flag in
+  __get_huge_page_for_hwpoison() that serves as an indication a hwpoisoned
+  hugetlb page should have its migratable bit cleared.
+  The impact of 5033091de814a is contexual.
+  Resolve by ignoring both missing commits. - jane]
+Signed-off-by: Jane Chu <jane.chu@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h |    5 ++++-
+ mm/memory.c        |    4 +++-
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3295,7 +3295,6 @@ enum mf_flags {
+ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ 		      unsigned long count, int mf_flags);
+ extern int memory_failure(unsigned long pfn, int flags);
+-extern void memory_failure_queue(unsigned long pfn, int flags);
+ extern void memory_failure_queue_kick(int cpu);
+ extern int unpoison_memory(unsigned long pfn);
+ extern int sysctl_memory_failure_early_kill;
+@@ -3304,8 +3303,12 @@ extern void shake_page(struct page *p);
+ extern atomic_long_t num_poisoned_pages __read_mostly;
+ extern int soft_offline_page(unsigned long pfn, int flags);
+ #ifdef CONFIG_MEMORY_FAILURE
++extern void memory_failure_queue(unsigned long pfn, int flags);
+ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+ #else
++static inline void memory_failure_queue(unsigned long pfn, int flags)
++{
++}
+ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+ {
+ 	return 0;
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -2861,8 +2861,10 @@ static inline int __wp_page_copy_user(st
+ 	unsigned long addr = vmf->address;
+ 
+ 	if (likely(src)) {
+-		if (copy_mc_user_highpage(dst, src, addr, vma))
++		if (copy_mc_user_highpage(dst, src, addr, vma)) {
++			memory_failure_queue(page_to_pfn(src), 0);
+ 			return -EHWPOISON;
++		}
+ 		return 0;
+ 	}
+ 
diff --git a/queue-6.1/mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch b/queue-6.1/mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
new file mode 100644
index 00000000000..e04bcc90684
--- /dev/null
+++ b/queue-6.1/mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
@@ -0,0 +1,98 @@
+From 57fc0f1ceaa4016354cf6f88533e20b56190e41a Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 20 Jun 2023 18:24:23 +0200
+Subject: mptcp: ensure listener is unhashed before updating the sk status
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 57fc0f1ceaa4016354cf6f88533e20b56190e41a upstream.
+
+The MPTCP protocol access the listener subflow in a lockless
+manner in a couple of places (poll, diag). That works only if
+the msk itself leaves the listener status only after that the
+subflow itself has been closed/disconnected. Otherwise we risk
+deadlock in diag, as reported by Christoph.
+
+Address the issue ensuring that the first subflow (the listener
+one) is always disconnected before updating the msk socket status.
+
+Reported-by: Christoph Paasch <cpaasch@apple.com>
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/407
+Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm_netlink.c |    1 +
+ net/mptcp/protocol.c   |   26 ++++++++++++++++++++------
+ 2 files changed, 21 insertions(+), 6 deletions(-)
+
+--- a/net/mptcp/pm_netlink.c
++++ b/net/mptcp/pm_netlink.c
+@@ -1039,6 +1039,7 @@ static int mptcp_pm_nl_create_listen_soc
+ 		return err;
+ 	}
+ 
++	inet_sk_state_store(newsk, TCP_LISTEN);
+ 	err = kernel_listen(ssock, backlog);
+ 	if (err) {
+ 		pr_warn("kernel_listen error, err=%d", err);
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -2400,12 +2400,6 @@ static void __mptcp_close_ssk(struct soc
+ 		kfree_rcu(subflow, rcu);
+ 	} else {
+ 		/* otherwise tcp will dispose of the ssk and subflow ctx */
+-		if (ssk->sk_state == TCP_LISTEN) {
+-			tcp_set_state(ssk, TCP_CLOSE);
+-			mptcp_subflow_queue_clean(sk, ssk);
+-			inet_csk_listen_stop(ssk);
+-		}
+-
+ 		__tcp_close(ssk, 0);
+ 
+ 		/* close acquired an extra ref */
+@@ -2939,6 +2933,24 @@ static __poll_t mptcp_check_readable(str
+ 	return EPOLLIN | EPOLLRDNORM;
+ }
+ 
++static void mptcp_check_listen_stop(struct sock *sk)
++{
++	struct sock *ssk;
++
++	if (inet_sk_state_load(sk) != TCP_LISTEN)
++		return;
++
++	ssk = mptcp_sk(sk)->first;
++	if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
++		return;
++
++	lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
++	mptcp_subflow_queue_clean(sk, ssk);
++	inet_csk_listen_stop(ssk);
++	tcp_set_state(ssk, TCP_CLOSE);
++	release_sock(ssk);
++}
++
+ bool __mptcp_close(struct sock *sk, long timeout)
+ {
+ 	struct mptcp_subflow_context *subflow;
+@@ -2949,6 +2961,7 @@ bool __mptcp_close(struct sock *sk, long
+ 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+ 
+ 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
++		mptcp_check_listen_stop(sk);
+ 		inet_sk_state_store(sk, TCP_CLOSE);
+ 		goto cleanup;
+ 	}
+@@ -3062,6 +3075,7 @@ static int mptcp_disconnect(struct sock
+ 	if (msk->fastopening)
+ 		return -EBUSY;
+ 
++	mptcp_check_listen_stop(sk);
+ 	inet_sk_state_store(sk, TCP_CLOSE);
+ 
+ 	mptcp_stop_timer(sk);
diff --git a/queue-6.1/series b/queue-6.1/series
index eb7f63bed40..64f8a0aa3ae 100644
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -1,2 +1,5 @@
 mm-mmap-fix-error-path-in-do_vmi_align_munmap.patch
 mm-mmap-fix-error-return-in-do_vmi_align_munmap.patch
+mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
+mm-hwpoison-try-to-recover-from-copy-on-write-faults.patch
+mm-hwpoison-when-copy-on-write-hits-poison-take-page-offline.patch