From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 12 May 2025 09:39:35 +0000 (+0200)
Subject: 6.12-stable patches
X-Git-Tag: v5.15.183~53
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=6bd3f7d8e9a05e24ad636f6fa4627ba607ae0ec8;p=thirdparty%2Fkernel%2Fstable-queue.git

6.12-stable patches

added patches:
	drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch
	kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch
	mm-fix-folio_pte_batch-on-xen-pv.patch
	mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch
	mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch
	mm-vmalloc-support-more-granular-vrealloc-sizing.patch
	selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch
	selftests-mm-fix-a-build-failure-on-powerpc.patch
	x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch
---

diff --git a/queue-6.12/drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch b/queue-6.12/drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch
new file mode 100644
index 0000000000..002f825e89
--- /dev/null
+++ b/queue-6.12/drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch
@@ -0,0 +1,47 @@
+From 5a3846648c0523fd850b7f0aec78c0139453ab8b Mon Sep 17 00:00:00 2001
+From: Wayne Lin <Wayne.Lin@amd.com>
+Date: Fri, 18 Apr 2025 16:31:59 +0800
+Subject: drm/amd/display: Shift DMUB AUX reply command if necessary
+
+From: Wayne Lin <Wayne.Lin@amd.com>
+
+commit 5a3846648c0523fd850b7f0aec78c0139453ab8b upstream.
+
+[Why]
+Defined value of dmub AUX reply command field get updated but didn't
+adjust dm receiving side accordingly.
+
+[How]
+Check the received reply command value to see if it's updated version
+or not. Adjust it if necessary.
+
+Fixes: ead08b95fa50 ("drm/amd/display: Fix race condition in DPIA AUX transfer")
+Cc: Mario Limonciello <mario.limonciello@amd.com>
+Cc: Alex Deucher <alexander.deucher@amd.com>
+Reviewed-by: Ray Wu <ray.wu@amd.com>
+Signed-off-by: Wayne Lin <Wayne.Lin@amd.com>
+Signed-off-by: Ray Wu <ray.wu@amd.com>
+Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit d5c9ade755a9afa210840708a12a8f44c0d532f4)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -12535,8 +12535,11 @@ int amdgpu_dm_process_dmub_aux_transfer_
+ 		goto out;
+ 	}
+ 
++	payload->reply[0] = adev->dm.dmub_notify->aux_reply.command & 0xF;
++	if (adev->dm.dmub_notify->aux_reply.command & 0xF0)
++		/* The reply is stored in the top nibble of the command. */
++		payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF;
+ 
+-	payload->reply[0] = adev->dm.dmub_notify->aux_reply.command;
+ 	if (!payload->write && p_notify->aux_reply.length &&
+ 			(payload->reply[0] == AUX_TRANSACTION_REPLY_AUX_ACK)) {
+ 
diff --git a/queue-6.12/kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch b/queue-6.12/kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch
new file mode 100644
index 0000000000..9db7ddfc27
--- /dev/null
+++ b/queue-6.12/kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch
@@ -0,0 +1,97 @@
+From a2620f8932fa9fdabc3d78ed6efb004ca409019f Mon Sep 17 00:00:00 2001
+From: Mikhail Lobanov <m.lobanov@rosa.ru>
+Date: Mon, 14 Apr 2025 20:12:06 +0300
+Subject: KVM: SVM: Forcibly leave SMM mode on SHUTDOWN interception
+
+From: Mikhail Lobanov <m.lobanov@rosa.ru>
+
+commit a2620f8932fa9fdabc3d78ed6efb004ca409019f upstream.
+
+Previously, commit ed129ec9057f ("KVM: x86: forcibly leave nested mode
+on vCPU reset") addressed an issue where a triple fault occurring in
+nested mode could lead to use-after-free scenarios. However, the commit
+did not handle the analogous situation for System Management Mode (SMM).
+
+This omission results in triggering a WARN when KVM forces a vCPU INIT
+after SHUTDOWN interception while the vCPU is in SMM. This situation was
+reprodused using Syzkaller by:
+
+  1) Creating a KVM VM and vCPU
+  2) Sending a KVM_SMI ioctl to explicitly enter SMM
+  3) Executing invalid instructions causing consecutive exceptions and
+     eventually a triple fault
+
+The issue manifests as follows:
+
+  WARNING: CPU: 0 PID: 25506 at arch/x86/kvm/x86.c:12112
+  kvm_vcpu_reset+0x1d2/0x1530 arch/x86/kvm/x86.c:12112
+  Modules linked in:
+  CPU: 0 PID: 25506 Comm: syz-executor.0 Not tainted
+  6.1.130-syzkaller-00157-g164fe5dde9b6 #0
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
+  BIOS 1.12.0-1 04/01/2014
+  RIP: 0010:kvm_vcpu_reset+0x1d2/0x1530 arch/x86/kvm/x86.c:12112
+  Call Trace:
+   <TASK>
+   shutdown_interception+0x66/0xb0 arch/x86/kvm/svm/svm.c:2136
+   svm_invoke_exit_handler+0x110/0x530 arch/x86/kvm/svm/svm.c:3395
+   svm_handle_exit+0x424/0x920 arch/x86/kvm/svm/svm.c:3457
+   vcpu_enter_guest arch/x86/kvm/x86.c:10959 [inline]
+   vcpu_run+0x2c43/0x5a90 arch/x86/kvm/x86.c:11062
+   kvm_arch_vcpu_ioctl_run+0x50f/0x1cf0 arch/x86/kvm/x86.c:11283
+   kvm_vcpu_ioctl+0x570/0xf00 arch/x86/kvm/../../../virt/kvm/kvm_main.c:4122
+   vfs_ioctl fs/ioctl.c:51 [inline]
+   __do_sys_ioctl fs/ioctl.c:870 [inline]
+   __se_sys_ioctl fs/ioctl.c:856 [inline]
+   __x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:856
+   do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+   do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81
+   entry_SYSCALL_64_after_hwframe+0x6e/0xd8
+
+Architecturally, INIT is blocked when the CPU is in SMM, hence KVM's WARN()
+in kvm_vcpu_reset() to guard against KVM bugs, e.g. to detect improper
+emulation of INIT.  SHUTDOWN on SVM is a weird edge case where KVM needs to
+do _something_ sane with the VMCB, since it's technically undefined, and
+INIT is the least awful choice given KVM's ABI.
+
+So, double down on stuffing INIT on SHUTDOWN, and force the vCPU out of
+SMM to avoid any weirdness (and the WARN).
+
+Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
+
+Fixes: ed129ec9057f ("KVM: x86: forcibly leave nested mode on vCPU reset")
+Cc: stable@vger.kernel.org
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Mikhail Lobanov <m.lobanov@rosa.ru>
+Link: https://lore.kernel.org/r/20250414171207.155121-1-m.lobanov@rosa.ru
+[sean: massage changelog, make it clear this isn't architectural behavior]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/smm.c     |    1 +
+ arch/x86/kvm/svm/svm.c |    4 ++++
+ 2 files changed, 5 insertions(+)
+
+--- a/arch/x86/kvm/smm.c
++++ b/arch/x86/kvm/smm.c
+@@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vc
+ 
+ 	kvm_mmu_reset_context(vcpu);
+ }
++EXPORT_SYMBOL_GPL(kvm_smm_changed);
+ 
+ void process_smi(struct kvm_vcpu *vcpu)
+ {
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -2222,6 +2222,10 @@ static int shutdown_interception(struct
+ 	 */
+ 	if (!sev_es_guest(vcpu->kvm)) {
+ 		clear_page(svm->vmcb);
++#ifdef CONFIG_KVM_SMM
++		if (is_smm(vcpu))
++			kvm_smm_changed(vcpu, false);
++#endif
+ 		kvm_vcpu_reset(vcpu, true);
+ 	}
+ 
diff --git a/queue-6.12/mm-fix-folio_pte_batch-on-xen-pv.patch b/queue-6.12/mm-fix-folio_pte_batch-on-xen-pv.patch
new file mode 100644
index 0000000000..e6862e3b53
--- /dev/null
+++ b/queue-6.12/mm-fix-folio_pte_batch-on-xen-pv.patch
@@ -0,0 +1,153 @@
+From 7b08b74f3d99f6b801250683c751d391128799ec Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz>
+Date: Fri, 2 May 2025 23:50:19 +0200
+Subject: mm: fix folio_pte_batch() on XEN PV
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Petr VanÄk <arkamar@atlas.cz>
+
+commit 7b08b74f3d99f6b801250683c751d391128799ec upstream.
+
+On XEN PV, folio_pte_batch() can incorrectly batch beyond the end of a
+folio due to a corner case in pte_advance_pfn().  Specifically, when the
+PFN following the folio maps to an invalidated MFN,
+
+	expected_pte = pte_advance_pfn(expected_pte, nr);
+
+produces a pte_none().  If the actual next PTE in memory is also
+pte_none(), the pte_same() succeeds,
+
+	if (!pte_same(pte, expected_pte))
+		break;
+
+the loop is not broken, and batching continues into unrelated memory.
+
+For example, with a 4-page folio, the PTE layout might look like this:
+
+[   53.465673] [ T2552] folio_pte_batch: printing PTE values at addr=0x7f1ac9dc5000
+[   53.465674] [ T2552]   PTE[453] = 000000010085c125
+[   53.465679] [ T2552]   PTE[454] = 000000010085d125
+[   53.465682] [ T2552]   PTE[455] = 000000010085e125
+[   53.465684] [ T2552]   PTE[456] = 000000010085f125
+[   53.465686] [ T2552]   PTE[457] = 0000000000000000 <-- not present
+[   53.465689] [ T2552]   PTE[458] = 0000000101da7125
+
+pte_advance_pfn(PTE[456]) returns a pte_none() due to invalid PFN->MFN
+mapping.  The next actual PTE (PTE[457]) is also pte_none(), so the loop
+continues and includes PTE[457] in the batch, resulting in 5 batched
+entries for a 4-page folio.  This triggers the following warning:
+
+[   53.465751] [ T2552] page: refcount:85 mapcount:20 mapping:ffff88813ff4f6a8 index:0x110 pfn:0x10085c
+[   53.465754] [ T2552] head: order:2 mapcount:80 entire_mapcount:0 nr_pages_mapped:4 pincount:0
+[   53.465756] [ T2552] memcg:ffff888003573000
+[   53.465758] [ T2552] aops:0xffffffff8226fd20 ino:82467c dentry name(?):"libc.so.6"
+[   53.465761] [ T2552] flags: 0x2000000000416c(referenced|uptodate|lru|active|private|head|node=0|zone=2)
+[   53.465764] [ T2552] raw: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8
+[   53.465767] [ T2552] raw: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000
+[   53.465768] [ T2552] head: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8
+[   53.465770] [ T2552] head: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000
+[   53.465772] [ T2552] head: 0020000000000202 ffffea0004021701 000000040000004f 00000000ffffffff
+[   53.465774] [ T2552] head: 0000000300000003 8000000300000002 0000000000000013 0000000000000004
+[   53.465775] [ T2552] page dumped because: VM_WARN_ON_FOLIO((_Generic((page + nr_pages - 1), const struct page *: (const struct folio *)_compound_head(page + nr_pages - 1), struct page *: (struct folio *)_compound_head(page + nr_pages - 1))) != folio)
+
+Original code works as expected everywhere, except on XEN PV, where
+pte_advance_pfn() can yield a pte_none() after balloon inflation due to
+MFNs invalidation.  In XEN, pte_advance_pfn() ends up calling
+__pte()->xen_make_pte()->pte_pfn_to_mfn(), which returns pte_none() when
+mfn == INVALID_P2M_ENTRY.
+
+The pte_pfn_to_mfn() documents that nastiness:
+
+	If there's no mfn for the pfn, then just create an
+	empty non-present pte.  Unfortunately this loses
+	information about the original pfn, so
+	pte_mfn_to_pfn is asymmetric.
+
+While such hacks should certainly be removed, we can do better in
+folio_pte_batch() and simply check ahead of time how many PTEs we can
+possibly batch in our folio.
+
+This way, we can not only fix the issue but cleanup the code: removing the
+pte_pfn() check inside the loop body and avoiding end_ptr comparison +
+arithmetic.
+
+Link: https://lkml.kernel.org/r/20250502215019.822-2-arkamar@atlas.cz
+Fixes: f8d937761d65 ("mm/memory: optimize fork() with PTE-mapped THP")
+Co-developed-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Petr VanÄk <arkamar@atlas.cz>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/internal.h |   27 +++++++++++----------------
+ 1 file changed, 11 insertions(+), 16 deletions(-)
+
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -204,11 +204,9 @@ static inline int folio_pte_batch(struct
+ 		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+ 		bool *any_writable, bool *any_young, bool *any_dirty)
+ {
+-	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+-	const pte_t *end_ptep = start_ptep + max_nr;
+ 	pte_t expected_pte, *ptep;
+ 	bool writable, young, dirty;
+-	int nr;
++	int nr, cur_nr;
+ 
+ 	if (any_writable)
+ 		*any_writable = false;
+@@ -221,11 +219,15 @@ static inline int folio_pte_batch(struct
+ 	VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
+ 	VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ 
++	/* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
++	max_nr = min_t(unsigned long, max_nr,
++		       folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
++
+ 	nr = pte_batch_hint(start_ptep, pte);
+ 	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
+ 	ptep = start_ptep + nr;
+ 
+-	while (ptep < end_ptep) {
++	while (nr < max_nr) {
+ 		pte = ptep_get(ptep);
+ 		if (any_writable)
+ 			writable = !!pte_write(pte);
+@@ -238,14 +240,6 @@ static inline int folio_pte_batch(struct
+ 		if (!pte_same(pte, expected_pte))
+ 			break;
+ 
+-		/*
+-		 * Stop immediately once we reached the end of the folio. In
+-		 * corner cases the next PFN might fall into a different
+-		 * folio.
+-		 */
+-		if (pte_pfn(pte) >= folio_end_pfn)
+-			break;
+-
+ 		if (any_writable)
+ 			*any_writable |= writable;
+ 		if (any_young)
+@@ -253,12 +247,13 @@ static inline int folio_pte_batch(struct
+ 		if (any_dirty)
+ 			*any_dirty |= dirty;
+ 
+-		nr = pte_batch_hint(ptep, pte);
+-		expected_pte = pte_advance_pfn(expected_pte, nr);
+-		ptep += nr;
++		cur_nr = pte_batch_hint(ptep, pte);
++		expected_pte = pte_advance_pfn(expected_pte, cur_nr);
++		ptep += cur_nr;
++		nr += cur_nr;
+ 	}
+ 
+-	return min(ptep - start_ptep, max_nr);
++	return min(nr, max_nr);
+ }
+ 
+ /**
diff --git a/queue-6.12/mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch b/queue-6.12/mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch
new file mode 100644
index 0000000000..21327f9636
--- /dev/null
+++ b/queue-6.12/mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch
@@ -0,0 +1,90 @@
+From be6e843fc51a584672dfd9c4a6a24c8cb81d5fb7 Mon Sep 17 00:00:00 2001
+From: Gavin Guo <gavinguo@igalia.com>
+Date: Mon, 21 Apr 2025 19:35:36 +0800
+Subject: mm/huge_memory: fix dereferencing invalid pmd migration entry
+
+From: Gavin Guo <gavinguo@igalia.com>
+
+commit be6e843fc51a584672dfd9c4a6a24c8cb81d5fb7 upstream.
+
+When migrating a THP, concurrent access to the PMD migration entry during
+a deferred split scan can lead to an invalid address access, as
+illustrated below.  To prevent this invalid access, it is necessary to
+check the PMD migration entry and return early.  In this context, there is
+no need to use pmd_to_swp_entry and pfn_swap_entry_to_page to verify the
+equality of the target folio.  Since the PMD migration entry is locked, it
+cannot be served as the target.
+
+Mailing list discussion and explanation from Hugh Dickins: "An anon_vma
+lookup points to a location which may contain the folio of interest, but
+might instead contain another folio: and weeding out those other folios is
+precisely what the "folio != pmd_folio((*pmd)" check (and the "risk of
+replacing the wrong folio" comment a few lines above it) is for."
+
+BUG: unable to handle page fault for address: ffffea60001db008
+CPU: 0 UID: 0 PID: 2199114 Comm: tee Not tainted 6.14.0+ #4 NONE
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
+RIP: 0010:split_huge_pmd_locked+0x3b5/0x2b60
+Call Trace:
+<TASK>
+try_to_migrate_one+0x28c/0x3730
+rmap_walk_anon+0x4f6/0x770
+unmap_folio+0x196/0x1f0
+split_huge_page_to_list_to_order+0x9f6/0x1560
+deferred_split_scan+0xac5/0x12a0
+shrinker_debugfs_scan_write+0x376/0x470
+full_proxy_write+0x15c/0x220
+vfs_write+0x2fc/0xcb0
+ksys_write+0x146/0x250
+do_syscall_64+0x6a/0x120
+entry_SYSCALL_64_after_hwframe+0x76/0x7e
+
+The bug is found by syzkaller on an internal kernel, then confirmed on
+upstream.
+
+Link: https://lkml.kernel.org/r/20250421113536.3682201-1-gavinguo@igalia.com
+Link: https://lore.kernel.org/all/20250414072737.1698513-1-gavinguo@igalia.com/
+Link: https://lore.kernel.org/all/20250418085802.2973519-1-gavinguo@igalia.com/
+Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
+Signed-off-by: Gavin Guo <gavinguo@igalia.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Acked-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Gavin Shan <gshan@redhat.com>
+Cc: Florent Revest <revest@google.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/huge_memory.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2879,6 +2879,8 @@ static void __split_huge_pmd_locked(stru
+ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
+ 			   pmd_t *pmd, bool freeze, struct folio *folio)
+ {
++	bool pmd_migration = is_pmd_migration_entry(*pmd);
++
+ 	VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
+ 	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
+ 	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
+@@ -2889,9 +2891,12 @@ void split_huge_pmd_locked(struct vm_are
+ 	 * require a folio to check the PMD against. Otherwise, there
+ 	 * is a risk of replacing the wrong folio.
+ 	 */
+-	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+-	    is_pmd_migration_entry(*pmd)) {
+-		if (folio && folio != pmd_folio(*pmd))
++	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || pmd_migration) {
++		/*
++		 * Do not apply pmd_folio() to a migration entry; and folio lock
++		 * guarantees that it must be of the wrong folio anyway.
++		 */
++		if (folio && (pmd_migration || folio != pmd_folio(*pmd)))
+ 			return;
+ 		__split_huge_pmd_locked(vma, pmd, address, freeze);
+ 	}
diff --git a/queue-6.12/mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch b/queue-6.12/mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch
new file mode 100644
index 0000000000..77642ff904
--- /dev/null
+++ b/queue-6.12/mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch
@@ -0,0 +1,128 @@
+From 95567729173e62e0e60a1f8ad9eb2e1320a8ccac Mon Sep 17 00:00:00 2001
+From: Peter Xu <peterx@redhat.com>
+Date: Thu, 24 Apr 2025 17:57:28 -0400
+Subject: mm/userfaultfd: fix uninitialized output field for -EAGAIN race
+
+From: Peter Xu <peterx@redhat.com>
+
+commit 95567729173e62e0e60a1f8ad9eb2e1320a8ccac upstream.
+
+While discussing some userfaultfd relevant issues recently, Andrea noticed
+a potential ABI breakage with -EAGAIN on almost all userfaultfd ioctl()s.
+
+Quote from Andrea, explaining how -EAGAIN was processed, and how this
+should fix it (taking example of UFFDIO_COPY ioctl):
+
+  The "mmap_changing" and "stale pmd" conditions are already reported as
+  -EAGAIN written in the copy field, this does not change it. This change
+  removes the subnormal case that left copy.copy uninitialized and required
+  apps to explicitly set the copy field to get deterministic
+  behavior (which is a requirement contrary to the documentation in both
+  the manpage and source code). In turn there's no alteration to backwards
+  compatibility as result of this change because userland will find the
+  copy field consistently set to -EAGAIN, and not anymore sometime -EAGAIN
+  and sometime uninitialized.
+
+  Even then the change only can make a difference to non cooperative users
+  of userfaultfd, so when UFFD_FEATURE_EVENT_* is enabled, which is not
+  true for the vast majority of apps using userfaultfd or this unintended
+  uninitialized field may have been noticed sooner.
+
+Meanwhile, since this bug existed for years, it also almost affects all
+ioctl()s that was introduced later.  Besides UFFDIO_ZEROPAGE, these also
+get affected in the same way:
+
+  - UFFDIO_CONTINUE
+  - UFFDIO_POISON
+  - UFFDIO_MOVE
+
+This patch should have fixed all of them.
+
+Link: https://lkml.kernel.org/r/20250424215729.194656-2-peterx@redhat.com
+Fixes: df2cc96e7701 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races")
+Fixes: f619147104c8 ("userfaultfd: add UFFDIO_CONTINUE ioctl")
+Fixes: fc71884a5f59 ("mm: userfaultfd: add new UFFDIO_POISON ioctl")
+Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Reported-by: Andrea Arcangeli <aarcange@redhat.com>
+Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Cc: Mike Rapoport <rppt@kernel.org>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/userfaultfd.c |   28 ++++++++++++++++++++++------
+ 1 file changed, 22 insertions(+), 6 deletions(-)
+
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -1585,8 +1585,11 @@ static int userfaultfd_copy(struct userf
+ 	user_uffdio_copy = (struct uffdio_copy __user *) arg;
+ 
+ 	ret = -EAGAIN;
+-	if (atomic_read(&ctx->mmap_changing))
++	if (unlikely(atomic_read(&ctx->mmap_changing))) {
++		if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
++			return -EFAULT;
+ 		goto out;
++	}
+ 
+ 	ret = -EFAULT;
+ 	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+@@ -1641,8 +1644,11 @@ static int userfaultfd_zeropage(struct u
+ 	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+ 
+ 	ret = -EAGAIN;
+-	if (atomic_read(&ctx->mmap_changing))
++	if (unlikely(atomic_read(&ctx->mmap_changing))) {
++		if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
++			return -EFAULT;
+ 		goto out;
++	}
+ 
+ 	ret = -EFAULT;
+ 	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+@@ -1744,8 +1750,11 @@ static int userfaultfd_continue(struct u
+ 	user_uffdio_continue = (struct uffdio_continue __user *)arg;
+ 
+ 	ret = -EAGAIN;
+-	if (atomic_read(&ctx->mmap_changing))
++	if (unlikely(atomic_read(&ctx->mmap_changing))) {
++		if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
++			return -EFAULT;
+ 		goto out;
++	}
+ 
+ 	ret = -EFAULT;
+ 	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
+@@ -1801,8 +1810,11 @@ static inline int userfaultfd_poison(str
+ 	user_uffdio_poison = (struct uffdio_poison __user *)arg;
+ 
+ 	ret = -EAGAIN;
+-	if (atomic_read(&ctx->mmap_changing))
++	if (unlikely(atomic_read(&ctx->mmap_changing))) {
++		if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
++			return -EFAULT;
+ 		goto out;
++	}
+ 
+ 	ret = -EFAULT;
+ 	if (copy_from_user(&uffdio_poison, user_uffdio_poison,
+@@ -1870,8 +1882,12 @@ static int userfaultfd_move(struct userf
+ 
+ 	user_uffdio_move = (struct uffdio_move __user *) arg;
+ 
+-	if (atomic_read(&ctx->mmap_changing))
+-		return -EAGAIN;
++	ret = -EAGAIN;
++	if (unlikely(atomic_read(&ctx->mmap_changing))) {
++		if (unlikely(put_user(ret, &user_uffdio_move->move)))
++			return -EFAULT;
++		goto out;
++	}
+ 
+ 	if (copy_from_user(&uffdio_move, user_uffdio_move,
+ 			   /* don't copy "move" last field */
diff --git a/queue-6.12/mm-vmalloc-support-more-granular-vrealloc-sizing.patch b/queue-6.12/mm-vmalloc-support-more-granular-vrealloc-sizing.patch
new file mode 100644
index 0000000000..276db1d7b4
--- /dev/null
+++ b/queue-6.12/mm-vmalloc-support-more-granular-vrealloc-sizing.patch
@@ -0,0 +1,122 @@
+From a0309faf1cb0622cac7c820150b7abf2024acff5 Mon Sep 17 00:00:00 2001
+From: Kees Cook <kees@kernel.org>
+Date: Fri, 25 Apr 2025 17:11:07 -0700
+Subject: mm: vmalloc: support more granular vrealloc() sizing
+
+From: Kees Cook <kees@kernel.org>
+
+commit a0309faf1cb0622cac7c820150b7abf2024acff5 upstream.
+
+Introduce struct vm_struct::requested_size so that the requested
+(re)allocation size is retained separately from the allocated area size.
+This means that KASAN will correctly poison the correct spans of requested
+bytes.  This also means we can support growing the usable portion of an
+allocation that can already be supported by the existing area's existing
+allocation.
+
+Link: https://lkml.kernel.org/r/20250426001105.it.679-kees@kernel.org
+Fixes: 3ddc2fefe6f3 ("mm: vmalloc: implement vrealloc()")
+Signed-off-by: Kees Cook <kees@kernel.org>
+Reported-by: Erhard Furtner <erhard_f@mailbox.org>
+Closes: https://lore.kernel.org/all/20250408192503.6149a816@outsider.home/
+Reviewed-by: Danilo Krummrich <dakr@kernel.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/vmalloc.h |    1 +
+ mm/vmalloc.c            |   31 ++++++++++++++++++++++++-------
+ 2 files changed, 25 insertions(+), 7 deletions(-)
+
+--- a/include/linux/vmalloc.h
++++ b/include/linux/vmalloc.h
+@@ -61,6 +61,7 @@ struct vm_struct {
+ 	unsigned int		nr_pages;
+ 	phys_addr_t		phys_addr;
+ 	const void		*caller;
++	unsigned long		requested_size;
+ };
+ 
+ struct vmap_area {
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(stru
+ {
+ 	vm->flags = flags;
+ 	vm->addr = (void *)va->va_start;
+-	vm->size = va_size(va);
++	vm->size = vm->requested_size = va_size(va);
+ 	vm->caller = caller;
+ 	va->vm = vm;
+ }
+@@ -3128,6 +3128,7 @@ static struct vm_struct *__get_vm_area_n
+ 
+ 	area->flags = flags;
+ 	area->caller = caller;
++	area->requested_size = requested_size;
+ 
+ 	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
+ 	if (IS_ERR(va)) {
+@@ -4067,6 +4068,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
+  */
+ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+ {
++	struct vm_struct *vm = NULL;
++	size_t alloced_size = 0;
+ 	size_t old_size = 0;
+ 	void *n;
+ 
+@@ -4076,15 +4079,17 @@ void *vrealloc_noprof(const void *p, siz
+ 	}
+ 
+ 	if (p) {
+-		struct vm_struct *vm;
+-
+ 		vm = find_vm_area(p);
+ 		if (unlikely(!vm)) {
+ 			WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
+ 			return NULL;
+ 		}
+ 
+-		old_size = get_vm_area_size(vm);
++		alloced_size = get_vm_area_size(vm);
++		old_size = vm->requested_size;
++		if (WARN(alloced_size < old_size,
++			 "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
++			return NULL;
+ 	}
+ 
+ 	/*
+@@ -4092,14 +4097,26 @@ void *vrealloc_noprof(const void *p, siz
+ 	 * would be a good heuristic for when to shrink the vm_area?
+ 	 */
+ 	if (size <= old_size) {
+-		/* Zero out spare memory. */
+-		if (want_init_on_alloc(flags))
++		/* Zero out "freed" memory. */
++		if (want_init_on_free())
+ 			memset((void *)p + size, 0, old_size - size);
++		vm->requested_size = size;
+ 		kasan_poison_vmalloc(p + size, old_size - size);
+-		kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL);
+ 		return (void *)p;
+ 	}
+ 
++	/*
++	 * We already have the bytes available in the allocation; use them.
++	 */
++	if (size <= alloced_size) {
++		kasan_unpoison_vmalloc(p + old_size, size - old_size,
++				       KASAN_VMALLOC_PROT_NORMAL);
++		/* Zero out "alloced" memory. */
++		if (want_init_on_alloc(flags))
++			memset((void *)p + old_size, 0, size - old_size);
++		vm->requested_size = size;
++	}
++
+ 	/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
+ 	n = __vmalloc_noprof(size, flags);
+ 	if (!n)
diff --git a/queue-6.12/selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch b/queue-6.12/selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch
new file mode 100644
index 0000000000..cb7881a499
--- /dev/null
+++ b/queue-6.12/selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch
@@ -0,0 +1,72 @@
+From ab00ddd802f80e31fc9639c652d736fe3913feae Mon Sep 17 00:00:00 2001
+From: Feng Tang <feng.tang@linux.alibaba.com>
+Date: Wed, 23 Apr 2025 18:36:45 +0800
+Subject: selftests/mm: compaction_test: support platform with huge mount of memory
+
+From: Feng Tang <feng.tang@linux.alibaba.com>
+
+commit ab00ddd802f80e31fc9639c652d736fe3913feae upstream.
+
+When running mm selftest to verify mm patches, 'compaction_test' case
+failed on an x86 server with 1TB memory.  And the root cause is that it
+has too much free memory than what the test supports.
+
+The test case tries to allocate 100000 huge pages, which is about 200 GB
+for that x86 server, and when it succeeds, it expects it's large than 1/3
+of 80% of the free memory in system.  This logic only works for platform
+with 750 GB ( 200 / (1/3) / 80% ) or less free memory, and may raise false
+alarm for others.
+
+Fix it by changing the fixed page number to self-adjustable number
+according to the real number of free memory.
+
+Link: https://lkml.kernel.org/r/20250423103645.2758-1-feng.tang@linux.alibaba.com
+Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory")
+Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
+Acked-by: Dev Jain <dev.jain@arm.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Tested-by: Baolin Wang <baolin.wang@inux.alibaba.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Sri Jayaramappa <sjayaram@akamai.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/mm/compaction_test.c |   19 ++++++++++++++-----
+ 1 file changed, 14 insertions(+), 5 deletions(-)
+
+--- a/tools/testing/selftests/mm/compaction_test.c
++++ b/tools/testing/selftests/mm/compaction_test.c
+@@ -90,6 +90,8 @@ int check_compaction(unsigned long mem_f
+ 	int compaction_index = 0;
+ 	char nr_hugepages[20] = {0};
+ 	char init_nr_hugepages[24] = {0};
++	char target_nr_hugepages[24] = {0};
++	int slen;
+ 
+ 	snprintf(init_nr_hugepages, sizeof(init_nr_hugepages),
+ 		 "%lu", initial_nr_hugepages);
+@@ -106,11 +108,18 @@ int check_compaction(unsigned long mem_f
+ 		goto out;
+ 	}
+ 
+-	/* Request a large number of huge pages. The Kernel will allocate
+-	   as much as it can */
+-	if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
+-		ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
+-			       strerror(errno));
++	/*
++	 * Request huge pages for about half of the free memory. The Kernel
++	 * will allocate as much as it can, and we expect it will get at least 1/3
++	 */
++	nr_hugepages_ul = mem_free / hugepage_size / 2;
++	snprintf(target_nr_hugepages, sizeof(target_nr_hugepages),
++		 "%lu", nr_hugepages_ul);
++
++	slen = strlen(target_nr_hugepages);
++	if (write(fd, target_nr_hugepages, slen) != slen) {
++		ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n",
++			       nr_hugepages_ul, strerror(errno));
+ 		goto close_fd;
+ 	}
+ 
diff --git a/queue-6.12/selftests-mm-fix-a-build-failure-on-powerpc.patch b/queue-6.12/selftests-mm-fix-a-build-failure-on-powerpc.patch
new file mode 100644
index 0000000000..860ac47385
--- /dev/null
+++ b/queue-6.12/selftests-mm-fix-a-build-failure-on-powerpc.patch
@@ -0,0 +1,59 @@
+From 8cf6ecb18baac867585fe1cba5dde6dbf3b6d29a Mon Sep 17 00:00:00 2001
+From: "Nysal Jan K.A." <nysal@linux.ibm.com>
+Date: Mon, 28 Apr 2025 18:49:35 +0530
+Subject: selftests/mm: fix a build failure on powerpc
+
+From: Nysal Jan K.A. <nysal@linux.ibm.com>
+
+commit 8cf6ecb18baac867585fe1cba5dde6dbf3b6d29a upstream.
+
+The compiler is unaware of the size of code generated by the ".rept"
+assembler directive.  This results in the compiler emitting branch
+instructions where the offset to branch to exceeds the maximum allowed
+value, resulting in build failures like the following:
+
+  CC       protection_keys
+  /tmp/ccypKWAE.s: Assembler messages:
+  /tmp/ccypKWAE.s:2073: Error: operand out of range (0x0000000000020158
+  is not between 0xffffffffffff8000 and 0x0000000000007ffc)
+  /tmp/ccypKWAE.s:2509: Error: operand out of range (0x0000000000020130
+  is not between 0xffffffffffff8000 and 0x0000000000007ffc)
+
+Fix the issue by manually adding nop instructions using the preprocessor.
+
+Link: https://lkml.kernel.org/r/20250428131937.641989-2-nysal@linux.ibm.com
+Fixes: 46036188ea1f ("selftests/mm: build with -O2")
+Reported-by: Madhavan Srinivasan <maddy@linux.ibm.com>
+Signed-off-by: Nysal Jan K.A. <nysal@linux.ibm.com>
+Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
+Reviewed-by: Donet Tom <donettom@linux.ibm.com>
+Tested-by: Donet Tom <donettom@linux.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/mm/pkey-powerpc.h |   12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/mm/pkey-powerpc.h
++++ b/tools/testing/selftests/mm/pkey-powerpc.h
+@@ -102,8 +102,18 @@ void expect_fault_on_read_execonly_key(v
+ 	return;
+ }
+ 
++#define REPEAT_8(s) s s s s s s s s
++#define REPEAT_64(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) \
++		     REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s)
++#define REPEAT_512(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) \
++		      REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s)
++#define REPEAT_4096(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) \
++		       REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s)
++#define REPEAT_16384(s) REPEAT_4096(s) REPEAT_4096(s) \
++			REPEAT_4096(s) REPEAT_4096(s)
++
+ /* 4-byte instructions * 16384 = 64K page */
+-#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
++#define __page_o_noops() asm(REPEAT_16384("nop\n"))
+ 
+ void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+ {
diff --git a/queue-6.12/series b/queue-6.12/series
index 8a24b85d32..7c40bbb591 100644
--- a/queue-6.12/series
+++ b/queue-6.12/series
@@ -69,3 +69,12 @@ staging-iio-adc-ad7816-correct-conditional-logic-for-store-mode.patch
 staging-bcm2835-camera-initialise-dev-in-v4l2_dev.patch
 staging-axis-fifo-remove-hardware-resets-for-user-errors.patch
 staging-axis-fifo-correct-handling-of-tx_fifo_depth-for-size-validation.patch
+x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch
+mm-fix-folio_pte_batch-on-xen-pv.patch
+mm-vmalloc-support-more-granular-vrealloc-sizing.patch
+mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch
+mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch
+selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch
+selftests-mm-fix-a-build-failure-on-powerpc.patch
+kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch
+drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch
diff --git a/queue-6.12/x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch b/queue-6.12/x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch
new file mode 100644
index 0000000000..3644848a38
--- /dev/null
+++ b/queue-6.12/x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch
@@ -0,0 +1,131 @@
+From fea4e317f9e7e1f449ce90dedc27a2d2a95bee5a Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Thu, 8 May 2025 15:41:32 -0700
+Subject: x86/mm: Eliminate window where TLB flushes may be inadvertently skipped
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit fea4e317f9e7e1f449ce90dedc27a2d2a95bee5a upstream.
+
+tl;dr: There is a window in the mm switching code where the new CR3 is
+set and the CPU should be getting TLB flushes for the new mm.  But
+should_flush_tlb() has a bug and suppresses the flush.  Fix it by
+widening the window where should_flush_tlb() sends an IPI.
+
+Long Version:
+
+=== History ===
+
+There were a few things leading up to this.
+
+First, updating mm_cpumask() was observed to be too expensive, so it was
+made lazier.  But being lazy caused too many unnecessary IPIs to CPUs
+due to the now-lazy mm_cpumask().  So code was added to cull
+mm_cpumask() periodically[2].  But that culling was a bit too aggressive
+and skipped sending TLB flushes to CPUs that need them.  So here we are
+again.
+
+=== Problem ===
+
+The too-aggressive code in should_flush_tlb() strikes in this window:
+
+	// Turn on IPIs for this CPU/mm combination, but only
+	// if should_flush_tlb() agrees:
+	cpumask_set_cpu(cpu, mm_cpumask(next));
+
+	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+	choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+	load_new_mm_cr3(need_flush);
+	// ^ After 'need_flush' is set to false, IPIs *MUST*
+	// be sent to this CPU and not be ignored.
+
+        this_cpu_write(cpu_tlbstate.loaded_mm, next);
+	// ^ Not until this point does should_flush_tlb()
+	// become true!
+
+should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3()
+and writing to 'loaded_mm', which is a window where they should not be
+suppressed.  Whoops.
+
+=== Solution ===
+
+Thankfully, the fuzzy "just about to write CR3" window is already marked
+with loaded_mm==LOADED_MM_SWITCHING.  Simply checking for that state in
+should_flush_tlb() is sufficient to ensure that the CPU is targeted with
+an IPI.
+
+This will cause more TLB flush IPIs.  But the window is relatively small
+and I do not expect this to cause any kind of measurable performance
+impact.
+
+Update the comment where LOADED_MM_SWITCHING is written since it grew
+yet another user.
+
+Peter Z also raised a concern that should_flush_tlb() might not observe
+'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off()
+writes them.  Add a barrier to ensure that they are observed in the
+order they are written.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Acked-by: Rik van Riel <riel@surriel.com>
+Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1]
+Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2]
+Reported-by: Stephen Dolan <sdolan@janestreet.com>
+Cc: stable@vger.kernel.org
+Acked-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/tlb.c |   23 +++++++++++++++++++++--
+ 1 file changed, 21 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -624,7 +624,11 @@ void switch_mm_irqs_off(struct mm_struct
+ 
+ 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ 
+-		/* Let nmi_uaccess_okay() know that we're changing CR3. */
++		/*
++		 * Indicate that CR3 is about to change. nmi_uaccess_okay()
++		 * and others are sensitive to the window where mm_cpumask(),
++		 * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
++ 		 */
+ 		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ 		barrier();
+ 	}
+@@ -895,8 +899,16 @@ done:
+ 
+ static bool should_flush_tlb(int cpu, void *data)
+ {
++	struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
+ 	struct flush_tlb_info *info = data;
+ 
++	/*
++	 * Order the 'loaded_mm' and 'is_lazy' against their
++	 * write ordering in switch_mm_irqs_off(). Ensure
++	 * 'is_lazy' is at least as new as 'loaded_mm'.
++	 */
++	smp_rmb();
++
+ 	/* Lazy TLB will get flushed at the next context switch. */
+ 	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ 		return false;
+@@ -905,8 +917,15 @@ static bool should_flush_tlb(int cpu, vo
+ 	if (!info->mm)
+ 		return true;
+ 
++	/*
++	 * While switching, the remote CPU could have state from
++	 * either the prev or next mm. Assume the worst and flush.
++	 */
++	if (loaded_mm == LOADED_MM_SWITCHING)
++		return true;
++
+ 	/* The target mm is loaded, and the CPU is not lazy. */
+-	if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
++	if (loaded_mm == info->mm)
+ 		return true;
+ 
+ 	/* In cpumask, but not the loaded mm? Periodically remove by flushing. */