From: Greg Kroah-Hartman Date: Mon, 12 May 2025 09:39:35 +0000 (+0200) Subject: 6.12-stable patches X-Git-Tag: v5.15.183~53 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=6bd3f7d8e9a05e24ad636f6fa4627ba607ae0ec8;p=thirdparty%2Fkernel%2Fstable-queue.git 6.12-stable patches added patches: drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch mm-fix-folio_pte_batch-on-xen-pv.patch mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch mm-vmalloc-support-more-granular-vrealloc-sizing.patch selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch selftests-mm-fix-a-build-failure-on-powerpc.patch x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch --- diff --git a/queue-6.12/drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch b/queue-6.12/drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch new file mode 100644 index 0000000000..002f825e89 --- /dev/null +++ b/queue-6.12/drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch @@ -0,0 +1,47 @@ +From 5a3846648c0523fd850b7f0aec78c0139453ab8b Mon Sep 17 00:00:00 2001 +From: Wayne Lin +Date: Fri, 18 Apr 2025 16:31:59 +0800 +Subject: drm/amd/display: Shift DMUB AUX reply command if necessary + +From: Wayne Lin + +commit 5a3846648c0523fd850b7f0aec78c0139453ab8b upstream. + +[Why] +Defined value of dmub AUX reply command field get updated but didn't +adjust dm receiving side accordingly. + +[How] +Check the received reply command value to see if it's updated version +or not. Adjust it if necessary. + +Fixes: ead08b95fa50 ("drm/amd/display: Fix race condition in DPIA AUX transfer") +Cc: Mario Limonciello +Cc: Alex Deucher +Reviewed-by: Ray Wu +Signed-off-by: Wayne Lin +Signed-off-by: Ray Wu +Tested-by: Daniel Wheeler +Signed-off-by: Alex Deucher +(cherry picked from commit d5c9ade755a9afa210840708a12a8f44c0d532f4) +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -12535,8 +12535,11 @@ int amdgpu_dm_process_dmub_aux_transfer_ + goto out; + } + ++ payload->reply[0] = adev->dm.dmub_notify->aux_reply.command & 0xF; ++ if (adev->dm.dmub_notify->aux_reply.command & 0xF0) ++ /* The reply is stored in the top nibble of the command. */ ++ payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF; + +- payload->reply[0] = adev->dm.dmub_notify->aux_reply.command; + if (!payload->write && p_notify->aux_reply.length && + (payload->reply[0] == AUX_TRANSACTION_REPLY_AUX_ACK)) { + diff --git a/queue-6.12/kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch b/queue-6.12/kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch new file mode 100644 index 0000000000..9db7ddfc27 --- /dev/null +++ b/queue-6.12/kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch @@ -0,0 +1,97 @@ +From a2620f8932fa9fdabc3d78ed6efb004ca409019f Mon Sep 17 00:00:00 2001 +From: Mikhail Lobanov +Date: Mon, 14 Apr 2025 20:12:06 +0300 +Subject: KVM: SVM: Forcibly leave SMM mode on SHUTDOWN interception + +From: Mikhail Lobanov + +commit a2620f8932fa9fdabc3d78ed6efb004ca409019f upstream. + +Previously, commit ed129ec9057f ("KVM: x86: forcibly leave nested mode +on vCPU reset") addressed an issue where a triple fault occurring in +nested mode could lead to use-after-free scenarios. However, the commit +did not handle the analogous situation for System Management Mode (SMM). + +This omission results in triggering a WARN when KVM forces a vCPU INIT +after SHUTDOWN interception while the vCPU is in SMM. This situation was +reprodused using Syzkaller by: + + 1) Creating a KVM VM and vCPU + 2) Sending a KVM_SMI ioctl to explicitly enter SMM + 3) Executing invalid instructions causing consecutive exceptions and + eventually a triple fault + +The issue manifests as follows: + + WARNING: CPU: 0 PID: 25506 at arch/x86/kvm/x86.c:12112 + kvm_vcpu_reset+0x1d2/0x1530 arch/x86/kvm/x86.c:12112 + Modules linked in: + CPU: 0 PID: 25506 Comm: syz-executor.0 Not tainted + 6.1.130-syzkaller-00157-g164fe5dde9b6 #0 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), + BIOS 1.12.0-1 04/01/2014 + RIP: 0010:kvm_vcpu_reset+0x1d2/0x1530 arch/x86/kvm/x86.c:12112 + Call Trace: + + shutdown_interception+0x66/0xb0 arch/x86/kvm/svm/svm.c:2136 + svm_invoke_exit_handler+0x110/0x530 arch/x86/kvm/svm/svm.c:3395 + svm_handle_exit+0x424/0x920 arch/x86/kvm/svm/svm.c:3457 + vcpu_enter_guest arch/x86/kvm/x86.c:10959 [inline] + vcpu_run+0x2c43/0x5a90 arch/x86/kvm/x86.c:11062 + kvm_arch_vcpu_ioctl_run+0x50f/0x1cf0 arch/x86/kvm/x86.c:11283 + kvm_vcpu_ioctl+0x570/0xf00 arch/x86/kvm/../../../virt/kvm/kvm_main.c:4122 + vfs_ioctl fs/ioctl.c:51 [inline] + __do_sys_ioctl fs/ioctl.c:870 [inline] + __se_sys_ioctl fs/ioctl.c:856 [inline] + __x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:856 + do_syscall_x64 arch/x86/entry/common.c:51 [inline] + do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81 + entry_SYSCALL_64_after_hwframe+0x6e/0xd8 + +Architecturally, INIT is blocked when the CPU is in SMM, hence KVM's WARN() +in kvm_vcpu_reset() to guard against KVM bugs, e.g. to detect improper +emulation of INIT. SHUTDOWN on SVM is a weird edge case where KVM needs to +do _something_ sane with the VMCB, since it's technically undefined, and +INIT is the least awful choice given KVM's ABI. + +So, double down on stuffing INIT on SHUTDOWN, and force the vCPU out of +SMM to avoid any weirdness (and the WARN). + +Found by Linux Verification Center (linuxtesting.org) with Syzkaller. + +Fixes: ed129ec9057f ("KVM: x86: forcibly leave nested mode on vCPU reset") +Cc: stable@vger.kernel.org +Suggested-by: Sean Christopherson +Signed-off-by: Mikhail Lobanov +Link: https://lore.kernel.org/r/20250414171207.155121-1-m.lobanov@rosa.ru +[sean: massage changelog, make it clear this isn't architectural behavior] +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/smm.c | 1 + + arch/x86/kvm/svm/svm.c | 4 ++++ + 2 files changed, 5 insertions(+) + +--- a/arch/x86/kvm/smm.c ++++ b/arch/x86/kvm/smm.c +@@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vc + + kvm_mmu_reset_context(vcpu); + } ++EXPORT_SYMBOL_GPL(kvm_smm_changed); + + void process_smi(struct kvm_vcpu *vcpu) + { +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -2222,6 +2222,10 @@ static int shutdown_interception(struct + */ + if (!sev_es_guest(vcpu->kvm)) { + clear_page(svm->vmcb); ++#ifdef CONFIG_KVM_SMM ++ if (is_smm(vcpu)) ++ kvm_smm_changed(vcpu, false); ++#endif + kvm_vcpu_reset(vcpu, true); + } + diff --git a/queue-6.12/mm-fix-folio_pte_batch-on-xen-pv.patch b/queue-6.12/mm-fix-folio_pte_batch-on-xen-pv.patch new file mode 100644 index 0000000000..e6862e3b53 --- /dev/null +++ b/queue-6.12/mm-fix-folio_pte_batch-on-xen-pv.patch @@ -0,0 +1,153 @@ +From 7b08b74f3d99f6b801250683c751d391128799ec Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Petr=20Van=C4=9Bk?= +Date: Fri, 2 May 2025 23:50:19 +0200 +Subject: mm: fix folio_pte_batch() on XEN PV +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Petr Vaněk + +commit 7b08b74f3d99f6b801250683c751d391128799ec upstream. + +On XEN PV, folio_pte_batch() can incorrectly batch beyond the end of a +folio due to a corner case in pte_advance_pfn(). Specifically, when the +PFN following the folio maps to an invalidated MFN, + + expected_pte = pte_advance_pfn(expected_pte, nr); + +produces a pte_none(). If the actual next PTE in memory is also +pte_none(), the pte_same() succeeds, + + if (!pte_same(pte, expected_pte)) + break; + +the loop is not broken, and batching continues into unrelated memory. + +For example, with a 4-page folio, the PTE layout might look like this: + +[ 53.465673] [ T2552] folio_pte_batch: printing PTE values at addr=0x7f1ac9dc5000 +[ 53.465674] [ T2552] PTE[453] = 000000010085c125 +[ 53.465679] [ T2552] PTE[454] = 000000010085d125 +[ 53.465682] [ T2552] PTE[455] = 000000010085e125 +[ 53.465684] [ T2552] PTE[456] = 000000010085f125 +[ 53.465686] [ T2552] PTE[457] = 0000000000000000 <-- not present +[ 53.465689] [ T2552] PTE[458] = 0000000101da7125 + +pte_advance_pfn(PTE[456]) returns a pte_none() due to invalid PFN->MFN +mapping. The next actual PTE (PTE[457]) is also pte_none(), so the loop +continues and includes PTE[457] in the batch, resulting in 5 batched +entries for a 4-page folio. This triggers the following warning: + +[ 53.465751] [ T2552] page: refcount:85 mapcount:20 mapping:ffff88813ff4f6a8 index:0x110 pfn:0x10085c +[ 53.465754] [ T2552] head: order:2 mapcount:80 entire_mapcount:0 nr_pages_mapped:4 pincount:0 +[ 53.465756] [ T2552] memcg:ffff888003573000 +[ 53.465758] [ T2552] aops:0xffffffff8226fd20 ino:82467c dentry name(?):"libc.so.6" +[ 53.465761] [ T2552] flags: 0x2000000000416c(referenced|uptodate|lru|active|private|head|node=0|zone=2) +[ 53.465764] [ T2552] raw: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8 +[ 53.465767] [ T2552] raw: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000 +[ 53.465768] [ T2552] head: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8 +[ 53.465770] [ T2552] head: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000 +[ 53.465772] [ T2552] head: 0020000000000202 ffffea0004021701 000000040000004f 00000000ffffffff +[ 53.465774] [ T2552] head: 0000000300000003 8000000300000002 0000000000000013 0000000000000004 +[ 53.465775] [ T2552] page dumped because: VM_WARN_ON_FOLIO((_Generic((page + nr_pages - 1), const struct page *: (const struct folio *)_compound_head(page + nr_pages - 1), struct page *: (struct folio *)_compound_head(page + nr_pages - 1))) != folio) + +Original code works as expected everywhere, except on XEN PV, where +pte_advance_pfn() can yield a pte_none() after balloon inflation due to +MFNs invalidation. In XEN, pte_advance_pfn() ends up calling +__pte()->xen_make_pte()->pte_pfn_to_mfn(), which returns pte_none() when +mfn == INVALID_P2M_ENTRY. + +The pte_pfn_to_mfn() documents that nastiness: + + If there's no mfn for the pfn, then just create an + empty non-present pte. Unfortunately this loses + information about the original pfn, so + pte_mfn_to_pfn is asymmetric. + +While such hacks should certainly be removed, we can do better in +folio_pte_batch() and simply check ahead of time how many PTEs we can +possibly batch in our folio. + +This way, we can not only fix the issue but cleanup the code: removing the +pte_pfn() check inside the loop body and avoiding end_ptr comparison + +arithmetic. + +Link: https://lkml.kernel.org/r/20250502215019.822-2-arkamar@atlas.cz +Fixes: f8d937761d65 ("mm/memory: optimize fork() with PTE-mapped THP") +Co-developed-by: David Hildenbrand +Signed-off-by: David Hildenbrand +Signed-off-by: Petr Vaněk +Cc: Ryan Roberts +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/internal.h | 27 +++++++++++---------------- + 1 file changed, 11 insertions(+), 16 deletions(-) + +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -204,11 +204,9 @@ static inline int folio_pte_batch(struct + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, + bool *any_writable, bool *any_young, bool *any_dirty) + { +- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); +- const pte_t *end_ptep = start_ptep + max_nr; + pte_t expected_pte, *ptep; + bool writable, young, dirty; +- int nr; ++ int nr, cur_nr; + + if (any_writable) + *any_writable = false; +@@ -221,11 +219,15 @@ static inline int folio_pte_batch(struct + VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); + VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + ++ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ ++ max_nr = min_t(unsigned long, max_nr, ++ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); ++ + nr = pte_batch_hint(start_ptep, pte); + expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); + ptep = start_ptep + nr; + +- while (ptep < end_ptep) { ++ while (nr < max_nr) { + pte = ptep_get(ptep); + if (any_writable) + writable = !!pte_write(pte); +@@ -238,14 +240,6 @@ static inline int folio_pte_batch(struct + if (!pte_same(pte, expected_pte)) + break; + +- /* +- * Stop immediately once we reached the end of the folio. In +- * corner cases the next PFN might fall into a different +- * folio. +- */ +- if (pte_pfn(pte) >= folio_end_pfn) +- break; +- + if (any_writable) + *any_writable |= writable; + if (any_young) +@@ -253,12 +247,13 @@ static inline int folio_pte_batch(struct + if (any_dirty) + *any_dirty |= dirty; + +- nr = pte_batch_hint(ptep, pte); +- expected_pte = pte_advance_pfn(expected_pte, nr); +- ptep += nr; ++ cur_nr = pte_batch_hint(ptep, pte); ++ expected_pte = pte_advance_pfn(expected_pte, cur_nr); ++ ptep += cur_nr; ++ nr += cur_nr; + } + +- return min(ptep - start_ptep, max_nr); ++ return min(nr, max_nr); + } + + /** diff --git a/queue-6.12/mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch b/queue-6.12/mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch new file mode 100644 index 0000000000..21327f9636 --- /dev/null +++ b/queue-6.12/mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch @@ -0,0 +1,90 @@ +From be6e843fc51a584672dfd9c4a6a24c8cb81d5fb7 Mon Sep 17 00:00:00 2001 +From: Gavin Guo +Date: Mon, 21 Apr 2025 19:35:36 +0800 +Subject: mm/huge_memory: fix dereferencing invalid pmd migration entry + +From: Gavin Guo + +commit be6e843fc51a584672dfd9c4a6a24c8cb81d5fb7 upstream. + +When migrating a THP, concurrent access to the PMD migration entry during +a deferred split scan can lead to an invalid address access, as +illustrated below. To prevent this invalid access, it is necessary to +check the PMD migration entry and return early. In this context, there is +no need to use pmd_to_swp_entry and pfn_swap_entry_to_page to verify the +equality of the target folio. Since the PMD migration entry is locked, it +cannot be served as the target. + +Mailing list discussion and explanation from Hugh Dickins: "An anon_vma +lookup points to a location which may contain the folio of interest, but +might instead contain another folio: and weeding out those other folios is +precisely what the "folio != pmd_folio((*pmd)" check (and the "risk of +replacing the wrong folio" comment a few lines above it) is for." + +BUG: unable to handle page fault for address: ffffea60001db008 +CPU: 0 UID: 0 PID: 2199114 Comm: tee Not tainted 6.14.0+ #4 NONE +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 +RIP: 0010:split_huge_pmd_locked+0x3b5/0x2b60 +Call Trace: + +try_to_migrate_one+0x28c/0x3730 +rmap_walk_anon+0x4f6/0x770 +unmap_folio+0x196/0x1f0 +split_huge_page_to_list_to_order+0x9f6/0x1560 +deferred_split_scan+0xac5/0x12a0 +shrinker_debugfs_scan_write+0x376/0x470 +full_proxy_write+0x15c/0x220 +vfs_write+0x2fc/0xcb0 +ksys_write+0x146/0x250 +do_syscall_64+0x6a/0x120 +entry_SYSCALL_64_after_hwframe+0x76/0x7e + +The bug is found by syzkaller on an internal kernel, then confirmed on +upstream. + +Link: https://lkml.kernel.org/r/20250421113536.3682201-1-gavinguo@igalia.com +Link: https://lore.kernel.org/all/20250414072737.1698513-1-gavinguo@igalia.com/ +Link: https://lore.kernel.org/all/20250418085802.2973519-1-gavinguo@igalia.com/ +Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path") +Signed-off-by: Gavin Guo +Acked-by: David Hildenbrand +Acked-by: Hugh Dickins +Acked-by: Zi Yan +Reviewed-by: Gavin Shan +Cc: Florent Revest +Cc: Matthew Wilcox (Oracle) +Cc: Miaohe Lin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/huge_memory.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2879,6 +2879,8 @@ static void __split_huge_pmd_locked(stru + void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, bool freeze, struct folio *folio) + { ++ bool pmd_migration = is_pmd_migration_entry(*pmd); ++ + VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); + VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); + VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); +@@ -2889,9 +2891,12 @@ void split_huge_pmd_locked(struct vm_are + * require a folio to check the PMD against. Otherwise, there + * is a risk of replacing the wrong folio. + */ +- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || +- is_pmd_migration_entry(*pmd)) { +- if (folio && folio != pmd_folio(*pmd)) ++ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || pmd_migration) { ++ /* ++ * Do not apply pmd_folio() to a migration entry; and folio lock ++ * guarantees that it must be of the wrong folio anyway. ++ */ ++ if (folio && (pmd_migration || folio != pmd_folio(*pmd))) + return; + __split_huge_pmd_locked(vma, pmd, address, freeze); + } diff --git a/queue-6.12/mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch b/queue-6.12/mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch new file mode 100644 index 0000000000..77642ff904 --- /dev/null +++ b/queue-6.12/mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch @@ -0,0 +1,128 @@ +From 95567729173e62e0e60a1f8ad9eb2e1320a8ccac Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 24 Apr 2025 17:57:28 -0400 +Subject: mm/userfaultfd: fix uninitialized output field for -EAGAIN race + +From: Peter Xu + +commit 95567729173e62e0e60a1f8ad9eb2e1320a8ccac upstream. + +While discussing some userfaultfd relevant issues recently, Andrea noticed +a potential ABI breakage with -EAGAIN on almost all userfaultfd ioctl()s. + +Quote from Andrea, explaining how -EAGAIN was processed, and how this +should fix it (taking example of UFFDIO_COPY ioctl): + + The "mmap_changing" and "stale pmd" conditions are already reported as + -EAGAIN written in the copy field, this does not change it. This change + removes the subnormal case that left copy.copy uninitialized and required + apps to explicitly set the copy field to get deterministic + behavior (which is a requirement contrary to the documentation in both + the manpage and source code). In turn there's no alteration to backwards + compatibility as result of this change because userland will find the + copy field consistently set to -EAGAIN, and not anymore sometime -EAGAIN + and sometime uninitialized. + + Even then the change only can make a difference to non cooperative users + of userfaultfd, so when UFFD_FEATURE_EVENT_* is enabled, which is not + true for the vast majority of apps using userfaultfd or this unintended + uninitialized field may have been noticed sooner. + +Meanwhile, since this bug existed for years, it also almost affects all +ioctl()s that was introduced later. Besides UFFDIO_ZEROPAGE, these also +get affected in the same way: + + - UFFDIO_CONTINUE + - UFFDIO_POISON + - UFFDIO_MOVE + +This patch should have fixed all of them. + +Link: https://lkml.kernel.org/r/20250424215729.194656-2-peterx@redhat.com +Fixes: df2cc96e7701 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races") +Fixes: f619147104c8 ("userfaultfd: add UFFDIO_CONTINUE ioctl") +Fixes: fc71884a5f59 ("mm: userfaultfd: add new UFFDIO_POISON ioctl") +Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") +Signed-off-by: Peter Xu +Reported-by: Andrea Arcangeli +Suggested-by: Andrea Arcangeli +Reviewed-by: David Hildenbrand +Cc: Mike Rapoport +Cc: Axel Rasmussen +Cc: Suren Baghdasaryan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/userfaultfd.c | 28 ++++++++++++++++++++++------ + 1 file changed, 22 insertions(+), 6 deletions(-) + +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -1585,8 +1585,11 @@ static int userfaultfd_copy(struct userf + user_uffdio_copy = (struct uffdio_copy __user *) arg; + + ret = -EAGAIN; +- if (atomic_read(&ctx->mmap_changing)) ++ if (unlikely(atomic_read(&ctx->mmap_changing))) { ++ if (unlikely(put_user(ret, &user_uffdio_copy->copy))) ++ return -EFAULT; + goto out; ++ } + + ret = -EFAULT; + if (copy_from_user(&uffdio_copy, user_uffdio_copy, +@@ -1641,8 +1644,11 @@ static int userfaultfd_zeropage(struct u + user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + + ret = -EAGAIN; +- if (atomic_read(&ctx->mmap_changing)) ++ if (unlikely(atomic_read(&ctx->mmap_changing))) { ++ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) ++ return -EFAULT; + goto out; ++ } + + ret = -EFAULT; + if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, +@@ -1744,8 +1750,11 @@ static int userfaultfd_continue(struct u + user_uffdio_continue = (struct uffdio_continue __user *)arg; + + ret = -EAGAIN; +- if (atomic_read(&ctx->mmap_changing)) ++ if (unlikely(atomic_read(&ctx->mmap_changing))) { ++ if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) ++ return -EFAULT; + goto out; ++ } + + ret = -EFAULT; + if (copy_from_user(&uffdio_continue, user_uffdio_continue, +@@ -1801,8 +1810,11 @@ static inline int userfaultfd_poison(str + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; +- if (atomic_read(&ctx->mmap_changing)) ++ if (unlikely(atomic_read(&ctx->mmap_changing))) { ++ if (unlikely(put_user(ret, &user_uffdio_poison->updated))) ++ return -EFAULT; + goto out; ++ } + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, +@@ -1870,8 +1882,12 @@ static int userfaultfd_move(struct userf + + user_uffdio_move = (struct uffdio_move __user *) arg; + +- if (atomic_read(&ctx->mmap_changing)) +- return -EAGAIN; ++ ret = -EAGAIN; ++ if (unlikely(atomic_read(&ctx->mmap_changing))) { ++ if (unlikely(put_user(ret, &user_uffdio_move->move))) ++ return -EFAULT; ++ goto out; ++ } + + if (copy_from_user(&uffdio_move, user_uffdio_move, + /* don't copy "move" last field */ diff --git a/queue-6.12/mm-vmalloc-support-more-granular-vrealloc-sizing.patch b/queue-6.12/mm-vmalloc-support-more-granular-vrealloc-sizing.patch new file mode 100644 index 0000000000..276db1d7b4 --- /dev/null +++ b/queue-6.12/mm-vmalloc-support-more-granular-vrealloc-sizing.patch @@ -0,0 +1,122 @@ +From a0309faf1cb0622cac7c820150b7abf2024acff5 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Fri, 25 Apr 2025 17:11:07 -0700 +Subject: mm: vmalloc: support more granular vrealloc() sizing + +From: Kees Cook + +commit a0309faf1cb0622cac7c820150b7abf2024acff5 upstream. + +Introduce struct vm_struct::requested_size so that the requested +(re)allocation size is retained separately from the allocated area size. +This means that KASAN will correctly poison the correct spans of requested +bytes. This also means we can support growing the usable portion of an +allocation that can already be supported by the existing area's existing +allocation. + +Link: https://lkml.kernel.org/r/20250426001105.it.679-kees@kernel.org +Fixes: 3ddc2fefe6f3 ("mm: vmalloc: implement vrealloc()") +Signed-off-by: Kees Cook +Reported-by: Erhard Furtner +Closes: https://lore.kernel.org/all/20250408192503.6149a816@outsider.home/ +Reviewed-by: Danilo Krummrich +Cc: Michal Hocko +Cc: "Uladzislau Rezki (Sony)" +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/vmalloc.h | 1 + + mm/vmalloc.c | 31 ++++++++++++++++++++++++------- + 2 files changed, 25 insertions(+), 7 deletions(-) + +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -61,6 +61,7 @@ struct vm_struct { + unsigned int nr_pages; + phys_addr_t phys_addr; + const void *caller; ++ unsigned long requested_size; + }; + + struct vmap_area { +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(stru + { + vm->flags = flags; + vm->addr = (void *)va->va_start; +- vm->size = va_size(va); ++ vm->size = vm->requested_size = va_size(va); + vm->caller = caller; + va->vm = vm; + } +@@ -3128,6 +3128,7 @@ static struct vm_struct *__get_vm_area_n + + area->flags = flags; + area->caller = caller; ++ area->requested_size = requested_size; + + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); + if (IS_ERR(va)) { +@@ -4067,6 +4068,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); + */ + void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) + { ++ struct vm_struct *vm = NULL; ++ size_t alloced_size = 0; + size_t old_size = 0; + void *n; + +@@ -4076,15 +4079,17 @@ void *vrealloc_noprof(const void *p, siz + } + + if (p) { +- struct vm_struct *vm; +- + vm = find_vm_area(p); + if (unlikely(!vm)) { + WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); + return NULL; + } + +- old_size = get_vm_area_size(vm); ++ alloced_size = get_vm_area_size(vm); ++ old_size = vm->requested_size; ++ if (WARN(alloced_size < old_size, ++ "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) ++ return NULL; + } + + /* +@@ -4092,14 +4097,26 @@ void *vrealloc_noprof(const void *p, siz + * would be a good heuristic for when to shrink the vm_area? + */ + if (size <= old_size) { +- /* Zero out spare memory. */ +- if (want_init_on_alloc(flags)) ++ /* Zero out "freed" memory. */ ++ if (want_init_on_free()) + memset((void *)p + size, 0, old_size - size); ++ vm->requested_size = size; + kasan_poison_vmalloc(p + size, old_size - size); +- kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); + return (void *)p; + } + ++ /* ++ * We already have the bytes available in the allocation; use them. ++ */ ++ if (size <= alloced_size) { ++ kasan_unpoison_vmalloc(p + old_size, size - old_size, ++ KASAN_VMALLOC_PROT_NORMAL); ++ /* Zero out "alloced" memory. */ ++ if (want_init_on_alloc(flags)) ++ memset((void *)p + old_size, 0, size - old_size); ++ vm->requested_size = size; ++ } ++ + /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ + n = __vmalloc_noprof(size, flags); + if (!n) diff --git a/queue-6.12/selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch b/queue-6.12/selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch new file mode 100644 index 0000000000..cb7881a499 --- /dev/null +++ b/queue-6.12/selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch @@ -0,0 +1,72 @@ +From ab00ddd802f80e31fc9639c652d736fe3913feae Mon Sep 17 00:00:00 2001 +From: Feng Tang +Date: Wed, 23 Apr 2025 18:36:45 +0800 +Subject: selftests/mm: compaction_test: support platform with huge mount of memory + +From: Feng Tang + +commit ab00ddd802f80e31fc9639c652d736fe3913feae upstream. + +When running mm selftest to verify mm patches, 'compaction_test' case +failed on an x86 server with 1TB memory. And the root cause is that it +has too much free memory than what the test supports. + +The test case tries to allocate 100000 huge pages, which is about 200 GB +for that x86 server, and when it succeeds, it expects it's large than 1/3 +of 80% of the free memory in system. This logic only works for platform +with 750 GB ( 200 / (1/3) / 80% ) or less free memory, and may raise false +alarm for others. + +Fix it by changing the fixed page number to self-adjustable number +according to the real number of free memory. + +Link: https://lkml.kernel.org/r/20250423103645.2758-1-feng.tang@linux.alibaba.com +Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory") +Signed-off-by: Feng Tang +Acked-by: Dev Jain +Reviewed-by: Baolin Wang +Tested-by: Baolin Wang +Cc: Shuah Khan +Cc: Sri Jayaramappa +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/mm/compaction_test.c | 19 ++++++++++++++----- + 1 file changed, 14 insertions(+), 5 deletions(-) + +--- a/tools/testing/selftests/mm/compaction_test.c ++++ b/tools/testing/selftests/mm/compaction_test.c +@@ -90,6 +90,8 @@ int check_compaction(unsigned long mem_f + int compaction_index = 0; + char nr_hugepages[20] = {0}; + char init_nr_hugepages[24] = {0}; ++ char target_nr_hugepages[24] = {0}; ++ int slen; + + snprintf(init_nr_hugepages, sizeof(init_nr_hugepages), + "%lu", initial_nr_hugepages); +@@ -106,11 +108,18 @@ int check_compaction(unsigned long mem_f + goto out; + } + +- /* Request a large number of huge pages. The Kernel will allocate +- as much as it can */ +- if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { +- ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n", +- strerror(errno)); ++ /* ++ * Request huge pages for about half of the free memory. The Kernel ++ * will allocate as much as it can, and we expect it will get at least 1/3 ++ */ ++ nr_hugepages_ul = mem_free / hugepage_size / 2; ++ snprintf(target_nr_hugepages, sizeof(target_nr_hugepages), ++ "%lu", nr_hugepages_ul); ++ ++ slen = strlen(target_nr_hugepages); ++ if (write(fd, target_nr_hugepages, slen) != slen) { ++ ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n", ++ nr_hugepages_ul, strerror(errno)); + goto close_fd; + } + diff --git a/queue-6.12/selftests-mm-fix-a-build-failure-on-powerpc.patch b/queue-6.12/selftests-mm-fix-a-build-failure-on-powerpc.patch new file mode 100644 index 0000000000..860ac47385 --- /dev/null +++ b/queue-6.12/selftests-mm-fix-a-build-failure-on-powerpc.patch @@ -0,0 +1,59 @@ +From 8cf6ecb18baac867585fe1cba5dde6dbf3b6d29a Mon Sep 17 00:00:00 2001 +From: "Nysal Jan K.A." +Date: Mon, 28 Apr 2025 18:49:35 +0530 +Subject: selftests/mm: fix a build failure on powerpc + +From: Nysal Jan K.A. + +commit 8cf6ecb18baac867585fe1cba5dde6dbf3b6d29a upstream. + +The compiler is unaware of the size of code generated by the ".rept" +assembler directive. This results in the compiler emitting branch +instructions where the offset to branch to exceeds the maximum allowed +value, resulting in build failures like the following: + + CC protection_keys + /tmp/ccypKWAE.s: Assembler messages: + /tmp/ccypKWAE.s:2073: Error: operand out of range (0x0000000000020158 + is not between 0xffffffffffff8000 and 0x0000000000007ffc) + /tmp/ccypKWAE.s:2509: Error: operand out of range (0x0000000000020130 + is not between 0xffffffffffff8000 and 0x0000000000007ffc) + +Fix the issue by manually adding nop instructions using the preprocessor. + +Link: https://lkml.kernel.org/r/20250428131937.641989-2-nysal@linux.ibm.com +Fixes: 46036188ea1f ("selftests/mm: build with -O2") +Reported-by: Madhavan Srinivasan +Signed-off-by: Nysal Jan K.A. +Tested-by: Venkat Rao Bagalkote +Reviewed-by: Donet Tom +Tested-by: Donet Tom +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/mm/pkey-powerpc.h | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/tools/testing/selftests/mm/pkey-powerpc.h ++++ b/tools/testing/selftests/mm/pkey-powerpc.h +@@ -102,8 +102,18 @@ void expect_fault_on_read_execonly_key(v + return; + } + ++#define REPEAT_8(s) s s s s s s s s ++#define REPEAT_64(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) \ ++ REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) ++#define REPEAT_512(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) \ ++ REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) ++#define REPEAT_4096(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) \ ++ REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) ++#define REPEAT_16384(s) REPEAT_4096(s) REPEAT_4096(s) \ ++ REPEAT_4096(s) REPEAT_4096(s) ++ + /* 4-byte instructions * 16384 = 64K page */ +-#define __page_o_noops() asm(".rept 16384 ; nop; .endr") ++#define __page_o_noops() asm(REPEAT_16384("nop\n")) + + void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) + { diff --git a/queue-6.12/series b/queue-6.12/series index 8a24b85d32..7c40bbb591 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -69,3 +69,12 @@ staging-iio-adc-ad7816-correct-conditional-logic-for-store-mode.patch staging-bcm2835-camera-initialise-dev-in-v4l2_dev.patch staging-axis-fifo-remove-hardware-resets-for-user-errors.patch staging-axis-fifo-correct-handling-of-tx_fifo_depth-for-size-validation.patch +x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch +mm-fix-folio_pte_batch-on-xen-pv.patch +mm-vmalloc-support-more-granular-vrealloc-sizing.patch +mm-huge_memory-fix-dereferencing-invalid-pmd-migration-entry.patch +mm-userfaultfd-fix-uninitialized-output-field-for-eagain-race.patch +selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch +selftests-mm-fix-a-build-failure-on-powerpc.patch +kvm-svm-forcibly-leave-smm-mode-on-shutdown-interception.patch +drm-amd-display-shift-dmub-aux-reply-command-if-necessary.patch diff --git a/queue-6.12/x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch b/queue-6.12/x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch new file mode 100644 index 0000000000..3644848a38 --- /dev/null +++ b/queue-6.12/x86-mm-eliminate-window-where-tlb-flushes-may-be-inadvertently-skipped.patch @@ -0,0 +1,131 @@ +From fea4e317f9e7e1f449ce90dedc27a2d2a95bee5a Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Thu, 8 May 2025 15:41:32 -0700 +Subject: x86/mm: Eliminate window where TLB flushes may be inadvertently skipped + +From: Dave Hansen + +commit fea4e317f9e7e1f449ce90dedc27a2d2a95bee5a upstream. + +tl;dr: There is a window in the mm switching code where the new CR3 is +set and the CPU should be getting TLB flushes for the new mm. But +should_flush_tlb() has a bug and suppresses the flush. Fix it by +widening the window where should_flush_tlb() sends an IPI. + +Long Version: + +=== History === + +There were a few things leading up to this. + +First, updating mm_cpumask() was observed to be too expensive, so it was +made lazier. But being lazy caused too many unnecessary IPIs to CPUs +due to the now-lazy mm_cpumask(). So code was added to cull +mm_cpumask() periodically[2]. But that culling was a bit too aggressive +and skipped sending TLB flushes to CPUs that need them. So here we are +again. + +=== Problem === + +The too-aggressive code in should_flush_tlb() strikes in this window: + + // Turn on IPIs for this CPU/mm combination, but only + // if should_flush_tlb() agrees: + cpumask_set_cpu(cpu, mm_cpumask(next)); + + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + load_new_mm_cr3(need_flush); + // ^ After 'need_flush' is set to false, IPIs *MUST* + // be sent to this CPU and not be ignored. + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + // ^ Not until this point does should_flush_tlb() + // become true! + +should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3() +and writing to 'loaded_mm', which is a window where they should not be +suppressed. Whoops. + +=== Solution === + +Thankfully, the fuzzy "just about to write CR3" window is already marked +with loaded_mm==LOADED_MM_SWITCHING. Simply checking for that state in +should_flush_tlb() is sufficient to ensure that the CPU is targeted with +an IPI. + +This will cause more TLB flush IPIs. But the window is relatively small +and I do not expect this to cause any kind of measurable performance +impact. + +Update the comment where LOADED_MM_SWITCHING is written since it grew +yet another user. + +Peter Z also raised a concern that should_flush_tlb() might not observe +'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off() +writes them. Add a barrier to ensure that they are observed in the +order they are written. + +Signed-off-by: Dave Hansen +Acked-by: Rik van Riel +Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1] +Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2] +Reported-by: Stephen Dolan +Cc: stable@vger.kernel.org +Acked-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/tlb.c | 23 +++++++++++++++++++++-- + 1 file changed, 21 insertions(+), 2 deletions(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -624,7 +624,11 @@ void switch_mm_irqs_off(struct mm_struct + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + +- /* Let nmi_uaccess_okay() know that we're changing CR3. */ ++ /* ++ * Indicate that CR3 is about to change. nmi_uaccess_okay() ++ * and others are sensitive to the window where mm_cpumask(), ++ * CR3 and cpu_tlbstate.loaded_mm are not all in sync. ++ */ + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); + barrier(); + } +@@ -895,8 +899,16 @@ done: + + static bool should_flush_tlb(int cpu, void *data) + { ++ struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); + struct flush_tlb_info *info = data; + ++ /* ++ * Order the 'loaded_mm' and 'is_lazy' against their ++ * write ordering in switch_mm_irqs_off(). Ensure ++ * 'is_lazy' is at least as new as 'loaded_mm'. ++ */ ++ smp_rmb(); ++ + /* Lazy TLB will get flushed at the next context switch. */ + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + return false; +@@ -905,8 +917,15 @@ static bool should_flush_tlb(int cpu, vo + if (!info->mm) + return true; + ++ /* ++ * While switching, the remote CPU could have state from ++ * either the prev or next mm. Assume the worst and flush. ++ */ ++ if (loaded_mm == LOADED_MM_SWITCHING) ++ return true; ++ + /* The target mm is loaded, and the CPU is not lazy. */ +- if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) ++ if (loaded_mm == info->mm) + return true; + + /* In cpumask, but not the loaded mm? Periodically remove by flushing. */