From: Greg Kroah-Hartman Date: Thu, 8 Jan 2026 13:36:36 +0000 (+0100) Subject: 6.1-stable patches X-Git-Tag: v6.1.160~33 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=319ce5851e41ea4ffcef6e8182042e895abb33d1;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: dmaengine-idxd-remove-improper-idxd_free.patch drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch mm-mprotect-use-long-for-page-accountings-and-retval.patch sched-fair-proportional-newidle-balance.patch sched-fair-small-cleanup-to-sched_balance_newidle.patch sched-fair-small-cleanup-to-update_newidle_cost.patch usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch --- diff --git a/queue-6.1/dmaengine-idxd-remove-improper-idxd_free.patch b/queue-6.1/dmaengine-idxd-remove-improper-idxd_free.patch new file mode 100644 index 0000000000..1f2de4dfd7 --- /dev/null +++ b/queue-6.1/dmaengine-idxd-remove-improper-idxd_free.patch @@ -0,0 +1,71 @@ +From stable+bounces-201132-greg=kroah.com@vger.kernel.org Tue Dec 16 04:16:52 2025 +From: lanbincn@139.com +Date: Tue, 16 Dec 2025 03:13:13 +0000 +Subject: dmaengine: idxd: Remove improper idxd_free +To: stable@vger.kernel.org +Cc: Yi Sun , Shuai Xue , Dave Jiang , Vinicius Costa Gomes , Vinod Koul , Bin Lan +Message-ID: <20251216031313.4853-1-lanbincn@139.com> + +From: Yi Sun + +[ Upstream commit f41c538881eec4dcf5961a242097d447f848cda6 ] + +The call to idxd_free() introduces a duplicate put_device() leading to a +reference count underflow: +refcount_t: underflow; use-after-free. +WARNING: CPU: 15 PID: 4428 at lib/refcount.c:28 refcount_warn_saturate+0xbe/0x110 +... +Call Trace: + + idxd_remove+0xe4/0x120 [idxd] + pci_device_remove+0x3f/0xb0 + device_release_driver_internal+0x197/0x200 + driver_detach+0x48/0x90 + bus_remove_driver+0x74/0xf0 + pci_unregister_driver+0x2e/0xb0 + idxd_exit_module+0x34/0x7a0 [idxd] + __do_sys_delete_module.constprop.0+0x183/0x280 + do_syscall_64+0x54/0xd70 + entry_SYSCALL_64_after_hwframe+0x76/0x7e + +The idxd_unregister_devices() which is invoked at the very beginning of +idxd_remove(), already takes care of the necessary put_device() through the +following call path: +idxd_unregister_devices() -> device_unregister() -> put_device() + +In addition, when CONFIG_DEBUG_KOBJECT_RELEASE is enabled, put_device() may +trigger asynchronous cleanup via schedule_delayed_work(). If idxd_free() is +called immediately after, it can result in a use-after-free. + +Remove the improper idxd_free() to avoid both the refcount underflow and +potential memory corruption during module unload. + +Fixes: d5449ff1b04d ("dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call") +Signed-off-by: Yi Sun +Tested-by: Shuai Xue +Reviewed-by: Dave Jiang +Acked-by: Vinicius Costa Gomes + +Link: https://lore.kernel.org/r/20250729150313.1934101-2-yi.sun@intel.com +Signed-off-by: Vinod Koul +[ Slightly adjust the context. ] +Signed-off-by: Bin Lan +Signed-off-by: Greg Kroah-Hartman +--- +Without this patch, this issue can be reproduced in Linux-6.1.y +when the idxd module is removed. +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/idxd/init.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/drivers/dma/idxd/init.c ++++ b/drivers/dma/idxd/init.c +@@ -816,7 +816,6 @@ static void idxd_remove(struct pci_dev * + destroy_workqueue(idxd->wq); + perfmon_pmu_remove(idxd); + put_device(idxd_confdev(idxd)); +- idxd_free(idxd); + } + + static struct pci_driver idxd_pci_driver = { diff --git a/queue-6.1/drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch b/queue-6.1/drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch new file mode 100644 index 0000000000..e55395d2d6 --- /dev/null +++ b/queue-6.1/drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch @@ -0,0 +1,80 @@ +From stable+bounces-203358-greg=kroah.com@vger.kernel.org Wed Dec 24 09:57:13 2025 +From: Shivani Agarwal +Date: Wed, 24 Dec 2025 00:36:52 -0800 +Subject: drm/vmwgfx: Fix a null-ptr access in the cursor snooper +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: zack.rusin@broadcom.com, bcm-kernel-feedback-list@broadcom.com, maarten.lankhorst@linux.intel.com, mripard@kernel.org, tzimmermann@suse.de, simona@ffwll.ch, airlied@gmail.com, brianp@vmware.com, dtor@vmware.com, airlied@redhat.com, thellstrom@vmware.com, dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Kuzey Arda Bulut , Ian Forbes , Sasha Levin , Shivani Agarwal +Message-ID: <20251224083652.614902-1-shivani.agarwal@broadcom.com> + +From: Zack Rusin + +[ Upstream commit 5ac2c0279053a2c5265d46903432fb26ae2d0da2 ] + +Check that the resource which is converted to a surface exists before +trying to use the cursor snooper on it. + +vmw_cmd_res_check allows explicit invalid (SVGA3D_INVALID_ID) identifiers +because some svga commands accept SVGA3D_INVALID_ID to mean "no surface", +unfortunately functions that accept the actual surfaces as objects might +(and in case of the cursor snooper, do not) be able to handle null +objects. Make sure that we validate not only the identifier (via the +vmw_cmd_res_check) but also check that the actual resource exists before +trying to do something with it. + +Fixes unchecked null-ptr reference in the snooping code. + +Signed-off-by: Zack Rusin +Fixes: c0951b797e7d ("drm/vmwgfx: Refactor resource management") +Reported-by: Kuzey Arda Bulut +Cc: Broadcom internal kernel review list +Cc: dri-devel@lists.freedesktop.org +Reviewed-by: Ian Forbes +Link: https://lore.kernel.org/r/20250917153655.1968583-1-zack.rusin@broadcom.com +Signed-off-by: Sasha Levin +[Shivani: Modified to apply on v5.10.y-v6.1.y] +Signed-off-by: Shivani Agarwal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +@@ -1507,6 +1507,7 @@ static int vmw_cmd_dma(struct vmw_privat + SVGA3dCmdHeader *header) + { + struct vmw_buffer_object *vmw_bo = NULL; ++ struct vmw_resource *res; + struct vmw_surface *srf = NULL; + VMW_DECLARE_CMD_VAR(*cmd, SVGA3dCmdSurfaceDMA); + int ret; +@@ -1542,18 +1543,24 @@ static int vmw_cmd_dma(struct vmw_privat + + dirty = (cmd->body.transfer == SVGA3D_WRITE_HOST_VRAM) ? + VMW_RES_DIRTY_SET : 0; +- ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface, +- dirty, user_surface_converter, +- &cmd->body.host.sid, NULL); ++ ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface, dirty, ++ user_surface_converter, &cmd->body.host.sid, ++ NULL); + if (unlikely(ret != 0)) { + if (unlikely(ret != -ERESTARTSYS)) + VMW_DEBUG_USER("could not find surface for DMA.\n"); + return ret; + } + +- srf = vmw_res_to_srf(sw_context->res_cache[vmw_res_surface].res); ++ res = sw_context->res_cache[vmw_res_surface].res; ++ if (!res) { ++ VMW_DEBUG_USER("Invalid DMA surface.\n"); ++ return -EINVAL; ++ } + +- vmw_kms_cursor_snoop(srf, sw_context->fp->tfile, &vmw_bo->base, header); ++ srf = vmw_res_to_srf(res); ++ vmw_kms_cursor_snoop(srf, sw_context->fp->tfile, &vmw_bo->base, ++ header); + + return 0; + } diff --git a/queue-6.1/kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch b/queue-6.1/kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch new file mode 100644 index 0000000000..b5f36d0a2d --- /dev/null +++ b/queue-6.1/kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch @@ -0,0 +1,47 @@ +From justinstitt@google.com Thu Jan 8 14:27:57 2026 +From: Justin Stitt +Date: Fri, 05 Dec 2025 14:51:41 -0800 +Subject: KVM: arm64: sys_regs: disable -Wuninitialized-const-pointer warning +To: Marc Zyngier , Oliver Upton , Alexandru Elisei , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Zenghui Yu , Will Deacon , Nathan Chancellor , Christopher Covington +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.cs.columbia.edu, linux-kernel@vger.kernel.org, llvm@lists.linux.dev, stable@vger.kernel.org, Justin Stitt +Message-ID: <20251205-stable-disable-unit-ptr-warn-v2-1-cec53a8f736b@google.com> + +From: Justin Stitt + +A new warning in Clang 22 [1] complains that @clidr passed to +get_clidr_el1() is an uninitialized const pointer. get_clidr_el1() +doesn't really care since it casts away the const-ness anyways -- it is +a false positive. + +This patch isn't needed for anything past 6.1 as this code section was +reworked in Commit 7af0c2534f4c ("KVM: arm64: Normalize cache +configuration") which incidentally removed the aforementioned warning. +Since there is no upstream equivalent, this patch just needs to be +applied to 6.1. + +Disable this warning for sys_regs.o instead of backporting the patches +from 6.2+ that modified this code area. + +Cc: stable@vger.kernel.org +Fixes: 7c8c5e6a9101e ("arm64: KVM: system register handling") +Link: https://github.com/llvm/llvm-project/commit/00dacf8c22f065cb52efb14cd091d441f19b319e [1] +Reviewed-by: Nathan Chancellor +Signed-off-by: Justin Stitt +Reviewed-by: Tiffany Yang +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/Makefile | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/arm64/kvm/Makefile ++++ b/arch/arm64/kvm/Makefile +@@ -24,6 +24,9 @@ kvm-y += arm.o mmu.o mmio.o psci.o hyper + + kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o + ++# Work around a false positive Clang 22 -Wuninitialized-const-pointer warning ++CFLAGS_sys_regs.o := $(call cc-disable-warning, uninitialized-const-pointer) ++ + always-y := hyp_constants.h hyp-constants.s + + define rule_gen_hyp_constants diff --git a/queue-6.1/mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch b/queue-6.1/mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch new file mode 100644 index 0000000000..472788c189 --- /dev/null +++ b/queue-6.1/mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch @@ -0,0 +1,283 @@ +From stable+bounces-205076-greg=kroah.com@vger.kernel.org Tue Jan 6 12:49:13 2026 +From: Harry Yoo +Date: Tue, 6 Jan 2026 20:47:14 +0900 +Subject: mm/mprotect: delete pmd_none_or_clear_bad_unless_trans_huge() +To: stable@vger.kernel.org +Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org, baohua@kernel.org, baolin.wang@linux.alibaba.com, david@kernel.org, dev.jain@arm.com, hughd@google.com, jane.chu@oracle.com, jannh@google.com, kas@kernel.org, lance.yang@linux.dev, linux-mm@kvack.org, lorenzo.stoakes@oracle.com, npache@redhat.com, pfalcato@suse.de, ryan.roberts@arm.com, vbabka@suse.cz, ziy@nvidia.com, "Alistair Popple" , "Anshuman Khandual" , "Axel Rasmussen" , "Christophe Leroy" , "Christoph Hellwig" , "David Hildenbrand" , "Huang, Ying" , "Ira Weiny" , "Jason Gunthorpe" , "Kirill A . Shutemov" , "Lorenzo Stoakes" , "Matthew Wilcox" , "Mel Gorman" , "Miaohe Lin" , "Mike Kravetz" , "Mike Rapoport" , "Minchan Kim" , "Naoya Horiguchi" , "Pavel Tatashin" , "Peter Xu" , "Peter Zijlstra" , "Qi Zheng" , "Ralph Campbell" , "SeongJae Park" , "Song Liu" , "Steven Price" , "Suren Baghdasaryan" , "Thomas Hellström" , "Will Deacon" , "Yang Shi" , "Yu Zhao" , "Zack Rusin" , "Harry Yoo" +Message-ID: <20260106114715.80958-3-harry.yoo@oracle.com> + +From: Hugh Dickins + +commit 670ddd8cdcbd1d07a4571266ae3517f821728c3a upstream. + +change_pmd_range() had special pmd_none_or_clear_bad_unless_trans_huge(), +required to avoid "bad" choices when setting automatic NUMA hinting under +mmap_read_lock(); but most of that is already covered in pte_offset_map() +now. change_pmd_range() just wants a pmd_none() check before wasting time +on MMU notifiers, then checks on the read-once _pmd value to work out +what's needed for huge cases. If change_pte_range() returns -EAGAIN to +retry if pte_offset_map_lock() fails, nothing more special is needed. + +Link: https://lkml.kernel.org/r/725a42a9-91e9-c868-925-e3a5fd40bb4f@google.com +Signed-off-by: Hugh Dickins +Cc: Alistair Popple +Cc: Anshuman Khandual +Cc: Axel Rasmussen +Cc: Christophe Leroy +Cc: Christoph Hellwig +Cc: David Hildenbrand +Cc: "Huang, Ying" +Cc: Ira Weiny +Cc: Jason Gunthorpe +Cc: Kirill A. Shutemov +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox +Cc: Mel Gorman +Cc: Miaohe Lin +Cc: Mike Kravetz +Cc: Mike Rapoport (IBM) +Cc: Minchan Kim +Cc: Naoya Horiguchi +Cc: Pavel Tatashin +Cc: Peter Xu +Cc: Peter Zijlstra +Cc: Qi Zheng +Cc: Ralph Campbell +Cc: Ryan Roberts +Cc: SeongJae Park +Cc: Song Liu +Cc: Steven Price +Cc: Suren Baghdasaryan +Cc: Thomas Hellström +Cc: Will Deacon +Cc: Yang Shi +Cc: Yu Zhao +Cc: Zack Rusin +Signed-off-by: Andrew Morton +[ Background: It was reported that a bad pmd is seen when automatic NUMA + balancing is marking page table entries as prot_numa: + + [2437548.196018] mm/pgtable-generic.c:50: bad pmd 00000000af22fc02(dffffffe71fbfe02) + [2437548.235022] Call Trace: + [2437548.238234] + [2437548.241060] dump_stack_lvl+0x46/0x61 + [2437548.245689] panic+0x106/0x2e5 + [2437548.249497] pmd_clear_bad+0x3c/0x3c + [2437548.253967] change_pmd_range.isra.0+0x34d/0x3a7 + [2437548.259537] change_p4d_range+0x156/0x20e + [2437548.264392] change_protection_range+0x116/0x1a9 + [2437548.269976] change_prot_numa+0x15/0x37 + [2437548.274774] task_numa_work+0x1b8/0x302 + [2437548.279512] task_work_run+0x62/0x95 + [2437548.283882] exit_to_user_mode_loop+0x1a4/0x1a9 + [2437548.289277] exit_to_user_mode_prepare+0xf4/0xfc + [2437548.294751] ? sysvec_apic_timer_interrupt+0x34/0x81 + [2437548.300677] irqentry_exit_to_user_mode+0x5/0x25 + [2437548.306153] asm_sysvec_apic_timer_interrupt+0x16/0x1b + + This is due to a race condition between change_prot_numa() and + THP migration because the kernel doesn't check is_swap_pmd() and + pmd_trans_huge() atomically: + + change_prot_numa() THP migration + ====================================================================== + - change_pmd_range() + -> is_swap_pmd() returns false, + meaning it's not a PMD migration + entry. + - do_huge_pmd_numa_page() + -> migrate_misplaced_page() sets + migration entries for the THP. + - change_pmd_range() + -> pmd_none_or_clear_bad_unless_trans_huge() + -> pmd_none() and pmd_trans_huge() returns false + - pmd_none_or_clear_bad_unless_trans_huge() + -> pmd_bad() returns true for the migration entry! + + The upstream commit 670ddd8cdcbd ("mm/mprotect: delete + pmd_none_or_clear_bad_unless_trans_huge()") closes this race condition + by checking is_swap_pmd() and pmd_trans_huge() atomically. + + Backporting note: + Unlike the mainline, pte_offset_map_lock() does not check if the pmd + entry is a migration entry or a hugepage; acquires PTL unconditionally + instead of returning failure. Therefore, it is necessary to keep the + !is_swap_pmd() && !pmd_trans_huge() && !pmd_devmap() check before + acquiring the PTL. + + After acquiring the lock, open-code the semantics of + pte_offset_map_lock() in the mainline kernel; change_pte_range() fails + if the pmd value has changed. This requires adding pmd_old parameter + (pmd_t value that is read before calling the function) to + change_pte_range(). ] + +Signed-off-by: Harry Yoo +Acked-by: David Hildenbrand (Red Hat) +Signed-off-by: Greg Kroah-Hartman +--- + mm/mprotect.c | 101 ++++++++++++++++++++++++---------------------------------- + 1 file changed, 43 insertions(+), 58 deletions(-) + +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -73,10 +73,12 @@ static inline bool can_change_pte_writab + } + + static long change_pte_range(struct mmu_gather *tlb, +- struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, +- unsigned long end, pgprot_t newprot, unsigned long cp_flags) ++ struct vm_area_struct *vma, pmd_t *pmd, pmd_t pmd_old, ++ unsigned long addr, unsigned long end, pgprot_t newprot, ++ unsigned long cp_flags) + { + pte_t *pte, oldpte; ++ pmd_t _pmd; + spinlock_t *ptl; + long pages = 0; + int target_node = NUMA_NO_NODE; +@@ -86,21 +88,15 @@ static long change_pte_range(struct mmu_ + + tlb_change_page_size(tlb, PAGE_SIZE); + +- /* +- * Can be called with only the mmap_lock for reading by +- * prot_numa so we must check the pmd isn't constantly +- * changing from under us from pmd_none to pmd_trans_huge +- * and/or the other way around. +- */ +- if (pmd_trans_unstable(pmd)) +- return 0; +- +- /* +- * The pmd points to a regular pte so the pmd can't change +- * from under us even if the mmap_lock is only hold for +- * reading. +- */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); ++ /* Make sure pmd didn't change after acquiring ptl */ ++ _pmd = pmd_read_atomic(pmd); ++ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ ++ barrier(); ++ if (!pmd_same(pmd_old, _pmd)) { ++ pte_unmap_unlock(pte, ptl); ++ return -EAGAIN; ++ } + + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && +@@ -288,31 +284,6 @@ static long change_pte_range(struct mmu_ + return pages; + } + +-/* +- * Used when setting automatic NUMA hinting protection where it is +- * critical that a numa hinting PMD is not confused with a bad PMD. +- */ +-static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) +-{ +- pmd_t pmdval = pmd_read_atomic(pmd); +- +- /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE +- barrier(); +-#endif +- +- if (pmd_none(pmdval)) +- return 1; +- if (pmd_trans_huge(pmdval)) +- return 0; +- if (unlikely(pmd_bad(pmdval))) { +- pmd_clear_bad(pmd); +- return 1; +- } +- +- return 0; +-} +- + /* Return true if we're uffd wr-protecting file-backed memory, or false */ + static inline bool + uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) +@@ -360,22 +331,34 @@ static inline long change_pmd_range(stru + + pmd = pmd_offset(pud, addr); + do { +- long this_pages; +- ++ long ret; ++ pmd_t _pmd; ++again: + next = pmd_addr_end(addr, end); ++ _pmd = pmd_read_atomic(pmd); ++ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ barrier(); ++#endif + + change_pmd_prepare(vma, pmd, cp_flags); + /* + * Automatic NUMA balancing walks the tables with mmap_lock + * held for read. It's possible a parallel update to occur +- * between pmd_trans_huge() and a pmd_none_or_clear_bad() +- * check leading to a false positive and clearing. +- * Hence, it's necessary to atomically read the PMD value +- * for all the checks. ++ * between pmd_trans_huge(), is_swap_pmd(), and ++ * a pmd_none_or_clear_bad() check leading to a false positive ++ * and clearing. Hence, it's necessary to atomically read ++ * the PMD value for all the checks. + */ +- if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) && +- pmd_none_or_clear_bad_unless_trans_huge(pmd)) +- goto next; ++ if (!is_swap_pmd(_pmd) && !pmd_devmap(_pmd) && !pmd_trans_huge(_pmd)) { ++ if (pmd_none(_pmd)) ++ goto next; ++ ++ if (pmd_bad(_pmd)) { ++ pmd_clear_bad(pmd); ++ goto next; ++ } ++ } + + /* invoke the mmu notifier if the pmd is populated */ + if (!range.start) { +@@ -385,7 +368,7 @@ static inline long change_pmd_range(stru + mmu_notifier_invalidate_range_start(&range); + } + +- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { ++ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { + if ((next - addr != HPAGE_PMD_SIZE) || + uffd_wp_protect_file(vma, cp_flags)) { + __split_huge_pmd(vma, pmd, addr, false, NULL); +@@ -400,11 +383,11 @@ static inline long change_pmd_range(stru + * change_huge_pmd() does not defer TLB flushes, + * so no need to propagate the tlb argument. + */ +- int nr_ptes = change_huge_pmd(tlb, vma, pmd, +- addr, newprot, cp_flags); ++ ret = change_huge_pmd(tlb, vma, pmd, ++ addr, newprot, cp_flags); + +- if (nr_ptes) { +- if (nr_ptes == HPAGE_PMD_NR) { ++ if (ret) { ++ if (ret == HPAGE_PMD_NR) { + pages += HPAGE_PMD_NR; + nr_huge_updates++; + } +@@ -415,9 +398,11 @@ static inline long change_pmd_range(stru + } + /* fall through, the trans huge pmd just split */ + } +- this_pages = change_pte_range(tlb, vma, pmd, addr, next, +- newprot, cp_flags); +- pages += this_pages; ++ ret = change_pte_range(tlb, vma, pmd, _pmd, addr, next, ++ newprot, cp_flags); ++ if (ret < 0) ++ goto again; ++ pages += ret; + next: + cond_resched(); + } while (pmd++, addr = next, addr != end); diff --git a/queue-6.1/mm-mprotect-use-long-for-page-accountings-and-retval.patch b/queue-6.1/mm-mprotect-use-long-for-page-accountings-and-retval.patch new file mode 100644 index 0000000000..38eb3267a0 --- /dev/null +++ b/queue-6.1/mm-mprotect-use-long-for-page-accountings-and-retval.patch @@ -0,0 +1,223 @@ +From stable+bounces-205075-greg=kroah.com@vger.kernel.org Tue Jan 6 12:48:24 2026 +From: Harry Yoo +Date: Tue, 6 Jan 2026 20:47:13 +0900 +Subject: mm/mprotect: use long for page accountings and retval +To: stable@vger.kernel.org +Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org, baohua@kernel.org, baolin.wang@linux.alibaba.com, david@kernel.org, dev.jain@arm.com, hughd@google.com, jane.chu@oracle.com, jannh@google.com, kas@kernel.org, lance.yang@linux.dev, linux-mm@kvack.org, lorenzo.stoakes@oracle.com, npache@redhat.com, pfalcato@suse.de, ryan.roberts@arm.com, vbabka@suse.cz, ziy@nvidia.com, Peter Xu , Mike Kravetz , James Houghton , Andrea Arcangeli , Axel Rasmussen , David Hildenbrand , Muchun Song , Nadav Amit , Harry Yoo +Message-ID: <20260106114715.80958-2-harry.yoo@oracle.com> + +From: Peter Xu + +commit a79390f5d6a78647fd70856bd42b22d994de0ba2 upstream. + +Switch to use type "long" for page accountings and retval across the whole +procedure of change_protection(). + +The change should have shrinked the possible maximum page number to be +half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on +any system either because the maximum possible pages touched by change +protection should be ULONG_MAX / PAGE_SIZE. + +Two reasons to switch from "unsigned long" to "long": + + 1. It suites better on count_vm_numa_events(), whose 2nd parameter takes + a long type. + + 2. It paves way for returning negative (error) values in the future. + +Currently the only caller that consumes this retval is change_prot_numa(), +where the unsigned long was converted to an int. Since at it, touching up +the numa code to also take a long, so it'll avoid any possible overflow +too during the int-size convertion. + +Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com +Signed-off-by: Peter Xu +Acked-by: Mike Kravetz +Acked-by: James Houghton +Cc: Andrea Arcangeli +Cc: Axel Rasmussen +Cc: David Hildenbrand +Cc: Muchun Song +Cc: Nadav Amit +Signed-off-by: Andrew Morton +[ Adjust context ] +Signed-off-by: Harry Yoo +Acked-by: David Hildenbrand (Red Hat) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/hugetlb.h | 4 ++-- + include/linux/mm.h | 2 +- + mm/hugetlb.c | 4 ++-- + mm/mempolicy.c | 2 +- + mm/mprotect.c | 26 +++++++++++++------------- + 5 files changed, 19 insertions(+), 19 deletions(-) + +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -233,7 +233,7 @@ void hugetlb_vma_lock_release(struct kre + + int pmd_huge(pmd_t pmd); + int pud_huge(pud_t pud); +-unsigned long hugetlb_change_protection(struct vm_area_struct *vma, ++long hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot, + unsigned long cp_flags); + +@@ -447,7 +447,7 @@ static inline void move_hugetlb_state(st + { + } + +-static inline unsigned long hugetlb_change_protection( ++static inline long hugetlb_change_protection( + struct vm_area_struct *vma, unsigned long address, + unsigned long end, pgprot_t newprot, + unsigned long cp_flags) +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2148,7 +2148,7 @@ extern unsigned long move_page_tables(st + #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ + MM_CP_UFFD_WP_RESOLVE) + +-extern unsigned long change_protection(struct mmu_gather *tlb, ++extern long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + unsigned long cp_flags); +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -6668,7 +6668,7 @@ long follow_hugetlb_page(struct mm_struc + return i ? i : err; + } + +-unsigned long hugetlb_change_protection(struct vm_area_struct *vma, ++long hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, + pgprot_t newprot, unsigned long cp_flags) + { +@@ -6677,7 +6677,7 @@ unsigned long hugetlb_change_protection( + pte_t *ptep; + pte_t pte; + struct hstate *h = hstate_vma(vma); +- unsigned long pages = 0, psize = huge_page_size(h); ++ long pages = 0, psize = huge_page_size(h); + bool shared_pmd = false; + struct mmu_notifier_range range; + unsigned long last_addr_mask; +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -628,7 +628,7 @@ unsigned long change_prot_numa(struct vm + unsigned long addr, unsigned long end) + { + struct mmu_gather tlb; +- int nr_updated; ++ long nr_updated; + + tlb_gather_mmu(&tlb, vma->vm_mm); + +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -72,13 +72,13 @@ static inline bool can_change_pte_writab + return true; + } + +-static unsigned long change_pte_range(struct mmu_gather *tlb, ++static long change_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) + { + pte_t *pte, oldpte; + spinlock_t *ptl; +- unsigned long pages = 0; ++ long pages = 0; + int target_node = NUMA_NO_NODE; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; +@@ -346,13 +346,13 @@ uffd_wp_protect_file(struct vm_area_stru + } \ + } while (0) + +-static inline unsigned long change_pmd_range(struct mmu_gather *tlb, ++static inline long change_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) + { + pmd_t *pmd; + unsigned long next; +- unsigned long pages = 0; ++ long pages = 0; + unsigned long nr_huge_updates = 0; + struct mmu_notifier_range range; + +@@ -360,7 +360,7 @@ static inline unsigned long change_pmd_r + + pmd = pmd_offset(pud, addr); + do { +- unsigned long this_pages; ++ long this_pages; + + next = pmd_addr_end(addr, end); + +@@ -430,13 +430,13 @@ next: + return pages; + } + +-static inline unsigned long change_pud_range(struct mmu_gather *tlb, ++static inline long change_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) + { + pud_t *pud; + unsigned long next; +- unsigned long pages = 0; ++ long pages = 0; + + pud = pud_offset(p4d, addr); + do { +@@ -451,13 +451,13 @@ static inline unsigned long change_pud_r + return pages; + } + +-static inline unsigned long change_p4d_range(struct mmu_gather *tlb, ++static inline long change_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) + { + p4d_t *p4d; + unsigned long next; +- unsigned long pages = 0; ++ long pages = 0; + + p4d = p4d_offset(pgd, addr); + do { +@@ -472,14 +472,14 @@ static inline unsigned long change_p4d_r + return pages; + } + +-static unsigned long change_protection_range(struct mmu_gather *tlb, ++static long change_protection_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) + { + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; +- unsigned long pages = 0; ++ long pages = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); +@@ -498,12 +498,12 @@ static unsigned long change_protection_r + return pages; + } + +-unsigned long change_protection(struct mmu_gather *tlb, ++long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + unsigned long cp_flags) + { +- unsigned long pages; ++ long pages; + + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); + diff --git a/queue-6.1/sched-fair-proportional-newidle-balance.patch b/queue-6.1/sched-fair-proportional-newidle-balance.patch new file mode 100644 index 0000000000..25a5d0ef4e --- /dev/null +++ b/queue-6.1/sched-fair-proportional-newidle-balance.patch @@ -0,0 +1,207 @@ +From stable+bounces-198206-greg=kroah.com@vger.kernel.org Wed Dec 3 12:44:47 2025 +From: Ajay Kaher +Date: Wed, 3 Dec 2025 11:25:52 +0000 +Subject: sched/fair: Proportional newidle balance +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason +Message-ID: <20251203112552.1738424-5-ajay.kaher@broadcom.com> + +From: Peter Zijlstra + +commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream. + +Add a randomized algorithm that runs newidle balancing proportional to +its success rate. + +This improves schbench significantly: + + 6.18-rc4: 2.22 Mrps/s + 6.18-rc4+revert: 2.04 Mrps/s + 6.18-rc4+revert+random: 2.18 Mrps/S + +Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: + + 6.17: -6% + 6.17+revert: 0% + 6.17+revert+random: -1% + +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Dietmar Eggemann +Tested-by: Dietmar Eggemann +Tested-by: Chris Mason +Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com +Link: https://patch.msgid.link/20251107161739.770122091@infradead.org +[ Ajay: Modified to apply on v6.1 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched/topology.h | 3 ++ + kernel/sched/core.c | 3 ++ + kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++++++++++---- + kernel/sched/features.h | 5 ++++ + kernel/sched/sched.h | 7 ++++++ + kernel/sched/topology.c | 6 +++++ + 6 files changed, 64 insertions(+), 4 deletions(-) + +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -106,6 +106,9 @@ struct sched_domain { + unsigned int nr_balance_failed; /* initialise to 0 */ + + /* idle_balance() stats */ ++ unsigned int newidle_call; ++ unsigned int newidle_success; ++ unsigned int newidle_ratio; + u64 max_newidle_lb_cost; + unsigned long last_decay_max_lb_cost; + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -112,6 +112,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_ + EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); + + DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); + + #ifdef CONFIG_SCHED_DEBUG + /* +@@ -9632,6 +9633,8 @@ void __init sched_init_smp(void) + { + sched_init_numa(NUMA_NO_NODE); + ++ prandom_init_once(&sched_rnd_state); ++ + /* + * There's no userspace yet to cause hotplug operations; hence all the + * CPU masks are stable and all blatant races in the below code cannot +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10935,11 +10935,27 @@ void update_max_interval(void) + max_load_balance_interval = HZ*num_online_cpus()/10; + } + +-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) ++static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) ++{ ++ sd->newidle_call++; ++ sd->newidle_success += success; ++ ++ if (sd->newidle_call >= 1024) { ++ sd->newidle_ratio = sd->newidle_success; ++ sd->newidle_call /= 2; ++ sd->newidle_success /= 2; ++ } ++} ++ ++static inline bool ++update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) + { + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + ++ if (cost) ++ update_newidle_stats(sd, success); ++ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the +@@ -10987,7 +11003,7 @@ static void rebalance_domains(struct rq + * Decay the newidle max times here because this is a regular + * visit to all the domains. + */ +- need_decay = update_newidle_cost(sd, 0); ++ need_decay = update_newidle_cost(sd, 0, 0); + max_cost += sd->max_newidle_lb_cost; + + /* +@@ -11621,6 +11637,22 @@ static int sched_balance_newidle(struct + break; + + if (sd->flags & SD_BALANCE_NEWIDLE) { ++ unsigned int weight = 1; ++ ++ if (sched_feat(NI_RANDOM)) { ++ /* ++ * Throw a 1k sided dice; and only run ++ * newidle_balance according to the success ++ * rate. ++ */ ++ u32 d1k = sched_rng() % 1024; ++ weight = 1 + sd->newidle_ratio; ++ if (d1k > weight) { ++ update_newidle_stats(sd, 0); ++ continue; ++ } ++ weight = (1024 + weight/2) / weight; ++ } + + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, +@@ -11628,10 +11660,14 @@ static int sched_balance_newidle(struct + + t1 = sched_clock_cpu(this_cpu); + domain_cost = t1 - t0; +- update_newidle_cost(sd, domain_cost); +- + curr_cost += domain_cost; + t0 = t1; ++ ++ /* ++ * Track max cost of a domain to make sure to not delay the ++ * next wakeup on the CPU. ++ */ ++ update_newidle_cost(sd, domain_cost, weight * !!pulled_task); + } + + /* +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -99,5 +99,10 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) + + SCHED_FEAT(LATENCY_WARN, false) + ++/* ++ * Do newidle balancing proportional to its success rate using randomization. ++ */ ++SCHED_FEAT(NI_RANDOM, true) ++ + SCHED_FEAT(ALT_PERIOD, true) + SCHED_FEAT(BASE_SLICE, true) +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -5,6 +5,7 @@ + #ifndef _KERNEL_SCHED_SCHED_H + #define _KERNEL_SCHED_SCHED_H + ++#include + #include + #include + #include +@@ -1190,6 +1191,12 @@ static inline bool is_migration_disabled + } + + DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); ++ ++static inline u32 sched_rng(void) ++{ ++ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); ++} + + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) + #define this_rq() this_cpu_ptr(&runqueues) +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1584,6 +1584,12 @@ sd_init(struct sched_domain_topology_lev + + .last_balance = jiffies, + .balance_interval = sd_weight, ++ ++ /* 50% success rate */ ++ .newidle_call = 512, ++ .newidle_success = 256, ++ .newidle_ratio = 512, ++ + .max_newidle_lb_cost = 0, + .last_decay_max_lb_cost = jiffies, + .child = child, diff --git a/queue-6.1/sched-fair-small-cleanup-to-sched_balance_newidle.patch b/queue-6.1/sched-fair-small-cleanup-to-sched_balance_newidle.patch new file mode 100644 index 0000000000..72b56fe73c --- /dev/null +++ b/queue-6.1/sched-fair-small-cleanup-to-sched_balance_newidle.patch @@ -0,0 +1,49 @@ +From stable+bounces-198204-greg=kroah.com@vger.kernel.org Wed Dec 3 12:44:16 2025 +From: Ajay Kaher +Date: Wed, 3 Dec 2025 11:25:50 +0000 +Subject: sched/fair: Small cleanup to sched_balance_newidle() +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason +Message-ID: <20251203112552.1738424-3-ajay.kaher@broadcom.com> + +From: Peter Zijlstra + +commit e78e70dbf603c1425f15f32b455ca148c932f6c1 upstream. + +Pull out the !sd check to simplify code. + +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Dietmar Eggemann +Tested-by: Dietmar Eggemann +Tested-by: Chris Mason +Link: https://patch.msgid.link/20251107161739.525916173@infradead.org +[ Ajay: Modified to apply on v6.1 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/fair.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -11589,14 +11589,15 @@ static int sched_balance_newidle(struct + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); ++ if (!sd) { ++ rcu_read_unlock(); ++ goto out; ++ } + + if (!READ_ONCE(this_rq->rd->overload) || +- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { +- +- if (sd) +- update_next_balance(sd, &next_balance); ++ this_rq->avg_idle < sd->max_newidle_lb_cost) { ++ update_next_balance(sd, &next_balance); + rcu_read_unlock(); +- + goto out; + } + rcu_read_unlock(); diff --git a/queue-6.1/sched-fair-small-cleanup-to-update_newidle_cost.patch b/queue-6.1/sched-fair-small-cleanup-to-update_newidle_cost.patch new file mode 100644 index 0000000000..f774218fb6 --- /dev/null +++ b/queue-6.1/sched-fair-small-cleanup-to-update_newidle_cost.patch @@ -0,0 +1,58 @@ +From stable+bounces-198205-greg=kroah.com@vger.kernel.org Wed Dec 3 12:44:39 2025 +From: Ajay Kaher +Date: Wed, 3 Dec 2025 11:25:51 +0000 +Subject: sched/fair: Small cleanup to update_newidle_cost() +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason +Message-ID: <20251203112552.1738424-4-ajay.kaher@broadcom.com> + +From: Peter Zijlstra + +commit 08d473dd8718e4a4d698b1113a14a40ad64a909b upstream. + +Simplify code by adding a few variables. + +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Dietmar Eggemann +Tested-by: Dietmar Eggemann +Tested-by: Chris Mason +Link: https://patch.msgid.link/20251107161739.655208666@infradead.org +[ Ajay: Modified to apply on v6.1 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/fair.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10937,22 +10937,25 @@ void update_max_interval(void) + + static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) + { ++ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; ++ unsigned long now = jiffies; ++ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; +- sd->last_decay_max_lb_cost = jiffies; +- } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { ++ sd->last_decay_max_lb_cost = now; ++ ++ } else if (time_after(now, next_decay)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; +- sd->last_decay_max_lb_cost = jiffies; +- ++ sd->last_decay_max_lb_cost = now; + return true; + } + diff --git a/queue-6.1/series b/queue-6.1/series index 625794102e..f0aa234ca7 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -604,3 +604,15 @@ iommu-qcom-use-the-asid-read-from-device-tree-if-specified.patch iommu-qcom-index-contexts-by-asid-number-to-allow-asid-0.patch iommu-qcom-fix-device-leak-on-of_xlate.patch virtio_console-fix-order-of-fields-cols-and-rows.patch +kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch +dmaengine-idxd-remove-improper-idxd_free.patch +x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch +x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch +mm-mprotect-use-long-for-page-accountings-and-retval.patch +mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch +drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch +usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch +usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch +sched-fair-small-cleanup-to-sched_balance_newidle.patch +sched-fair-small-cleanup-to-update_newidle_cost.patch +sched-fair-proportional-newidle-balance.patch diff --git a/queue-6.1/usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch b/queue-6.1/usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch new file mode 100644 index 0000000000..ff49d56abd --- /dev/null +++ b/queue-6.1/usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch @@ -0,0 +1,116 @@ +From linux-usb+bounces-32050-greg=kroah.com@vger.kernel.org Thu Jan 8 10:14:16 2026 +From: Shivani Agarwal +Date: Thu, 8 Jan 2026 00:49:27 -0800 +Subject: usb: xhci: Apply the link chain quirk on NEC isoc endpoints +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mathias.nyman@intel.com, linux-usb@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Michal Pecio , Mathias Nyman , Shivani Agarwal +Message-ID: <20260108084927.671785-3-shivani.agarwal@broadcom.com> + +From: Michal Pecio + +commit bb0ba4cb1065e87f9cc75db1fa454e56d0894d01 upstream. + +Two clearly different specimens of NEC uPD720200 (one with start/stop +bug, one without) were seen to cause IOMMU faults after some Missed +Service Errors. Faulting address is immediately after a transfer ring +segment and patched dynamic debug messages revealed that the MSE was +received when waiting for a TD near the end of that segment: + +[ 1.041954] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ffa08fe0 +[ 1.042120] xhci_hcd: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0005 address=0xffa09000 flags=0x0000] +[ 1.042146] xhci_hcd: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0005 address=0xffa09040 flags=0x0000] + +It gets even funnier if the next page is a ring segment accessible to +the HC. Below, it reports MSE in segment at ff1e8000, plows through a +zero-filled page at ff1e9000 and starts reporting events for TRBs in +page at ff1ea000 every microframe, instead of jumping to seg ff1e6000. + +[ 7.041671] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ff1e8fe0 +[ 7.041999] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ff1e8fe0 +[ 7.042011] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint +[ 7.042028] xhci_hcd: All TDs skipped for slot 1 ep 2. Clear skip flag. +[ 7.042134] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint +[ 7.042138] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 31 +[ 7.042144] xhci_hcd: Looking for event-dma 00000000ff1ea040 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820 +[ 7.042259] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint +[ 7.042262] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 31 +[ 7.042266] xhci_hcd: Looking for event-dma 00000000ff1ea050 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820 + +At some point completion events change from Isoch Buffer Overrun to +Short Packet and the HC finally finds cycle bit mismatch in ff1ec000. + +[ 7.098130] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 13 +[ 7.098132] xhci_hcd: Looking for event-dma 00000000ff1ecc50 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820 +[ 7.098254] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 13 +[ 7.098256] xhci_hcd: Looking for event-dma 00000000ff1ecc60 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820 +[ 7.098379] xhci_hcd: Overrun event on slot 1 ep 2 + +It's possible that data from the isochronous device were written to +random buffers of pending TDs on other endpoints (either IN or OUT), +other devices or even other HCs in the same IOMMU domain. + +Lastly, an error from a different USB device on another HC. Was it +caused by the above? I don't know, but it may have been. The disk +was working without any other issues and generated PCIe traffic to +starve the NEC of upstream BW and trigger those MSEs. The two HCs +shared one x1 slot by means of a commercial "PCIe splitter" board. + +[ 7.162604] usb 10-2: reset SuperSpeed USB device number 3 using xhci_hcd +[ 7.178990] sd 9:0:0:0: [sdb] tag#0 UNKNOWN(0x2003) Result: hostbyte=0x07 driverbyte=DRIVER_OK cmd_age=0s +[ 7.179001] sd 9:0:0:0: [sdb] tag#0 CDB: opcode=0x28 28 00 04 02 ae 00 00 02 00 00 +[ 7.179004] I/O error, dev sdb, sector 67284480 op 0x0:(READ) flags 0x80700 phys_seg 5 prio class 0 + +Fortunately, it appears that this ridiculous bug is avoided by setting +the chain bit of Link TRBs on isochronous rings. Other ancient HCs are +known which also expect the bit to be set and they ignore Link TRBs if +it's not. Reportedly, 0.95 spec guaranteed that the bit is set. + +The bandwidth-starved NEC HC running a 32KB/uframe UVC endpoint reports +tens of MSEs per second and runs into the bug within seconds. Chaining +Link TRBs allows the same workload to run for many minutes, many times. + +No negative side effects seen in UVC recording and UAC playback with a +few devices at full speed, high speed and SuperSpeed. + +The problem doesn't reproduce on the newer Renesas uPD720201/uPD720202 +and on old Etron EJ168 and VIA VL805 (but the VL805 has other bug). + +[shorten line length of log snippets in commit messge -Mathias] + +Signed-off-by: Michal Pecio +Cc: stable@vger.kernel.org +Signed-off-by: Mathias Nyman +Link: https://lore.kernel.org/r/20250306144954.3507700-14-mathias.nyman@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +[Shivani: Modified to apply on v5.10.y-v6.1.y] +Signed-off-by: Shivani Agarwal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/host/xhci.h | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +--- a/drivers/usb/host/xhci.h ++++ b/drivers/usb/host/xhci.h +@@ -1789,11 +1789,20 @@ static inline void xhci_write_64(struct + } + + +-/* Link TRB chain should always be set on 0.95 hosts, and AMD 0.96 ISOC rings */ ++/* ++ * Reportedly, some chapters of v0.95 spec said that Link TRB always has its chain bit set. ++ * Other chapters and later specs say that it should only be set if the link is inside a TD ++ * which continues from the end of one segment to the next segment. ++ * ++ * Some 0.95 hardware was found to misbehave if any link TRB doesn't have the chain bit set. ++ * ++ * 0.96 hardware from AMD and NEC was found to ignore unchained isochronous link TRBs when ++ * "resynchronizing the pipe" after a Missed Service Error. ++ */ + static inline bool xhci_link_chain_quirk(struct xhci_hcd *xhci, enum xhci_ring_type type) + { + return (xhci->quirks & XHCI_LINK_TRB_QUIRK) || +- (type == TYPE_ISOC && (xhci->quirks & XHCI_AMD_0x96_HOST)); ++ (type == TYPE_ISOC && (xhci->quirks & (XHCI_AMD_0x96_HOST | XHCI_NEC_HOST))); + } + + /* xHCI debugging */ diff --git a/queue-6.1/usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch b/queue-6.1/usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch new file mode 100644 index 0000000000..3728ec72cf --- /dev/null +++ b/queue-6.1/usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch @@ -0,0 +1,103 @@ +From shivani.agarwal@broadcom.com Thu Jan 8 10:10:31 2026 +From: Shivani Agarwal +Date: Thu, 8 Jan 2026 00:49:26 -0800 +Subject: usb: xhci: move link chain bit quirk checks into one helper function. +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mathias.nyman@intel.com, linux-usb@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Niklas Neronin , Mathias Nyman , Shivani Agarwal +Message-ID: <20260108084927.671785-2-shivani.agarwal@broadcom.com> + +From: Niklas Neronin + +commit 7476a2215c07703db5e95efaa3fc5b9f957b9417 upstream. + +Older 0.95 xHCI hosts and some other specific newer hosts require the +chain bit to be set for Link TRBs even if the link TRB is not in the +middle of a transfer descriptor (TD). + +move the checks for all those cases into one xhci_link_chain_quirk() +function to clean up and avoid code duplication. + +No functional changes. + +[skip renaming chain_links flag, reword commit message -Mathias] + +Signed-off-by: Niklas Neronin +Signed-off-by: Mathias Nyman +Link: https://lore.kernel.org/r/20240626124835.1023046-10-mathias.nyman@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +[Shivani: Modified to apply on v5.10.y-v6.1.y] +Signed-off-by: Shivani Agarwal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/host/xhci-mem.c | 10 ++-------- + drivers/usb/host/xhci-ring.c | 8 ++------ + drivers/usb/host/xhci.h | 7 +++++-- + 3 files changed, 9 insertions(+), 16 deletions(-) + +--- a/drivers/usb/host/xhci-mem.c ++++ b/drivers/usb/host/xhci-mem.c +@@ -133,10 +133,7 @@ static void xhci_link_rings(struct xhci_ + if (!ring || !first || !last) + return; + +- /* Set chain bit for 0.95 hosts, and for isoc rings on AMD 0.96 host */ +- chain_links = !!(xhci_link_trb_quirk(xhci) || +- (ring->type == TYPE_ISOC && +- (xhci->quirks & XHCI_AMD_0x96_HOST))); ++ chain_links = xhci_link_chain_quirk(xhci, ring->type); + + next = ring->enq_seg->next; + xhci_link_segments(ring->enq_seg, first, ring->type, chain_links); +@@ -326,10 +323,7 @@ static int xhci_alloc_segments_for_ring( + struct xhci_segment *prev; + bool chain_links; + +- /* Set chain bit for 0.95 hosts, and for isoc rings on AMD 0.96 host */ +- chain_links = !!(xhci_link_trb_quirk(xhci) || +- (type == TYPE_ISOC && +- (xhci->quirks & XHCI_AMD_0x96_HOST))); ++ chain_links = xhci_link_chain_quirk(xhci, type); + + prev = xhci_segment_alloc(xhci, cycle_state, max_packet, flags); + if (!prev) +--- a/drivers/usb/host/xhci-ring.c ++++ b/drivers/usb/host/xhci-ring.c +@@ -250,9 +250,7 @@ static void inc_enq(struct xhci_hcd *xhc + * AMD 0.96 host, carry over the chain bit of the previous TRB + * (which may mean the chain bit is cleared). + */ +- if (!(ring->type == TYPE_ISOC && +- (xhci->quirks & XHCI_AMD_0x96_HOST)) && +- !xhci_link_trb_quirk(xhci)) { ++ if (!xhci_link_chain_quirk(xhci, ring->type)) { + next->link.control &= cpu_to_le32(~TRB_CHAIN); + next->link.control |= cpu_to_le32(chain); + } +@@ -3355,9 +3353,7 @@ static int prepare_ring(struct xhci_hcd + /* If we're not dealing with 0.95 hardware or isoc rings + * on AMD 0.96 host, clear the chain bit. + */ +- if (!xhci_link_trb_quirk(xhci) && +- !(ep_ring->type == TYPE_ISOC && +- (xhci->quirks & XHCI_AMD_0x96_HOST))) ++ if (!xhci_link_chain_quirk(xhci, ep_ring->type)) + ep_ring->enqueue->link.control &= + cpu_to_le32(~TRB_CHAIN); + else +--- a/drivers/usb/host/xhci.h ++++ b/drivers/usb/host/xhci.h +@@ -1788,9 +1788,12 @@ static inline void xhci_write_64(struct + lo_hi_writeq(val, regs); + } + +-static inline int xhci_link_trb_quirk(struct xhci_hcd *xhci) ++ ++/* Link TRB chain should always be set on 0.95 hosts, and AMD 0.96 ISOC rings */ ++static inline bool xhci_link_chain_quirk(struct xhci_hcd *xhci, enum xhci_ring_type type) + { +- return xhci->quirks & XHCI_LINK_TRB_QUIRK; ++ return (xhci->quirks & XHCI_LINK_TRB_QUIRK) || ++ (type == TYPE_ISOC && (xhci->quirks & XHCI_AMD_0x96_HOST)); + } + + /* xHCI debugging */ diff --git a/queue-6.1/x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch b/queue-6.1/x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch new file mode 100644 index 0000000000..b93af846b3 --- /dev/null +++ b/queue-6.1/x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch @@ -0,0 +1,130 @@ +From stable+bounces-203368-greg=kroah.com@vger.kernel.org Wed Dec 24 11:43:39 2025 +From: Ajay Kaher +Date: Wed, 24 Dec 2025 10:24:31 +0000 +Subject: x86/mm/pat: clear VM_PAT if copy_p4d_range failed +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com, akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Ma Wupeng , syzbot+5f488e922d047d8f00cc@syzkaller.appspotmail.com, Alexander Ofitserov +Message-ID: <20251224102432.923410-2-ajay.kaher@broadcom.com> + +From: Ma Wupeng + +[ Upstream commit d155df53f31068c3340733d586eb9b3ddfd70fc5 ] + +Syzbot reports a warning in untrack_pfn(). Digging into the root we found +that this is due to memory allocation failure in pmd_alloc_one. And this +failure is produced due to failslab. + +In copy_page_range(), memory alloaction for pmd failed. During the error +handling process in copy_page_range(), mmput() is called to remove all +vmas. While untrack_pfn this empty pfn, warning happens. + +Here's a simplified flow: + +dup_mm + dup_mmap + copy_page_range + copy_p4d_range + copy_pud_range + copy_pmd_range + pmd_alloc + __pmd_alloc + pmd_alloc_one + page = alloc_pages(gfp, 0); + if (!page) + return NULL; + mmput + exit_mmap + unmap_vmas + unmap_single_vma + untrack_pfn + follow_phys + WARN_ON_ONCE(1); + +Since this vma is not generate successfully, we can clear flag VM_PAT. In +this case, untrack_pfn() will not be called while cleaning this vma. + +Function untrack_pfn_moved() has also been renamed to fit the new logic. + +Link: https://lkml.kernel.org/r/20230217025615.1595558-1-mawupeng1@huawei.com +Signed-off-by: Ma Wupeng +Reported-by: +Signed-off-by: Andrew Morton +Signed-off-by: Alexander Ofitserov +Cc: stable@vger.kernel.org +[ Ajay: Modified to apply on v6.1 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/pat/memtype.c | 12 ++++++++---- + include/linux/pgtable.h | 7 ++++--- + mm/memory.c | 1 + + mm/mremap.c | 2 +- + 4 files changed, 14 insertions(+), 8 deletions(-) + +--- a/arch/x86/mm/pat/memtype.c ++++ b/arch/x86/mm/pat/memtype.c +@@ -1137,11 +1137,15 @@ void untrack_pfn(struct vm_area_struct * + } + + /* +- * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, +- * with the old vma after its pfnmap page table has been removed. The new +- * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. ++ * untrack_pfn_clear is called if the following situation fits: ++ * ++ * 1) while mremapping a pfnmap for a new region, with the old vma after ++ * its pfnmap page table has been removed. The new vma has a new pfnmap ++ * to the same pfn & cache type with VM_PAT set. ++ * 2) while duplicating vm area, the new vma fails to copy the pgtable from ++ * old vma. + */ +-void untrack_pfn_moved(struct vm_area_struct *vma) ++void untrack_pfn_clear(struct vm_area_struct *vma) + { + vma->vm_flags &= ~VM_PAT; + } +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -1214,9 +1214,10 @@ static inline void untrack_pfn(struct vm + } + + /* +- * untrack_pfn_moved is called while mremapping a pfnmap for a new region. ++ * untrack_pfn_clear is called while mremapping a pfnmap for a new region ++ * or fails to copy pgtable during duplicate vm area. + */ +-static inline void untrack_pfn_moved(struct vm_area_struct *vma) ++static inline void untrack_pfn_clear(struct vm_area_struct *vma) + { + } + #else +@@ -1228,7 +1229,7 @@ extern void track_pfn_insert(struct vm_a + extern int track_pfn_copy(struct vm_area_struct *vma); + extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); +-extern void untrack_pfn_moved(struct vm_area_struct *vma); ++extern void untrack_pfn_clear(struct vm_area_struct *vma); + #endif + + #ifdef CONFIG_MMU +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1335,6 +1335,7 @@ copy_page_range(struct vm_area_struct *d + continue; + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { ++ untrack_pfn_clear(dst_vma); + ret = -ENOMEM; + break; + } +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -682,7 +682,7 @@ static unsigned long move_vma(struct vm_ + + /* Tell pfnmap has moved from this vma */ + if (unlikely(vma->vm_flags & VM_PFNMAP)) +- untrack_pfn_moved(vma); ++ untrack_pfn_clear(vma); + + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { + /* We always clear VM_LOCKED[ONFAULT] on the old vma */ diff --git a/queue-6.1/x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch b/queue-6.1/x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch new file mode 100644 index 0000000000..79d507c149 --- /dev/null +++ b/queue-6.1/x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch @@ -0,0 +1,293 @@ +From stable+bounces-203369-greg=kroah.com@vger.kernel.org Wed Dec 24 11:43:43 2025 +From: Ajay Kaher +Date: Wed, 24 Dec 2025 10:24:32 +0000 +Subject: x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range() +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com, akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, xingwei lee , yuxin wang , Marius Fleischer , David Hildenbrand , Ingo Molnar , Rik van Riel , Linus Torvalds , Sasha Levin +Message-ID: <20251224102432.923410-3-ajay.kaher@broadcom.com> + +From: David Hildenbrand + +[ Upstream commit dc84bc2aba85a1508f04a936f9f9a15f64ebfb31 ] + +If track_pfn_copy() fails, we already added the dst VMA to the maple +tree. As fork() fails, we'll cleanup the maple tree, and stumble over +the dst VMA for which we neither performed any reservation nor copied +any page tables. + +Consequently untrack_pfn() will see VM_PAT and try obtaining the +PAT information from the page table -- which fails because the page +table was not copied. + +The easiest fix would be to simply clear the VM_PAT flag of the dst VMA +if track_pfn_copy() fails. However, the whole thing is about "simply" +clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy() +and performed a reservation, but copying the page tables fails, we'll +simply clear the VM_PAT flag, not properly undoing the reservation ... +which is also wrong. + +So let's fix it properly: set the VM_PAT flag only if the reservation +succeeded (leaving it clear initially), and undo the reservation if +anything goes wrong while copying the page tables: clearing the VM_PAT +flag after undoing the reservation. + +Note that any copied page table entries will get zapped when the VMA will +get removed later, after copy_page_range() succeeded; as VM_PAT is not set +then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be +happy. Note that leaving these page tables in place without a reservation +is not a problem, as we are aborting fork(); this process will never run. + +A reproducer can trigger this usually at the first try: + + https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c + + WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110 + Modules linked in: ... + CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 + RIP: 0010:get_pat_info+0xf6/0x110 + ... + Call Trace: + + ... + untrack_pfn+0x52/0x110 + unmap_single_vma+0xa6/0xe0 + unmap_vmas+0x105/0x1f0 + exit_mmap+0xf6/0x460 + __mmput+0x4b/0x120 + copy_process+0x1bf6/0x2aa0 + kernel_clone+0xab/0x440 + __do_sys_clone+0x66/0x90 + do_syscall_64+0x95/0x180 + +Likely this case was missed in: + + d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") + +... and instead of undoing the reservation we simply cleared the VM_PAT flag. + +Keep the documentation of these functions in include/linux/pgtable.h, +one place is more than sufficient -- we should clean that up for the other +functions like track_pfn_remap/untrack_pfn separately. + +Fixes: d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") +Fixes: 2ab640379a0a ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3") +Reported-by: xingwei lee +Reported-by: yuxin wang +Reported-by: Marius Fleischer +Signed-off-by: David Hildenbrand +Signed-off-by: Ingo Molnar +Cc: Andy Lutomirski +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: "H. Peter Anvin" +Cc: Linus Torvalds +Cc: Andrew Morton +Cc: linux-mm@kvack.org +Link: https://lore.kernel.org/r/20250321112323.153741-1-david@redhat.com +Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/ +Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/ +Signed-off-by: Sasha Levin +Cc: stable@vger.kernel.org +[ Ajay: Modified to apply on v6.1 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/pat/memtype.c | 52 ++++++++++++++++++++++++---------------------- + include/linux/pgtable.h | 28 +++++++++++++++++++----- + kernel/fork.c | 4 +++ + mm/memory.c | 11 +++------ + 4 files changed, 58 insertions(+), 37 deletions(-) + +--- a/arch/x86/mm/pat/memtype.c ++++ b/arch/x86/mm/pat/memtype.c +@@ -1029,29 +1029,42 @@ static int get_pat_info(struct vm_area_s + return -EINVAL; + } + +-/* +- * track_pfn_copy is called when vma that is covering the pfnmap gets +- * copied through copy_page_range(). +- * +- * If the vma has a linear pfn mapping for the entire range, we get the prot +- * from pte and reserve the entire vma range with single reserve_pfn_range call. +- */ +-int track_pfn_copy(struct vm_area_struct *vma) ++int track_pfn_copy(struct vm_area_struct *dst_vma, ++ struct vm_area_struct *src_vma, unsigned long *pfn) + { ++ const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; + resource_size_t paddr; +- unsigned long vma_size = vma->vm_end - vma->vm_start; + pgprot_t pgprot; ++ int rc; + +- if (vma->vm_flags & VM_PAT) { +- if (get_pat_info(vma, &paddr, &pgprot)) +- return -EINVAL; +- /* reserve the whole chunk covered by vma. */ +- return reserve_pfn_range(paddr, vma_size, &pgprot, 1); +- } ++ if (!(src_vma->vm_flags & VM_PAT)) ++ return 0; + ++ /* ++ * Duplicate the PAT information for the dst VMA based on the src ++ * VMA. ++ */ ++ if (get_pat_info(src_vma, &paddr, &pgprot)) ++ return -EINVAL; ++ rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); ++ if (rc) ++ return rc; ++ ++ /* Reservation for the destination VMA succeeded. */ ++ dst_vma->vm_flags |= VM_PAT; ++ *pfn = PHYS_PFN(paddr); + return 0; + } + ++void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) ++{ ++ untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start); ++ /* ++ * Reservation was freed, any copied page tables will get cleaned ++ * up later, but without getting PAT involved again. ++ */ ++} ++ + /* + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, +@@ -1136,15 +1149,6 @@ void untrack_pfn(struct vm_area_struct * + vma->vm_flags &= ~VM_PAT; + } + +-/* +- * untrack_pfn_clear is called if the following situation fits: +- * +- * 1) while mremapping a pfnmap for a new region, with the old vma after +- * its pfnmap page table has been removed. The new vma has a new pfnmap +- * to the same pfn & cache type with VM_PAT set. +- * 2) while duplicating vm area, the new vma fails to copy the pgtable from +- * old vma. +- */ + void untrack_pfn_clear(struct vm_area_struct *vma) + { + vma->vm_flags &= ~VM_PAT; +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -1195,15 +1195,26 @@ static inline void track_pfn_insert(stru + } + + /* +- * track_pfn_copy is called when vma that is covering the pfnmap gets +- * copied through copy_page_range(). ++ * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page ++ * tables copied during copy_page_range(). On success, stores the pfn to be ++ * passed to untrack_pfn_copy(). + */ +-static inline int track_pfn_copy(struct vm_area_struct *vma) ++static inline int track_pfn_copy(struct vm_area_struct *dst_vma, ++ struct vm_area_struct *src_vma, unsigned long *pfn) + { + return 0; + } + + /* ++ * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during ++ * copy_page_range(), but after track_pfn_copy() was already called. ++ */ ++static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, ++ unsigned long pfn) ++{ ++} ++ ++/* + * untrack_pfn is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case pfn, size are zero). +@@ -1214,8 +1225,10 @@ static inline void untrack_pfn(struct vm + } + + /* +- * untrack_pfn_clear is called while mremapping a pfnmap for a new region +- * or fails to copy pgtable during duplicate vm area. ++ * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: ++ * ++ * 1) During mremap() on the src VMA after the page tables were moved. ++ * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. + */ + static inline void untrack_pfn_clear(struct vm_area_struct *vma) + { +@@ -1226,7 +1239,10 @@ extern int track_pfn_remap(struct vm_are + unsigned long size); + extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + pfn_t pfn); +-extern int track_pfn_copy(struct vm_area_struct *vma); ++extern int track_pfn_copy(struct vm_area_struct *dst_vma, ++ struct vm_area_struct *src_vma, unsigned long *pfn); ++extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, ++ unsigned long pfn); + extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); + extern void untrack_pfn_clear(struct vm_area_struct *vma); +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -476,6 +476,10 @@ struct vm_area_struct *vm_area_dup(struc + *new = data_race(*orig); + INIT_LIST_HEAD(&new->anon_vma_chain); + dup_anon_vma_name(orig, new); ++ ++ /* track_pfn_copy() will later take care of copying internal state. */ ++ if (unlikely(new->vm_flags & VM_PFNMAP)) ++ untrack_pfn_clear(new); + } + return new; + } +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1278,12 +1278,12 @@ int + copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) + { + pgd_t *src_pgd, *dst_pgd; +- unsigned long next; + unsigned long addr = src_vma->vm_start; + unsigned long end = src_vma->vm_end; + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + struct mmu_notifier_range range; ++ unsigned long next, pfn; + bool is_cow; + int ret; + +@@ -1294,11 +1294,7 @@ copy_page_range(struct vm_area_struct *d + return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); + + if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { +- /* +- * We do not free on error cases below as remove_vma +- * gets called on error from higher level routine +- */ +- ret = track_pfn_copy(src_vma); ++ ret = track_pfn_copy(dst_vma, src_vma, &pfn); + if (ret) + return ret; + } +@@ -1335,7 +1331,6 @@ copy_page_range(struct vm_area_struct *d + continue; + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { +- untrack_pfn_clear(dst_vma); + ret = -ENOMEM; + break; + } +@@ -1345,6 +1340,8 @@ copy_page_range(struct vm_area_struct *d + raw_write_seqcount_end(&src_mm->write_protect_seq); + mmu_notifier_invalidate_range_end(&range); + } ++ if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) ++ untrack_pfn_copy(dst_vma, pfn); + return ret; + } +