--- /dev/null
+From stable+bounces-201132-greg=kroah.com@vger.kernel.org Tue Dec 16 04:16:52 2025
+From: lanbincn@139.com
+Date: Tue, 16 Dec 2025 03:13:13 +0000
+Subject: dmaengine: idxd: Remove improper idxd_free
+To: stable@vger.kernel.org
+Cc: Yi Sun <yi.sun@intel.com>, Shuai Xue <xueshuai@linux.alibaba.com>, Dave Jiang <dave.jiang@intel.com>, Vinicius Costa Gomes <vinicius.gomes@intel.com>, Vinod Koul <vkoul@kernel.org>, Bin Lan <lanbincn@139.com>
+Message-ID: <20251216031313.4853-1-lanbincn@139.com>
+
+From: Yi Sun <yi.sun@intel.com>
+
+[ Upstream commit f41c538881eec4dcf5961a242097d447f848cda6 ]
+
+The call to idxd_free() introduces a duplicate put_device() leading to a
+reference count underflow:
+refcount_t: underflow; use-after-free.
+WARNING: CPU: 15 PID: 4428 at lib/refcount.c:28 refcount_warn_saturate+0xbe/0x110
+...
+Call Trace:
+ <TASK>
+ idxd_remove+0xe4/0x120 [idxd]
+ pci_device_remove+0x3f/0xb0
+ device_release_driver_internal+0x197/0x200
+ driver_detach+0x48/0x90
+ bus_remove_driver+0x74/0xf0
+ pci_unregister_driver+0x2e/0xb0
+ idxd_exit_module+0x34/0x7a0 [idxd]
+ __do_sys_delete_module.constprop.0+0x183/0x280
+ do_syscall_64+0x54/0xd70
+ entry_SYSCALL_64_after_hwframe+0x76/0x7e
+
+The idxd_unregister_devices() which is invoked at the very beginning of
+idxd_remove(), already takes care of the necessary put_device() through the
+following call path:
+idxd_unregister_devices() -> device_unregister() -> put_device()
+
+In addition, when CONFIG_DEBUG_KOBJECT_RELEASE is enabled, put_device() may
+trigger asynchronous cleanup via schedule_delayed_work(). If idxd_free() is
+called immediately after, it can result in a use-after-free.
+
+Remove the improper idxd_free() to avoid both the refcount underflow and
+potential memory corruption during module unload.
+
+Fixes: d5449ff1b04d ("dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call")
+Signed-off-by: Yi Sun <yi.sun@intel.com>
+Tested-by: Shuai Xue <xueshuai@linux.alibaba.com>
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+
+Link: https://lore.kernel.org/r/20250729150313.1934101-2-yi.sun@intel.com
+Signed-off-by: Vinod Koul <vkoul@kernel.org>
+[ Slightly adjust the context. ]
+Signed-off-by: Bin Lan <lanbincn@139.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+Without this patch, this issue can be reproduced in Linux-6.1.y
+when the idxd module is removed.
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/init.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/dma/idxd/init.c
++++ b/drivers/dma/idxd/init.c
+@@ -816,7 +816,6 @@ static void idxd_remove(struct pci_dev *
+ destroy_workqueue(idxd->wq);
+ perfmon_pmu_remove(idxd);
+ put_device(idxd_confdev(idxd));
+- idxd_free(idxd);
+ }
+
+ static struct pci_driver idxd_pci_driver = {
--- /dev/null
+From stable+bounces-203358-greg=kroah.com@vger.kernel.org Wed Dec 24 09:57:13 2025
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Wed, 24 Dec 2025 00:36:52 -0800
+Subject: drm/vmwgfx: Fix a null-ptr access in the cursor snooper
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: zack.rusin@broadcom.com, bcm-kernel-feedback-list@broadcom.com, maarten.lankhorst@linux.intel.com, mripard@kernel.org, tzimmermann@suse.de, simona@ffwll.ch, airlied@gmail.com, brianp@vmware.com, dtor@vmware.com, airlied@redhat.com, thellstrom@vmware.com, dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Kuzey Arda Bulut <kuzeyardabulut@gmail.com>, Ian Forbes <ian.forbes@broadcom.com>, Sasha Levin <sashal@kernel.org>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20251224083652.614902-1-shivani.agarwal@broadcom.com>
+
+From: Zack Rusin <zack.rusin@broadcom.com>
+
+[ Upstream commit 5ac2c0279053a2c5265d46903432fb26ae2d0da2 ]
+
+Check that the resource which is converted to a surface exists before
+trying to use the cursor snooper on it.
+
+vmw_cmd_res_check allows explicit invalid (SVGA3D_INVALID_ID) identifiers
+because some svga commands accept SVGA3D_INVALID_ID to mean "no surface",
+unfortunately functions that accept the actual surfaces as objects might
+(and in case of the cursor snooper, do not) be able to handle null
+objects. Make sure that we validate not only the identifier (via the
+vmw_cmd_res_check) but also check that the actual resource exists before
+trying to do something with it.
+
+Fixes unchecked null-ptr reference in the snooping code.
+
+Signed-off-by: Zack Rusin <zack.rusin@broadcom.com>
+Fixes: c0951b797e7d ("drm/vmwgfx: Refactor resource management")
+Reported-by: Kuzey Arda Bulut <kuzeyardabulut@gmail.com>
+Cc: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
+Cc: dri-devel@lists.freedesktop.org
+Reviewed-by: Ian Forbes <ian.forbes@broadcom.com>
+Link: https://lore.kernel.org/r/20250917153655.1968583-1-zack.rusin@broadcom.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[Shivani: Modified to apply on v5.10.y-v6.1.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+@@ -1507,6 +1507,7 @@ static int vmw_cmd_dma(struct vmw_privat
+ SVGA3dCmdHeader *header)
+ {
+ struct vmw_buffer_object *vmw_bo = NULL;
++ struct vmw_resource *res;
+ struct vmw_surface *srf = NULL;
+ VMW_DECLARE_CMD_VAR(*cmd, SVGA3dCmdSurfaceDMA);
+ int ret;
+@@ -1542,18 +1543,24 @@ static int vmw_cmd_dma(struct vmw_privat
+
+ dirty = (cmd->body.transfer == SVGA3D_WRITE_HOST_VRAM) ?
+ VMW_RES_DIRTY_SET : 0;
+- ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
+- dirty, user_surface_converter,
+- &cmd->body.host.sid, NULL);
++ ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface, dirty,
++ user_surface_converter, &cmd->body.host.sid,
++ NULL);
+ if (unlikely(ret != 0)) {
+ if (unlikely(ret != -ERESTARTSYS))
+ VMW_DEBUG_USER("could not find surface for DMA.\n");
+ return ret;
+ }
+
+- srf = vmw_res_to_srf(sw_context->res_cache[vmw_res_surface].res);
++ res = sw_context->res_cache[vmw_res_surface].res;
++ if (!res) {
++ VMW_DEBUG_USER("Invalid DMA surface.\n");
++ return -EINVAL;
++ }
+
+- vmw_kms_cursor_snoop(srf, sw_context->fp->tfile, &vmw_bo->base, header);
++ srf = vmw_res_to_srf(res);
++ vmw_kms_cursor_snoop(srf, sw_context->fp->tfile, &vmw_bo->base,
++ header);
+
+ return 0;
+ }
--- /dev/null
+From justinstitt@google.com Thu Jan 8 14:27:57 2026
+From: Justin Stitt <justinstitt@google.com>
+Date: Fri, 05 Dec 2025 14:51:41 -0800
+Subject: KVM: arm64: sys_regs: disable -Wuninitialized-const-pointer warning
+To: Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>, Alexandru Elisei <alexandru.elisei@arm.com>, Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>, Catalin Marinas <catalin.marinas@arm.com>, Zenghui Yu <yuzenghui@huawei.com>, Will Deacon <will@kernel.org>, Nathan Chancellor <nathan@kernel.org>, Christopher Covington <cov@codeaurora.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.cs.columbia.edu, linux-kernel@vger.kernel.org, llvm@lists.linux.dev, stable@vger.kernel.org, Justin Stitt <justinstitt@google.com>
+Message-ID: <20251205-stable-disable-unit-ptr-warn-v2-1-cec53a8f736b@google.com>
+
+From: Justin Stitt <justinstitt@google.com>
+
+A new warning in Clang 22 [1] complains that @clidr passed to
+get_clidr_el1() is an uninitialized const pointer. get_clidr_el1()
+doesn't really care since it casts away the const-ness anyways -- it is
+a false positive.
+
+This patch isn't needed for anything past 6.1 as this code section was
+reworked in Commit 7af0c2534f4c ("KVM: arm64: Normalize cache
+configuration") which incidentally removed the aforementioned warning.
+Since there is no upstream equivalent, this patch just needs to be
+applied to 6.1.
+
+Disable this warning for sys_regs.o instead of backporting the patches
+from 6.2+ that modified this code area.
+
+Cc: stable@vger.kernel.org
+Fixes: 7c8c5e6a9101e ("arm64: KVM: system register handling")
+Link: https://github.com/llvm/llvm-project/commit/00dacf8c22f065cb52efb14cd091d441f19b319e [1]
+Reviewed-by: Nathan Chancellor <nathan@kernel.org>
+Signed-off-by: Justin Stitt <justinstitt@google.com>
+Reviewed-by: Tiffany Yang <ynaffit@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/Makefile | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/arm64/kvm/Makefile
++++ b/arch/arm64/kvm/Makefile
+@@ -24,6 +24,9 @@ kvm-y += arm.o mmu.o mmio.o psci.o hyper
+
+ kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
+
++# Work around a false positive Clang 22 -Wuninitialized-const-pointer warning
++CFLAGS_sys_regs.o := $(call cc-disable-warning, uninitialized-const-pointer)
++
+ always-y := hyp_constants.h hyp-constants.s
+
+ define rule_gen_hyp_constants
--- /dev/null
+From stable+bounces-205076-greg=kroah.com@vger.kernel.org Tue Jan 6 12:49:13 2026
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Tue, 6 Jan 2026 20:47:14 +0900
+Subject: mm/mprotect: delete pmd_none_or_clear_bad_unless_trans_huge()
+To: stable@vger.kernel.org
+Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org, baohua@kernel.org, baolin.wang@linux.alibaba.com, david@kernel.org, dev.jain@arm.com, hughd@google.com, jane.chu@oracle.com, jannh@google.com, kas@kernel.org, lance.yang@linux.dev, linux-mm@kvack.org, lorenzo.stoakes@oracle.com, npache@redhat.com, pfalcato@suse.de, ryan.roberts@arm.com, vbabka@suse.cz, ziy@nvidia.com, "Alistair Popple" <apopple@nvidia.com>, "Anshuman Khandual" <anshuman.khandual@arm.com>, "Axel Rasmussen" <axelrasmussen@google.com>, "Christophe Leroy" <christophe.leroy@csgroup.eu>, "Christoph Hellwig" <hch@infradead.org>, "David Hildenbrand" <david@redhat.com>, "Huang, Ying" <ying.huang@intel.com>, "Ira Weiny" <ira.weiny@intel.com>, "Jason Gunthorpe" <jgg@ziepe.ca>, "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>, "Lorenzo Stoakes" <lstoakes@gmail.com>, "Matthew Wilcox" <willy@infradead.org>, "Mel Gorman" <mgorman@techsingularity.net>, "Miaohe Lin" <linmiaohe@huawei.com>, "Mike Kravetz" <mike.kravetz@oracle.com>, "Mike Rapoport" <rppt@kernel.org>, "Minchan Kim" <minchan@kernel.org>, "Naoya Horiguchi" <naoya.horiguchi@nec.com>, "Pavel Tatashin" <pasha.tatashin@soleen.com>, "Peter Xu" <peterx@redhat.com>, "Peter Zijlstra" <peterz@infradead.org>, "Qi Zheng" <zhengqi.arch@bytedance.com>, "Ralph Campbell" <rcampbell@nvidia.com>, "SeongJae Park" <sj@kernel.org>, "Song Liu" <song@kernel.org>, "Steven Price" <steven.price@arm.com>, "Suren Baghdasaryan" <surenb@google.com>, "Thomas Hellström" <thomas.hellstrom@linux.intel.com>, "Will Deacon" <will@kernel.org>, "Yang Shi" <shy828301@gmail.com>, "Yu Zhao" <yuzhao@google.com>, "Zack Rusin" <zackr@vmware.com>, "Harry Yoo" <harry.yoo@oracle.com>
+Message-ID: <20260106114715.80958-3-harry.yoo@oracle.com>
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 670ddd8cdcbd1d07a4571266ae3517f821728c3a upstream.
+
+change_pmd_range() had special pmd_none_or_clear_bad_unless_trans_huge(),
+required to avoid "bad" choices when setting automatic NUMA hinting under
+mmap_read_lock(); but most of that is already covered in pte_offset_map()
+now. change_pmd_range() just wants a pmd_none() check before wasting time
+on MMU notifiers, then checks on the read-once _pmd value to work out
+what's needed for huge cases. If change_pte_range() returns -EAGAIN to
+retry if pte_offset_map_lock() fails, nothing more special is needed.
+
+Link: https://lkml.kernel.org/r/725a42a9-91e9-c868-925-e3a5fd40bb4f@google.com
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Ira Weiny <ira.weiny@intel.com>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Lorenzo Stoakes <lstoakes@gmail.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Mike Rapoport (IBM) <rppt@kernel.org>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Ralph Campbell <rcampbell@nvidia.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: SeongJae Park <sj@kernel.org>
+Cc: Song Liu <song@kernel.org>
+Cc: Steven Price <steven.price@arm.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: Zack Rusin <zackr@vmware.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ Background: It was reported that a bad pmd is seen when automatic NUMA
+ balancing is marking page table entries as prot_numa:
+
+ [2437548.196018] mm/pgtable-generic.c:50: bad pmd 00000000af22fc02(dffffffe71fbfe02)
+ [2437548.235022] Call Trace:
+ [2437548.238234] <TASK>
+ [2437548.241060] dump_stack_lvl+0x46/0x61
+ [2437548.245689] panic+0x106/0x2e5
+ [2437548.249497] pmd_clear_bad+0x3c/0x3c
+ [2437548.253967] change_pmd_range.isra.0+0x34d/0x3a7
+ [2437548.259537] change_p4d_range+0x156/0x20e
+ [2437548.264392] change_protection_range+0x116/0x1a9
+ [2437548.269976] change_prot_numa+0x15/0x37
+ [2437548.274774] task_numa_work+0x1b8/0x302
+ [2437548.279512] task_work_run+0x62/0x95
+ [2437548.283882] exit_to_user_mode_loop+0x1a4/0x1a9
+ [2437548.289277] exit_to_user_mode_prepare+0xf4/0xfc
+ [2437548.294751] ? sysvec_apic_timer_interrupt+0x34/0x81
+ [2437548.300677] irqentry_exit_to_user_mode+0x5/0x25
+ [2437548.306153] asm_sysvec_apic_timer_interrupt+0x16/0x1b
+
+ This is due to a race condition between change_prot_numa() and
+ THP migration because the kernel doesn't check is_swap_pmd() and
+ pmd_trans_huge() atomically:
+
+ change_prot_numa() THP migration
+ ======================================================================
+ - change_pmd_range()
+ -> is_swap_pmd() returns false,
+ meaning it's not a PMD migration
+ entry.
+ - do_huge_pmd_numa_page()
+ -> migrate_misplaced_page() sets
+ migration entries for the THP.
+ - change_pmd_range()
+ -> pmd_none_or_clear_bad_unless_trans_huge()
+ -> pmd_none() and pmd_trans_huge() returns false
+ - pmd_none_or_clear_bad_unless_trans_huge()
+ -> pmd_bad() returns true for the migration entry!
+
+ The upstream commit 670ddd8cdcbd ("mm/mprotect: delete
+ pmd_none_or_clear_bad_unless_trans_huge()") closes this race condition
+ by checking is_swap_pmd() and pmd_trans_huge() atomically.
+
+ Backporting note:
+ Unlike the mainline, pte_offset_map_lock() does not check if the pmd
+ entry is a migration entry or a hugepage; acquires PTL unconditionally
+ instead of returning failure. Therefore, it is necessary to keep the
+ !is_swap_pmd() && !pmd_trans_huge() && !pmd_devmap() check before
+ acquiring the PTL.
+
+ After acquiring the lock, open-code the semantics of
+ pte_offset_map_lock() in the mainline kernel; change_pte_range() fails
+ if the pmd value has changed. This requires adding pmd_old parameter
+ (pmd_t value that is read before calling the function) to
+ change_pte_range(). ]
+
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mprotect.c | 101 ++++++++++++++++++++++++----------------------------------
+ 1 file changed, 43 insertions(+), 58 deletions(-)
+
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -73,10 +73,12 @@ static inline bool can_change_pte_writab
+ }
+
+ static long change_pte_range(struct mmu_gather *tlb,
+- struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+- unsigned long end, pgprot_t newprot, unsigned long cp_flags)
++ struct vm_area_struct *vma, pmd_t *pmd, pmd_t pmd_old,
++ unsigned long addr, unsigned long end, pgprot_t newprot,
++ unsigned long cp_flags)
+ {
+ pte_t *pte, oldpte;
++ pmd_t _pmd;
+ spinlock_t *ptl;
+ long pages = 0;
+ int target_node = NUMA_NO_NODE;
+@@ -86,21 +88,15 @@ static long change_pte_range(struct mmu_
+
+ tlb_change_page_size(tlb, PAGE_SIZE);
+
+- /*
+- * Can be called with only the mmap_lock for reading by
+- * prot_numa so we must check the pmd isn't constantly
+- * changing from under us from pmd_none to pmd_trans_huge
+- * and/or the other way around.
+- */
+- if (pmd_trans_unstable(pmd))
+- return 0;
+-
+- /*
+- * The pmd points to a regular pte so the pmd can't change
+- * from under us even if the mmap_lock is only hold for
+- * reading.
+- */
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
++ /* Make sure pmd didn't change after acquiring ptl */
++ _pmd = pmd_read_atomic(pmd);
++ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
++ barrier();
++ if (!pmd_same(pmd_old, _pmd)) {
++ pte_unmap_unlock(pte, ptl);
++ return -EAGAIN;
++ }
+
+ /* Get target node for single threaded private VMAs */
+ if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
+@@ -288,31 +284,6 @@ static long change_pte_range(struct mmu_
+ return pages;
+ }
+
+-/*
+- * Used when setting automatic NUMA hinting protection where it is
+- * critical that a numa hinting PMD is not confused with a bad PMD.
+- */
+-static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+-{
+- pmd_t pmdval = pmd_read_atomic(pmd);
+-
+- /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+- barrier();
+-#endif
+-
+- if (pmd_none(pmdval))
+- return 1;
+- if (pmd_trans_huge(pmdval))
+- return 0;
+- if (unlikely(pmd_bad(pmdval))) {
+- pmd_clear_bad(pmd);
+- return 1;
+- }
+-
+- return 0;
+-}
+-
+ /* Return true if we're uffd wr-protecting file-backed memory, or false */
+ static inline bool
+ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+@@ -360,22 +331,34 @@ static inline long change_pmd_range(stru
+
+ pmd = pmd_offset(pud, addr);
+ do {
+- long this_pages;
+-
++ long ret;
++ pmd_t _pmd;
++again:
+ next = pmd_addr_end(addr, end);
++ _pmd = pmd_read_atomic(pmd);
++ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++ barrier();
++#endif
+
+ change_pmd_prepare(vma, pmd, cp_flags);
+ /*
+ * Automatic NUMA balancing walks the tables with mmap_lock
+ * held for read. It's possible a parallel update to occur
+- * between pmd_trans_huge() and a pmd_none_or_clear_bad()
+- * check leading to a false positive and clearing.
+- * Hence, it's necessary to atomically read the PMD value
+- * for all the checks.
++ * between pmd_trans_huge(), is_swap_pmd(), and
++ * a pmd_none_or_clear_bad() check leading to a false positive
++ * and clearing. Hence, it's necessary to atomically read
++ * the PMD value for all the checks.
+ */
+- if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
+- pmd_none_or_clear_bad_unless_trans_huge(pmd))
+- goto next;
++ if (!is_swap_pmd(_pmd) && !pmd_devmap(_pmd) && !pmd_trans_huge(_pmd)) {
++ if (pmd_none(_pmd))
++ goto next;
++
++ if (pmd_bad(_pmd)) {
++ pmd_clear_bad(pmd);
++ goto next;
++ }
++ }
+
+ /* invoke the mmu notifier if the pmd is populated */
+ if (!range.start) {
+@@ -385,7 +368,7 @@ static inline long change_pmd_range(stru
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
++ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
+ if ((next - addr != HPAGE_PMD_SIZE) ||
+ uffd_wp_protect_file(vma, cp_flags)) {
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
+@@ -400,11 +383,11 @@ static inline long change_pmd_range(stru
+ * change_huge_pmd() does not defer TLB flushes,
+ * so no need to propagate the tlb argument.
+ */
+- int nr_ptes = change_huge_pmd(tlb, vma, pmd,
+- addr, newprot, cp_flags);
++ ret = change_huge_pmd(tlb, vma, pmd,
++ addr, newprot, cp_flags);
+
+- if (nr_ptes) {
+- if (nr_ptes == HPAGE_PMD_NR) {
++ if (ret) {
++ if (ret == HPAGE_PMD_NR) {
+ pages += HPAGE_PMD_NR;
+ nr_huge_updates++;
+ }
+@@ -415,9 +398,11 @@ static inline long change_pmd_range(stru
+ }
+ /* fall through, the trans huge pmd just split */
+ }
+- this_pages = change_pte_range(tlb, vma, pmd, addr, next,
+- newprot, cp_flags);
+- pages += this_pages;
++ ret = change_pte_range(tlb, vma, pmd, _pmd, addr, next,
++ newprot, cp_flags);
++ if (ret < 0)
++ goto again;
++ pages += ret;
+ next:
+ cond_resched();
+ } while (pmd++, addr = next, addr != end);
--- /dev/null
+From stable+bounces-205075-greg=kroah.com@vger.kernel.org Tue Jan 6 12:48:24 2026
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Tue, 6 Jan 2026 20:47:13 +0900
+Subject: mm/mprotect: use long for page accountings and retval
+To: stable@vger.kernel.org
+Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org, baohua@kernel.org, baolin.wang@linux.alibaba.com, david@kernel.org, dev.jain@arm.com, hughd@google.com, jane.chu@oracle.com, jannh@google.com, kas@kernel.org, lance.yang@linux.dev, linux-mm@kvack.org, lorenzo.stoakes@oracle.com, npache@redhat.com, pfalcato@suse.de, ryan.roberts@arm.com, vbabka@suse.cz, ziy@nvidia.com, Peter Xu <peterx@redhat.com>, Mike Kravetz <mike.kravetz@oracle.com>, James Houghton <jthoughton@google.com>, Andrea Arcangeli <aarcange@redhat.com>, Axel Rasmussen <axelrasmussen@google.com>, David Hildenbrand <david@redhat.com>, Muchun Song <songmuchun@bytedance.com>, Nadav Amit <nadav.amit@gmail.com>, Harry Yoo <harry.yoo@oracle.com>
+Message-ID: <20260106114715.80958-2-harry.yoo@oracle.com>
+
+From: Peter Xu <peterx@redhat.com>
+
+commit a79390f5d6a78647fd70856bd42b22d994de0ba2 upstream.
+
+Switch to use type "long" for page accountings and retval across the whole
+procedure of change_protection().
+
+The change should have shrinked the possible maximum page number to be
+half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on
+any system either because the maximum possible pages touched by change
+protection should be ULONG_MAX / PAGE_SIZE.
+
+Two reasons to switch from "unsigned long" to "long":
+
+ 1. It suites better on count_vm_numa_events(), whose 2nd parameter takes
+ a long type.
+
+ 2. It paves way for returning negative (error) values in the future.
+
+Currently the only caller that consumes this retval is change_prot_numa(),
+where the unsigned long was converted to an int. Since at it, touching up
+the numa code to also take a long, so it'll avoid any possible overflow
+too during the int-size convertion.
+
+Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: James Houghton <jthoughton@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ Adjust context ]
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/hugetlb.h | 4 ++--
+ include/linux/mm.h | 2 +-
+ mm/hugetlb.c | 4 ++--
+ mm/mempolicy.c | 2 +-
+ mm/mprotect.c | 26 +++++++++++++-------------
+ 5 files changed, 19 insertions(+), 19 deletions(-)
+
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -233,7 +233,7 @@ void hugetlb_vma_lock_release(struct kre
+
+ int pmd_huge(pmd_t pmd);
+ int pud_huge(pud_t pud);
+-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
++long hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot,
+ unsigned long cp_flags);
+
+@@ -447,7 +447,7 @@ static inline void move_hugetlb_state(st
+ {
+ }
+
+-static inline unsigned long hugetlb_change_protection(
++static inline long hugetlb_change_protection(
+ struct vm_area_struct *vma, unsigned long address,
+ unsigned long end, pgprot_t newprot,
+ unsigned long cp_flags)
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2148,7 +2148,7 @@ extern unsigned long move_page_tables(st
+ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
+ MM_CP_UFFD_WP_RESOLVE)
+
+-extern unsigned long change_protection(struct mmu_gather *tlb,
++extern long change_protection(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ unsigned long cp_flags);
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6668,7 +6668,7 @@ long follow_hugetlb_page(struct mm_struc
+ return i ? i : err;
+ }
+
+-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
++long hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
+ {
+@@ -6677,7 +6677,7 @@ unsigned long hugetlb_change_protection(
+ pte_t *ptep;
+ pte_t pte;
+ struct hstate *h = hstate_vma(vma);
+- unsigned long pages = 0, psize = huge_page_size(h);
++ long pages = 0, psize = huge_page_size(h);
+ bool shared_pmd = false;
+ struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -628,7 +628,7 @@ unsigned long change_prot_numa(struct vm
+ unsigned long addr, unsigned long end)
+ {
+ struct mmu_gather tlb;
+- int nr_updated;
++ long nr_updated;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -72,13 +72,13 @@ static inline bool can_change_pte_writab
+ return true;
+ }
+
+-static unsigned long change_pte_range(struct mmu_gather *tlb,
++static long change_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+ pte_t *pte, oldpte;
+ spinlock_t *ptl;
+- unsigned long pages = 0;
++ long pages = 0;
+ int target_node = NUMA_NO_NODE;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+@@ -346,13 +346,13 @@ uffd_wp_protect_file(struct vm_area_stru
+ } \
+ } while (0)
+
+-static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
++static inline long change_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+ pmd_t *pmd;
+ unsigned long next;
+- unsigned long pages = 0;
++ long pages = 0;
+ unsigned long nr_huge_updates = 0;
+ struct mmu_notifier_range range;
+
+@@ -360,7 +360,7 @@ static inline unsigned long change_pmd_r
+
+ pmd = pmd_offset(pud, addr);
+ do {
+- unsigned long this_pages;
++ long this_pages;
+
+ next = pmd_addr_end(addr, end);
+
+@@ -430,13 +430,13 @@ next:
+ return pages;
+ }
+
+-static inline unsigned long change_pud_range(struct mmu_gather *tlb,
++static inline long change_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+ pud_t *pud;
+ unsigned long next;
+- unsigned long pages = 0;
++ long pages = 0;
+
+ pud = pud_offset(p4d, addr);
+ do {
+@@ -451,13 +451,13 @@ static inline unsigned long change_pud_r
+ return pages;
+ }
+
+-static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
++static inline long change_p4d_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+ p4d_t *p4d;
+ unsigned long next;
+- unsigned long pages = 0;
++ long pages = 0;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+@@ -472,14 +472,14 @@ static inline unsigned long change_p4d_r
+ return pages;
+ }
+
+-static unsigned long change_protection_range(struct mmu_gather *tlb,
++static long change_protection_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ unsigned long next;
+- unsigned long pages = 0;
++ long pages = 0;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset(mm, addr);
+@@ -498,12 +498,12 @@ static unsigned long change_protection_r
+ return pages;
+ }
+
+-unsigned long change_protection(struct mmu_gather *tlb,
++long change_protection(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ unsigned long cp_flags)
+ {
+- unsigned long pages;
++ long pages;
+
+ BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+
--- /dev/null
+From stable+bounces-198206-greg=kroah.com@vger.kernel.org Wed Dec 3 12:44:47 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 3 Dec 2025 11:25:52 +0000
+Subject: sched/fair: Proportional newidle balance
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112552.1738424-5-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream.
+
+Add a randomized algorithm that runs newidle balancing proportional to
+its success rate.
+
+This improves schbench significantly:
+
+ 6.18-rc4: 2.22 Mrps/s
+ 6.18-rc4+revert: 2.04 Mrps/s
+ 6.18-rc4+revert+random: 2.18 Mrps/S
+
+Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
+
+ 6.17: -6%
+ 6.17+revert: 0%
+ 6.17+revert+random: -1%
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
+Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sched/topology.h | 3 ++
+ kernel/sched/core.c | 3 ++
+ kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++++++++++----
+ kernel/sched/features.h | 5 ++++
+ kernel/sched/sched.h | 7 ++++++
+ kernel/sched/topology.c | 6 +++++
+ 6 files changed, 64 insertions(+), 4 deletions(-)
+
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -106,6 +106,9 @@ struct sched_domain {
+ unsigned int nr_balance_failed; /* initialise to 0 */
+
+ /* idle_balance() stats */
++ unsigned int newidle_call;
++ unsigned int newidle_success;
++ unsigned int newidle_ratio;
+ u64 max_newidle_lb_cost;
+ unsigned long last_decay_max_lb_cost;
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -112,6 +112,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_
+ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+ #ifdef CONFIG_SCHED_DEBUG
+ /*
+@@ -9632,6 +9633,8 @@ void __init sched_init_smp(void)
+ {
+ sched_init_numa(NUMA_NO_NODE);
+
++ prandom_init_once(&sched_rnd_state);
++
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * CPU masks are stable and all blatant races in the below code cannot
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10935,11 +10935,27 @@ void update_max_interval(void)
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+ }
+
+-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
++static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
++{
++ sd->newidle_call++;
++ sd->newidle_success += success;
++
++ if (sd->newidle_call >= 1024) {
++ sd->newidle_ratio = sd->newidle_success;
++ sd->newidle_call /= 2;
++ sd->newidle_success /= 2;
++ }
++}
++
++static inline bool
++update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
+ {
+ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+ unsigned long now = jiffies;
+
++ if (cost)
++ update_newidle_stats(sd, success);
++
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+@@ -10987,7 +11003,7 @@ static void rebalance_domains(struct rq
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains.
+ */
+- need_decay = update_newidle_cost(sd, 0);
++ need_decay = update_newidle_cost(sd, 0, 0);
+ max_cost += sd->max_newidle_lb_cost;
+
+ /*
+@@ -11621,6 +11637,22 @@ static int sched_balance_newidle(struct
+ break;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
++ unsigned int weight = 1;
++
++ if (sched_feat(NI_RANDOM)) {
++ /*
++ * Throw a 1k sided dice; and only run
++ * newidle_balance according to the success
++ * rate.
++ */
++ u32 d1k = sched_rng() % 1024;
++ weight = 1 + sd->newidle_ratio;
++ if (d1k > weight) {
++ update_newidle_stats(sd, 0);
++ continue;
++ }
++ weight = (1024 + weight/2) / weight;
++ }
+
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+@@ -11628,10 +11660,14 @@ static int sched_balance_newidle(struct
+
+ t1 = sched_clock_cpu(this_cpu);
+ domain_cost = t1 - t0;
+- update_newidle_cost(sd, domain_cost);
+-
+ curr_cost += domain_cost;
+ t0 = t1;
++
++ /*
++ * Track max cost of a domain to make sure to not delay the
++ * next wakeup on the CPU.
++ */
++ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
+ }
+
+ /*
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -99,5 +99,10 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+ SCHED_FEAT(LATENCY_WARN, false)
+
++/*
++ * Do newidle balancing proportional to its success rate using randomization.
++ */
++SCHED_FEAT(NI_RANDOM, true)
++
+ SCHED_FEAT(ALT_PERIOD, true)
+ SCHED_FEAT(BASE_SLICE, true)
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -5,6 +5,7 @@
+ #ifndef _KERNEL_SCHED_SCHED_H
+ #define _KERNEL_SCHED_SCHED_H
+
++#include <linux/prandom.h>
+ #include <linux/sched/affinity.h>
+ #include <linux/sched/autogroup.h>
+ #include <linux/sched/cpufreq.h>
+@@ -1190,6 +1191,12 @@ static inline bool is_migration_disabled
+ }
+
+ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
++
++static inline u32 sched_rng(void)
++{
++ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
++}
+
+ #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+ #define this_rq() this_cpu_ptr(&runqueues)
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1584,6 +1584,12 @@ sd_init(struct sched_domain_topology_lev
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
++
++ /* 50% success rate */
++ .newidle_call = 512,
++ .newidle_success = 256,
++ .newidle_ratio = 512,
++
+ .max_newidle_lb_cost = 0,
+ .last_decay_max_lb_cost = jiffies,
+ .child = child,
--- /dev/null
+From stable+bounces-198204-greg=kroah.com@vger.kernel.org Wed Dec 3 12:44:16 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 3 Dec 2025 11:25:50 +0000
+Subject: sched/fair: Small cleanup to sched_balance_newidle()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112552.1738424-3-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit e78e70dbf603c1425f15f32b455ca148c932f6c1 upstream.
+
+Pull out the !sd check to simplify code.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.525916173@infradead.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -11589,14 +11589,15 @@ static int sched_balance_newidle(struct
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
++ if (!sd) {
++ rcu_read_unlock();
++ goto out;
++ }
+
+ if (!READ_ONCE(this_rq->rd->overload) ||
+- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+-
+- if (sd)
+- update_next_balance(sd, &next_balance);
++ this_rq->avg_idle < sd->max_newidle_lb_cost) {
++ update_next_balance(sd, &next_balance);
+ rcu_read_unlock();
+-
+ goto out;
+ }
+ rcu_read_unlock();
--- /dev/null
+From stable+bounces-198205-greg=kroah.com@vger.kernel.org Wed Dec 3 12:44:39 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 3 Dec 2025 11:25:51 +0000
+Subject: sched/fair: Small cleanup to update_newidle_cost()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112552.1738424-4-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 08d473dd8718e4a4d698b1113a14a40ad64a909b upstream.
+
+Simplify code by adding a few variables.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.655208666@infradead.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10937,22 +10937,25 @@ void update_max_interval(void)
+
+ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+ {
++ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
++ unsigned long now = jiffies;
++
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ sd->max_newidle_lb_cost = cost;
+- sd->last_decay_max_lb_cost = jiffies;
+- } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
++ sd->last_decay_max_lb_cost = now;
++
++ } else if (time_after(now, next_decay)) {
+ /*
+ * Decay the newidle max times by ~1% per second to ensure that
+ * it is not outdated and the current max cost is actually
+ * shorter.
+ */
+ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+- sd->last_decay_max_lb_cost = jiffies;
+-
++ sd->last_decay_max_lb_cost = now;
+ return true;
+ }
+
iommu-qcom-index-contexts-by-asid-number-to-allow-asid-0.patch
iommu-qcom-fix-device-leak-on-of_xlate.patch
virtio_console-fix-order-of-fields-cols-and-rows.patch
+kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch
+dmaengine-idxd-remove-improper-idxd_free.patch
+x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch
+x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch
+mm-mprotect-use-long-for-page-accountings-and-retval.patch
+mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch
+drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch
+usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch
+usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch
+sched-fair-small-cleanup-to-sched_balance_newidle.patch
+sched-fair-small-cleanup-to-update_newidle_cost.patch
+sched-fair-proportional-newidle-balance.patch
--- /dev/null
+From linux-usb+bounces-32050-greg=kroah.com@vger.kernel.org Thu Jan 8 10:14:16 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu, 8 Jan 2026 00:49:27 -0800
+Subject: usb: xhci: Apply the link chain quirk on NEC isoc endpoints
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mathias.nyman@intel.com, linux-usb@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Michal Pecio <michal.pecio@gmail.com>, Mathias Nyman <mathias.nyman@linux.intel.com>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108084927.671785-3-shivani.agarwal@broadcom.com>
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit bb0ba4cb1065e87f9cc75db1fa454e56d0894d01 upstream.
+
+Two clearly different specimens of NEC uPD720200 (one with start/stop
+bug, one without) were seen to cause IOMMU faults after some Missed
+Service Errors. Faulting address is immediately after a transfer ring
+segment and patched dynamic debug messages revealed that the MSE was
+received when waiting for a TD near the end of that segment:
+
+[ 1.041954] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ffa08fe0
+[ 1.042120] xhci_hcd: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0005 address=0xffa09000 flags=0x0000]
+[ 1.042146] xhci_hcd: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0005 address=0xffa09040 flags=0x0000]
+
+It gets even funnier if the next page is a ring segment accessible to
+the HC. Below, it reports MSE in segment at ff1e8000, plows through a
+zero-filled page at ff1e9000 and starts reporting events for TRBs in
+page at ff1ea000 every microframe, instead of jumping to seg ff1e6000.
+
+[ 7.041671] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ff1e8fe0
+[ 7.041999] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ff1e8fe0
+[ 7.042011] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint
+[ 7.042028] xhci_hcd: All TDs skipped for slot 1 ep 2. Clear skip flag.
+[ 7.042134] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint
+[ 7.042138] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 31
+[ 7.042144] xhci_hcd: Looking for event-dma 00000000ff1ea040 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+[ 7.042259] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint
+[ 7.042262] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 31
+[ 7.042266] xhci_hcd: Looking for event-dma 00000000ff1ea050 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+
+At some point completion events change from Isoch Buffer Overrun to
+Short Packet and the HC finally finds cycle bit mismatch in ff1ec000.
+
+[ 7.098130] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 13
+[ 7.098132] xhci_hcd: Looking for event-dma 00000000ff1ecc50 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+[ 7.098254] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 13
+[ 7.098256] xhci_hcd: Looking for event-dma 00000000ff1ecc60 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+[ 7.098379] xhci_hcd: Overrun event on slot 1 ep 2
+
+It's possible that data from the isochronous device were written to
+random buffers of pending TDs on other endpoints (either IN or OUT),
+other devices or even other HCs in the same IOMMU domain.
+
+Lastly, an error from a different USB device on another HC. Was it
+caused by the above? I don't know, but it may have been. The disk
+was working without any other issues and generated PCIe traffic to
+starve the NEC of upstream BW and trigger those MSEs. The two HCs
+shared one x1 slot by means of a commercial "PCIe splitter" board.
+
+[ 7.162604] usb 10-2: reset SuperSpeed USB device number 3 using xhci_hcd
+[ 7.178990] sd 9:0:0:0: [sdb] tag#0 UNKNOWN(0x2003) Result: hostbyte=0x07 driverbyte=DRIVER_OK cmd_age=0s
+[ 7.179001] sd 9:0:0:0: [sdb] tag#0 CDB: opcode=0x28 28 00 04 02 ae 00 00 02 00 00
+[ 7.179004] I/O error, dev sdb, sector 67284480 op 0x0:(READ) flags 0x80700 phys_seg 5 prio class 0
+
+Fortunately, it appears that this ridiculous bug is avoided by setting
+the chain bit of Link TRBs on isochronous rings. Other ancient HCs are
+known which also expect the bit to be set and they ignore Link TRBs if
+it's not. Reportedly, 0.95 spec guaranteed that the bit is set.
+
+The bandwidth-starved NEC HC running a 32KB/uframe UVC endpoint reports
+tens of MSEs per second and runs into the bug within seconds. Chaining
+Link TRBs allows the same workload to run for many minutes, many times.
+
+No negative side effects seen in UVC recording and UAC playback with a
+few devices at full speed, high speed and SuperSpeed.
+
+The problem doesn't reproduce on the newer Renesas uPD720201/uPD720202
+and on old Etron EJ168 and VIA VL805 (but the VL805 has other bug).
+
+[shorten line length of log snippets in commit messge -Mathias]
+
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20250306144954.3507700-14-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[Shivani: Modified to apply on v5.10.y-v6.1.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci.h | 13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -1789,11 +1789,20 @@ static inline void xhci_write_64(struct
+ }
+
+
+-/* Link TRB chain should always be set on 0.95 hosts, and AMD 0.96 ISOC rings */
++/*
++ * Reportedly, some chapters of v0.95 spec said that Link TRB always has its chain bit set.
++ * Other chapters and later specs say that it should only be set if the link is inside a TD
++ * which continues from the end of one segment to the next segment.
++ *
++ * Some 0.95 hardware was found to misbehave if any link TRB doesn't have the chain bit set.
++ *
++ * 0.96 hardware from AMD and NEC was found to ignore unchained isochronous link TRBs when
++ * "resynchronizing the pipe" after a Missed Service Error.
++ */
+ static inline bool xhci_link_chain_quirk(struct xhci_hcd *xhci, enum xhci_ring_type type)
+ {
+ return (xhci->quirks & XHCI_LINK_TRB_QUIRK) ||
+- (type == TYPE_ISOC && (xhci->quirks & XHCI_AMD_0x96_HOST));
++ (type == TYPE_ISOC && (xhci->quirks & (XHCI_AMD_0x96_HOST | XHCI_NEC_HOST)));
+ }
+
+ /* xHCI debugging */
--- /dev/null
+From shivani.agarwal@broadcom.com Thu Jan 8 10:10:31 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu, 8 Jan 2026 00:49:26 -0800
+Subject: usb: xhci: move link chain bit quirk checks into one helper function.
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mathias.nyman@intel.com, linux-usb@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Niklas Neronin <niklas.neronin@linux.intel.com>, Mathias Nyman <mathias.nyman@linux.intel.com>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108084927.671785-2-shivani.agarwal@broadcom.com>
+
+From: Niklas Neronin <niklas.neronin@linux.intel.com>
+
+commit 7476a2215c07703db5e95efaa3fc5b9f957b9417 upstream.
+
+Older 0.95 xHCI hosts and some other specific newer hosts require the
+chain bit to be set for Link TRBs even if the link TRB is not in the
+middle of a transfer descriptor (TD).
+
+move the checks for all those cases into one xhci_link_chain_quirk()
+function to clean up and avoid code duplication.
+
+No functional changes.
+
+[skip renaming chain_links flag, reword commit message -Mathias]
+
+Signed-off-by: Niklas Neronin <niklas.neronin@linux.intel.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20240626124835.1023046-10-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[Shivani: Modified to apply on v5.10.y-v6.1.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-mem.c | 10 ++--------
+ drivers/usb/host/xhci-ring.c | 8 ++------
+ drivers/usb/host/xhci.h | 7 +++++--
+ 3 files changed, 9 insertions(+), 16 deletions(-)
+
+--- a/drivers/usb/host/xhci-mem.c
++++ b/drivers/usb/host/xhci-mem.c
+@@ -133,10 +133,7 @@ static void xhci_link_rings(struct xhci_
+ if (!ring || !first || !last)
+ return;
+
+- /* Set chain bit for 0.95 hosts, and for isoc rings on AMD 0.96 host */
+- chain_links = !!(xhci_link_trb_quirk(xhci) ||
+- (ring->type == TYPE_ISOC &&
+- (xhci->quirks & XHCI_AMD_0x96_HOST)));
++ chain_links = xhci_link_chain_quirk(xhci, ring->type);
+
+ next = ring->enq_seg->next;
+ xhci_link_segments(ring->enq_seg, first, ring->type, chain_links);
+@@ -326,10 +323,7 @@ static int xhci_alloc_segments_for_ring(
+ struct xhci_segment *prev;
+ bool chain_links;
+
+- /* Set chain bit for 0.95 hosts, and for isoc rings on AMD 0.96 host */
+- chain_links = !!(xhci_link_trb_quirk(xhci) ||
+- (type == TYPE_ISOC &&
+- (xhci->quirks & XHCI_AMD_0x96_HOST)));
++ chain_links = xhci_link_chain_quirk(xhci, type);
+
+ prev = xhci_segment_alloc(xhci, cycle_state, max_packet, flags);
+ if (!prev)
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -250,9 +250,7 @@ static void inc_enq(struct xhci_hcd *xhc
+ * AMD 0.96 host, carry over the chain bit of the previous TRB
+ * (which may mean the chain bit is cleared).
+ */
+- if (!(ring->type == TYPE_ISOC &&
+- (xhci->quirks & XHCI_AMD_0x96_HOST)) &&
+- !xhci_link_trb_quirk(xhci)) {
++ if (!xhci_link_chain_quirk(xhci, ring->type)) {
+ next->link.control &= cpu_to_le32(~TRB_CHAIN);
+ next->link.control |= cpu_to_le32(chain);
+ }
+@@ -3355,9 +3353,7 @@ static int prepare_ring(struct xhci_hcd
+ /* If we're not dealing with 0.95 hardware or isoc rings
+ * on AMD 0.96 host, clear the chain bit.
+ */
+- if (!xhci_link_trb_quirk(xhci) &&
+- !(ep_ring->type == TYPE_ISOC &&
+- (xhci->quirks & XHCI_AMD_0x96_HOST)))
++ if (!xhci_link_chain_quirk(xhci, ep_ring->type))
+ ep_ring->enqueue->link.control &=
+ cpu_to_le32(~TRB_CHAIN);
+ else
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -1788,9 +1788,12 @@ static inline void xhci_write_64(struct
+ lo_hi_writeq(val, regs);
+ }
+
+-static inline int xhci_link_trb_quirk(struct xhci_hcd *xhci)
++
++/* Link TRB chain should always be set on 0.95 hosts, and AMD 0.96 ISOC rings */
++static inline bool xhci_link_chain_quirk(struct xhci_hcd *xhci, enum xhci_ring_type type)
+ {
+- return xhci->quirks & XHCI_LINK_TRB_QUIRK;
++ return (xhci->quirks & XHCI_LINK_TRB_QUIRK) ||
++ (type == TYPE_ISOC && (xhci->quirks & XHCI_AMD_0x96_HOST));
+ }
+
+ /* xHCI debugging */
--- /dev/null
+From stable+bounces-203368-greg=kroah.com@vger.kernel.org Wed Dec 24 11:43:39 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 24 Dec 2025 10:24:31 +0000
+Subject: x86/mm/pat: clear VM_PAT if copy_p4d_range failed
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com, akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Ma Wupeng <mawupeng1@huawei.com>, syzbot+5f488e922d047d8f00cc@syzkaller.appspotmail.com, Alexander Ofitserov <oficerovas@altlinux.org>
+Message-ID: <20251224102432.923410-2-ajay.kaher@broadcom.com>
+
+From: Ma Wupeng <mawupeng1@huawei.com>
+
+[ Upstream commit d155df53f31068c3340733d586eb9b3ddfd70fc5 ]
+
+Syzbot reports a warning in untrack_pfn(). Digging into the root we found
+that this is due to memory allocation failure in pmd_alloc_one. And this
+failure is produced due to failslab.
+
+In copy_page_range(), memory alloaction for pmd failed. During the error
+handling process in copy_page_range(), mmput() is called to remove all
+vmas. While untrack_pfn this empty pfn, warning happens.
+
+Here's a simplified flow:
+
+dup_mm
+ dup_mmap
+ copy_page_range
+ copy_p4d_range
+ copy_pud_range
+ copy_pmd_range
+ pmd_alloc
+ __pmd_alloc
+ pmd_alloc_one
+ page = alloc_pages(gfp, 0);
+ if (!page)
+ return NULL;
+ mmput
+ exit_mmap
+ unmap_vmas
+ unmap_single_vma
+ untrack_pfn
+ follow_phys
+ WARN_ON_ONCE(1);
+
+Since this vma is not generate successfully, we can clear flag VM_PAT. In
+this case, untrack_pfn() will not be called while cleaning this vma.
+
+Function untrack_pfn_moved() has also been renamed to fit the new logic.
+
+Link: https://lkml.kernel.org/r/20230217025615.1595558-1-mawupeng1@huawei.com
+Signed-off-by: Ma Wupeng <mawupeng1@huawei.com>
+Reported-by: <syzbot+5f488e922d047d8f00cc@syzkaller.appspotmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Alexander Ofitserov <oficerovas@altlinux.org>
+Cc: stable@vger.kernel.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/pat/memtype.c | 12 ++++++++----
+ include/linux/pgtable.h | 7 ++++---
+ mm/memory.c | 1 +
+ mm/mremap.c | 2 +-
+ 4 files changed, 14 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/mm/pat/memtype.c
++++ b/arch/x86/mm/pat/memtype.c
+@@ -1137,11 +1137,15 @@ void untrack_pfn(struct vm_area_struct *
+ }
+
+ /*
+- * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
+- * with the old vma after its pfnmap page table has been removed. The new
+- * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
++ * untrack_pfn_clear is called if the following situation fits:
++ *
++ * 1) while mremapping a pfnmap for a new region, with the old vma after
++ * its pfnmap page table has been removed. The new vma has a new pfnmap
++ * to the same pfn & cache type with VM_PAT set.
++ * 2) while duplicating vm area, the new vma fails to copy the pgtable from
++ * old vma.
+ */
+-void untrack_pfn_moved(struct vm_area_struct *vma)
++void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+ vma->vm_flags &= ~VM_PAT;
+ }
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1214,9 +1214,10 @@ static inline void untrack_pfn(struct vm
+ }
+
+ /*
+- * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
++ * untrack_pfn_clear is called while mremapping a pfnmap for a new region
++ * or fails to copy pgtable during duplicate vm area.
+ */
+-static inline void untrack_pfn_moved(struct vm_area_struct *vma)
++static inline void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+ }
+ #else
+@@ -1228,7 +1229,7 @@ extern void track_pfn_insert(struct vm_a
+ extern int track_pfn_copy(struct vm_area_struct *vma);
+ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size);
+-extern void untrack_pfn_moved(struct vm_area_struct *vma);
++extern void untrack_pfn_clear(struct vm_area_struct *vma);
+ #endif
+
+ #ifdef CONFIG_MMU
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1335,6 +1335,7 @@ copy_page_range(struct vm_area_struct *d
+ continue;
+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+ addr, next))) {
++ untrack_pfn_clear(dst_vma);
+ ret = -ENOMEM;
+ break;
+ }
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -682,7 +682,7 @@ static unsigned long move_vma(struct vm_
+
+ /* Tell pfnmap has moved from this vma */
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+- untrack_pfn_moved(vma);
++ untrack_pfn_clear(vma);
+
+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ /* We always clear VM_LOCKED[ONFAULT] on the old vma */
--- /dev/null
+From stable+bounces-203369-greg=kroah.com@vger.kernel.org Wed Dec 24 11:43:43 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 24 Dec 2025 10:24:32 +0000
+Subject: x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com, akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, xingwei lee <xrivendell7@gmail.com>, yuxin wang <wang1315768607@163.com>, Marius Fleischer <fleischermarius@gmail.com>, David Hildenbrand <david@redhat.com>, Ingo Molnar <mingo@kernel.org>, Rik van Riel <riel@surriel.com>, Linus Torvalds <torvalds@linux-foundation.org>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251224102432.923410-3-ajay.kaher@broadcom.com>
+
+From: David Hildenbrand <david@redhat.com>
+
+[ Upstream commit dc84bc2aba85a1508f04a936f9f9a15f64ebfb31 ]
+
+If track_pfn_copy() fails, we already added the dst VMA to the maple
+tree. As fork() fails, we'll cleanup the maple tree, and stumble over
+the dst VMA for which we neither performed any reservation nor copied
+any page tables.
+
+Consequently untrack_pfn() will see VM_PAT and try obtaining the
+PAT information from the page table -- which fails because the page
+table was not copied.
+
+The easiest fix would be to simply clear the VM_PAT flag of the dst VMA
+if track_pfn_copy() fails. However, the whole thing is about "simply"
+clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy()
+and performed a reservation, but copying the page tables fails, we'll
+simply clear the VM_PAT flag, not properly undoing the reservation ...
+which is also wrong.
+
+So let's fix it properly: set the VM_PAT flag only if the reservation
+succeeded (leaving it clear initially), and undo the reservation if
+anything goes wrong while copying the page tables: clearing the VM_PAT
+flag after undoing the reservation.
+
+Note that any copied page table entries will get zapped when the VMA will
+get removed later, after copy_page_range() succeeded; as VM_PAT is not set
+then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be
+happy. Note that leaving these page tables in place without a reservation
+is not a problem, as we are aborting fork(); this process will never run.
+
+A reproducer can trigger this usually at the first try:
+
+ https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c
+
+ WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110
+ Modules linked in: ...
+ CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014
+ RIP: 0010:get_pat_info+0xf6/0x110
+ ...
+ Call Trace:
+ <TASK>
+ ...
+ untrack_pfn+0x52/0x110
+ unmap_single_vma+0xa6/0xe0
+ unmap_vmas+0x105/0x1f0
+ exit_mmap+0xf6/0x460
+ __mmput+0x4b/0x120
+ copy_process+0x1bf6/0x2aa0
+ kernel_clone+0xab/0x440
+ __do_sys_clone+0x66/0x90
+ do_syscall_64+0x95/0x180
+
+Likely this case was missed in:
+
+ d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")
+
+... and instead of undoing the reservation we simply cleared the VM_PAT flag.
+
+Keep the documentation of these functions in include/linux/pgtable.h,
+one place is more than sufficient -- we should clean that up for the other
+functions like track_pfn_remap/untrack_pfn separately.
+
+Fixes: d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")
+Fixes: 2ab640379a0a ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3")
+Reported-by: xingwei lee <xrivendell7@gmail.com>
+Reported-by: yuxin wang <wang1315768607@163.com>
+Reported-by: Marius Fleischer <fleischermarius@gmail.com>
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: linux-mm@kvack.org
+Link: https://lore.kernel.org/r/20250321112323.153741-1-david@redhat.com
+Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/
+Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Cc: stable@vger.kernel.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/pat/memtype.c | 52 ++++++++++++++++++++++++----------------------
+ include/linux/pgtable.h | 28 +++++++++++++++++++-----
+ kernel/fork.c | 4 +++
+ mm/memory.c | 11 +++------
+ 4 files changed, 58 insertions(+), 37 deletions(-)
+
+--- a/arch/x86/mm/pat/memtype.c
++++ b/arch/x86/mm/pat/memtype.c
+@@ -1029,29 +1029,42 @@ static int get_pat_info(struct vm_area_s
+ return -EINVAL;
+ }
+
+-/*
+- * track_pfn_copy is called when vma that is covering the pfnmap gets
+- * copied through copy_page_range().
+- *
+- * If the vma has a linear pfn mapping for the entire range, we get the prot
+- * from pte and reserve the entire vma range with single reserve_pfn_range call.
+- */
+-int track_pfn_copy(struct vm_area_struct *vma)
++int track_pfn_copy(struct vm_area_struct *dst_vma,
++ struct vm_area_struct *src_vma, unsigned long *pfn)
+ {
++ const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
+ resource_size_t paddr;
+- unsigned long vma_size = vma->vm_end - vma->vm_start;
+ pgprot_t pgprot;
++ int rc;
+
+- if (vma->vm_flags & VM_PAT) {
+- if (get_pat_info(vma, &paddr, &pgprot))
+- return -EINVAL;
+- /* reserve the whole chunk covered by vma. */
+- return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+- }
++ if (!(src_vma->vm_flags & VM_PAT))
++ return 0;
+
++ /*
++ * Duplicate the PAT information for the dst VMA based on the src
++ * VMA.
++ */
++ if (get_pat_info(src_vma, &paddr, &pgprot))
++ return -EINVAL;
++ rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
++ if (rc)
++ return rc;
++
++ /* Reservation for the destination VMA succeeded. */
++ dst_vma->vm_flags |= VM_PAT;
++ *pfn = PHYS_PFN(paddr);
+ return 0;
+ }
+
++void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
++{
++ untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start);
++ /*
++ * Reservation was freed, any copied page tables will get cleaned
++ * up later, but without getting PAT involved again.
++ */
++}
++
+ /*
+ * prot is passed in as a parameter for the new mapping. If the vma has
+ * a linear pfn mapping for the entire range, or no vma is provided,
+@@ -1136,15 +1149,6 @@ void untrack_pfn(struct vm_area_struct *
+ vma->vm_flags &= ~VM_PAT;
+ }
+
+-/*
+- * untrack_pfn_clear is called if the following situation fits:
+- *
+- * 1) while mremapping a pfnmap for a new region, with the old vma after
+- * its pfnmap page table has been removed. The new vma has a new pfnmap
+- * to the same pfn & cache type with VM_PAT set.
+- * 2) while duplicating vm area, the new vma fails to copy the pgtable from
+- * old vma.
+- */
+ void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+ vma->vm_flags &= ~VM_PAT;
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1195,15 +1195,26 @@ static inline void track_pfn_insert(stru
+ }
+
+ /*
+- * track_pfn_copy is called when vma that is covering the pfnmap gets
+- * copied through copy_page_range().
++ * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
++ * tables copied during copy_page_range(). On success, stores the pfn to be
++ * passed to untrack_pfn_copy().
+ */
+-static inline int track_pfn_copy(struct vm_area_struct *vma)
++static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
++ struct vm_area_struct *src_vma, unsigned long *pfn)
+ {
+ return 0;
+ }
+
+ /*
++ * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
++ * copy_page_range(), but after track_pfn_copy() was already called.
++ */
++static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
++ unsigned long pfn)
++{
++}
++
++/*
+ * untrack_pfn is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case pfn, size are zero).
+@@ -1214,8 +1225,10 @@ static inline void untrack_pfn(struct vm
+ }
+
+ /*
+- * untrack_pfn_clear is called while mremapping a pfnmap for a new region
+- * or fails to copy pgtable during duplicate vm area.
++ * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
++ *
++ * 1) During mremap() on the src VMA after the page tables were moved.
++ * 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
+ */
+ static inline void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+@@ -1226,7 +1239,10 @@ extern int track_pfn_remap(struct vm_are
+ unsigned long size);
+ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+ pfn_t pfn);
+-extern int track_pfn_copy(struct vm_area_struct *vma);
++extern int track_pfn_copy(struct vm_area_struct *dst_vma,
++ struct vm_area_struct *src_vma, unsigned long *pfn);
++extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
++ unsigned long pfn);
+ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size);
+ extern void untrack_pfn_clear(struct vm_area_struct *vma);
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -476,6 +476,10 @@ struct vm_area_struct *vm_area_dup(struc
+ *new = data_race(*orig);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ dup_anon_vma_name(orig, new);
++
++ /* track_pfn_copy() will later take care of copying internal state. */
++ if (unlikely(new->vm_flags & VM_PFNMAP))
++ untrack_pfn_clear(new);
+ }
+ return new;
+ }
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1278,12 +1278,12 @@ int
+ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+ {
+ pgd_t *src_pgd, *dst_pgd;
+- unsigned long next;
+ unsigned long addr = src_vma->vm_start;
+ unsigned long end = src_vma->vm_end;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ struct mmu_notifier_range range;
++ unsigned long next, pfn;
+ bool is_cow;
+ int ret;
+
+@@ -1294,11 +1294,7 @@ copy_page_range(struct vm_area_struct *d
+ return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
+
+ if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
+- /*
+- * We do not free on error cases below as remove_vma
+- * gets called on error from higher level routine
+- */
+- ret = track_pfn_copy(src_vma);
++ ret = track_pfn_copy(dst_vma, src_vma, &pfn);
+ if (ret)
+ return ret;
+ }
+@@ -1335,7 +1331,6 @@ copy_page_range(struct vm_area_struct *d
+ continue;
+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+ addr, next))) {
+- untrack_pfn_clear(dst_vma);
+ ret = -ENOMEM;
+ break;
+ }
+@@ -1345,6 +1340,8 @@ copy_page_range(struct vm_area_struct *d
+ raw_write_seqcount_end(&src_mm->write_protect_seq);
+ mmu_notifier_invalidate_range_end(&range);
+ }
++ if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
++ untrack_pfn_copy(dst_vma, pfn);
+ return ret;
+ }
+