]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.1-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 8 Jan 2026 13:36:36 +0000 (14:36 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 8 Jan 2026 13:36:36 +0000 (14:36 +0100)
added patches:
dmaengine-idxd-remove-improper-idxd_free.patch
drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch
kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch
mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch
mm-mprotect-use-long-for-page-accountings-and-retval.patch
sched-fair-proportional-newidle-balance.patch
sched-fair-small-cleanup-to-sched_balance_newidle.patch
sched-fair-small-cleanup-to-update_newidle_cost.patch
usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch
usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch
x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch
x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch

13 files changed:
queue-6.1/dmaengine-idxd-remove-improper-idxd_free.patch [new file with mode: 0644]
queue-6.1/drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch [new file with mode: 0644]
queue-6.1/kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch [new file with mode: 0644]
queue-6.1/mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch [new file with mode: 0644]
queue-6.1/mm-mprotect-use-long-for-page-accountings-and-retval.patch [new file with mode: 0644]
queue-6.1/sched-fair-proportional-newidle-balance.patch [new file with mode: 0644]
queue-6.1/sched-fair-small-cleanup-to-sched_balance_newidle.patch [new file with mode: 0644]
queue-6.1/sched-fair-small-cleanup-to-update_newidle_cost.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch [new file with mode: 0644]
queue-6.1/usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch [new file with mode: 0644]
queue-6.1/x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch [new file with mode: 0644]
queue-6.1/x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch [new file with mode: 0644]

diff --git a/queue-6.1/dmaengine-idxd-remove-improper-idxd_free.patch b/queue-6.1/dmaengine-idxd-remove-improper-idxd_free.patch
new file mode 100644 (file)
index 0000000..1f2de4d
--- /dev/null
@@ -0,0 +1,71 @@
+From stable+bounces-201132-greg=kroah.com@vger.kernel.org Tue Dec 16 04:16:52 2025
+From: lanbincn@139.com
+Date: Tue, 16 Dec 2025 03:13:13 +0000
+Subject: dmaengine: idxd: Remove improper idxd_free
+To: stable@vger.kernel.org
+Cc: Yi Sun <yi.sun@intel.com>, Shuai Xue <xueshuai@linux.alibaba.com>, Dave Jiang <dave.jiang@intel.com>, Vinicius Costa Gomes <vinicius.gomes@intel.com>, Vinod Koul <vkoul@kernel.org>, Bin Lan <lanbincn@139.com>
+Message-ID: <20251216031313.4853-1-lanbincn@139.com>
+
+From: Yi Sun <yi.sun@intel.com>
+
+[ Upstream commit f41c538881eec4dcf5961a242097d447f848cda6 ]
+
+The call to idxd_free() introduces a duplicate put_device() leading to a
+reference count underflow:
+refcount_t: underflow; use-after-free.
+WARNING: CPU: 15 PID: 4428 at lib/refcount.c:28 refcount_warn_saturate+0xbe/0x110
+...
+Call Trace:
+ <TASK>
+  idxd_remove+0xe4/0x120 [idxd]
+  pci_device_remove+0x3f/0xb0
+  device_release_driver_internal+0x197/0x200
+  driver_detach+0x48/0x90
+  bus_remove_driver+0x74/0xf0
+  pci_unregister_driver+0x2e/0xb0
+  idxd_exit_module+0x34/0x7a0 [idxd]
+  __do_sys_delete_module.constprop.0+0x183/0x280
+  do_syscall_64+0x54/0xd70
+  entry_SYSCALL_64_after_hwframe+0x76/0x7e
+
+The idxd_unregister_devices() which is invoked at the very beginning of
+idxd_remove(), already takes care of the necessary put_device() through the
+following call path:
+idxd_unregister_devices() -> device_unregister() -> put_device()
+
+In addition, when CONFIG_DEBUG_KOBJECT_RELEASE is enabled, put_device() may
+trigger asynchronous cleanup via schedule_delayed_work(). If idxd_free() is
+called immediately after, it can result in a use-after-free.
+
+Remove the improper idxd_free() to avoid both the refcount underflow and
+potential memory corruption during module unload.
+
+Fixes: d5449ff1b04d ("dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call")
+Signed-off-by: Yi Sun <yi.sun@intel.com>
+Tested-by: Shuai Xue <xueshuai@linux.alibaba.com>
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+
+Link: https://lore.kernel.org/r/20250729150313.1934101-2-yi.sun@intel.com
+Signed-off-by: Vinod Koul <vkoul@kernel.org>
+[ Slightly adjust the context. ]
+Signed-off-by: Bin Lan <lanbincn@139.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+Without this patch, this issue can be reproduced in Linux-6.1.y 
+when the idxd module is removed.
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/init.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/dma/idxd/init.c
++++ b/drivers/dma/idxd/init.c
+@@ -816,7 +816,6 @@ static void idxd_remove(struct pci_dev *
+       destroy_workqueue(idxd->wq);
+       perfmon_pmu_remove(idxd);
+       put_device(idxd_confdev(idxd));
+-      idxd_free(idxd);
+ }
+ static struct pci_driver idxd_pci_driver = {
diff --git a/queue-6.1/drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch b/queue-6.1/drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch
new file mode 100644 (file)
index 0000000..e55395d
--- /dev/null
@@ -0,0 +1,80 @@
+From stable+bounces-203358-greg=kroah.com@vger.kernel.org Wed Dec 24 09:57:13 2025
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Wed, 24 Dec 2025 00:36:52 -0800
+Subject: drm/vmwgfx: Fix a null-ptr access in the cursor snooper
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: zack.rusin@broadcom.com, bcm-kernel-feedback-list@broadcom.com, maarten.lankhorst@linux.intel.com, mripard@kernel.org, tzimmermann@suse.de, simona@ffwll.ch, airlied@gmail.com, brianp@vmware.com, dtor@vmware.com, airlied@redhat.com, thellstrom@vmware.com, dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Kuzey Arda Bulut <kuzeyardabulut@gmail.com>, Ian Forbes <ian.forbes@broadcom.com>, Sasha Levin <sashal@kernel.org>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20251224083652.614902-1-shivani.agarwal@broadcom.com>
+
+From: Zack Rusin <zack.rusin@broadcom.com>
+
+[ Upstream commit 5ac2c0279053a2c5265d46903432fb26ae2d0da2 ]
+
+Check that the resource which is converted to a surface exists before
+trying to use the cursor snooper on it.
+
+vmw_cmd_res_check allows explicit invalid (SVGA3D_INVALID_ID) identifiers
+because some svga commands accept SVGA3D_INVALID_ID to mean "no surface",
+unfortunately functions that accept the actual surfaces as objects might
+(and in case of the cursor snooper, do not) be able to handle null
+objects. Make sure that we validate not only the identifier (via the
+vmw_cmd_res_check) but also check that the actual resource exists before
+trying to do something with it.
+
+Fixes unchecked null-ptr reference in the snooping code.
+
+Signed-off-by: Zack Rusin <zack.rusin@broadcom.com>
+Fixes: c0951b797e7d ("drm/vmwgfx: Refactor resource management")
+Reported-by: Kuzey Arda Bulut <kuzeyardabulut@gmail.com>
+Cc: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
+Cc: dri-devel@lists.freedesktop.org
+Reviewed-by: Ian Forbes <ian.forbes@broadcom.com>
+Link: https://lore.kernel.org/r/20250917153655.1968583-1-zack.rusin@broadcom.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[Shivani: Modified to apply on v5.10.y-v6.1.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c |   17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+@@ -1507,6 +1507,7 @@ static int vmw_cmd_dma(struct vmw_privat
+                      SVGA3dCmdHeader *header)
+ {
+       struct vmw_buffer_object *vmw_bo = NULL;
++      struct vmw_resource *res;
+       struct vmw_surface *srf = NULL;
+       VMW_DECLARE_CMD_VAR(*cmd, SVGA3dCmdSurfaceDMA);
+       int ret;
+@@ -1542,18 +1543,24 @@ static int vmw_cmd_dma(struct vmw_privat
+       dirty = (cmd->body.transfer == SVGA3D_WRITE_HOST_VRAM) ?
+               VMW_RES_DIRTY_SET : 0;
+-      ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
+-                              dirty, user_surface_converter,
+-                              &cmd->body.host.sid, NULL);
++      ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface, dirty,
++                              user_surface_converter, &cmd->body.host.sid,
++                              NULL);
+       if (unlikely(ret != 0)) {
+               if (unlikely(ret != -ERESTARTSYS))
+                       VMW_DEBUG_USER("could not find surface for DMA.\n");
+               return ret;
+       }
+-      srf = vmw_res_to_srf(sw_context->res_cache[vmw_res_surface].res);
++      res = sw_context->res_cache[vmw_res_surface].res;
++      if (!res) {
++              VMW_DEBUG_USER("Invalid DMA surface.\n");
++              return -EINVAL;
++      }
+-      vmw_kms_cursor_snoop(srf, sw_context->fp->tfile, &vmw_bo->base, header);
++      srf = vmw_res_to_srf(res);
++      vmw_kms_cursor_snoop(srf, sw_context->fp->tfile, &vmw_bo->base,
++                           header);
+       return 0;
+ }
diff --git a/queue-6.1/kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch b/queue-6.1/kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch
new file mode 100644 (file)
index 0000000..b5f36d0
--- /dev/null
@@ -0,0 +1,47 @@
+From justinstitt@google.com  Thu Jan  8 14:27:57 2026
+From: Justin Stitt <justinstitt@google.com>
+Date: Fri, 05 Dec 2025 14:51:41 -0800
+Subject: KVM: arm64: sys_regs: disable -Wuninitialized-const-pointer warning
+To: Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Alexandru Elisei <alexandru.elisei@arm.com>, Joey Gouly <joey.gouly@arm.com>,  Suzuki K Poulose <suzuki.poulose@arm.com>, Catalin Marinas <catalin.marinas@arm.com>,  Zenghui Yu <yuzenghui@huawei.com>, Will Deacon <will@kernel.org>,  Nathan Chancellor <nathan@kernel.org>, Christopher Covington <cov@codeaurora.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.cs.columbia.edu,  linux-kernel@vger.kernel.org, llvm@lists.linux.dev, stable@vger.kernel.org,  Justin Stitt <justinstitt@google.com>
+Message-ID: <20251205-stable-disable-unit-ptr-warn-v2-1-cec53a8f736b@google.com>
+
+From: Justin Stitt <justinstitt@google.com>
+
+A new warning in Clang 22 [1] complains that @clidr passed to
+get_clidr_el1() is an uninitialized const pointer. get_clidr_el1()
+doesn't really care since it casts away the const-ness anyways -- it is
+a false positive.
+
+This patch isn't needed for anything past 6.1 as this code section was
+reworked in Commit 7af0c2534f4c ("KVM: arm64: Normalize cache
+configuration") which incidentally removed the aforementioned warning.
+Since there is no upstream equivalent, this patch just needs to be
+applied to 6.1.
+
+Disable this warning for sys_regs.o instead of backporting the patches
+from 6.2+ that modified this code area.
+
+Cc: stable@vger.kernel.org
+Fixes: 7c8c5e6a9101e ("arm64: KVM: system register handling")
+Link: https://github.com/llvm/llvm-project/commit/00dacf8c22f065cb52efb14cd091d441f19b319e [1]
+Reviewed-by: Nathan Chancellor <nathan@kernel.org>
+Signed-off-by: Justin Stitt <justinstitt@google.com>
+Reviewed-by: Tiffany Yang <ynaffit@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/Makefile |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/arm64/kvm/Makefile
++++ b/arch/arm64/kvm/Makefile
+@@ -24,6 +24,9 @@ kvm-y += arm.o mmu.o mmio.o psci.o hyper
+ kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
++# Work around a false positive Clang 22 -Wuninitialized-const-pointer warning
++CFLAGS_sys_regs.o := $(call cc-disable-warning, uninitialized-const-pointer)
++
+ always-y := hyp_constants.h hyp-constants.s
+ define rule_gen_hyp_constants
diff --git a/queue-6.1/mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch b/queue-6.1/mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch
new file mode 100644 (file)
index 0000000..472788c
--- /dev/null
@@ -0,0 +1,283 @@
+From stable+bounces-205076-greg=kroah.com@vger.kernel.org Tue Jan  6 12:49:13 2026
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Tue,  6 Jan 2026 20:47:14 +0900
+Subject: mm/mprotect: delete pmd_none_or_clear_bad_unless_trans_huge()
+To: stable@vger.kernel.org
+Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org, baohua@kernel.org, baolin.wang@linux.alibaba.com, david@kernel.org, dev.jain@arm.com, hughd@google.com, jane.chu@oracle.com, jannh@google.com, kas@kernel.org, lance.yang@linux.dev, linux-mm@kvack.org, lorenzo.stoakes@oracle.com, npache@redhat.com, pfalcato@suse.de, ryan.roberts@arm.com, vbabka@suse.cz, ziy@nvidia.com, "Alistair Popple" <apopple@nvidia.com>, "Anshuman Khandual" <anshuman.khandual@arm.com>, "Axel Rasmussen" <axelrasmussen@google.com>, "Christophe Leroy" <christophe.leroy@csgroup.eu>, "Christoph Hellwig" <hch@infradead.org>, "David Hildenbrand" <david@redhat.com>, "Huang, Ying" <ying.huang@intel.com>, "Ira Weiny" <ira.weiny@intel.com>, "Jason Gunthorpe" <jgg@ziepe.ca>, "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>, "Lorenzo Stoakes" <lstoakes@gmail.com>, "Matthew Wilcox" <willy@infradead.org>, "Mel Gorman" <mgorman@techsingularity.net>, "Miaohe Lin" <linmiaohe@huawei.com>, "Mike Kravetz" <mike.kravetz@oracle.com>, "Mike Rapoport" <rppt@kernel.org>, "Minchan Kim" <minchan@kernel.org>, "Naoya Horiguchi" <naoya.horiguchi@nec.com>, "Pavel Tatashin" <pasha.tatashin@soleen.com>, "Peter Xu" <peterx@redhat.com>, "Peter Zijlstra" <peterz@infradead.org>, "Qi Zheng" <zhengqi.arch@bytedance.com>, "Ralph Campbell" <rcampbell@nvidia.com>, "SeongJae Park" <sj@kernel.org>, "Song Liu" <song@kernel.org>, "Steven Price" <steven.price@arm.com>, "Suren Baghdasaryan" <surenb@google.com>, "Thomas Hellström" <thomas.hellstrom@linux.intel.com>, "Will Deacon" <will@kernel.org>, "Yang Shi" <shy828301@gmail.com>, "Yu Zhao" <yuzhao@google.com>, "Zack Rusin" <zackr@vmware.com>, "Harry Yoo" <harry.yoo@oracle.com>
+Message-ID: <20260106114715.80958-3-harry.yoo@oracle.com>
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 670ddd8cdcbd1d07a4571266ae3517f821728c3a upstream.
+
+change_pmd_range() had special pmd_none_or_clear_bad_unless_trans_huge(),
+required to avoid "bad" choices when setting automatic NUMA hinting under
+mmap_read_lock(); but most of that is already covered in pte_offset_map()
+now.  change_pmd_range() just wants a pmd_none() check before wasting time
+on MMU notifiers, then checks on the read-once _pmd value to work out
+what's needed for huge cases.  If change_pte_range() returns -EAGAIN to
+retry if pte_offset_map_lock() fails, nothing more special is needed.
+
+Link: https://lkml.kernel.org/r/725a42a9-91e9-c868-925-e3a5fd40bb4f@google.com
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Ira Weiny <ira.weiny@intel.com>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Lorenzo Stoakes <lstoakes@gmail.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Mike Rapoport (IBM) <rppt@kernel.org>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Ralph Campbell <rcampbell@nvidia.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: SeongJae Park <sj@kernel.org>
+Cc: Song Liu <song@kernel.org>
+Cc: Steven Price <steven.price@arm.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: Zack Rusin <zackr@vmware.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ Background: It was reported that a bad pmd is seen when automatic NUMA
+  balancing is marking page table entries as prot_numa:
+
+      [2437548.196018] mm/pgtable-generic.c:50: bad pmd 00000000af22fc02(dffffffe71fbfe02)
+      [2437548.235022] Call Trace:
+      [2437548.238234]  <TASK>
+      [2437548.241060]  dump_stack_lvl+0x46/0x61
+      [2437548.245689]  panic+0x106/0x2e5
+      [2437548.249497]  pmd_clear_bad+0x3c/0x3c
+      [2437548.253967]  change_pmd_range.isra.0+0x34d/0x3a7
+      [2437548.259537]  change_p4d_range+0x156/0x20e
+      [2437548.264392]  change_protection_range+0x116/0x1a9
+      [2437548.269976]  change_prot_numa+0x15/0x37
+      [2437548.274774]  task_numa_work+0x1b8/0x302
+      [2437548.279512]  task_work_run+0x62/0x95
+      [2437548.283882]  exit_to_user_mode_loop+0x1a4/0x1a9
+      [2437548.289277]  exit_to_user_mode_prepare+0xf4/0xfc
+      [2437548.294751]  ? sysvec_apic_timer_interrupt+0x34/0x81
+      [2437548.300677]  irqentry_exit_to_user_mode+0x5/0x25
+      [2437548.306153]  asm_sysvec_apic_timer_interrupt+0x16/0x1b
+
+    This is due to a race condition between change_prot_numa() and
+    THP migration because the kernel doesn't check is_swap_pmd() and
+    pmd_trans_huge() atomically:
+
+    change_prot_numa()                      THP migration
+    ======================================================================
+    - change_pmd_range()
+    -> is_swap_pmd() returns false,
+    meaning it's not a PMD migration
+    entry.
+                                      - do_huge_pmd_numa_page()
+                                      -> migrate_misplaced_page() sets
+                                         migration entries for the THP.
+    - change_pmd_range()
+    -> pmd_none_or_clear_bad_unless_trans_huge()
+    -> pmd_none() and pmd_trans_huge() returns false
+    - pmd_none_or_clear_bad_unless_trans_huge()
+    -> pmd_bad() returns true for the migration entry!
+
+  The upstream commit 670ddd8cdcbd ("mm/mprotect: delete
+  pmd_none_or_clear_bad_unless_trans_huge()") closes this race condition
+  by checking is_swap_pmd() and pmd_trans_huge() atomically.
+
+  Backporting note:
+    Unlike the mainline, pte_offset_map_lock() does not check if the pmd
+    entry is a migration entry or a hugepage; acquires PTL unconditionally
+    instead of returning failure. Therefore, it is necessary to keep the
+    !is_swap_pmd() && !pmd_trans_huge() && !pmd_devmap() check before
+    acquiring the PTL.
+
+    After acquiring the lock, open-code the semantics of
+    pte_offset_map_lock() in the mainline kernel; change_pte_range() fails
+    if the pmd value has changed. This requires adding pmd_old parameter
+    (pmd_t value that is read before calling the function) to
+    change_pte_range(). ]
+
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mprotect.c |  101 ++++++++++++++++++++++++----------------------------------
+ 1 file changed, 43 insertions(+), 58 deletions(-)
+
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -73,10 +73,12 @@ static inline bool can_change_pte_writab
+ }
+ static long change_pte_range(struct mmu_gather *tlb,
+-              struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+-              unsigned long end, pgprot_t newprot, unsigned long cp_flags)
++              struct vm_area_struct *vma, pmd_t *pmd, pmd_t pmd_old,
++              unsigned long addr, unsigned long end, pgprot_t newprot,
++              unsigned long cp_flags)
+ {
+       pte_t *pte, oldpte;
++      pmd_t _pmd;
+       spinlock_t *ptl;
+       long pages = 0;
+       int target_node = NUMA_NO_NODE;
+@@ -86,21 +88,15 @@ static long change_pte_range(struct mmu_
+       tlb_change_page_size(tlb, PAGE_SIZE);
+-      /*
+-       * Can be called with only the mmap_lock for reading by
+-       * prot_numa so we must check the pmd isn't constantly
+-       * changing from under us from pmd_none to pmd_trans_huge
+-       * and/or the other way around.
+-       */
+-      if (pmd_trans_unstable(pmd))
+-              return 0;
+-
+-      /*
+-       * The pmd points to a regular pte so the pmd can't change
+-       * from under us even if the mmap_lock is only hold for
+-       * reading.
+-       */
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
++      /* Make sure pmd didn't change after acquiring ptl */
++      _pmd = pmd_read_atomic(pmd);
++      /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
++      barrier();
++      if (!pmd_same(pmd_old, _pmd)) {
++              pte_unmap_unlock(pte, ptl);
++              return -EAGAIN;
++      }
+       /* Get target node for single threaded private VMAs */
+       if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
+@@ -288,31 +284,6 @@ static long change_pte_range(struct mmu_
+       return pages;
+ }
+-/*
+- * Used when setting automatic NUMA hinting protection where it is
+- * critical that a numa hinting PMD is not confused with a bad PMD.
+- */
+-static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+-{
+-      pmd_t pmdval = pmd_read_atomic(pmd);
+-
+-      /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+-      barrier();
+-#endif
+-
+-      if (pmd_none(pmdval))
+-              return 1;
+-      if (pmd_trans_huge(pmdval))
+-              return 0;
+-      if (unlikely(pmd_bad(pmdval))) {
+-              pmd_clear_bad(pmd);
+-              return 1;
+-      }
+-
+-      return 0;
+-}
+-
+ /* Return true if we're uffd wr-protecting file-backed memory, or false */
+ static inline bool
+ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+@@ -360,22 +331,34 @@ static inline long change_pmd_range(stru
+       pmd = pmd_offset(pud, addr);
+       do {
+-              long this_pages;
+-
++              long ret;
++              pmd_t _pmd;
++again:
+               next = pmd_addr_end(addr, end);
++              _pmd = pmd_read_atomic(pmd);
++              /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++              barrier();
++#endif
+               change_pmd_prepare(vma, pmd, cp_flags);
+               /*
+                * Automatic NUMA balancing walks the tables with mmap_lock
+                * held for read. It's possible a parallel update to occur
+-               * between pmd_trans_huge() and a pmd_none_or_clear_bad()
+-               * check leading to a false positive and clearing.
+-               * Hence, it's necessary to atomically read the PMD value
+-               * for all the checks.
++               * between pmd_trans_huge(), is_swap_pmd(), and
++               * a pmd_none_or_clear_bad() check leading to a false positive
++               * and clearing. Hence, it's necessary to atomically read
++               * the PMD value for all the checks.
+                */
+-              if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
+-                   pmd_none_or_clear_bad_unless_trans_huge(pmd))
+-                      goto next;
++              if (!is_swap_pmd(_pmd) && !pmd_devmap(_pmd) && !pmd_trans_huge(_pmd)) {
++                      if (pmd_none(_pmd))
++                              goto next;
++
++                      if (pmd_bad(_pmd)) {
++                              pmd_clear_bad(pmd);
++                              goto next;
++                      }
++              }
+               /* invoke the mmu notifier if the pmd is populated */
+               if (!range.start) {
+@@ -385,7 +368,7 @@ static inline long change_pmd_range(stru
+                       mmu_notifier_invalidate_range_start(&range);
+               }
+-              if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
++              if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
+                       if ((next - addr != HPAGE_PMD_SIZE) ||
+                           uffd_wp_protect_file(vma, cp_flags)) {
+                               __split_huge_pmd(vma, pmd, addr, false, NULL);
+@@ -400,11 +383,11 @@ static inline long change_pmd_range(stru
+                                * change_huge_pmd() does not defer TLB flushes,
+                                * so no need to propagate the tlb argument.
+                                */
+-                              int nr_ptes = change_huge_pmd(tlb, vma, pmd,
+-                                              addr, newprot, cp_flags);
++                              ret = change_huge_pmd(tlb, vma, pmd,
++                                                    addr, newprot, cp_flags);
+-                              if (nr_ptes) {
+-                                      if (nr_ptes == HPAGE_PMD_NR) {
++                              if (ret) {
++                                      if (ret == HPAGE_PMD_NR) {
+                                               pages += HPAGE_PMD_NR;
+                                               nr_huge_updates++;
+                                       }
+@@ -415,9 +398,11 @@ static inline long change_pmd_range(stru
+                       }
+                       /* fall through, the trans huge pmd just split */
+               }
+-              this_pages = change_pte_range(tlb, vma, pmd, addr, next,
+-                                            newprot, cp_flags);
+-              pages += this_pages;
++              ret = change_pte_range(tlb, vma, pmd, _pmd, addr, next,
++                                     newprot, cp_flags);
++              if (ret < 0)
++                      goto again;
++              pages += ret;
+ next:
+               cond_resched();
+       } while (pmd++, addr = next, addr != end);
diff --git a/queue-6.1/mm-mprotect-use-long-for-page-accountings-and-retval.patch b/queue-6.1/mm-mprotect-use-long-for-page-accountings-and-retval.patch
new file mode 100644 (file)
index 0000000..38eb326
--- /dev/null
@@ -0,0 +1,223 @@
+From stable+bounces-205075-greg=kroah.com@vger.kernel.org Tue Jan  6 12:48:24 2026
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Tue,  6 Jan 2026 20:47:13 +0900
+Subject: mm/mprotect: use long for page accountings and retval
+To: stable@vger.kernel.org
+Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org, baohua@kernel.org, baolin.wang@linux.alibaba.com, david@kernel.org, dev.jain@arm.com, hughd@google.com, jane.chu@oracle.com, jannh@google.com, kas@kernel.org, lance.yang@linux.dev, linux-mm@kvack.org, lorenzo.stoakes@oracle.com, npache@redhat.com, pfalcato@suse.de, ryan.roberts@arm.com, vbabka@suse.cz, ziy@nvidia.com, Peter Xu <peterx@redhat.com>, Mike Kravetz <mike.kravetz@oracle.com>, James Houghton <jthoughton@google.com>, Andrea Arcangeli <aarcange@redhat.com>, Axel Rasmussen <axelrasmussen@google.com>, David Hildenbrand <david@redhat.com>, Muchun Song <songmuchun@bytedance.com>, Nadav Amit <nadav.amit@gmail.com>, Harry Yoo <harry.yoo@oracle.com>
+Message-ID: <20260106114715.80958-2-harry.yoo@oracle.com>
+
+From: Peter Xu <peterx@redhat.com>
+
+commit a79390f5d6a78647fd70856bd42b22d994de0ba2 upstream.
+
+Switch to use type "long" for page accountings and retval across the whole
+procedure of change_protection().
+
+The change should have shrinked the possible maximum page number to be
+half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on
+any system either because the maximum possible pages touched by change
+protection should be ULONG_MAX / PAGE_SIZE.
+
+Two reasons to switch from "unsigned long" to "long":
+
+  1. It suites better on count_vm_numa_events(), whose 2nd parameter takes
+     a long type.
+
+  2. It paves way for returning negative (error) values in the future.
+
+Currently the only caller that consumes this retval is change_prot_numa(),
+where the unsigned long was converted to an int.  Since at it, touching up
+the numa code to also take a long, so it'll avoid any possible overflow
+too during the int-size convertion.
+
+Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: James Houghton <jthoughton@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ Adjust context ]
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/hugetlb.h |    4 ++--
+ include/linux/mm.h      |    2 +-
+ mm/hugetlb.c            |    4 ++--
+ mm/mempolicy.c          |    2 +-
+ mm/mprotect.c           |   26 +++++++++++++-------------
+ 5 files changed, 19 insertions(+), 19 deletions(-)
+
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -233,7 +233,7 @@ void hugetlb_vma_lock_release(struct kre
+ int pmd_huge(pmd_t pmd);
+ int pud_huge(pud_t pud);
+-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
++long hugetlb_change_protection(struct vm_area_struct *vma,
+               unsigned long address, unsigned long end, pgprot_t newprot,
+               unsigned long cp_flags);
+@@ -447,7 +447,7 @@ static inline void move_hugetlb_state(st
+ {
+ }
+-static inline unsigned long hugetlb_change_protection(
++static inline long hugetlb_change_protection(
+                       struct vm_area_struct *vma, unsigned long address,
+                       unsigned long end, pgprot_t newprot,
+                       unsigned long cp_flags)
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2148,7 +2148,7 @@ extern unsigned long move_page_tables(st
+ #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
+                                           MM_CP_UFFD_WP_RESOLVE)
+-extern unsigned long change_protection(struct mmu_gather *tlb,
++extern long change_protection(struct mmu_gather *tlb,
+                             struct vm_area_struct *vma, unsigned long start,
+                             unsigned long end, pgprot_t newprot,
+                             unsigned long cp_flags);
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6668,7 +6668,7 @@ long follow_hugetlb_page(struct mm_struc
+       return i ? i : err;
+ }
+-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
++long hugetlb_change_protection(struct vm_area_struct *vma,
+               unsigned long address, unsigned long end,
+               pgprot_t newprot, unsigned long cp_flags)
+ {
+@@ -6677,7 +6677,7 @@ unsigned long hugetlb_change_protection(
+       pte_t *ptep;
+       pte_t pte;
+       struct hstate *h = hstate_vma(vma);
+-      unsigned long pages = 0, psize = huge_page_size(h);
++      long pages = 0, psize = huge_page_size(h);
+       bool shared_pmd = false;
+       struct mmu_notifier_range range;
+       unsigned long last_addr_mask;
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -628,7 +628,7 @@ unsigned long change_prot_numa(struct vm
+                       unsigned long addr, unsigned long end)
+ {
+       struct mmu_gather tlb;
+-      int nr_updated;
++      long nr_updated;
+       tlb_gather_mmu(&tlb, vma->vm_mm);
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -72,13 +72,13 @@ static inline bool can_change_pte_writab
+       return true;
+ }
+-static unsigned long change_pte_range(struct mmu_gather *tlb,
++static long change_pte_range(struct mmu_gather *tlb,
+               struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+               unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+       pte_t *pte, oldpte;
+       spinlock_t *ptl;
+-      unsigned long pages = 0;
++      long pages = 0;
+       int target_node = NUMA_NO_NODE;
+       bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+       bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+@@ -346,13 +346,13 @@ uffd_wp_protect_file(struct vm_area_stru
+               }                                                       \
+       } while (0)
+-static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
++static inline long change_pmd_range(struct mmu_gather *tlb,
+               struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
+               unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+       pmd_t *pmd;
+       unsigned long next;
+-      unsigned long pages = 0;
++      long pages = 0;
+       unsigned long nr_huge_updates = 0;
+       struct mmu_notifier_range range;
+@@ -360,7 +360,7 @@ static inline unsigned long change_pmd_r
+       pmd = pmd_offset(pud, addr);
+       do {
+-              unsigned long this_pages;
++              long this_pages;
+               next = pmd_addr_end(addr, end);
+@@ -430,13 +430,13 @@ next:
+       return pages;
+ }
+-static inline unsigned long change_pud_range(struct mmu_gather *tlb,
++static inline long change_pud_range(struct mmu_gather *tlb,
+               struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
+               unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+       pud_t *pud;
+       unsigned long next;
+-      unsigned long pages = 0;
++      long pages = 0;
+       pud = pud_offset(p4d, addr);
+       do {
+@@ -451,13 +451,13 @@ static inline unsigned long change_pud_r
+       return pages;
+ }
+-static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
++static inline long change_p4d_range(struct mmu_gather *tlb,
+               struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
+               unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+       p4d_t *p4d;
+       unsigned long next;
+-      unsigned long pages = 0;
++      long pages = 0;
+       p4d = p4d_offset(pgd, addr);
+       do {
+@@ -472,14 +472,14 @@ static inline unsigned long change_p4d_r
+       return pages;
+ }
+-static unsigned long change_protection_range(struct mmu_gather *tlb,
++static long change_protection_range(struct mmu_gather *tlb,
+               struct vm_area_struct *vma, unsigned long addr,
+               unsigned long end, pgprot_t newprot, unsigned long cp_flags)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       pgd_t *pgd;
+       unsigned long next;
+-      unsigned long pages = 0;
++      long pages = 0;
+       BUG_ON(addr >= end);
+       pgd = pgd_offset(mm, addr);
+@@ -498,12 +498,12 @@ static unsigned long change_protection_r
+       return pages;
+ }
+-unsigned long change_protection(struct mmu_gather *tlb,
++long change_protection(struct mmu_gather *tlb,
+                      struct vm_area_struct *vma, unsigned long start,
+                      unsigned long end, pgprot_t newprot,
+                      unsigned long cp_flags)
+ {
+-      unsigned long pages;
++      long pages;
+       BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
diff --git a/queue-6.1/sched-fair-proportional-newidle-balance.patch b/queue-6.1/sched-fair-proportional-newidle-balance.patch
new file mode 100644 (file)
index 0000000..25a5d0e
--- /dev/null
@@ -0,0 +1,207 @@
+From stable+bounces-198206-greg=kroah.com@vger.kernel.org Wed Dec  3 12:44:47 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed,  3 Dec 2025 11:25:52 +0000
+Subject: sched/fair: Proportional newidle balance
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112552.1738424-5-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream.
+
+Add a randomized algorithm that runs newidle balancing proportional to
+its success rate.
+
+This improves schbench significantly:
+
+ 6.18-rc4:                     2.22 Mrps/s
+ 6.18-rc4+revert:              2.04 Mrps/s
+ 6.18-rc4+revert+random:       2.18 Mrps/S
+
+Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
+
+ 6.17:                 -6%
+ 6.17+revert:           0%
+ 6.17+revert+random:   -1%
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
+Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sched/topology.h |    3 ++
+ kernel/sched/core.c            |    3 ++
+ kernel/sched/fair.c            |   44 +++++++++++++++++++++++++++++++++++++----
+ kernel/sched/features.h        |    5 ++++
+ kernel/sched/sched.h           |    7 ++++++
+ kernel/sched/topology.c        |    6 +++++
+ 6 files changed, 64 insertions(+), 4 deletions(-)
+
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -106,6 +106,9 @@ struct sched_domain {
+       unsigned int nr_balance_failed; /* initialise to 0 */
+       /* idle_balance() stats */
++      unsigned int newidle_call;
++      unsigned int newidle_success;
++      unsigned int newidle_ratio;
+       u64 max_newidle_lb_cost;
+       unsigned long last_decay_max_lb_cost;
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -112,6 +112,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_
+ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
+ #ifdef CONFIG_SCHED_DEBUG
+ /*
+@@ -9632,6 +9633,8 @@ void __init sched_init_smp(void)
+ {
+       sched_init_numa(NUMA_NO_NODE);
++      prandom_init_once(&sched_rnd_state);
++
+       /*
+        * There's no userspace yet to cause hotplug operations; hence all the
+        * CPU masks are stable and all blatant races in the below code cannot
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10935,11 +10935,27 @@ void update_max_interval(void)
+       max_load_balance_interval = HZ*num_online_cpus()/10;
+ }
+-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
++static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
++{
++      sd->newidle_call++;
++      sd->newidle_success += success;
++
++      if (sd->newidle_call >= 1024) {
++              sd->newidle_ratio = sd->newidle_success;
++              sd->newidle_call /= 2;
++              sd->newidle_success /= 2;
++      }
++}
++
++static inline bool
++update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
+ {
+       unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+       unsigned long now = jiffies;
++      if (cost)
++              update_newidle_stats(sd, success);
++
+       if (cost > sd->max_newidle_lb_cost) {
+               /*
+                * Track max cost of a domain to make sure to not delay the
+@@ -10987,7 +11003,7 @@ static void rebalance_domains(struct rq
+                * Decay the newidle max times here because this is a regular
+                * visit to all the domains.
+                */
+-              need_decay = update_newidle_cost(sd, 0);
++              need_decay = update_newidle_cost(sd, 0, 0);
+               max_cost += sd->max_newidle_lb_cost;
+               /*
+@@ -11621,6 +11637,22 @@ static int sched_balance_newidle(struct
+                       break;
+               if (sd->flags & SD_BALANCE_NEWIDLE) {
++                      unsigned int weight = 1;
++
++                      if (sched_feat(NI_RANDOM)) {
++                              /*
++                               * Throw a 1k sided dice; and only run
++                               * newidle_balance according to the success
++                               * rate.
++                               */
++                              u32 d1k = sched_rng() % 1024;
++                              weight = 1 + sd->newidle_ratio;
++                              if (d1k > weight) {
++                                      update_newidle_stats(sd, 0);
++                                      continue;
++                              }
++                              weight = (1024 + weight/2) / weight;
++                      }
+                       pulled_task = load_balance(this_cpu, this_rq,
+                                                  sd, CPU_NEWLY_IDLE,
+@@ -11628,10 +11660,14 @@ static int sched_balance_newidle(struct
+                       t1 = sched_clock_cpu(this_cpu);
+                       domain_cost = t1 - t0;
+-                      update_newidle_cost(sd, domain_cost);
+-
+                       curr_cost += domain_cost;
+                       t0 = t1;
++
++                      /*
++                       * Track max cost of a domain to make sure to not delay the
++                       * next wakeup on the CPU.
++                       */
++                      update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
+               }
+               /*
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -99,5 +99,10 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
+ SCHED_FEAT(LATENCY_WARN, false)
++/*
++ * Do newidle balancing proportional to its success rate using randomization.
++ */
++SCHED_FEAT(NI_RANDOM, true)
++
+ SCHED_FEAT(ALT_PERIOD, true)
+ SCHED_FEAT(BASE_SLICE, true)
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -5,6 +5,7 @@
+ #ifndef _KERNEL_SCHED_SCHED_H
+ #define _KERNEL_SCHED_SCHED_H
++#include <linux/prandom.h>
+ #include <linux/sched/affinity.h>
+ #include <linux/sched/autogroup.h>
+ #include <linux/sched/cpufreq.h>
+@@ -1190,6 +1191,12 @@ static inline bool is_migration_disabled
+ }
+ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
++
++static inline u32 sched_rng(void)
++{
++      return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
++}
+ #define cpu_rq(cpu)           (&per_cpu(runqueues, (cpu)))
+ #define this_rq()             this_cpu_ptr(&runqueues)
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1584,6 +1584,12 @@ sd_init(struct sched_domain_topology_lev
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
++
++              /* 50% success rate */
++              .newidle_call           = 512,
++              .newidle_success        = 256,
++              .newidle_ratio          = 512,
++
+               .max_newidle_lb_cost    = 0,
+               .last_decay_max_lb_cost = jiffies,
+               .child                  = child,
diff --git a/queue-6.1/sched-fair-small-cleanup-to-sched_balance_newidle.patch b/queue-6.1/sched-fair-small-cleanup-to-sched_balance_newidle.patch
new file mode 100644 (file)
index 0000000..72b56fe
--- /dev/null
@@ -0,0 +1,49 @@
+From stable+bounces-198204-greg=kroah.com@vger.kernel.org Wed Dec  3 12:44:16 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed,  3 Dec 2025 11:25:50 +0000
+Subject: sched/fair: Small cleanup to sched_balance_newidle()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112552.1738424-3-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit e78e70dbf603c1425f15f32b455ca148c932f6c1 upstream.
+
+Pull out the !sd check to simplify code.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.525916173@infradead.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -11589,14 +11589,15 @@ static int sched_balance_newidle(struct
+       rcu_read_lock();
+       sd = rcu_dereference_check_sched_domain(this_rq->sd);
++      if (!sd) {
++              rcu_read_unlock();
++              goto out;
++      }
+       if (!READ_ONCE(this_rq->rd->overload) ||
+-          (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+-
+-              if (sd)
+-                      update_next_balance(sd, &next_balance);
++          this_rq->avg_idle < sd->max_newidle_lb_cost) {
++              update_next_balance(sd, &next_balance);
+               rcu_read_unlock();
+-
+               goto out;
+       }
+       rcu_read_unlock();
diff --git a/queue-6.1/sched-fair-small-cleanup-to-update_newidle_cost.patch b/queue-6.1/sched-fair-small-cleanup-to-update_newidle_cost.patch
new file mode 100644 (file)
index 0000000..f774218
--- /dev/null
@@ -0,0 +1,58 @@
+From stable+bounces-198205-greg=kroah.com@vger.kernel.org Wed Dec  3 12:44:39 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed,  3 Dec 2025 11:25:51 +0000
+Subject: sched/fair: Small cleanup to update_newidle_cost()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112552.1738424-4-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 08d473dd8718e4a4d698b1113a14a40ad64a909b upstream.
+
+Simplify code by adding a few variables.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.655208666@infradead.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c |   11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10937,22 +10937,25 @@ void update_max_interval(void)
+ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+ {
++      unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
++      unsigned long now = jiffies;
++
+       if (cost > sd->max_newidle_lb_cost) {
+               /*
+                * Track max cost of a domain to make sure to not delay the
+                * next wakeup on the CPU.
+                */
+               sd->max_newidle_lb_cost = cost;
+-              sd->last_decay_max_lb_cost = jiffies;
+-      } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
++              sd->last_decay_max_lb_cost = now;
++
++      } else if (time_after(now, next_decay)) {
+               /*
+                * Decay the newidle max times by ~1% per second to ensure that
+                * it is not outdated and the current max cost is actually
+                * shorter.
+                */
+               sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+-              sd->last_decay_max_lb_cost = jiffies;
+-
++              sd->last_decay_max_lb_cost = now;
+               return true;
+       }
index 625794102e407b3bc2ef95933d3dc87afaba4a43..f0aa234ca793a268ce7b1f87ab13dd3afc845538 100644 (file)
@@ -604,3 +604,15 @@ iommu-qcom-use-the-asid-read-from-device-tree-if-specified.patch
 iommu-qcom-index-contexts-by-asid-number-to-allow-asid-0.patch
 iommu-qcom-fix-device-leak-on-of_xlate.patch
 virtio_console-fix-order-of-fields-cols-and-rows.patch
+kvm-arm64-sys_regs-disable-wuninitialized-const-pointer-warning.patch
+dmaengine-idxd-remove-improper-idxd_free.patch
+x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch
+x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch
+mm-mprotect-use-long-for-page-accountings-and-retval.patch
+mm-mprotect-delete-pmd_none_or_clear_bad_unless_trans_huge.patch
+drm-vmwgfx-fix-a-null-ptr-access-in-the-cursor-snooper.patch
+usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch
+usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch
+sched-fair-small-cleanup-to-sched_balance_newidle.patch
+sched-fair-small-cleanup-to-update_newidle_cost.patch
+sched-fair-proportional-newidle-balance.patch
diff --git a/queue-6.1/usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch b/queue-6.1/usb-xhci-apply-the-link-chain-quirk-on-nec-isoc-endpoints.patch
new file mode 100644 (file)
index 0000000..ff49d56
--- /dev/null
@@ -0,0 +1,116 @@
+From linux-usb+bounces-32050-greg=kroah.com@vger.kernel.org Thu Jan  8 10:14:16 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu,  8 Jan 2026 00:49:27 -0800
+Subject: usb: xhci: Apply the link chain quirk on NEC isoc endpoints
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mathias.nyman@intel.com, linux-usb@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Michal Pecio <michal.pecio@gmail.com>, Mathias Nyman <mathias.nyman@linux.intel.com>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108084927.671785-3-shivani.agarwal@broadcom.com>
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit bb0ba4cb1065e87f9cc75db1fa454e56d0894d01 upstream.
+
+Two clearly different specimens of NEC uPD720200 (one with start/stop
+bug, one without) were seen to cause IOMMU faults after some Missed
+Service Errors. Faulting address is immediately after a transfer ring
+segment and patched dynamic debug messages revealed that the MSE was
+received when waiting for a TD near the end of that segment:
+
+[ 1.041954] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ffa08fe0
+[ 1.042120] xhci_hcd: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0005 address=0xffa09000 flags=0x0000]
+[ 1.042146] xhci_hcd: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0005 address=0xffa09040 flags=0x0000]
+
+It gets even funnier if the next page is a ring segment accessible to
+the HC. Below, it reports MSE in segment at ff1e8000, plows through a
+zero-filled page at ff1e9000 and starts reporting events for TRBs in
+page at ff1ea000 every microframe, instead of jumping to seg ff1e6000.
+
+[ 7.041671] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ff1e8fe0
+[ 7.041999] xhci_hcd: Miss service interval error for slot 1 ep 2 expected TD DMA ff1e8fe0
+[ 7.042011] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint
+[ 7.042028] xhci_hcd: All TDs skipped for slot 1 ep 2. Clear skip flag.
+[ 7.042134] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint
+[ 7.042138] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 31
+[ 7.042144] xhci_hcd: Looking for event-dma 00000000ff1ea040 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+[ 7.042259] xhci_hcd: WARN: buffer overrun event for slot 1 ep 2 on endpoint
+[ 7.042262] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 31
+[ 7.042266] xhci_hcd: Looking for event-dma 00000000ff1ea050 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+
+At some point completion events change from Isoch Buffer Overrun to
+Short Packet and the HC finally finds cycle bit mismatch in ff1ec000.
+
+[ 7.098130] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 13
+[ 7.098132] xhci_hcd: Looking for event-dma 00000000ff1ecc50 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+[ 7.098254] xhci_hcd: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 13
+[ 7.098256] xhci_hcd: Looking for event-dma 00000000ff1ecc60 trb-start 00000000ff1e6820 trb-end 00000000ff1e6820
+[ 7.098379] xhci_hcd: Overrun event on slot 1 ep 2
+
+It's possible that data from the isochronous device were written to
+random buffers of pending TDs on other endpoints (either IN or OUT),
+other devices or even other HCs in the same IOMMU domain.
+
+Lastly, an error from a different USB device on another HC. Was it
+caused by the above? I don't know, but it may have been. The disk
+was working without any other issues and generated PCIe traffic to
+starve the NEC of upstream BW and trigger those MSEs. The two HCs
+shared one x1 slot by means of a commercial "PCIe splitter" board.
+
+[ 7.162604] usb 10-2: reset SuperSpeed USB device number 3 using xhci_hcd
+[ 7.178990] sd 9:0:0:0: [sdb] tag#0 UNKNOWN(0x2003) Result: hostbyte=0x07 driverbyte=DRIVER_OK cmd_age=0s
+[ 7.179001] sd 9:0:0:0: [sdb] tag#0 CDB: opcode=0x28 28 00 04 02 ae 00 00 02 00 00
+[ 7.179004] I/O error, dev sdb, sector 67284480 op 0x0:(READ) flags 0x80700 phys_seg 5 prio class 0
+
+Fortunately, it appears that this ridiculous bug is avoided by setting
+the chain bit of Link TRBs on isochronous rings. Other ancient HCs are
+known which also expect the bit to be set and they ignore Link TRBs if
+it's not. Reportedly, 0.95 spec guaranteed that the bit is set.
+
+The bandwidth-starved NEC HC running a 32KB/uframe UVC endpoint reports
+tens of MSEs per second and runs into the bug within seconds. Chaining
+Link TRBs allows the same workload to run for many minutes, many times.
+
+No negative side effects seen in UVC recording and UAC playback with a
+few devices at full speed, high speed and SuperSpeed.
+
+The problem doesn't reproduce on the newer Renesas uPD720201/uPD720202
+and on old Etron EJ168 and VIA VL805 (but the VL805 has other bug).
+
+[shorten line length of log snippets in commit messge -Mathias]
+
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20250306144954.3507700-14-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[Shivani: Modified to apply on v5.10.y-v6.1.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci.h |   13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -1789,11 +1789,20 @@ static inline void xhci_write_64(struct
+ }
+-/* Link TRB chain should always be set on 0.95 hosts, and AMD 0.96 ISOC rings */
++/*
++ * Reportedly, some chapters of v0.95 spec said that Link TRB always has its chain bit set.
++ * Other chapters and later specs say that it should only be set if the link is inside a TD
++ * which continues from the end of one segment to the next segment.
++ *
++ * Some 0.95 hardware was found to misbehave if any link TRB doesn't have the chain bit set.
++ *
++ * 0.96 hardware from AMD and NEC was found to ignore unchained isochronous link TRBs when
++ * "resynchronizing the pipe" after a Missed Service Error.
++ */
+ static inline bool xhci_link_chain_quirk(struct xhci_hcd *xhci, enum xhci_ring_type type)
+ {
+       return (xhci->quirks & XHCI_LINK_TRB_QUIRK) ||
+-             (type == TYPE_ISOC && (xhci->quirks & XHCI_AMD_0x96_HOST));
++             (type == TYPE_ISOC && (xhci->quirks & (XHCI_AMD_0x96_HOST | XHCI_NEC_HOST)));
+ }
+ /* xHCI debugging */
diff --git a/queue-6.1/usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch b/queue-6.1/usb-xhci-move-link-chain-bit-quirk-checks-into-one-helper-function.patch
new file mode 100644 (file)
index 0000000..3728ec7
--- /dev/null
@@ -0,0 +1,103 @@
+From shivani.agarwal@broadcom.com Thu Jan  8 10:10:31 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu,  8 Jan 2026 00:49:26 -0800
+Subject: usb: xhci: move link chain bit quirk checks into one helper function.
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mathias.nyman@intel.com, linux-usb@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Niklas Neronin <niklas.neronin@linux.intel.com>, Mathias Nyman <mathias.nyman@linux.intel.com>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108084927.671785-2-shivani.agarwal@broadcom.com>
+
+From: Niklas Neronin <niklas.neronin@linux.intel.com>
+
+commit 7476a2215c07703db5e95efaa3fc5b9f957b9417 upstream.
+
+Older 0.95 xHCI hosts and some other specific newer hosts require the
+chain bit to be set for Link TRBs even if the link TRB is not in the
+middle of a transfer descriptor (TD).
+
+move the checks for all those cases  into one xhci_link_chain_quirk()
+function to clean up and avoid code duplication.
+
+No functional changes.
+
+[skip renaming chain_links flag, reword commit message -Mathias]
+
+Signed-off-by: Niklas Neronin <niklas.neronin@linux.intel.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20240626124835.1023046-10-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[Shivani: Modified to apply on v5.10.y-v6.1.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-mem.c  |   10 ++--------
+ drivers/usb/host/xhci-ring.c |    8 ++------
+ drivers/usb/host/xhci.h      |    7 +++++--
+ 3 files changed, 9 insertions(+), 16 deletions(-)
+
+--- a/drivers/usb/host/xhci-mem.c
++++ b/drivers/usb/host/xhci-mem.c
+@@ -133,10 +133,7 @@ static void xhci_link_rings(struct xhci_
+       if (!ring || !first || !last)
+               return;
+-      /* Set chain bit for 0.95 hosts, and for isoc rings on AMD 0.96 host */
+-      chain_links = !!(xhci_link_trb_quirk(xhci) ||
+-                       (ring->type == TYPE_ISOC &&
+-                        (xhci->quirks & XHCI_AMD_0x96_HOST)));
++      chain_links = xhci_link_chain_quirk(xhci, ring->type);
+       next = ring->enq_seg->next;
+       xhci_link_segments(ring->enq_seg, first, ring->type, chain_links);
+@@ -326,10 +323,7 @@ static int xhci_alloc_segments_for_ring(
+       struct xhci_segment *prev;
+       bool chain_links;
+-      /* Set chain bit for 0.95 hosts, and for isoc rings on AMD 0.96 host */
+-      chain_links = !!(xhci_link_trb_quirk(xhci) ||
+-                       (type == TYPE_ISOC &&
+-                        (xhci->quirks & XHCI_AMD_0x96_HOST)));
++      chain_links = xhci_link_chain_quirk(xhci, type);
+       prev = xhci_segment_alloc(xhci, cycle_state, max_packet, flags);
+       if (!prev)
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -250,9 +250,7 @@ static void inc_enq(struct xhci_hcd *xhc
+                * AMD 0.96 host, carry over the chain bit of the previous TRB
+                * (which may mean the chain bit is cleared).
+                */
+-              if (!(ring->type == TYPE_ISOC &&
+-                    (xhci->quirks & XHCI_AMD_0x96_HOST)) &&
+-                  !xhci_link_trb_quirk(xhci)) {
++              if (!xhci_link_chain_quirk(xhci, ring->type)) {
+                       next->link.control &= cpu_to_le32(~TRB_CHAIN);
+                       next->link.control |= cpu_to_le32(chain);
+               }
+@@ -3355,9 +3353,7 @@ static int prepare_ring(struct xhci_hcd
+               /* If we're not dealing with 0.95 hardware or isoc rings
+                * on AMD 0.96 host, clear the chain bit.
+                */
+-              if (!xhci_link_trb_quirk(xhci) &&
+-                  !(ep_ring->type == TYPE_ISOC &&
+-                    (xhci->quirks & XHCI_AMD_0x96_HOST)))
++              if (!xhci_link_chain_quirk(xhci, ep_ring->type))
+                       ep_ring->enqueue->link.control &=
+                               cpu_to_le32(~TRB_CHAIN);
+               else
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -1788,9 +1788,12 @@ static inline void xhci_write_64(struct
+       lo_hi_writeq(val, regs);
+ }
+-static inline int xhci_link_trb_quirk(struct xhci_hcd *xhci)
++
++/* Link TRB chain should always be set on 0.95 hosts, and AMD 0.96 ISOC rings */
++static inline bool xhci_link_chain_quirk(struct xhci_hcd *xhci, enum xhci_ring_type type)
+ {
+-      return xhci->quirks & XHCI_LINK_TRB_QUIRK;
++      return (xhci->quirks & XHCI_LINK_TRB_QUIRK) ||
++             (type == TYPE_ISOC && (xhci->quirks & XHCI_AMD_0x96_HOST));
+ }
+ /* xHCI debugging */
diff --git a/queue-6.1/x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch b/queue-6.1/x86-mm-pat-clear-vm_pat-if-copy_p4d_range-failed.patch
new file mode 100644 (file)
index 0000000..b93af84
--- /dev/null
@@ -0,0 +1,130 @@
+From stable+bounces-203368-greg=kroah.com@vger.kernel.org Wed Dec 24 11:43:39 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 24 Dec 2025 10:24:31 +0000
+Subject: x86/mm/pat: clear VM_PAT if copy_p4d_range failed
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com, akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Ma Wupeng <mawupeng1@huawei.com>, syzbot+5f488e922d047d8f00cc@syzkaller.appspotmail.com, Alexander Ofitserov <oficerovas@altlinux.org>
+Message-ID: <20251224102432.923410-2-ajay.kaher@broadcom.com>
+
+From: Ma Wupeng <mawupeng1@huawei.com>
+
+[ Upstream commit d155df53f31068c3340733d586eb9b3ddfd70fc5 ]
+
+Syzbot reports a warning in untrack_pfn().  Digging into the root we found
+that this is due to memory allocation failure in pmd_alloc_one.  And this
+failure is produced due to failslab.
+
+In copy_page_range(), memory alloaction for pmd failed.  During the error
+handling process in copy_page_range(), mmput() is called to remove all
+vmas.  While untrack_pfn this empty pfn, warning happens.
+
+Here's a simplified flow:
+
+dup_mm
+  dup_mmap
+    copy_page_range
+      copy_p4d_range
+        copy_pud_range
+          copy_pmd_range
+            pmd_alloc
+              __pmd_alloc
+                pmd_alloc_one
+                  page = alloc_pages(gfp, 0);
+                    if (!page)
+                      return NULL;
+    mmput
+        exit_mmap
+          unmap_vmas
+            unmap_single_vma
+              untrack_pfn
+                follow_phys
+                  WARN_ON_ONCE(1);
+
+Since this vma is not generate successfully, we can clear flag VM_PAT.  In
+this case, untrack_pfn() will not be called while cleaning this vma.
+
+Function untrack_pfn_moved() has also been renamed to fit the new logic.
+
+Link: https://lkml.kernel.org/r/20230217025615.1595558-1-mawupeng1@huawei.com
+Signed-off-by: Ma Wupeng <mawupeng1@huawei.com>
+Reported-by: <syzbot+5f488e922d047d8f00cc@syzkaller.appspotmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Alexander Ofitserov <oficerovas@altlinux.org>
+Cc: stable@vger.kernel.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/pat/memtype.c |   12 ++++++++----
+ include/linux/pgtable.h   |    7 ++++---
+ mm/memory.c               |    1 +
+ mm/mremap.c               |    2 +-
+ 4 files changed, 14 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/mm/pat/memtype.c
++++ b/arch/x86/mm/pat/memtype.c
+@@ -1137,11 +1137,15 @@ void untrack_pfn(struct vm_area_struct *
+ }
+ /*
+- * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
+- * with the old vma after its pfnmap page table has been removed.  The new
+- * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
++ * untrack_pfn_clear is called if the following situation fits:
++ *
++ * 1) while mremapping a pfnmap for a new region,  with the old vma after
++ * its pfnmap page table has been removed.  The new vma has a new pfnmap
++ * to the same pfn & cache type with VM_PAT set.
++ * 2) while duplicating vm area, the new vma fails to copy the pgtable from
++ * old vma.
+  */
+-void untrack_pfn_moved(struct vm_area_struct *vma)
++void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+       vma->vm_flags &= ~VM_PAT;
+ }
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1214,9 +1214,10 @@ static inline void untrack_pfn(struct vm
+ }
+ /*
+- * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
++ * untrack_pfn_clear is called while mremapping a pfnmap for a new region
++ * or fails to copy pgtable during duplicate vm area.
+  */
+-static inline void untrack_pfn_moved(struct vm_area_struct *vma)
++static inline void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+ }
+ #else
+@@ -1228,7 +1229,7 @@ extern void track_pfn_insert(struct vm_a
+ extern int track_pfn_copy(struct vm_area_struct *vma);
+ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+                       unsigned long size);
+-extern void untrack_pfn_moved(struct vm_area_struct *vma);
++extern void untrack_pfn_clear(struct vm_area_struct *vma);
+ #endif
+ #ifdef CONFIG_MMU
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1335,6 +1335,7 @@ copy_page_range(struct vm_area_struct *d
+                       continue;
+               if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+                                           addr, next))) {
++                      untrack_pfn_clear(dst_vma);
+                       ret = -ENOMEM;
+                       break;
+               }
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -682,7 +682,7 @@ static unsigned long move_vma(struct vm_
+       /* Tell pfnmap has moved from this vma */
+       if (unlikely(vma->vm_flags & VM_PFNMAP))
+-              untrack_pfn_moved(vma);
++              untrack_pfn_clear(vma);
+       if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+               /* We always clear VM_LOCKED[ONFAULT] on the old vma */
diff --git a/queue-6.1/x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch b/queue-6.1/x86-mm-pat-fix-vm_pat-handling-when-fork-fails-in-copy_page_range.patch
new file mode 100644 (file)
index 0000000..79d507c
--- /dev/null
@@ -0,0 +1,293 @@
+From stable+bounces-203369-greg=kroah.com@vger.kernel.org Wed Dec 24 11:43:43 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 24 Dec 2025 10:24:32 +0000
+Subject: x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com, akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, xingwei lee <xrivendell7@gmail.com>, yuxin wang <wang1315768607@163.com>, Marius Fleischer <fleischermarius@gmail.com>, David Hildenbrand <david@redhat.com>, Ingo Molnar <mingo@kernel.org>, Rik van Riel <riel@surriel.com>, Linus Torvalds <torvalds@linux-foundation.org>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251224102432.923410-3-ajay.kaher@broadcom.com>
+
+From: David Hildenbrand <david@redhat.com>
+
+[ Upstream commit dc84bc2aba85a1508f04a936f9f9a15f64ebfb31 ]
+
+If track_pfn_copy() fails, we already added the dst VMA to the maple
+tree. As fork() fails, we'll cleanup the maple tree, and stumble over
+the dst VMA for which we neither performed any reservation nor copied
+any page tables.
+
+Consequently untrack_pfn() will see VM_PAT and try obtaining the
+PAT information from the page table -- which fails because the page
+table was not copied.
+
+The easiest fix would be to simply clear the VM_PAT flag of the dst VMA
+if track_pfn_copy() fails. However, the whole thing is about "simply"
+clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy()
+and performed a reservation, but copying the page tables fails, we'll
+simply clear the VM_PAT flag, not properly undoing the reservation ...
+which is also wrong.
+
+So let's fix it properly: set the VM_PAT flag only if the reservation
+succeeded (leaving it clear initially), and undo the reservation if
+anything goes wrong while copying the page tables: clearing the VM_PAT
+flag after undoing the reservation.
+
+Note that any copied page table entries will get zapped when the VMA will
+get removed later, after copy_page_range() succeeded; as VM_PAT is not set
+then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be
+happy. Note that leaving these page tables in place without a reservation
+is not a problem, as we are aborting fork(); this process will never run.
+
+A reproducer can trigger this usually at the first try:
+
+  https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c
+
+  WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110
+  Modules linked in: ...
+  CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014
+  RIP: 0010:get_pat_info+0xf6/0x110
+  ...
+  Call Trace:
+   <TASK>
+   ...
+   untrack_pfn+0x52/0x110
+   unmap_single_vma+0xa6/0xe0
+   unmap_vmas+0x105/0x1f0
+   exit_mmap+0xf6/0x460
+   __mmput+0x4b/0x120
+   copy_process+0x1bf6/0x2aa0
+   kernel_clone+0xab/0x440
+   __do_sys_clone+0x66/0x90
+   do_syscall_64+0x95/0x180
+
+Likely this case was missed in:
+
+  d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")
+
+... and instead of undoing the reservation we simply cleared the VM_PAT flag.
+
+Keep the documentation of these functions in include/linux/pgtable.h,
+one place is more than sufficient -- we should clean that up for the other
+functions like track_pfn_remap/untrack_pfn separately.
+
+Fixes: d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")
+Fixes: 2ab640379a0a ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3")
+Reported-by: xingwei lee <xrivendell7@gmail.com>
+Reported-by: yuxin wang <wang1315768607@163.com>
+Reported-by: Marius Fleischer <fleischermarius@gmail.com>
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: linux-mm@kvack.org
+Link: https://lore.kernel.org/r/20250321112323.153741-1-david@redhat.com
+Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/
+Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Cc: stable@vger.kernel.org
+[ Ajay: Modified to apply on v6.1 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/pat/memtype.c |   52 ++++++++++++++++++++++++----------------------
+ include/linux/pgtable.h   |   28 +++++++++++++++++++-----
+ kernel/fork.c             |    4 +++
+ mm/memory.c               |   11 +++------
+ 4 files changed, 58 insertions(+), 37 deletions(-)
+
+--- a/arch/x86/mm/pat/memtype.c
++++ b/arch/x86/mm/pat/memtype.c
+@@ -1029,29 +1029,42 @@ static int get_pat_info(struct vm_area_s
+       return -EINVAL;
+ }
+-/*
+- * track_pfn_copy is called when vma that is covering the pfnmap gets
+- * copied through copy_page_range().
+- *
+- * If the vma has a linear pfn mapping for the entire range, we get the prot
+- * from pte and reserve the entire vma range with single reserve_pfn_range call.
+- */
+-int track_pfn_copy(struct vm_area_struct *vma)
++int track_pfn_copy(struct vm_area_struct *dst_vma,
++              struct vm_area_struct *src_vma, unsigned long *pfn)
+ {
++      const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
+       resource_size_t paddr;
+-      unsigned long vma_size = vma->vm_end - vma->vm_start;
+       pgprot_t pgprot;
++      int rc;
+-      if (vma->vm_flags & VM_PAT) {
+-              if (get_pat_info(vma, &paddr, &pgprot))
+-                      return -EINVAL;
+-              /* reserve the whole chunk covered by vma. */
+-              return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+-      }
++      if (!(src_vma->vm_flags & VM_PAT))
++              return 0;
++      /*
++       * Duplicate the PAT information for the dst VMA based on the src
++       * VMA.
++       */
++      if (get_pat_info(src_vma, &paddr, &pgprot))
++              return -EINVAL;
++      rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
++      if (rc)
++              return rc;
++
++      /* Reservation for the destination VMA succeeded. */
++      dst_vma->vm_flags |= VM_PAT;
++      *pfn = PHYS_PFN(paddr);
+       return 0;
+ }
++void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
++{
++      untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start);
++      /*
++       * Reservation was freed, any copied page tables will get cleaned
++       * up later, but without getting PAT involved again.
++       */
++}
++
+ /*
+  * prot is passed in as a parameter for the new mapping. If the vma has
+  * a linear pfn mapping for the entire range, or no vma is provided,
+@@ -1136,15 +1149,6 @@ void untrack_pfn(struct vm_area_struct *
+               vma->vm_flags &= ~VM_PAT;
+ }
+-/*
+- * untrack_pfn_clear is called if the following situation fits:
+- *
+- * 1) while mremapping a pfnmap for a new region,  with the old vma after
+- * its pfnmap page table has been removed.  The new vma has a new pfnmap
+- * to the same pfn & cache type with VM_PAT set.
+- * 2) while duplicating vm area, the new vma fails to copy the pgtable from
+- * old vma.
+- */
+ void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+       vma->vm_flags &= ~VM_PAT;
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1195,15 +1195,26 @@ static inline void track_pfn_insert(stru
+ }
+ /*
+- * track_pfn_copy is called when vma that is covering the pfnmap gets
+- * copied through copy_page_range().
++ * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
++ * tables copied during copy_page_range(). On success, stores the pfn to be
++ * passed to untrack_pfn_copy().
+  */
+-static inline int track_pfn_copy(struct vm_area_struct *vma)
++static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
++              struct vm_area_struct *src_vma, unsigned long *pfn)
+ {
+       return 0;
+ }
+ /*
++ * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
++ * copy_page_range(), but after track_pfn_copy() was already called.
++ */
++static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
++              unsigned long pfn)
++{
++}
++
++/*
+  * untrack_pfn is called while unmapping a pfnmap for a region.
+  * untrack can be called for a specific region indicated by pfn and size or
+  * can be for the entire vma (in which case pfn, size are zero).
+@@ -1214,8 +1225,10 @@ static inline void untrack_pfn(struct vm
+ }
+ /*
+- * untrack_pfn_clear is called while mremapping a pfnmap for a new region
+- * or fails to copy pgtable during duplicate vm area.
++ * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
++ *
++ * 1) During mremap() on the src VMA after the page tables were moved.
++ * 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
+  */
+ static inline void untrack_pfn_clear(struct vm_area_struct *vma)
+ {
+@@ -1226,7 +1239,10 @@ extern int track_pfn_remap(struct vm_are
+                          unsigned long size);
+ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+                            pfn_t pfn);
+-extern int track_pfn_copy(struct vm_area_struct *vma);
++extern int track_pfn_copy(struct vm_area_struct *dst_vma,
++              struct vm_area_struct *src_vma, unsigned long *pfn);
++extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
++              unsigned long pfn);
+ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+                       unsigned long size);
+ extern void untrack_pfn_clear(struct vm_area_struct *vma);
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -476,6 +476,10 @@ struct vm_area_struct *vm_area_dup(struc
+               *new = data_race(*orig);
+               INIT_LIST_HEAD(&new->anon_vma_chain);
+               dup_anon_vma_name(orig, new);
++
++              /* track_pfn_copy() will later take care of copying internal state. */
++              if (unlikely(new->vm_flags & VM_PFNMAP))
++                      untrack_pfn_clear(new);
+       }
+       return new;
+ }
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1278,12 +1278,12 @@ int
+ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+ {
+       pgd_t *src_pgd, *dst_pgd;
+-      unsigned long next;
+       unsigned long addr = src_vma->vm_start;
+       unsigned long end = src_vma->vm_end;
+       struct mm_struct *dst_mm = dst_vma->vm_mm;
+       struct mm_struct *src_mm = src_vma->vm_mm;
+       struct mmu_notifier_range range;
++      unsigned long next, pfn;
+       bool is_cow;
+       int ret;
+@@ -1294,11 +1294,7 @@ copy_page_range(struct vm_area_struct *d
+               return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
+       if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
+-              /*
+-               * We do not free on error cases below as remove_vma
+-               * gets called on error from higher level routine
+-               */
+-              ret = track_pfn_copy(src_vma);
++              ret = track_pfn_copy(dst_vma, src_vma, &pfn);
+               if (ret)
+                       return ret;
+       }
+@@ -1335,7 +1331,6 @@ copy_page_range(struct vm_area_struct *d
+                       continue;
+               if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+                                           addr, next))) {
+-                      untrack_pfn_clear(dst_vma);
+                       ret = -ENOMEM;
+                       break;
+               }
+@@ -1345,6 +1340,8 @@ copy_page_range(struct vm_area_struct *d
+               raw_write_seqcount_end(&src_mm->write_protect_seq);
+               mmu_notifier_invalidate_range_end(&range);
+       }
++      if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
++              untrack_pfn_copy(dst_vma, pfn);
+       return ret;
+ }