--- /dev/null
+From 5037b342825df7094a4906d1e2a9674baab50cb2 Mon Sep 17 00:00:00 2001
+From: Robbie Ko <robbieko@synology.com>
+Date: Thu, 11 Dec 2025 13:30:33 +0800
+Subject: btrfs: fix deadlock in wait_current_trans() due to ignored transaction type
+
+From: Robbie Ko <robbieko@synology.com>
+
+commit 5037b342825df7094a4906d1e2a9674baab50cb2 upstream.
+
+When wait_current_trans() is called during start_transaction(), it
+currently waits for a blocked transaction without considering whether
+the given transaction type actually needs to wait for that particular
+transaction state. The btrfs_blocked_trans_types[] array already defines
+which transaction types should wait for which transaction states, but
+this check was missing in wait_current_trans().
+
+This can lead to a deadlock scenario involving two transactions and
+pending ordered extents:
+
+ 1. Transaction A is in TRANS_STATE_COMMIT_DOING state
+
+ 2. A worker processing an ordered extent calls start_transaction()
+ with TRANS_JOIN
+
+ 3. join_transaction() returns -EBUSY because Transaction A is in
+ TRANS_STATE_COMMIT_DOING
+
+ 4. Transaction A moves to TRANS_STATE_UNBLOCKED and completes
+
+ 5. A new Transaction B is created (TRANS_STATE_RUNNING)
+
+ 6. The ordered extent from step 2 is added to Transaction B's
+ pending ordered extents
+
+ 7. Transaction B immediately starts commit by another task and
+ enters TRANS_STATE_COMMIT_START
+
+ 8. The worker finally reaches wait_current_trans(), sees Transaction B
+ in TRANS_STATE_COMMIT_START (a blocked state), and waits
+ unconditionally
+
+ 9. However, TRANS_JOIN should NOT wait for TRANS_STATE_COMMIT_START
+ according to btrfs_blocked_trans_types[]
+
+ 10. Transaction B is waiting for pending ordered extents to complete
+
+ 11. Deadlock: Transaction B waits for ordered extent, ordered extent
+ waits for Transaction B
+
+This can be illustrated by the following call stacks:
+ CPU0 CPU1
+ btrfs_finish_ordered_io()
+ start_transaction(TRANS_JOIN)
+ join_transaction()
+ # -EBUSY (Transaction A is
+ # TRANS_STATE_COMMIT_DOING)
+ # Transaction A completes
+ # Transaction B created
+ # ordered extent added to
+ # Transaction B's pending list
+ btrfs_commit_transaction()
+ # Transaction B enters
+ # TRANS_STATE_COMMIT_START
+ # waiting for pending ordered
+ # extents
+ wait_current_trans()
+ # waits for Transaction B
+ # (should not wait!)
+
+Task bstore_kv_sync in btrfs_commit_transaction waiting for ordered
+extents:
+
+ __schedule+0x2e7/0x8a0
+ schedule+0x64/0xe0
+ btrfs_commit_transaction+0xbf7/0xda0 [btrfs]
+ btrfs_sync_file+0x342/0x4d0 [btrfs]
+ __x64_sys_fdatasync+0x4b/0x80
+ do_syscall_64+0x33/0x40
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Task kworker in wait_current_trans waiting for transaction commit:
+
+ Workqueue: btrfs-syno_nocow btrfs_work_helper [btrfs]
+ __schedule+0x2e7/0x8a0
+ schedule+0x64/0xe0
+ wait_current_trans+0xb0/0x110 [btrfs]
+ start_transaction+0x346/0x5b0 [btrfs]
+ btrfs_finish_ordered_io.isra.0+0x49b/0x9c0 [btrfs]
+ btrfs_work_helper+0xe8/0x350 [btrfs]
+ process_one_work+0x1d3/0x3c0
+ worker_thread+0x4d/0x3e0
+ kthread+0x12d/0x150
+ ret_from_fork+0x1f/0x30
+
+Fix this by passing the transaction type to wait_current_trans() and
+checking btrfs_blocked_trans_types[cur_trans->state] against the given
+type before deciding to wait. This ensures that transaction types which
+are allowed to join during certain blocked states will not unnecessarily
+wait and cause deadlocks.
+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Robbie Ko <robbieko@synology.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Cc: Motiejus Jakštys <motiejus@jakstys.lt>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/transaction.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -518,13 +518,14 @@ static inline int is_transaction_blocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+-static void wait_current_trans(struct btrfs_fs_info *fs_info)
++static void wait_current_trans(struct btrfs_fs_info *fs_info, unsigned int type)
+ {
+ struct btrfs_transaction *cur_trans;
+
+ spin_lock(&fs_info->trans_lock);
+ cur_trans = fs_info->running_transaction;
+- if (cur_trans && is_transaction_blocked(cur_trans)) {
++ if (cur_trans && is_transaction_blocked(cur_trans) &&
++ (btrfs_blocked_trans_types[cur_trans->state] & type)) {
+ refcount_inc(&cur_trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+@@ -699,12 +700,12 @@ again:
+ sb_start_intwrite(fs_info->sb);
+
+ if (may_wait_transaction(fs_info, type))
+- wait_current_trans(fs_info);
++ wait_current_trans(fs_info, type);
+
+ do {
+ ret = join_transaction(fs_info, type);
+ if (ret == -EBUSY) {
+- wait_current_trans(fs_info);
++ wait_current_trans(fs_info, type);
+ if (unlikely(type == TRANS_ATTACH ||
+ type == TRANS_JOIN_NOSTART))
+ ret = -ENOENT;
+@@ -1001,7 +1002,7 @@ out:
+
+ void btrfs_throttle(struct btrfs_fs_info *fs_info)
+ {
+- wait_current_trans(fs_info);
++ wait_current_trans(fs_info, TRANS_START);
+ }
+
+ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
--- /dev/null
+From 3644f4411713f52bf231574aa8759e3d8e20b341 Mon Sep 17 00:00:00 2001
+From: Nathan Chancellor <nathan@kernel.org>
+Date: Wed, 22 Oct 2025 00:49:08 +0200
+Subject: HID: intel-ish-hid: Fix -Wcast-function-type-strict in devm_ishtp_alloc_workqueue()
+
+From: Nathan Chancellor <nathan@kernel.org>
+
+commit 3644f4411713f52bf231574aa8759e3d8e20b341 upstream.
+
+Clang warns (or errors with CONFIG_WERROR=y / W=e):
+
+ drivers/hid/intel-ish-hid/ipc/ipc.c:935:36: error: cast from 'void (*)(struct workqueue_struct *)' to 'void (*)(void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict]
+ 935 | if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue,
+ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ include/linux/device/devres.h:168:34: note: expanded from macro 'devm_add_action_or_reset'
+ 168 | __devm_add_action_or_ireset(dev, action, data, #action)
+ | ^~~~~~
+
+This warning is pointing out a kernel control flow integrity (kCFI /
+CONFIG_CFI=y) violation will occur due to this function cast when the
+destroy_workqueue() is indirectly called via devm_action_release()
+because the prototype of destroy_workqueue() does not match the
+prototype of (*action)().
+
+Use a local function with the correct prototype to wrap
+destroy_workqueue() to resolve the warning and CFI violation.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Closes: https://lore.kernel.org/oe-kbuild-all/202510190103.qTZvfdjj-lkp@intel.com/
+Closes: https://github.com/ClangBuiltLinux/linux/issues/2139
+Fixes: 0d30dae38fe0 ("HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking")
+Signed-off-by: Nathan Chancellor <nathan@kernel.org>
+Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Reviewed-by: Zhang Lixu <lixu.zhang@intel.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/intel-ish-hid/ipc/ipc.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/drivers/hid/intel-ish-hid/ipc/ipc.c
++++ b/drivers/hid/intel-ish-hid/ipc/ipc.c
+@@ -933,6 +933,11 @@ static const struct ishtp_hw_ops ish_hw_
+ .dma_no_cache_snooping = _dma_no_cache_snooping
+ };
+
++static void ishtp_free_workqueue(void *wq)
++{
++ destroy_workqueue(wq);
++}
++
+ static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev)
+ {
+ struct workqueue_struct *wq;
+@@ -941,8 +946,7 @@ static struct workqueue_struct *devm_ish
+ if (!wq)
+ return NULL;
+
+- if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue,
+- wq))
++ if (devm_add_action_or_reset(dev, ishtp_free_workqueue, wq))
+ return NULL;
+
+ return wq;
--- /dev/null
+From 0d30dae38fe01cd1de358c6039a0b1184689fe51 Mon Sep 17 00:00:00 2001
+From: Zhang Lixu <lixu.zhang@intel.com>
+Date: Fri, 10 Oct 2025 13:52:54 +0800
+Subject: HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking
+
+From: Zhang Lixu <lixu.zhang@intel.com>
+
+commit 0d30dae38fe01cd1de358c6039a0b1184689fe51 upstream.
+
+During suspend/resume tests with S2IDLE, some ISH functional failures were
+observed because of delay in executing ISH resume handler. Here
+schedule_work() is used from resume handler to do actual work.
+schedule_work() uses system_wq, which is a per CPU work queue. Although
+the queuing is not bound to a CPU, but it prefers local CPU of the caller,
+unless prohibited.
+
+Users of this work queue are not supposed to queue long running work.
+But in practice, there are scenarios where long running work items are
+queued on other unbound workqueues, occupying the CPU. As a result, the
+ISH resume handler may not get a chance to execute in a timely manner.
+
+In one scenario, one of the ish_resume_handler() executions was delayed
+nearly 1 second because another work item on an unbound workqueue occupied
+the same CPU. This delay causes ISH functionality failures.
+
+A similar issue was previously observed where the ISH HID driver timed out
+while getting the HID descriptor during S4 resume in the recovery kernel,
+likely caused by the same workqueue contention problem.
+
+Create dedicated unbound workqueues for all ISH operations to allow work
+items to execute on any available CPU, eliminating CPU-specific bottlenecks
+and improving resume reliability under varying system loads. Also ISH has
+three different components, a bus driver which implements ISH protocols, a
+PCI interface layer and HID interface. Use one dedicated work queue for all
+of them.
+
+Signed-off-by: Zhang Lixu <lixu.zhang@intel.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/intel-ish-hid/ipc/ipc.c | 21 ++++++++++++++++++++-
+ drivers/hid/intel-ish-hid/ipc/pci-ish.c | 2 +-
+ drivers/hid/intel-ish-hid/ishtp-hid-client.c | 4 ++--
+ drivers/hid/intel-ish-hid/ishtp/bus.c | 18 +++++++++++++++++-
+ drivers/hid/intel-ish-hid/ishtp/hbm.c | 4 ++--
+ drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h | 3 +++
+ include/linux/intel-ish-client-if.h | 2 ++
+ 7 files changed, 47 insertions(+), 7 deletions(-)
+
+--- a/drivers/hid/intel-ish-hid/ipc/ipc.c
++++ b/drivers/hid/intel-ish-hid/ipc/ipc.c
+@@ -628,7 +628,7 @@ static void recv_ipc(struct ishtp_device
+ if (!ishtp_dev) {
+ ishtp_dev = dev;
+ }
+- schedule_work(&fw_reset_work);
++ queue_work(dev->unbound_wq, &fw_reset_work);
+ break;
+
+ case MNG_RESET_NOTIFY_ACK:
+@@ -933,6 +933,21 @@ static const struct ishtp_hw_ops ish_hw_
+ .dma_no_cache_snooping = _dma_no_cache_snooping
+ };
+
++static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev)
++{
++ struct workqueue_struct *wq;
++
++ wq = alloc_workqueue("ishtp_unbound_%d", WQ_UNBOUND, 0, dev->id);
++ if (!wq)
++ return NULL;
++
++ if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue,
++ wq))
++ return NULL;
++
++ return wq;
++}
++
+ /**
+ * ish_dev_init() -Initialize ISH devoce
+ * @pdev: PCI device
+@@ -953,6 +968,10 @@ struct ishtp_device *ish_dev_init(struct
+ if (!dev)
+ return NULL;
+
++ dev->unbound_wq = devm_ishtp_alloc_workqueue(&pdev->dev);
++ if (!dev->unbound_wq)
++ return NULL;
++
+ dev->devc = &pdev->dev;
+ ishtp_device_init(dev);
+
+--- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c
++++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c
+@@ -384,7 +384,7 @@ static int __maybe_unused ish_resume(str
+ ish_resume_device = device;
+ dev->resume_flag = 1;
+
+- schedule_work(&resume_work);
++ queue_work(dev->unbound_wq, &resume_work);
+
+ return 0;
+ }
+--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
++++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+@@ -860,7 +860,7 @@ static int hid_ishtp_cl_reset(struct ish
+ hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
+ hid_ishtp_cl);
+
+- schedule_work(&client_data->work);
++ queue_work(ishtp_get_workqueue(cl_device), &client_data->work);
+
+ return 0;
+ }
+@@ -902,7 +902,7 @@ static int hid_ishtp_cl_resume(struct de
+
+ hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
+ hid_ishtp_cl);
+- schedule_work(&client_data->resume_work);
++ queue_work(ishtp_get_workqueue(cl_device), &client_data->resume_work);
+ return 0;
+ }
+
+--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
++++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
+@@ -541,7 +541,7 @@ void ishtp_cl_bus_rx_event(struct ishtp_
+ return;
+
+ if (device->event_cb)
+- schedule_work(&device->event_work);
++ queue_work(device->ishtp_dev->unbound_wq, &device->event_work);
+ }
+
+ /**
+@@ -877,6 +877,22 @@ struct device *ishtp_get_pci_device(stru
+ EXPORT_SYMBOL(ishtp_get_pci_device);
+
+ /**
++ * ishtp_get_workqueue - Retrieve the workqueue associated with an ISHTP device
++ * @cl_device: Pointer to the ISHTP client device structure
++ *
++ * Returns the workqueue_struct pointer (unbound_wq) associated with the given
++ * ISHTP client device. This workqueue is typically used for scheduling work
++ * related to the device.
++ *
++ * Return: Pointer to struct workqueue_struct.
++ */
++struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device)
++{
++ return cl_device->ishtp_dev->unbound_wq;
++}
++EXPORT_SYMBOL(ishtp_get_workqueue);
++
++/**
+ * ishtp_trace_callback() - Return trace callback
+ * @cl_device: ISH-TP client device instance
+ *
+--- a/drivers/hid/intel-ish-hid/ishtp/hbm.c
++++ b/drivers/hid/intel-ish-hid/ishtp/hbm.c
+@@ -573,7 +573,7 @@ void ishtp_hbm_dispatch(struct ishtp_dev
+
+ /* Start firmware loading process if it has loader capability */
+ if (version_res->host_version_supported & ISHTP_SUPPORT_CAP_LOADER)
+- schedule_work(&dev->work_fw_loader);
++ queue_work(dev->unbound_wq, &dev->work_fw_loader);
+
+ dev->version.major_version = HBM_MAJOR_VERSION;
+ dev->version.minor_version = HBM_MINOR_VERSION;
+@@ -864,7 +864,7 @@ void recv_hbm(struct ishtp_device *dev,
+ dev->rd_msg_fifo_tail = (dev->rd_msg_fifo_tail + IPC_PAYLOAD_SIZE) %
+ (RD_INT_FIFO_SIZE * IPC_PAYLOAD_SIZE);
+ spin_unlock_irqrestore(&dev->rd_msg_spinlock, flags);
+- schedule_work(&dev->bh_hbm_work);
++ queue_work(dev->unbound_wq, &dev->bh_hbm_work);
+ eoi:
+ return;
+ }
+--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
++++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
+@@ -175,6 +175,9 @@ struct ishtp_device {
+ struct hbm_version version;
+ int transfer_path; /* Choice of transfer path: IPC or DMA */
+
++ /* Alloc a dedicated unbound workqueue for ishtp device */
++ struct workqueue_struct *unbound_wq;
++
+ /* work structure for scheduling firmware loading tasks */
+ struct work_struct work_fw_loader;
+ /* waitq for waiting for command response from the firmware loader */
+--- a/include/linux/intel-ish-client-if.h
++++ b/include/linux/intel-ish-client-if.h
+@@ -87,6 +87,8 @@ bool ishtp_wait_resume(struct ishtp_devi
+ ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device);
+ /* Get device pointer of PCI device for DMA acces */
+ struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device);
++/* Get the ISHTP workqueue */
++struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device);
+
+ struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device);
+ void ishtp_cl_free(struct ishtp_cl *cl);
--- /dev/null
+From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:34 +0800
+Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit e37d5a2d60a338c5917c45296bac65da1382eda5 upstream.
+
+Introduce a new IOMMU interface to flush IOTLB paging cache entries for
+the CPU kernel address space. This interface is invoked from the x86
+architecture code that manages combined user and kernel page tables,
+specifically before any kernel page table page is freed and reused.
+
+This addresses the main issue with vfree() which is a common occurrence
+and can be triggered by unprivileged users. While this resolves the
+primary problem, it doesn't address some extremely rare case related to
+memory unplug of memory that was present as reserved memory at boot, which
+cannot be triggered by unprivileged users. The discussion can be found at
+the link below.
+
+Enable SVA on x86 architecture since the IOMMU can now receive
+notification to flush the paging cache before freeing the CPU kernel page
+table pages.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com
+Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/
+Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Suggested-by: Jann Horn <jannh@google.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig | 1 +
+ drivers/iommu/iommu-sva.c | 32 ++++++++++++++++++++++++++++----
+ include/linux/iommu.h | 4 ++++
+ mm/pgtable-generic.c | 2 ++
+ 4 files changed, 35 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -279,6 +279,7 @@ config X86
+ select HAVE_PCI
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
++ select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA
+ select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+--- a/drivers/iommu/iommu-sva.c
++++ b/drivers/iommu/iommu-sva.c
+@@ -10,6 +10,8 @@
+ #include "iommu-priv.h"
+
+ static DEFINE_MUTEX(iommu_sva_lock);
++static bool iommu_sva_present;
++static LIST_HEAD(iommu_sva_mms);
+ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+ struct mm_struct *mm);
+
+@@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc
+ return ERR_PTR(-ENOSPC);
+ }
+ iommu_mm->pasid = pasid;
++ iommu_mm->mm = mm;
+ INIT_LIST_HEAD(&iommu_mm->sva_domains);
+ /*
+ * Make sure the write to mm->iommu_mm is not reordered in front of
+@@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device(
+ if (!group)
+ return ERR_PTR(-ENODEV);
+
+- if (IS_ENABLED(CONFIG_X86))
+- return ERR_PTR(-EOPNOTSUPP);
+-
+ mutex_lock(&iommu_sva_lock);
+
+ /* Allocate mm->pasid if necessary. */
+@@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(
+ if (ret)
+ goto out_free_domain;
+ domain->users = 1;
+- list_add(&domain->next, &mm->iommu_mm->sva_domains);
+
++ if (list_empty(&iommu_mm->sva_domains)) {
++ if (list_empty(&iommu_sva_mms))
++ iommu_sva_present = true;
++ list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
++ }
++ list_add(&domain->next, &iommu_mm->sva_domains);
+ out:
+ refcount_set(&handle->users, 1);
+ mutex_unlock(&iommu_sva_lock);
+@@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iomm
+ list_del(&domain->next);
+ iommu_domain_free(domain);
+ }
++
++ if (list_empty(&iommu_mm->sva_domains)) {
++ list_del(&iommu_mm->mm_list_elm);
++ if (list_empty(&iommu_sva_mms))
++ iommu_sva_present = false;
++ }
++
+ mutex_unlock(&iommu_sva_lock);
+ kfree(handle);
+ }
+@@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_do
+
+ return domain;
+ }
++
++void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
++{
++ struct iommu_mm_data *iommu_mm;
++
++ guard(mutex)(&iommu_sva_lock);
++ if (!iommu_sva_present)
++ return;
++
++ list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
++ mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
++}
+--- a/include/linux/iommu.h
++++ b/include/linux/iommu.h
+@@ -1134,7 +1134,9 @@ struct iommu_sva {
+
+ struct iommu_mm_data {
+ u32 pasid;
++ struct mm_struct *mm;
+ struct list_head sva_domains;
++ struct list_head mm_list_elm;
+ };
+
+ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode);
+@@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(
+ struct mm_struct *mm);
+ void iommu_sva_unbind_device(struct iommu_sva *handle);
+ u32 iommu_sva_get_pasid(struct iommu_sva *handle);
++void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end);
+ #else
+ static inline struct iommu_sva *
+ iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
+@@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(st
+ }
+
+ static inline void mm_pasid_drop(struct mm_struct *mm) {}
++static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {}
+ #endif /* CONFIG_IOMMU_SVA */
+
+ #ifdef CONFIG_IOMMU_IOPF
+--- a/mm/pgtable-generic.c
++++ b/mm/pgtable-generic.c
+@@ -13,6 +13,7 @@
+ #include <linux/swap.h>
+ #include <linux/swapops.h>
+ #include <linux/mm_inline.h>
++#include <linux/iommu.h>
+ #include <asm/pgalloc.h>
+ #include <asm/tlb.h>
+
+@@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(str
+ list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
+ spin_unlock(&kernel_pgtable_work.lock);
+
++ iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL);
+ list_for_each_entry_safe(pt, next, &page_list, pt_list)
+ __pagetable_free(pt);
+ }
--- /dev/null
+From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:29 +0800
+Subject: mm: actually mark kernel page table pages
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 977870522af34359b461060597ee3a86f27450d6 upstream.
+
+Now that the API is in place, mark kernel page table pages just after they
+are allocated. Unmark them just before they are freed.
+
+Note: Unconditionally clearing the 'kernel' marking (via
+ptdesc_clear_kernel()) would be functionally identical to what is here.
+But having the if() makes it logically clear that this function can be
+used for kernel and non-kernel page tables.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/asm-generic/pgalloc.h | 18 ++++++++++++++++++
+ include/linux/mm.h | 3 +++
+ 2 files changed, 21 insertions(+)
+
+--- a/include/asm-generic/pgalloc.h
++++ b/include/asm-generic/pgalloc.h
+@@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_ker
+ return NULL;
+ }
+
++ ptdesc_set_kernel(ptdesc);
++
+ return ptdesc_address(ptdesc);
+ }
+ #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))
+@@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_nopro
+ pagetable_free(ptdesc);
+ return NULL;
+ }
++
++ if (mm == &init_mm)
++ ptdesc_set_kernel(ptdesc);
++
+ return ptdesc_address(ptdesc);
+ }
+ #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
+@@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_nop
+ return NULL;
+
+ pagetable_pud_ctor(ptdesc);
++
++ if (mm == &init_mm)
++ ptdesc_set_kernel(ptdesc);
++
+ return ptdesc_address(ptdesc);
+ }
+ #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))
+@@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_nop
+ return NULL;
+
+ pagetable_p4d_ctor(ptdesc);
++
++ if (mm == &init_mm)
++ ptdesc_set_kernel(ptdesc);
++
+ return ptdesc_address(ptdesc);
+ }
+ #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__))
+@@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(
+ return NULL;
+
+ pagetable_pgd_ctor(ptdesc);
++
++ if (mm == &init_mm)
++ ptdesc_set_kernel(ptdesc);
++
+ return ptdesc_address(ptdesc);
+ }
+ #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__))
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3042,6 +3042,9 @@ static inline void pagetable_free(struct
+ {
+ struct page *page = ptdesc_page(pt);
+
++ if (ptdesc_test_kernel(pt))
++ ptdesc_clear_kernel(pt);
++
+ __free_pages(page, compound_order(page));
+ }
+
--- /dev/null
+From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:28 +0800
+Subject: mm: add a ptdesc flag to mark kernel page tables
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 27bfafac65d87c58639f5d7af1353ec1e7886963 upstream.
+
+The page tables used to map the kernel and userspace often have very
+different handling rules. There are frequently *_kernel() variants of
+functions just for kernel page tables. That's not great and has lead to
+code duplication.
+
+Instead of having completely separate call paths, allow a 'ptdesc' to be
+marked as being for kernel mappings. Introduce helpers to set and clear
+this status.
+
+Note: this uses the PG_referenced bit. Page flags are a great fit for
+this since it is truly a single bit of information. Use PG_referenced
+itself because it's a fairly benign flag (as opposed to things like
+PG_lock). It's also (according to Willy) unlikely to go away any time
+soon.
+
+PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE. It does not need to be
+cleared before freeing the page, and pages coming out of the allocator
+should have it cleared. Regardless, introduce an API to clear it anyway.
+Having symmetry in the API makes it easier to change the underlying
+implementation later, like if there was a need to move to a
+PAGE_FLAGS_CHECK_AT_FREE bit.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 41 insertions(+)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2947,6 +2947,7 @@ static inline pmd_t *pmd_alloc(struct mm
+ #endif /* CONFIG_MMU */
+
+ enum pt_flags {
++ PT_kernel = PG_referenced,
+ PT_reserved = PG_reserved,
+ /* High bits are used for zone/node/section */
+ };
+@@ -2973,6 +2974,46 @@ static inline bool pagetable_is_reserved
+ }
+
+ /**
++ * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
++ * @ptdesc: The ptdesc to be marked
++ *
++ * Kernel page tables often need special handling. Set a flag so that
++ * the handling code knows this ptdesc will not be used for userspace.
++ */
++static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
++{
++ set_bit(PT_kernel, &ptdesc->pt_flags.f);
++}
++
++/**
++ * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
++ * @ptdesc: The ptdesc to be unmarked
++ *
++ * Use when the ptdesc is no longer used to map the kernel and no longer
++ * needs special handling.
++ */
++static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
++{
++ /*
++ * Note: the 'PG_referenced' bit does not strictly need to be
++ * cleared before freeing the page. But this is nice for
++ * symmetry.
++ */
++ clear_bit(PT_kernel, &ptdesc->pt_flags.f);
++}
++
++/**
++ * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
++ * @ptdesc: The ptdesc being tested
++ *
++ * Call to tell if the ptdesc used to map the kernel.
++ */
++static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
++{
++ return test_bit(PT_kernel, &ptdesc->pt_flags.f);
++}
++
++/**
+ * pagetable_alloc - Allocate pagetables
+ * @gfp: GFP flags
+ * @order: desired pagetable order
--- /dev/null
+From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:33 +0800
+Subject: mm: introduce deferred freeing for kernel page tables
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 5ba2f0a1556479638ac11a3c201421f5515e89f5 upstream.
+
+This introduces a conditional asynchronous mechanism, enabled by
+CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the
+freeing of pages that are used as page tables for kernel address mappings.
+These pages are now queued to a work struct instead of being freed
+immediately.
+
+This deferred freeing allows for batch-freeing of page tables, providing a
+safe context for performing a single expensive operation (TLB flush) for a
+batch of kernel page tables instead of performing that expensive operation
+for each page table.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h | 16 +++++++++++++---
+ mm/Kconfig | 3 +++
+ mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++
+ 3 files changed, 53 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3038,6 +3038,14 @@ static inline void __pagetable_free(stru
+ __free_pages(page, compound_order(page));
+ }
+
++#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
++void pagetable_free_kernel(struct ptdesc *pt);
++#else
++static inline void pagetable_free_kernel(struct ptdesc *pt)
++{
++ __pagetable_free(pt);
++}
++#endif
+ /**
+ * pagetable_free - Free pagetables
+ * @pt: The page table descriptor
+@@ -3047,10 +3055,12 @@ static inline void __pagetable_free(stru
+ */
+ static inline void pagetable_free(struct ptdesc *pt)
+ {
+- if (ptdesc_test_kernel(pt))
++ if (ptdesc_test_kernel(pt)) {
+ ptdesc_clear_kernel(pt);
+-
+- __pagetable_free(pt);
++ pagetable_free_kernel(pt);
++ } else {
++ __pagetable_free(pt);
++ }
+ }
+
+ #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -915,6 +915,9 @@ config HAVE_GIGANTIC_FOLIOS
+ def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
+ (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+
++config ASYNC_KERNEL_PGTABLE_FREE
++ def_bool n
++
+ # TODO: Allow to be enabled without THP
+ config ARCH_SUPPORTS_HUGE_PFNMAP
+ def_bool n
+--- a/mm/pgtable-generic.c
++++ b/mm/pgtable-generic.c
+@@ -406,3 +406,40 @@ again:
+ pte_unmap_unlock(pte, ptl);
+ goto again;
+ }
++
++#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
++static void kernel_pgtable_work_func(struct work_struct *work);
++
++static struct {
++ struct list_head list;
++ /* protect above ptdesc lists */
++ spinlock_t lock;
++ struct work_struct work;
++} kernel_pgtable_work = {
++ .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
++ .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
++ .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
++};
++
++static void kernel_pgtable_work_func(struct work_struct *work)
++{
++ struct ptdesc *pt, *next;
++ LIST_HEAD(page_list);
++
++ spin_lock(&kernel_pgtable_work.lock);
++ list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
++ spin_unlock(&kernel_pgtable_work.lock);
++
++ list_for_each_entry_safe(pt, next, &page_list, pt_list)
++ __pagetable_free(pt);
++}
++
++void pagetable_free_kernel(struct ptdesc *pt)
++{
++ spin_lock(&kernel_pgtable_work.lock);
++ list_add(&pt->pt_list, &kernel_pgtable_work.list);
++ spin_unlock(&kernel_pgtable_work.lock);
++
++ schedule_work(&kernel_pgtable_work.work);
++}
++#endif
--- /dev/null
+From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:31 +0800
+Subject: mm: introduce pure page table freeing function
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 01894295672335ff304beed4359f30d14d5765f2 upstream.
+
+The pages used for ptdescs are currently freed back to the allocator in a
+single location. They will shortly be freed from a second location.
+
+Create a simple helper that just frees them back to the allocator.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3031,6 +3031,13 @@ static inline struct ptdesc *pagetable_a
+ }
+ #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
+
++static inline void __pagetable_free(struct ptdesc *pt)
++{
++ struct page *page = ptdesc_page(pt);
++
++ __free_pages(page, compound_order(page));
++}
++
+ /**
+ * pagetable_free - Free pagetables
+ * @pt: The page table descriptor
+@@ -3040,12 +3047,10 @@ static inline struct ptdesc *pagetable_a
+ */
+ static inline void pagetable_free(struct ptdesc *pt)
+ {
+- struct page *page = ptdesc_page(pt);
+-
+ if (ptdesc_test_kernel(pt))
+ ptdesc_clear_kernel(pt);
+
+- __free_pages(page, compound_order(page));
++ __pagetable_free(pt);
+ }
+
+ #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
--- /dev/null
+From fc4b909c368f3a7b08c895dd5926476b58e85312 Mon Sep 17 00:00:00 2001
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+Date: Tue, 14 Oct 2025 07:50:09 -0700
+Subject: mm/page_alloc: batch page freeing in decay_pcp_high
+
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+
+commit fc4b909c368f3a7b08c895dd5926476b58e85312 upstream.
+
+It is possible for pcp->count - pcp->high to exceed pcp->batch by a lot.
+When this happens, we should perform batching to ensure that
+free_pcppages_bulk isn't called with too many pages to free at once and
+starve out other threads that need the pcp or zone lock.
+
+Since we are still only freeing the difference between the initial
+pcp->count and pcp->high values, there should be no change to how many
+pages are freed.
+
+Link: https://lkml.kernel.org/r/20251014145011.3427205-3-joshua.hahnjy@gmail.com
+Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
+Suggested-by: Chris Mason <clm@fb.com>
+Suggested-by: Andrew Morton <akpm@linux-foundation.org>
+Co-developed-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Brendan Jackman <jackmanb@google.com>
+Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: SeongJae Park <sj@kernel.org>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2554,7 +2554,7 @@ static int rmqueue_bulk(struct zone *zon
+ */
+ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+ {
+- int high_min, to_drain, batch;
++ int high_min, to_drain, to_drain_batched, batch;
+ bool todo = false;
+
+ high_min = READ_ONCE(pcp->high_min);
+@@ -2572,11 +2572,14 @@ bool decay_pcp_high(struct zone *zone, s
+ }
+
+ to_drain = pcp->count - pcp->high;
+- if (to_drain > 0) {
++ while (to_drain > 0) {
++ to_drain_batched = min(to_drain, batch);
+ spin_lock(&pcp->lock);
+- free_pcppages_bulk(zone, to_drain, pcp, 0);
++ free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
+ spin_unlock(&pcp->lock);
+ todo = true;
++
++ to_drain -= to_drain_batched;
+ }
+
+ return todo;
--- /dev/null
+From 038a102535eb49e10e93eafac54352fcc5d78847 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 5 Jan 2026 16:08:56 +0100
+Subject: mm/page_alloc: prevent pcp corruption with SMP=n
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 038a102535eb49e10e93eafac54352fcc5d78847 upstream.
+
+The kernel test robot has reported:
+
+ BUG: spinlock trylock failure on UP on CPU#0, kcompactd0/28
+ lock: 0xffff888807e35ef0, .magic: dead4ead, .owner: kcompactd0/28, .owner_cpu: 0
+ CPU: 0 UID: 0 PID: 28 Comm: kcompactd0 Not tainted 6.18.0-rc5-00127-ga06157804399 #1 PREEMPT 8cc09ef94dcec767faa911515ce9e609c45db470
+ Call Trace:
+ <IRQ>
+ __dump_stack (lib/dump_stack.c:95)
+ dump_stack_lvl (lib/dump_stack.c:123)
+ dump_stack (lib/dump_stack.c:130)
+ spin_dump (kernel/locking/spinlock_debug.c:71)
+ do_raw_spin_trylock (kernel/locking/spinlock_debug.c:?)
+ _raw_spin_trylock (include/linux/spinlock_api_smp.h:89 kernel/locking/spinlock.c:138)
+ __free_frozen_pages (mm/page_alloc.c:2973)
+ ___free_pages (mm/page_alloc.c:5295)
+ __free_pages (mm/page_alloc.c:5334)
+ tlb_remove_table_rcu (include/linux/mm.h:? include/linux/mm.h:3122 include/asm-generic/tlb.h:220 mm/mmu_gather.c:227 mm/mmu_gather.c:290)
+ ? __cfi_tlb_remove_table_rcu (mm/mmu_gather.c:289)
+ ? rcu_core (kernel/rcu/tree.c:?)
+ rcu_core (include/linux/rcupdate.h:341 kernel/rcu/tree.c:2607 kernel/rcu/tree.c:2861)
+ rcu_core_si (kernel/rcu/tree.c:2879)
+ handle_softirqs (arch/x86/include/asm/jump_label.h:36 include/trace/events/irq.h:142 kernel/softirq.c:623)
+ __irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:725)
+ irq_exit_rcu (kernel/softirq.c:741)
+ sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1052)
+ </IRQ>
+ <TASK>
+ RIP: 0010:_raw_spin_unlock_irqrestore (arch/x86/include/asm/preempt.h:95 include/linux/spinlock_api_smp.h:152 kernel/locking/spinlock.c:194)
+ free_pcppages_bulk (mm/page_alloc.c:1494)
+ drain_pages_zone (include/linux/spinlock.h:391 mm/page_alloc.c:2632)
+ __drain_all_pages (mm/page_alloc.c:2731)
+ drain_all_pages (mm/page_alloc.c:2747)
+ kcompactd (mm/compaction.c:3115)
+ kthread (kernel/kthread.c:465)
+ ? __cfi_kcompactd (mm/compaction.c:3166)
+ ? __cfi_kthread (kernel/kthread.c:412)
+ ret_from_fork (arch/x86/kernel/process.c:164)
+ ? __cfi_kthread (kernel/kthread.c:412)
+ ret_from_fork_asm (arch/x86/entry/entry_64.S:255)
+ </TASK>
+
+Matthew has analyzed the report and identified that in drain_page_zone()
+we are in a section protected by spin_lock(&pcp->lock) and then get an
+interrupt that attempts spin_trylock() on the same lock. The code is
+designed to work this way without disabling IRQs and occasionally fail the
+trylock with a fallback. However, the SMP=n spinlock implementation
+assumes spin_trylock() will always succeed, and thus it's normally a
+no-op. Here the enabled lock debugging catches the problem, but otherwise
+it could cause a corruption of the pcp structure.
+
+The problem has been introduced by commit 574907741599 ("mm/page_alloc:
+leave IRQs enabled for per-cpu page allocations"). The pcp locking scheme
+recognizes the need for disabling IRQs to prevent nesting spin_trylock()
+sections on SMP=n, but the need to prevent the nesting in spin_lock() has
+not been recognized. Fix it by introducing local wrappers that change the
+spin_lock() to spin_lock_iqsave() with SMP=n and use them in all places
+that do spin_lock(&pcp->lock).
+
+[vbabka@suse.cz: add pcp_ prefix to the spin_lock_irqsave wrappers, per Steven]
+Link: https://lkml.kernel.org/r/20260105-fix-pcp-up-v1-1-5579662d2071@suse.cz
+Fixes: 574907741599 ("mm/page_alloc: leave IRQs enabled for per-cpu page allocations")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Closes: https://lore.kernel.org/oe-lkp/202512101320.e2f2dd6f-lkp@intel.com
+Analyzed-by: Matthew Wilcox <willy@infradead.org>
+Link: https://lore.kernel.org/all/aUW05pyc9nZkvY-1@casper.infradead.org/
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Brendan Jackman <jackmanb@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c | 47 +++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 39 insertions(+), 8 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -166,6 +166,33 @@ static DEFINE_MUTEX(pcp_batch_high_lock)
+ #define pcp_spin_unlock(ptr) \
+ pcpu_spin_unlock(lock, ptr)
+
++/*
++ * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
++ * a potentially remote cpu drain) and get interrupted by an operation that
++ * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
++ * spinlock assumptions making the trylock a no-op. So we have to turn that
++ * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
++ * remote cpu's so we can only be locking the only existing local one.
++ */
++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
++static inline void __flags_noop(unsigned long *flags) { }
++#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
++({ \
++ __flags_noop(&(flags)); \
++ spin_lock(&(ptr)->lock); \
++})
++#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
++({ \
++ spin_unlock(&(ptr)->lock); \
++ __flags_noop(&(flags)); \
++})
++#else
++#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
++ spin_lock_irqsave(&(ptr)->lock, flags)
++#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
++ spin_unlock_irqrestore(&(ptr)->lock, flags)
++#endif
++
+ #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+ DEFINE_PER_CPU(int, numa_node);
+ EXPORT_PER_CPU_SYMBOL(numa_node);
+@@ -2555,6 +2582,7 @@ static int rmqueue_bulk(struct zone *zon
+ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+ {
+ int high_min, to_drain, to_drain_batched, batch;
++ unsigned long UP_flags;
+ bool todo = false;
+
+ high_min = READ_ONCE(pcp->high_min);
+@@ -2574,9 +2602,9 @@ bool decay_pcp_high(struct zone *zone, s
+ to_drain = pcp->count - pcp->high;
+ while (to_drain > 0) {
+ to_drain_batched = min(to_drain, batch);
+- spin_lock(&pcp->lock);
++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
+- spin_unlock(&pcp->lock);
++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ todo = true;
+
+ to_drain -= to_drain_batched;
+@@ -2593,14 +2621,15 @@ bool decay_pcp_high(struct zone *zone, s
+ */
+ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
+ {
++ unsigned long UP_flags;
+ int to_drain, batch;
+
+ batch = READ_ONCE(pcp->batch);
+ to_drain = min(pcp->count, batch);
+ if (to_drain > 0) {
+- spin_lock(&pcp->lock);
++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+- spin_unlock(&pcp->lock);
++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ }
+ }
+ #endif
+@@ -2611,10 +2640,11 @@ void drain_zone_pages(struct zone *zone,
+ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
+ {
+ struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
++ unsigned long UP_flags;
+ int count;
+
+ do {
+- spin_lock(&pcp->lock);
++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ count = pcp->count;
+ if (count) {
+ int to_drain = min(count,
+@@ -2623,7 +2653,7 @@ static void drain_pages_zone(unsigned in
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+ count -= to_drain;
+ }
+- spin_unlock(&pcp->lock);
++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ } while (count);
+ }
+
+@@ -6081,6 +6111,7 @@ static void zone_pcp_update_cacheinfo(st
+ {
+ struct per_cpu_pages *pcp;
+ struct cpu_cacheinfo *cci;
++ unsigned long UP_flags;
+
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ cci = get_cpu_cacheinfo(cpu);
+@@ -6091,12 +6122,12 @@ static void zone_pcp_update_cacheinfo(st
+ * This can reduce zone lock contention without hurting
+ * cache-hot pages sharing.
+ */
+- spin_lock(&pcp->lock);
++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+ pcp->flags |= PCPF_FREE_HIGH_BATCH;
+ else
+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+- spin_unlock(&pcp->lock);
++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ }
+
+ void setup_pcp_cacheinfo(unsigned int cpu)
--- /dev/null
+From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+Date: Tue, 14 Oct 2025 07:50:08 -0700
+Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection
+
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+
+commit 0acc67c4030c39f39ac90413cc5d0abddd3a9527 upstream.
+
+Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5.
+
+Motivation & Approach
+=====================
+
+While testing workloads with high sustained memory pressure on large
+machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly
+high number of softlockups. Further investigation showed that the zone
+lock in free_pcppages_bulk was being held for a long time, and was called
+to free 2k+ pages over 100 times just during boot.
+
+This causes starvation in other processes for the zone lock, which can
+lead to the system stalling as multiple threads cannot make progress
+without the locks. We can see these issues manifesting as warnings:
+
+[ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU
+[ 4512.604370] rcu: 20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426
+[ 4512.626401] rcu: hardirqs softirqs csw/system
+[ 4512.638793] rcu: number: 0 145 0
+[ 4512.651177] rcu: cputime: 30 10410 174 ==> 10558(ms)
+[ 4512.666657] rcu: (t=21077 jiffies g=783665 q=1242213 ncpus=316)
+
+While these warnings don't indicate a crash or a kernel panic, they do
+point to the underlying issue of lock contention. To prevent starvation
+in both locks, batch the freeing of pages using pcp->batch.
+
+Because free_pcppages_bulk is called with the pcp lock and acquires the
+zone lock, relinquishing and reacquiring the locks are only effective when
+both of them are broken together (unless the system was built with queued
+spinlocks). Thus, instead of modifying free_pcppages_bulk to break both
+locks, batch the freeing from its callers instead.
+
+A similar fix has been implemented in the Meta fleet, and we have seen
+significantly less softlockups.
+
+Testing
+=======
+The following are a few synthetic benchmarks, made on three machines. The
+first is a large machine with 754GiB memory and 316 processors.
+The second is a relatively smaller machine with 251GiB memory and 176
+processors. The third and final is the smallest of the three, which has 62GiB
+memory and 36 processors.
+
+On all machines, I kick off a kernel build with -j$(nproc).
+Negative delta is better (faster compilation).
+
+Large machine (754GiB memory, 316 processors)
+make -j$(nproc)
++------------+---------------+-----------+
+| Metric (s) | Variation (%) | Delta(%) |
++------------+---------------+-----------+
+| real | 0.8070 | - 1.4865 |
+| user | 0.2823 | + 0.4081 |
+| sys | 5.0267 | -11.8737 |
++------------+---------------+-----------+
+
+Medium machine (251GiB memory, 176 processors)
+make -j$(nproc)
++------------+---------------+----------+
+| Metric (s) | Variation (%) | Delta(%) |
++------------+---------------+----------+
+| real | 0.2806 | +0.0351 |
+| user | 0.0994 | +0.3170 |
+| sys | 0.6229 | -0.6277 |
++------------+---------------+----------+
+
+Small machine (62GiB memory, 36 processors)
+make -j$(nproc)
++------------+---------------+----------+
+| Metric (s) | Variation (%) | Delta(%) |
++------------+---------------+----------+
+| real | 0.1503 | -2.6585 |
+| user | 0.0431 | -2.2984 |
+| sys | 0.1870 | -3.2013 |
++------------+---------------+----------+
+
+Here, variation is the coefficient of variation, i.e. standard deviation
+/ mean.
+
+Based on these results, it seems like there are varying degrees to how
+much lock contention this reduces. For the largest and smallest machines
+that I ran the tests on, it seems like there is quite some significant
+reduction. There is also some performance increases visible from
+userspace.
+
+Interestingly, the performance gains don't scale with the size of the
+machine, but rather there seems to be a dip in the gain there is for the
+medium-sized machine. One possible theory is that because the high
+watermark depends on both memory and the number of local CPUs, what
+impacts zone contention the most is not these individual values, but
+rather the ratio of mem:processors.
+
+
+This patch (of 5):
+
+Currently, refresh_cpu_vm_stats returns an int, indicating how many
+changes were made during its updates. Using this information, callers
+like vmstat_update can heuristically determine if more work will be done
+in the future.
+
+However, all of refresh_cpu_vm_stats's callers either (a) ignore the
+result, only caring about performing the updates, or (b) only care about
+whether changes were made, but not *how many* changes were made.
+
+Simplify the code by returning a bool instead to indicate if updates
+were made.
+
+In addition, simplify fold_diff and decay_pcp_high to return a bool
+for the same reason.
+
+Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com
+Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com
+Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Jackman <jackmanb@google.com>
+Cc: Chris Mason <clm@fb.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/gfp.h | 2 +-
+ mm/page_alloc.c | 8 ++++----
+ mm/vmstat.c | 28 +++++++++++++++-------------
+ 3 files changed, 20 insertions(+), 18 deletions(-)
+
+--- a/include/linux/gfp.h
++++ b/include/linux/gfp.h
+@@ -387,7 +387,7 @@ extern void free_pages(unsigned long add
+ #define free_page(addr) free_pages((addr), 0)
+
+ void page_alloc_init_cpuhp(void);
+-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
++bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
+ void drain_all_pages(struct zone *zone);
+ void drain_local_pages(struct zone *zone);
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2552,10 +2552,10 @@ static int rmqueue_bulk(struct zone *zon
+ * Called from the vmstat counter updater to decay the PCP high.
+ * Return whether there are addition works to do.
+ */
+-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
++bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+ {
+ int high_min, to_drain, batch;
+- int todo = 0;
++ bool todo = false;
+
+ high_min = READ_ONCE(pcp->high_min);
+ batch = READ_ONCE(pcp->batch);
+@@ -2568,7 +2568,7 @@ int decay_pcp_high(struct zone *zone, st
+ pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+ pcp->high - (pcp->high >> 3), high_min);
+ if (pcp->high > high_min)
+- todo++;
++ todo = true;
+ }
+
+ to_drain = pcp->count - pcp->high;
+@@ -2576,7 +2576,7 @@ int decay_pcp_high(struct zone *zone, st
+ spin_lock(&pcp->lock);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+ spin_unlock(&pcp->lock);
+- todo++;
++ todo = true;
+ }
+
+ return todo;
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state);
+
+ /*
+ * Fold a differential into the global counters.
+- * Returns the number of counters updated.
++ * Returns whether counters were updated.
+ */
+ static int fold_diff(int *zone_diff, int *node_diff)
+ {
+ int i;
+- int changes = 0;
++ bool changed = false;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (zone_diff[i]) {
+ atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+- changes++;
++ changed = true;
+ }
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ if (node_diff[i]) {
+ atomic_long_add(node_diff[i], &vm_node_stat[i]);
+- changes++;
++ changed = true;
+ }
+- return changes;
++ return changed;
+ }
+
+ /*
+@@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
+ *
+- * The function returns the number of global counters updated.
++ * The function returns whether global counters were updated.
+ */
+-static int refresh_cpu_vm_stats(bool do_pagesets)
++static bool refresh_cpu_vm_stats(bool do_pagesets)
+ {
+ struct pglist_data *pgdat;
+ struct zone *zone;
+ int i;
+ int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
+- int changes = 0;
++ bool changed = false;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+@@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_
+ if (do_pagesets) {
+ cond_resched();
+
+- changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
++ if (decay_pcp_high(zone, this_cpu_ptr(pcp)))
++ changed = true;
+ #ifdef CONFIG_NUMA
+ /*
+ * Deal with draining the remote pageset of this
+@@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_
+ }
+
+ if (__this_cpu_dec_return(pcp->expire)) {
+- changes++;
++ changed = true;
+ continue;
+ }
+
+ if (__this_cpu_read(pcp->count)) {
+ drain_zone_pages(zone, this_cpu_ptr(pcp));
+- changes++;
++ changed = true;
+ }
+ #endif
+ }
+@@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_
+ }
+ }
+
+- changes += fold_diff(global_zone_diff, global_node_diff);
+- return changes;
++ if (fold_diff(global_zone_diff, global_node_diff))
++ changed = true;
++ return changed;
+ }
+
+ /*
dmaengine-ti-dma-crossbar-fix-device-leak-on-dra7x-route-allocation.patch
dmaengine-ti-dma-crossbar-fix-device-leak-on-am335x-route-allocation.patch
dmaengine-ti-k3-udma-fix-device-leak-on-udma-lookup.patch
+mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch
+mm-actually-mark-kernel-page-table-pages.patch
+x86-mm-use-ptdesc-when-freeing-pmd-pages.patch
+mm-introduce-pure-page-table-freeing-function.patch
+x86-mm-use-pagetable_free.patch
+mm-introduce-deferred-freeing-for-kernel-page-tables.patch
+iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch
+hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch
+hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch
+btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch
+mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch
+mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch
+mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch
--- /dev/null
+From bf9e4e30f3538391745a99bc2268ec4f5e4a401e Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:32 +0800
+Subject: x86/mm: use pagetable_free()
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit bf9e4e30f3538391745a99bc2268ec4f5e4a401e upstream.
+
+The kernel's memory management subsystem provides a dedicated interface,
+pagetable_free(), for freeing page table pages. Updates two call sites to
+use pagetable_free() instead of the lower-level __free_page() or
+free_pages(). This improves code consistency and clarity, and ensures the
+correct freeing mechanism is used.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-7-baolu.lu@linux.intel.com
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Kevin Tian <kevin.tian@intel.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/init_64.c | 2 +-
+ arch/x86/mm/pat/set_memory.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -1031,7 +1031,7 @@ static void __meminit free_pagetable(str
+ free_reserved_pages(page, nr_pages);
+ #endif
+ } else {
+- __free_pages(page, order);
++ pagetable_free(page_ptdesc(page));
+ }
+ }
+
+--- a/arch/x86/mm/pat/set_memory.c
++++ b/arch/x86/mm/pat/set_memory.c
+@@ -429,7 +429,7 @@ static void cpa_collapse_large_pages(str
+
+ list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
+ list_del(&ptdesc->pt_list);
+- __free_page(ptdesc_page(ptdesc));
++ pagetable_free(ptdesc);
+ }
+ }
+
--- /dev/null
+From 412d000346ea38ac4b9bb715a86c73ef89d90dea Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:30 +0800
+Subject: x86/mm: use 'ptdesc' when freeing PMD pages
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 412d000346ea38ac4b9bb715a86c73ef89d90dea upstream.
+
+There are a billion ways to refer to a physical memory address. One of
+the x86 PMD freeing code location chooses to use a 'pte_t *' to point to a
+PMD page and then call a PTE-specific freeing function for it. That's a
+bit wonky.
+
+Just use a 'struct ptdesc *' instead. Its entire purpose is to refer to
+page table pages. It also means being able to remove an explicit cast.
+
+Right now, pte_free_kernel() is a one-liner that calls
+pagetable_dtor_free(). Effectively, all this patch does is remove one
+superfluous __pa(__va(paddr)) conversion and then call
+pagetable_dtor_free() directly instead of through a helper.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-5-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/pgtable.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -729,7 +729,7 @@ int pmd_clear_huge(pmd_t *pmd)
+ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+ {
+ pmd_t *pmd, *pmd_sv;
+- pte_t *pte;
++ struct ptdesc *pt;
+ int i;
+
+ pmd = pud_pgtable(*pud);
+@@ -750,8 +750,8 @@ int pud_free_pmd_page(pud_t *pud, unsign
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+- pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+- pte_free_kernel(&init_mm, pte);
++ pt = page_ptdesc(pmd_page(pmd_sv[i]));
++ pagetable_dtor_free(pt);
+ }
+ }
+
+@@ -772,15 +772,15 @@ int pud_free_pmd_page(pud_t *pud, unsign
+ */
+ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+ {
+- pte_t *pte;
++ struct ptdesc *pt;
+
+- pte = (pte_t *)pmd_page_vaddr(*pmd);
++ pt = page_ptdesc(pmd_page(*pmd));
+ pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+- pte_free_kernel(&init_mm, pte);
++ pagetable_dtor_free(pt);
+
+ return 1;
+ }