From 433e4e2921c5b04b47ad46575bba4f9b97f31888 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 21 Jan 2026 16:15:37 +0100 Subject: [PATCH] 6.18-stable patches added patches: btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch mm-actually-mark-kernel-page-table-pages.patch mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch mm-introduce-deferred-freeing-for-kernel-page-tables.patch mm-introduce-pure-page-table-freeing-function.patch mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch x86-mm-use-pagetable_free.patch x86-mm-use-ptdesc-when-freeing-pmd-pages.patch --- ...rans-due-to-ignored-transaction-type.patch | 154 ++++++++++ ...strict-in-devm_ishtp_alloc_workqueue.patch | 64 ++++ ...orkqueues-to-prevent-resume-blocking.patch | 201 +++++++++++++ ...tlb-entries-for-kernel-address-space.patch | 194 ++++++++++++ ...ctually-mark-kernel-page-table-pages.patch | 118 ++++++++ ...desc-flag-to-mark-kernel-page-tables.patch | 121 ++++++++ ...erred-freeing-for-kernel-page-tables.patch | 143 +++++++++ ...uce-pure-page-table-freeing-function.patch | 77 +++++ ...batch-page-freeing-in-decay_pcp_high.patch | 66 +++++ ...oc-prevent-pcp-corruption-with-smp-n.patch | 210 +++++++++++++ ...efresh_cpu_vm_stats-change-detection.patch | 276 ++++++++++++++++++ queue-6.18/series | 13 + queue-6.18/x86-mm-use-pagetable_free.patch | 71 +++++ ...mm-use-ptdesc-when-freeing-pmd-pages.patch | 96 ++++++ 14 files changed, 1804 insertions(+) create mode 100644 queue-6.18/btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch create mode 100644 queue-6.18/hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch create mode 100644 queue-6.18/hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch create mode 100644 queue-6.18/iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch create mode 100644 queue-6.18/mm-actually-mark-kernel-page-table-pages.patch create mode 100644 queue-6.18/mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch create mode 100644 queue-6.18/mm-introduce-deferred-freeing-for-kernel-page-tables.patch create mode 100644 queue-6.18/mm-introduce-pure-page-table-freeing-function.patch create mode 100644 queue-6.18/mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch create mode 100644 queue-6.18/mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch create mode 100644 queue-6.18/mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch create mode 100644 queue-6.18/x86-mm-use-pagetable_free.patch create mode 100644 queue-6.18/x86-mm-use-ptdesc-when-freeing-pmd-pages.patch diff --git a/queue-6.18/btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch b/queue-6.18/btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch new file mode 100644 index 0000000000..9ff64649a9 --- /dev/null +++ b/queue-6.18/btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch @@ -0,0 +1,154 @@ +From 5037b342825df7094a4906d1e2a9674baab50cb2 Mon Sep 17 00:00:00 2001 +From: Robbie Ko +Date: Thu, 11 Dec 2025 13:30:33 +0800 +Subject: btrfs: fix deadlock in wait_current_trans() due to ignored transaction type + +From: Robbie Ko + +commit 5037b342825df7094a4906d1e2a9674baab50cb2 upstream. + +When wait_current_trans() is called during start_transaction(), it +currently waits for a blocked transaction without considering whether +the given transaction type actually needs to wait for that particular +transaction state. The btrfs_blocked_trans_types[] array already defines +which transaction types should wait for which transaction states, but +this check was missing in wait_current_trans(). + +This can lead to a deadlock scenario involving two transactions and +pending ordered extents: + + 1. Transaction A is in TRANS_STATE_COMMIT_DOING state + + 2. A worker processing an ordered extent calls start_transaction() + with TRANS_JOIN + + 3. join_transaction() returns -EBUSY because Transaction A is in + TRANS_STATE_COMMIT_DOING + + 4. Transaction A moves to TRANS_STATE_UNBLOCKED and completes + + 5. A new Transaction B is created (TRANS_STATE_RUNNING) + + 6. The ordered extent from step 2 is added to Transaction B's + pending ordered extents + + 7. Transaction B immediately starts commit by another task and + enters TRANS_STATE_COMMIT_START + + 8. The worker finally reaches wait_current_trans(), sees Transaction B + in TRANS_STATE_COMMIT_START (a blocked state), and waits + unconditionally + + 9. However, TRANS_JOIN should NOT wait for TRANS_STATE_COMMIT_START + according to btrfs_blocked_trans_types[] + + 10. Transaction B is waiting for pending ordered extents to complete + + 11. Deadlock: Transaction B waits for ordered extent, ordered extent + waits for Transaction B + +This can be illustrated by the following call stacks: + CPU0 CPU1 + btrfs_finish_ordered_io() + start_transaction(TRANS_JOIN) + join_transaction() + # -EBUSY (Transaction A is + # TRANS_STATE_COMMIT_DOING) + # Transaction A completes + # Transaction B created + # ordered extent added to + # Transaction B's pending list + btrfs_commit_transaction() + # Transaction B enters + # TRANS_STATE_COMMIT_START + # waiting for pending ordered + # extents + wait_current_trans() + # waits for Transaction B + # (should not wait!) + +Task bstore_kv_sync in btrfs_commit_transaction waiting for ordered +extents: + + __schedule+0x2e7/0x8a0 + schedule+0x64/0xe0 + btrfs_commit_transaction+0xbf7/0xda0 [btrfs] + btrfs_sync_file+0x342/0x4d0 [btrfs] + __x64_sys_fdatasync+0x4b/0x80 + do_syscall_64+0x33/0x40 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Task kworker in wait_current_trans waiting for transaction commit: + + Workqueue: btrfs-syno_nocow btrfs_work_helper [btrfs] + __schedule+0x2e7/0x8a0 + schedule+0x64/0xe0 + wait_current_trans+0xb0/0x110 [btrfs] + start_transaction+0x346/0x5b0 [btrfs] + btrfs_finish_ordered_io.isra.0+0x49b/0x9c0 [btrfs] + btrfs_work_helper+0xe8/0x350 [btrfs] + process_one_work+0x1d3/0x3c0 + worker_thread+0x4d/0x3e0 + kthread+0x12d/0x150 + ret_from_fork+0x1f/0x30 + +Fix this by passing the transaction type to wait_current_trans() and +checking btrfs_blocked_trans_types[cur_trans->state] against the given +type before deciding to wait. This ensures that transaction types which +are allowed to join during certain blocked states will not unnecessarily +wait and cause deadlocks. + +Reviewed-by: Filipe Manana +Signed-off-by: Robbie Ko +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Cc: Motiejus Jakštys +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/transaction.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -518,13 +518,14 @@ static inline int is_transaction_blocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +-static void wait_current_trans(struct btrfs_fs_info *fs_info) ++static void wait_current_trans(struct btrfs_fs_info *fs_info, unsigned int type) + { + struct btrfs_transaction *cur_trans; + + spin_lock(&fs_info->trans_lock); + cur_trans = fs_info->running_transaction; +- if (cur_trans && is_transaction_blocked(cur_trans)) { ++ if (cur_trans && is_transaction_blocked(cur_trans) && ++ (btrfs_blocked_trans_types[cur_trans->state] & type)) { + refcount_inc(&cur_trans->use_count); + spin_unlock(&fs_info->trans_lock); + +@@ -699,12 +700,12 @@ again: + sb_start_intwrite(fs_info->sb); + + if (may_wait_transaction(fs_info, type)) +- wait_current_trans(fs_info); ++ wait_current_trans(fs_info, type); + + do { + ret = join_transaction(fs_info, type); + if (ret == -EBUSY) { +- wait_current_trans(fs_info); ++ wait_current_trans(fs_info, type); + if (unlikely(type == TRANS_ATTACH || + type == TRANS_JOIN_NOSTART)) + ret = -ENOENT; +@@ -1001,7 +1002,7 @@ out: + + void btrfs_throttle(struct btrfs_fs_info *fs_info) + { +- wait_current_trans(fs_info); ++ wait_current_trans(fs_info, TRANS_START); + } + + bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) diff --git a/queue-6.18/hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch b/queue-6.18/hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch new file mode 100644 index 0000000000..56c494c9e8 --- /dev/null +++ b/queue-6.18/hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch @@ -0,0 +1,64 @@ +From 3644f4411713f52bf231574aa8759e3d8e20b341 Mon Sep 17 00:00:00 2001 +From: Nathan Chancellor +Date: Wed, 22 Oct 2025 00:49:08 +0200 +Subject: HID: intel-ish-hid: Fix -Wcast-function-type-strict in devm_ishtp_alloc_workqueue() + +From: Nathan Chancellor + +commit 3644f4411713f52bf231574aa8759e3d8e20b341 upstream. + +Clang warns (or errors with CONFIG_WERROR=y / W=e): + + drivers/hid/intel-ish-hid/ipc/ipc.c:935:36: error: cast from 'void (*)(struct workqueue_struct *)' to 'void (*)(void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict] + 935 | if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue, + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + include/linux/device/devres.h:168:34: note: expanded from macro 'devm_add_action_or_reset' + 168 | __devm_add_action_or_ireset(dev, action, data, #action) + | ^~~~~~ + +This warning is pointing out a kernel control flow integrity (kCFI / +CONFIG_CFI=y) violation will occur due to this function cast when the +destroy_workqueue() is indirectly called via devm_action_release() +because the prototype of destroy_workqueue() does not match the +prototype of (*action)(). + +Use a local function with the correct prototype to wrap +destroy_workqueue() to resolve the warning and CFI violation. + +Reported-by: kernel test robot +Closes: https://lore.kernel.org/oe-kbuild-all/202510190103.qTZvfdjj-lkp@intel.com/ +Closes: https://github.com/ClangBuiltLinux/linux/issues/2139 +Fixes: 0d30dae38fe0 ("HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking") +Signed-off-by: Nathan Chancellor +Acked-by: Srinivas Pandruvada +Reviewed-by: Zhang Lixu +Signed-off-by: Jiri Kosina +Signed-off-by: Greg Kroah-Hartman +--- + drivers/hid/intel-ish-hid/ipc/ipc.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/drivers/hid/intel-ish-hid/ipc/ipc.c ++++ b/drivers/hid/intel-ish-hid/ipc/ipc.c +@@ -933,6 +933,11 @@ static const struct ishtp_hw_ops ish_hw_ + .dma_no_cache_snooping = _dma_no_cache_snooping + }; + ++static void ishtp_free_workqueue(void *wq) ++{ ++ destroy_workqueue(wq); ++} ++ + static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev) + { + struct workqueue_struct *wq; +@@ -941,8 +946,7 @@ static struct workqueue_struct *devm_ish + if (!wq) + return NULL; + +- if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue, +- wq)) ++ if (devm_add_action_or_reset(dev, ishtp_free_workqueue, wq)) + return NULL; + + return wq; diff --git a/queue-6.18/hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch b/queue-6.18/hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch new file mode 100644 index 0000000000..deab1d993e --- /dev/null +++ b/queue-6.18/hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch @@ -0,0 +1,201 @@ +From 0d30dae38fe01cd1de358c6039a0b1184689fe51 Mon Sep 17 00:00:00 2001 +From: Zhang Lixu +Date: Fri, 10 Oct 2025 13:52:54 +0800 +Subject: HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking + +From: Zhang Lixu + +commit 0d30dae38fe01cd1de358c6039a0b1184689fe51 upstream. + +During suspend/resume tests with S2IDLE, some ISH functional failures were +observed because of delay in executing ISH resume handler. Here +schedule_work() is used from resume handler to do actual work. +schedule_work() uses system_wq, which is a per CPU work queue. Although +the queuing is not bound to a CPU, but it prefers local CPU of the caller, +unless prohibited. + +Users of this work queue are not supposed to queue long running work. +But in practice, there are scenarios where long running work items are +queued on other unbound workqueues, occupying the CPU. As a result, the +ISH resume handler may not get a chance to execute in a timely manner. + +In one scenario, one of the ish_resume_handler() executions was delayed +nearly 1 second because another work item on an unbound workqueue occupied +the same CPU. This delay causes ISH functionality failures. + +A similar issue was previously observed where the ISH HID driver timed out +while getting the HID descriptor during S4 resume in the recovery kernel, +likely caused by the same workqueue contention problem. + +Create dedicated unbound workqueues for all ISH operations to allow work +items to execute on any available CPU, eliminating CPU-specific bottlenecks +and improving resume reliability under varying system loads. Also ISH has +three different components, a bus driver which implements ISH protocols, a +PCI interface layer and HID interface. Use one dedicated work queue for all +of them. + +Signed-off-by: Zhang Lixu +Signed-off-by: Jiri Kosina +Signed-off-by: Greg Kroah-Hartman +--- + drivers/hid/intel-ish-hid/ipc/ipc.c | 21 ++++++++++++++++++++- + drivers/hid/intel-ish-hid/ipc/pci-ish.c | 2 +- + drivers/hid/intel-ish-hid/ishtp-hid-client.c | 4 ++-- + drivers/hid/intel-ish-hid/ishtp/bus.c | 18 +++++++++++++++++- + drivers/hid/intel-ish-hid/ishtp/hbm.c | 4 ++-- + drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h | 3 +++ + include/linux/intel-ish-client-if.h | 2 ++ + 7 files changed, 47 insertions(+), 7 deletions(-) + +--- a/drivers/hid/intel-ish-hid/ipc/ipc.c ++++ b/drivers/hid/intel-ish-hid/ipc/ipc.c +@@ -628,7 +628,7 @@ static void recv_ipc(struct ishtp_device + if (!ishtp_dev) { + ishtp_dev = dev; + } +- schedule_work(&fw_reset_work); ++ queue_work(dev->unbound_wq, &fw_reset_work); + break; + + case MNG_RESET_NOTIFY_ACK: +@@ -933,6 +933,21 @@ static const struct ishtp_hw_ops ish_hw_ + .dma_no_cache_snooping = _dma_no_cache_snooping + }; + ++static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev) ++{ ++ struct workqueue_struct *wq; ++ ++ wq = alloc_workqueue("ishtp_unbound_%d", WQ_UNBOUND, 0, dev->id); ++ if (!wq) ++ return NULL; ++ ++ if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue, ++ wq)) ++ return NULL; ++ ++ return wq; ++} ++ + /** + * ish_dev_init() -Initialize ISH devoce + * @pdev: PCI device +@@ -953,6 +968,10 @@ struct ishtp_device *ish_dev_init(struct + if (!dev) + return NULL; + ++ dev->unbound_wq = devm_ishtp_alloc_workqueue(&pdev->dev); ++ if (!dev->unbound_wq) ++ return NULL; ++ + dev->devc = &pdev->dev; + ishtp_device_init(dev); + +--- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c ++++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c +@@ -384,7 +384,7 @@ static int __maybe_unused ish_resume(str + ish_resume_device = device; + dev->resume_flag = 1; + +- schedule_work(&resume_work); ++ queue_work(dev->unbound_wq, &resume_work); + + return 0; + } +--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c ++++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c +@@ -860,7 +860,7 @@ static int hid_ishtp_cl_reset(struct ish + hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__, + hid_ishtp_cl); + +- schedule_work(&client_data->work); ++ queue_work(ishtp_get_workqueue(cl_device), &client_data->work); + + return 0; + } +@@ -902,7 +902,7 @@ static int hid_ishtp_cl_resume(struct de + + hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__, + hid_ishtp_cl); +- schedule_work(&client_data->resume_work); ++ queue_work(ishtp_get_workqueue(cl_device), &client_data->resume_work); + return 0; + } + +--- a/drivers/hid/intel-ish-hid/ishtp/bus.c ++++ b/drivers/hid/intel-ish-hid/ishtp/bus.c +@@ -541,7 +541,7 @@ void ishtp_cl_bus_rx_event(struct ishtp_ + return; + + if (device->event_cb) +- schedule_work(&device->event_work); ++ queue_work(device->ishtp_dev->unbound_wq, &device->event_work); + } + + /** +@@ -877,6 +877,22 @@ struct device *ishtp_get_pci_device(stru + EXPORT_SYMBOL(ishtp_get_pci_device); + + /** ++ * ishtp_get_workqueue - Retrieve the workqueue associated with an ISHTP device ++ * @cl_device: Pointer to the ISHTP client device structure ++ * ++ * Returns the workqueue_struct pointer (unbound_wq) associated with the given ++ * ISHTP client device. This workqueue is typically used for scheduling work ++ * related to the device. ++ * ++ * Return: Pointer to struct workqueue_struct. ++ */ ++struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device) ++{ ++ return cl_device->ishtp_dev->unbound_wq; ++} ++EXPORT_SYMBOL(ishtp_get_workqueue); ++ ++/** + * ishtp_trace_callback() - Return trace callback + * @cl_device: ISH-TP client device instance + * +--- a/drivers/hid/intel-ish-hid/ishtp/hbm.c ++++ b/drivers/hid/intel-ish-hid/ishtp/hbm.c +@@ -573,7 +573,7 @@ void ishtp_hbm_dispatch(struct ishtp_dev + + /* Start firmware loading process if it has loader capability */ + if (version_res->host_version_supported & ISHTP_SUPPORT_CAP_LOADER) +- schedule_work(&dev->work_fw_loader); ++ queue_work(dev->unbound_wq, &dev->work_fw_loader); + + dev->version.major_version = HBM_MAJOR_VERSION; + dev->version.minor_version = HBM_MINOR_VERSION; +@@ -864,7 +864,7 @@ void recv_hbm(struct ishtp_device *dev, + dev->rd_msg_fifo_tail = (dev->rd_msg_fifo_tail + IPC_PAYLOAD_SIZE) % + (RD_INT_FIFO_SIZE * IPC_PAYLOAD_SIZE); + spin_unlock_irqrestore(&dev->rd_msg_spinlock, flags); +- schedule_work(&dev->bh_hbm_work); ++ queue_work(dev->unbound_wq, &dev->bh_hbm_work); + eoi: + return; + } +--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h ++++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h +@@ -175,6 +175,9 @@ struct ishtp_device { + struct hbm_version version; + int transfer_path; /* Choice of transfer path: IPC or DMA */ + ++ /* Alloc a dedicated unbound workqueue for ishtp device */ ++ struct workqueue_struct *unbound_wq; ++ + /* work structure for scheduling firmware loading tasks */ + struct work_struct work_fw_loader; + /* waitq for waiting for command response from the firmware loader */ +--- a/include/linux/intel-ish-client-if.h ++++ b/include/linux/intel-ish-client-if.h +@@ -87,6 +87,8 @@ bool ishtp_wait_resume(struct ishtp_devi + ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device); + /* Get device pointer of PCI device for DMA acces */ + struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device); ++/* Get the ISHTP workqueue */ ++struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device); + + struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device); + void ishtp_cl_free(struct ishtp_cl *cl); diff --git a/queue-6.18/iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch b/queue-6.18/iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch new file mode 100644 index 0000000000..ccf8869377 --- /dev/null +++ b/queue-6.18/iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch @@ -0,0 +1,194 @@ +From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001 +From: Lu Baolu +Date: Wed, 22 Oct 2025 16:26:34 +0800 +Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space + +From: Lu Baolu + +commit e37d5a2d60a338c5917c45296bac65da1382eda5 upstream. + +Introduce a new IOMMU interface to flush IOTLB paging cache entries for +the CPU kernel address space. This interface is invoked from the x86 +architecture code that manages combined user and kernel page tables, +specifically before any kernel page table page is freed and reused. + +This addresses the main issue with vfree() which is a common occurrence +and can be triggered by unprivileged users. While this resolves the +primary problem, it doesn't address some extremely rare case related to +memory unplug of memory that was present as reserved memory at boot, which +cannot be triggered by unprivileged users. The discussion can be found at +the link below. + +Enable SVA on x86 architecture since the IOMMU can now receive +notification to flush the paging cache before freeing the CPU kernel page +table pages. + +Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com +Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/ +Co-developed-by: Jason Gunthorpe +Signed-off-by: Jason Gunthorpe +Signed-off-by: Lu Baolu +Suggested-by: Jann Horn +Reviewed-by: Jason Gunthorpe +Reviewed-by: Vasant Hegde +Reviewed-by: Kevin Tian +Cc: Alistair Popple +Cc: Andy Lutomirski +Cc: Borislav Betkov +Cc: Dave Hansen +Cc: David Hildenbrand +Cc: Ingo Molnar +Cc: Jean-Philippe Brucker +Cc: Joerg Roedel +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Mike Rapoport (Microsoft) +Cc: Peter Zijlstra +Cc: Robin Murohy +Cc: Thomas Gleinxer +Cc: "Uladzislau Rezki (Sony)" +Cc: Vinicius Costa Gomes +Cc: Vlastimil Babka +Cc: Will Deacon +Cc: Yi Lai +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/Kconfig | 1 + + drivers/iommu/iommu-sva.c | 32 ++++++++++++++++++++++++++++---- + include/linux/iommu.h | 4 ++++ + mm/pgtable-generic.c | 2 ++ + 4 files changed, 35 insertions(+), 4 deletions(-) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -279,6 +279,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA + select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS + select HAVE_POSIX_CPU_TIMERS_TASK_WORK +--- a/drivers/iommu/iommu-sva.c ++++ b/drivers/iommu/iommu-sva.c +@@ -10,6 +10,8 @@ + #include "iommu-priv.h" + + static DEFINE_MUTEX(iommu_sva_lock); ++static bool iommu_sva_present; ++static LIST_HEAD(iommu_sva_mms); + static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); + +@@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc + return ERR_PTR(-ENOSPC); + } + iommu_mm->pasid = pasid; ++ iommu_mm->mm = mm; + INIT_LIST_HEAD(&iommu_mm->sva_domains); + /* + * Make sure the write to mm->iommu_mm is not reordered in front of +@@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device( + if (!group) + return ERR_PTR(-ENODEV); + +- if (IS_ENABLED(CONFIG_X86)) +- return ERR_PTR(-EOPNOTSUPP); +- + mutex_lock(&iommu_sva_lock); + + /* Allocate mm->pasid if necessary. */ +@@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device( + if (ret) + goto out_free_domain; + domain->users = 1; +- list_add(&domain->next, &mm->iommu_mm->sva_domains); + ++ if (list_empty(&iommu_mm->sva_domains)) { ++ if (list_empty(&iommu_sva_mms)) ++ iommu_sva_present = true; ++ list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); ++ } ++ list_add(&domain->next, &iommu_mm->sva_domains); + out: + refcount_set(&handle->users, 1); + mutex_unlock(&iommu_sva_lock); +@@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iomm + list_del(&domain->next); + iommu_domain_free(domain); + } ++ ++ if (list_empty(&iommu_mm->sva_domains)) { ++ list_del(&iommu_mm->mm_list_elm); ++ if (list_empty(&iommu_sva_mms)) ++ iommu_sva_present = false; ++ } ++ + mutex_unlock(&iommu_sva_lock); + kfree(handle); + } +@@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_do + + return domain; + } ++ ++void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) ++{ ++ struct iommu_mm_data *iommu_mm; ++ ++ guard(mutex)(&iommu_sva_lock); ++ if (!iommu_sva_present) ++ return; ++ ++ list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) ++ mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); ++} +--- a/include/linux/iommu.h ++++ b/include/linux/iommu.h +@@ -1134,7 +1134,9 @@ struct iommu_sva { + + struct iommu_mm_data { + u32 pasid; ++ struct mm_struct *mm; + struct list_head sva_domains; ++ struct list_head mm_list_elm; + }; + + int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); +@@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device( + struct mm_struct *mm); + void iommu_sva_unbind_device(struct iommu_sva *handle); + u32 iommu_sva_get_pasid(struct iommu_sva *handle); ++void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); + #else + static inline struct iommu_sva * + iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +@@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(st + } + + static inline void mm_pasid_drop(struct mm_struct *mm) {} ++static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} + #endif /* CONFIG_IOMMU_SVA */ + + #ifdef CONFIG_IOMMU_IOPF +--- a/mm/pgtable-generic.c ++++ b/mm/pgtable-generic.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(str + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + ++ iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); + } diff --git a/queue-6.18/mm-actually-mark-kernel-page-table-pages.patch b/queue-6.18/mm-actually-mark-kernel-page-table-pages.patch new file mode 100644 index 0000000000..cecf10d4c3 --- /dev/null +++ b/queue-6.18/mm-actually-mark-kernel-page-table-pages.patch @@ -0,0 +1,118 @@ +From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Wed, 22 Oct 2025 16:26:29 +0800 +Subject: mm: actually mark kernel page table pages + +From: Dave Hansen + +commit 977870522af34359b461060597ee3a86f27450d6 upstream. + +Now that the API is in place, mark kernel page table pages just after they +are allocated. Unmark them just before they are freed. + +Note: Unconditionally clearing the 'kernel' marking (via +ptdesc_clear_kernel()) would be functionally identical to what is here. +But having the if() makes it logically clear that this function can be +used for kernel and non-kernel page tables. + +Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com +Signed-off-by: Dave Hansen +Signed-off-by: Lu Baolu +Reviewed-by: Jason Gunthorpe +Reviewed-by: Kevin Tian +Acked-by: David Hildenbrand +Acked-by: Mike Rapoport (Microsoft) +Cc: Alistair Popple +Cc: Andy Lutomirski +Cc: Borislav Betkov +Cc: Ingo Molnar +Cc: Jann Horn +Cc: Jean-Philippe Brucker +Cc: Joerg Roedel +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Peter Zijlstra +Cc: Robin Murohy +Cc: Thomas Gleinxer +Cc: "Uladzislau Rezki (Sony)" +Cc: Vasant Hegde +Cc: Vinicius Costa Gomes +Cc: Vlastimil Babka +Cc: Will Deacon +Cc: Yi Lai +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/asm-generic/pgalloc.h | 18 ++++++++++++++++++ + include/linux/mm.h | 3 +++ + 2 files changed, 21 insertions(+) + +--- a/include/asm-generic/pgalloc.h ++++ b/include/asm-generic/pgalloc.h +@@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_ker + return NULL; + } + ++ ptdesc_set_kernel(ptdesc); ++ + return ptdesc_address(ptdesc); + } + #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) +@@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_nopro + pagetable_free(ptdesc); + return NULL; + } ++ ++ if (mm == &init_mm) ++ ptdesc_set_kernel(ptdesc); ++ + return ptdesc_address(ptdesc); + } + #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) +@@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_nop + return NULL; + + pagetable_pud_ctor(ptdesc); ++ ++ if (mm == &init_mm) ++ ptdesc_set_kernel(ptdesc); ++ + return ptdesc_address(ptdesc); + } + #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) +@@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_nop + return NULL; + + pagetable_p4d_ctor(ptdesc); ++ ++ if (mm == &init_mm) ++ ptdesc_set_kernel(ptdesc); ++ + return ptdesc_address(ptdesc); + } + #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) +@@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof( + return NULL; + + pagetable_pgd_ctor(ptdesc); ++ ++ if (mm == &init_mm) ++ ptdesc_set_kernel(ptdesc); ++ + return ptdesc_address(ptdesc); + } + #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3042,6 +3042,9 @@ static inline void pagetable_free(struct + { + struct page *page = ptdesc_page(pt); + ++ if (ptdesc_test_kernel(pt)) ++ ptdesc_clear_kernel(pt); ++ + __free_pages(page, compound_order(page)); + } + diff --git a/queue-6.18/mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch b/queue-6.18/mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch new file mode 100644 index 0000000000..ed6ebf51bf --- /dev/null +++ b/queue-6.18/mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch @@ -0,0 +1,121 @@ +From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Wed, 22 Oct 2025 16:26:28 +0800 +Subject: mm: add a ptdesc flag to mark kernel page tables + +From: Dave Hansen + +commit 27bfafac65d87c58639f5d7af1353ec1e7886963 upstream. + +The page tables used to map the kernel and userspace often have very +different handling rules. There are frequently *_kernel() variants of +functions just for kernel page tables. That's not great and has lead to +code duplication. + +Instead of having completely separate call paths, allow a 'ptdesc' to be +marked as being for kernel mappings. Introduce helpers to set and clear +this status. + +Note: this uses the PG_referenced bit. Page flags are a great fit for +this since it is truly a single bit of information. Use PG_referenced +itself because it's a fairly benign flag (as opposed to things like +PG_lock). It's also (according to Willy) unlikely to go away any time +soon. + +PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE. It does not need to be +cleared before freeing the page, and pages coming out of the allocator +should have it cleared. Regardless, introduce an API to clear it anyway. +Having symmetry in the API makes it easier to change the underlying +implementation later, like if there was a need to move to a +PAGE_FLAGS_CHECK_AT_FREE bit. + +Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com +Signed-off-by: Dave Hansen +Signed-off-by: Lu Baolu +Reviewed-by: Jason Gunthorpe +Reviewed-by: Kevin Tian +Acked-by: David Hildenbrand +Acked-by: Mike Rapoport (Microsoft) +Cc: Alistair Popple +Cc: Andy Lutomirski +Cc: Borislav Betkov +Cc: Ingo Molnar +Cc: Jann Horn +Cc: Jean-Philippe Brucker +Cc: Joerg Roedel +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Peter Zijlstra +Cc: Robin Murohy +Cc: Thomas Gleinxer +Cc: "Uladzislau Rezki (Sony)" +Cc: Vasant Hegde +Cc: Vinicius Costa Gomes +Cc: Vlastimil Babka +Cc: Will Deacon +Cc: Yi Lai +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 41 insertions(+) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2947,6 +2947,7 @@ static inline pmd_t *pmd_alloc(struct mm + #endif /* CONFIG_MMU */ + + enum pt_flags { ++ PT_kernel = PG_referenced, + PT_reserved = PG_reserved, + /* High bits are used for zone/node/section */ + }; +@@ -2973,6 +2974,46 @@ static inline bool pagetable_is_reserved + } + + /** ++ * ptdesc_set_kernel - Mark a ptdesc used to map the kernel ++ * @ptdesc: The ptdesc to be marked ++ * ++ * Kernel page tables often need special handling. Set a flag so that ++ * the handling code knows this ptdesc will not be used for userspace. ++ */ ++static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) ++{ ++ set_bit(PT_kernel, &ptdesc->pt_flags.f); ++} ++ ++/** ++ * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel ++ * @ptdesc: The ptdesc to be unmarked ++ * ++ * Use when the ptdesc is no longer used to map the kernel and no longer ++ * needs special handling. ++ */ ++static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) ++{ ++ /* ++ * Note: the 'PG_referenced' bit does not strictly need to be ++ * cleared before freeing the page. But this is nice for ++ * symmetry. ++ */ ++ clear_bit(PT_kernel, &ptdesc->pt_flags.f); ++} ++ ++/** ++ * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel ++ * @ptdesc: The ptdesc being tested ++ * ++ * Call to tell if the ptdesc used to map the kernel. ++ */ ++static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) ++{ ++ return test_bit(PT_kernel, &ptdesc->pt_flags.f); ++} ++ ++/** + * pagetable_alloc - Allocate pagetables + * @gfp: GFP flags + * @order: desired pagetable order diff --git a/queue-6.18/mm-introduce-deferred-freeing-for-kernel-page-tables.patch b/queue-6.18/mm-introduce-deferred-freeing-for-kernel-page-tables.patch new file mode 100644 index 0000000000..f02de4d9a0 --- /dev/null +++ b/queue-6.18/mm-introduce-deferred-freeing-for-kernel-page-tables.patch @@ -0,0 +1,143 @@ +From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Wed, 22 Oct 2025 16:26:33 +0800 +Subject: mm: introduce deferred freeing for kernel page tables + +From: Dave Hansen + +commit 5ba2f0a1556479638ac11a3c201421f5515e89f5 upstream. + +This introduces a conditional asynchronous mechanism, enabled by +CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the +freeing of pages that are used as page tables for kernel address mappings. +These pages are now queued to a work struct instead of being freed +immediately. + +This deferred freeing allows for batch-freeing of page tables, providing a +safe context for performing a single expensive operation (TLB flush) for a +batch of kernel page tables instead of performing that expensive operation +for each page table. + +Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com +Signed-off-by: Dave Hansen +Signed-off-by: Lu Baolu +Reviewed-by: Jason Gunthorpe +Reviewed-by: Kevin Tian +Acked-by: David Hildenbrand +Acked-by: Mike Rapoport (Microsoft) +Cc: Alistair Popple +Cc: Andy Lutomirski +Cc: Borislav Betkov +Cc: Ingo Molnar +Cc: Jann Horn +Cc: Jean-Philippe Brucker +Cc: Joerg Roedel +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Peter Zijlstra +Cc: Robin Murohy +Cc: Thomas Gleinxer +Cc: "Uladzislau Rezki (Sony)" +Cc: Vasant Hegde +Cc: Vinicius Costa Gomes +Cc: Vlastimil Babka +Cc: Will Deacon +Cc: Yi Lai +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mm.h | 16 +++++++++++++--- + mm/Kconfig | 3 +++ + mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++ + 3 files changed, 53 insertions(+), 3 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3038,6 +3038,14 @@ static inline void __pagetable_free(stru + __free_pages(page, compound_order(page)); + } + ++#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE ++void pagetable_free_kernel(struct ptdesc *pt); ++#else ++static inline void pagetable_free_kernel(struct ptdesc *pt) ++{ ++ __pagetable_free(pt); ++} ++#endif + /** + * pagetable_free - Free pagetables + * @pt: The page table descriptor +@@ -3047,10 +3055,12 @@ static inline void __pagetable_free(stru + */ + static inline void pagetable_free(struct ptdesc *pt) + { +- if (ptdesc_test_kernel(pt)) ++ if (ptdesc_test_kernel(pt)) { + ptdesc_clear_kernel(pt); +- +- __pagetable_free(pt); ++ pagetable_free_kernel(pt); ++ } else { ++ __pagetable_free(pt); ++ } + } + + #if defined(CONFIG_SPLIT_PTE_PTLOCKS) +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -915,6 +915,9 @@ config HAVE_GIGANTIC_FOLIOS + def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ + (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + ++config ASYNC_KERNEL_PGTABLE_FREE ++ def_bool n ++ + # TODO: Allow to be enabled without THP + config ARCH_SUPPORTS_HUGE_PFNMAP + def_bool n +--- a/mm/pgtable-generic.c ++++ b/mm/pgtable-generic.c +@@ -406,3 +406,40 @@ again: + pte_unmap_unlock(pte, ptl); + goto again; + } ++ ++#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE ++static void kernel_pgtable_work_func(struct work_struct *work); ++ ++static struct { ++ struct list_head list; ++ /* protect above ptdesc lists */ ++ spinlock_t lock; ++ struct work_struct work; ++} kernel_pgtable_work = { ++ .list = LIST_HEAD_INIT(kernel_pgtable_work.list), ++ .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), ++ .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), ++}; ++ ++static void kernel_pgtable_work_func(struct work_struct *work) ++{ ++ struct ptdesc *pt, *next; ++ LIST_HEAD(page_list); ++ ++ spin_lock(&kernel_pgtable_work.lock); ++ list_splice_tail_init(&kernel_pgtable_work.list, &page_list); ++ spin_unlock(&kernel_pgtable_work.lock); ++ ++ list_for_each_entry_safe(pt, next, &page_list, pt_list) ++ __pagetable_free(pt); ++} ++ ++void pagetable_free_kernel(struct ptdesc *pt) ++{ ++ spin_lock(&kernel_pgtable_work.lock); ++ list_add(&pt->pt_list, &kernel_pgtable_work.list); ++ spin_unlock(&kernel_pgtable_work.lock); ++ ++ schedule_work(&kernel_pgtable_work.work); ++} ++#endif diff --git a/queue-6.18/mm-introduce-pure-page-table-freeing-function.patch b/queue-6.18/mm-introduce-pure-page-table-freeing-function.patch new file mode 100644 index 0000000000..8719749901 --- /dev/null +++ b/queue-6.18/mm-introduce-pure-page-table-freeing-function.patch @@ -0,0 +1,77 @@ +From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Wed, 22 Oct 2025 16:26:31 +0800 +Subject: mm: introduce pure page table freeing function + +From: Dave Hansen + +commit 01894295672335ff304beed4359f30d14d5765f2 upstream. + +The pages used for ptdescs are currently freed back to the allocator in a +single location. They will shortly be freed from a second location. + +Create a simple helper that just frees them back to the allocator. + +Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com +Signed-off-by: Dave Hansen +Signed-off-by: Lu Baolu +Reviewed-by: Jason Gunthorpe +Reviewed-by: Kevin Tian +Acked-by: David Hildenbrand +Acked-by: Mike Rapoport (Microsoft) +Cc: Alistair Popple +Cc: Andy Lutomirski +Cc: Borislav Betkov +Cc: Ingo Molnar +Cc: Jann Horn +Cc: Jean-Philippe Brucker +Cc: Joerg Roedel +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Peter Zijlstra +Cc: Robin Murohy +Cc: Thomas Gleinxer +Cc: "Uladzislau Rezki (Sony)" +Cc: Vasant Hegde +Cc: Vinicius Costa Gomes +Cc: Vlastimil Babka +Cc: Will Deacon +Cc: Yi Lai +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mm.h | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3031,6 +3031,13 @@ static inline struct ptdesc *pagetable_a + } + #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) + ++static inline void __pagetable_free(struct ptdesc *pt) ++{ ++ struct page *page = ptdesc_page(pt); ++ ++ __free_pages(page, compound_order(page)); ++} ++ + /** + * pagetable_free - Free pagetables + * @pt: The page table descriptor +@@ -3040,12 +3047,10 @@ static inline struct ptdesc *pagetable_a + */ + static inline void pagetable_free(struct ptdesc *pt) + { +- struct page *page = ptdesc_page(pt); +- + if (ptdesc_test_kernel(pt)) + ptdesc_clear_kernel(pt); + +- __free_pages(page, compound_order(page)); ++ __pagetable_free(pt); + } + + #if defined(CONFIG_SPLIT_PTE_PTLOCKS) diff --git a/queue-6.18/mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch b/queue-6.18/mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch new file mode 100644 index 0000000000..8e387e09ad --- /dev/null +++ b/queue-6.18/mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch @@ -0,0 +1,66 @@ +From fc4b909c368f3a7b08c895dd5926476b58e85312 Mon Sep 17 00:00:00 2001 +From: Joshua Hahn +Date: Tue, 14 Oct 2025 07:50:09 -0700 +Subject: mm/page_alloc: batch page freeing in decay_pcp_high + +From: Joshua Hahn + +commit fc4b909c368f3a7b08c895dd5926476b58e85312 upstream. + +It is possible for pcp->count - pcp->high to exceed pcp->batch by a lot. +When this happens, we should perform batching to ensure that +free_pcppages_bulk isn't called with too many pages to free at once and +starve out other threads that need the pcp or zone lock. + +Since we are still only freeing the difference between the initial +pcp->count and pcp->high values, there should be no change to how many +pages are freed. + +Link: https://lkml.kernel.org/r/20251014145011.3427205-3-joshua.hahnjy@gmail.com +Signed-off-by: Joshua Hahn +Suggested-by: Chris Mason +Suggested-by: Andrew Morton +Co-developed-by: Johannes Weiner +Reviewed-by: Vlastimil Babka +Cc: Brendan Jackman +Cc: "Kirill A. Shutemov" +Cc: Michal Hocko +Cc: SeongJae Park +Cc: Suren Baghdasaryan +Cc: Zi Yan +Signed-off-by: Andrew Morton +Stable-dep-of: 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n") +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2554,7 +2554,7 @@ static int rmqueue_bulk(struct zone *zon + */ + bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) + { +- int high_min, to_drain, batch; ++ int high_min, to_drain, to_drain_batched, batch; + bool todo = false; + + high_min = READ_ONCE(pcp->high_min); +@@ -2572,11 +2572,14 @@ bool decay_pcp_high(struct zone *zone, s + } + + to_drain = pcp->count - pcp->high; +- if (to_drain > 0) { ++ while (to_drain > 0) { ++ to_drain_batched = min(to_drain, batch); + spin_lock(&pcp->lock); +- free_pcppages_bulk(zone, to_drain, pcp, 0); ++ free_pcppages_bulk(zone, to_drain_batched, pcp, 0); + spin_unlock(&pcp->lock); + todo = true; ++ ++ to_drain -= to_drain_batched; + } + + return todo; diff --git a/queue-6.18/mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch b/queue-6.18/mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch new file mode 100644 index 0000000000..b304fea6ab --- /dev/null +++ b/queue-6.18/mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch @@ -0,0 +1,210 @@ +From 038a102535eb49e10e93eafac54352fcc5d78847 Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Mon, 5 Jan 2026 16:08:56 +0100 +Subject: mm/page_alloc: prevent pcp corruption with SMP=n + +From: Vlastimil Babka + +commit 038a102535eb49e10e93eafac54352fcc5d78847 upstream. + +The kernel test robot has reported: + + BUG: spinlock trylock failure on UP on CPU#0, kcompactd0/28 + lock: 0xffff888807e35ef0, .magic: dead4ead, .owner: kcompactd0/28, .owner_cpu: 0 + CPU: 0 UID: 0 PID: 28 Comm: kcompactd0 Not tainted 6.18.0-rc5-00127-ga06157804399 #1 PREEMPT 8cc09ef94dcec767faa911515ce9e609c45db470 + Call Trace: + + __dump_stack (lib/dump_stack.c:95) + dump_stack_lvl (lib/dump_stack.c:123) + dump_stack (lib/dump_stack.c:130) + spin_dump (kernel/locking/spinlock_debug.c:71) + do_raw_spin_trylock (kernel/locking/spinlock_debug.c:?) + _raw_spin_trylock (include/linux/spinlock_api_smp.h:89 kernel/locking/spinlock.c:138) + __free_frozen_pages (mm/page_alloc.c:2973) + ___free_pages (mm/page_alloc.c:5295) + __free_pages (mm/page_alloc.c:5334) + tlb_remove_table_rcu (include/linux/mm.h:? include/linux/mm.h:3122 include/asm-generic/tlb.h:220 mm/mmu_gather.c:227 mm/mmu_gather.c:290) + ? __cfi_tlb_remove_table_rcu (mm/mmu_gather.c:289) + ? rcu_core (kernel/rcu/tree.c:?) + rcu_core (include/linux/rcupdate.h:341 kernel/rcu/tree.c:2607 kernel/rcu/tree.c:2861) + rcu_core_si (kernel/rcu/tree.c:2879) + handle_softirqs (arch/x86/include/asm/jump_label.h:36 include/trace/events/irq.h:142 kernel/softirq.c:623) + __irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:725) + irq_exit_rcu (kernel/softirq.c:741) + sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1052) + + + RIP: 0010:_raw_spin_unlock_irqrestore (arch/x86/include/asm/preempt.h:95 include/linux/spinlock_api_smp.h:152 kernel/locking/spinlock.c:194) + free_pcppages_bulk (mm/page_alloc.c:1494) + drain_pages_zone (include/linux/spinlock.h:391 mm/page_alloc.c:2632) + __drain_all_pages (mm/page_alloc.c:2731) + drain_all_pages (mm/page_alloc.c:2747) + kcompactd (mm/compaction.c:3115) + kthread (kernel/kthread.c:465) + ? __cfi_kcompactd (mm/compaction.c:3166) + ? __cfi_kthread (kernel/kthread.c:412) + ret_from_fork (arch/x86/kernel/process.c:164) + ? __cfi_kthread (kernel/kthread.c:412) + ret_from_fork_asm (arch/x86/entry/entry_64.S:255) + + +Matthew has analyzed the report and identified that in drain_page_zone() +we are in a section protected by spin_lock(&pcp->lock) and then get an +interrupt that attempts spin_trylock() on the same lock. The code is +designed to work this way without disabling IRQs and occasionally fail the +trylock with a fallback. However, the SMP=n spinlock implementation +assumes spin_trylock() will always succeed, and thus it's normally a +no-op. Here the enabled lock debugging catches the problem, but otherwise +it could cause a corruption of the pcp structure. + +The problem has been introduced by commit 574907741599 ("mm/page_alloc: +leave IRQs enabled for per-cpu page allocations"). The pcp locking scheme +recognizes the need for disabling IRQs to prevent nesting spin_trylock() +sections on SMP=n, but the need to prevent the nesting in spin_lock() has +not been recognized. Fix it by introducing local wrappers that change the +spin_lock() to spin_lock_iqsave() with SMP=n and use them in all places +that do spin_lock(&pcp->lock). + +[vbabka@suse.cz: add pcp_ prefix to the spin_lock_irqsave wrappers, per Steven] +Link: https://lkml.kernel.org/r/20260105-fix-pcp-up-v1-1-5579662d2071@suse.cz +Fixes: 574907741599 ("mm/page_alloc: leave IRQs enabled for per-cpu page allocations") +Signed-off-by: Vlastimil Babka +Reported-by: kernel test robot +Closes: https://lore.kernel.org/oe-lkp/202512101320.e2f2dd6f-lkp@intel.com +Analyzed-by: Matthew Wilcox +Link: https://lore.kernel.org/all/aUW05pyc9nZkvY-1@casper.infradead.org/ +Acked-by: Mel Gorman +Cc: Brendan Jackman +Cc: Johannes Weiner +Cc: Michal Hocko +Cc: Sebastian Andrzej Siewior +Cc: Steven Rostedt +Cc: Suren Baghdasaryan +Cc: Zi Yan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 47 +++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 39 insertions(+), 8 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -166,6 +166,33 @@ static DEFINE_MUTEX(pcp_batch_high_lock) + #define pcp_spin_unlock(ptr) \ + pcpu_spin_unlock(lock, ptr) + ++/* ++ * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e. ++ * a potentially remote cpu drain) and get interrupted by an operation that ++ * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP ++ * spinlock assumptions making the trylock a no-op. So we have to turn that ++ * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no ++ * remote cpu's so we can only be locking the only existing local one. ++ */ ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) ++static inline void __flags_noop(unsigned long *flags) { } ++#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ ++({ \ ++ __flags_noop(&(flags)); \ ++ spin_lock(&(ptr)->lock); \ ++}) ++#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ ++({ \ ++ spin_unlock(&(ptr)->lock); \ ++ __flags_noop(&(flags)); \ ++}) ++#else ++#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ ++ spin_lock_irqsave(&(ptr)->lock, flags) ++#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ ++ spin_unlock_irqrestore(&(ptr)->lock, flags) ++#endif ++ + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID + DEFINE_PER_CPU(int, numa_node); + EXPORT_PER_CPU_SYMBOL(numa_node); +@@ -2555,6 +2582,7 @@ static int rmqueue_bulk(struct zone *zon + bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) + { + int high_min, to_drain, to_drain_batched, batch; ++ unsigned long UP_flags; + bool todo = false; + + high_min = READ_ONCE(pcp->high_min); +@@ -2574,9 +2602,9 @@ bool decay_pcp_high(struct zone *zone, s + to_drain = pcp->count - pcp->high; + while (to_drain > 0) { + to_drain_batched = min(to_drain, batch); +- spin_lock(&pcp->lock); ++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + free_pcppages_bulk(zone, to_drain_batched, pcp, 0); +- spin_unlock(&pcp->lock); ++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + todo = true; + + to_drain -= to_drain_batched; +@@ -2593,14 +2621,15 @@ bool decay_pcp_high(struct zone *zone, s + */ + void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) + { ++ unsigned long UP_flags; + int to_drain, batch; + + batch = READ_ONCE(pcp->batch); + to_drain = min(pcp->count, batch); + if (to_drain > 0) { +- spin_lock(&pcp->lock); ++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + free_pcppages_bulk(zone, to_drain, pcp, 0); +- spin_unlock(&pcp->lock); ++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + } + } + #endif +@@ -2611,10 +2640,11 @@ void drain_zone_pages(struct zone *zone, + static void drain_pages_zone(unsigned int cpu, struct zone *zone) + { + struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); ++ unsigned long UP_flags; + int count; + + do { +- spin_lock(&pcp->lock); ++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + count = pcp->count; + if (count) { + int to_drain = min(count, +@@ -2623,7 +2653,7 @@ static void drain_pages_zone(unsigned in + free_pcppages_bulk(zone, to_drain, pcp, 0); + count -= to_drain; + } +- spin_unlock(&pcp->lock); ++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + } while (count); + } + +@@ -6081,6 +6111,7 @@ static void zone_pcp_update_cacheinfo(st + { + struct per_cpu_pages *pcp; + struct cpu_cacheinfo *cci; ++ unsigned long UP_flags; + + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); +@@ -6091,12 +6122,12 @@ static void zone_pcp_update_cacheinfo(st + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ +- spin_lock(&pcp->lock); ++ pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; +- spin_unlock(&pcp->lock); ++ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + } + + void setup_pcp_cacheinfo(unsigned int cpu) diff --git a/queue-6.18/mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch b/queue-6.18/mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch new file mode 100644 index 0000000000..c0b7888e4f --- /dev/null +++ b/queue-6.18/mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch @@ -0,0 +1,276 @@ +From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001 +From: Joshua Hahn +Date: Tue, 14 Oct 2025 07:50:08 -0700 +Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection + +From: Joshua Hahn + +commit 0acc67c4030c39f39ac90413cc5d0abddd3a9527 upstream. + +Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5. + +Motivation & Approach +===================== + +While testing workloads with high sustained memory pressure on large +machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly +high number of softlockups. Further investigation showed that the zone +lock in free_pcppages_bulk was being held for a long time, and was called +to free 2k+ pages over 100 times just during boot. + +This causes starvation in other processes for the zone lock, which can +lead to the system stalling as multiple threads cannot make progress +without the locks. We can see these issues manifesting as warnings: + +[ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU +[ 4512.604370] rcu: 20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426 +[ 4512.626401] rcu: hardirqs softirqs csw/system +[ 4512.638793] rcu: number: 0 145 0 +[ 4512.651177] rcu: cputime: 30 10410 174 ==> 10558(ms) +[ 4512.666657] rcu: (t=21077 jiffies g=783665 q=1242213 ncpus=316) + +While these warnings don't indicate a crash or a kernel panic, they do +point to the underlying issue of lock contention. To prevent starvation +in both locks, batch the freeing of pages using pcp->batch. + +Because free_pcppages_bulk is called with the pcp lock and acquires the +zone lock, relinquishing and reacquiring the locks are only effective when +both of them are broken together (unless the system was built with queued +spinlocks). Thus, instead of modifying free_pcppages_bulk to break both +locks, batch the freeing from its callers instead. + +A similar fix has been implemented in the Meta fleet, and we have seen +significantly less softlockups. + +Testing +======= +The following are a few synthetic benchmarks, made on three machines. The +first is a large machine with 754GiB memory and 316 processors. +The second is a relatively smaller machine with 251GiB memory and 176 +processors. The third and final is the smallest of the three, which has 62GiB +memory and 36 processors. + +On all machines, I kick off a kernel build with -j$(nproc). +Negative delta is better (faster compilation). + +Large machine (754GiB memory, 316 processors) +make -j$(nproc) ++------------+---------------+-----------+ +| Metric (s) | Variation (%) | Delta(%) | ++------------+---------------+-----------+ +| real | 0.8070 | - 1.4865 | +| user | 0.2823 | + 0.4081 | +| sys | 5.0267 | -11.8737 | ++------------+---------------+-----------+ + +Medium machine (251GiB memory, 176 processors) +make -j$(nproc) ++------------+---------------+----------+ +| Metric (s) | Variation (%) | Delta(%) | ++------------+---------------+----------+ +| real | 0.2806 | +0.0351 | +| user | 0.0994 | +0.3170 | +| sys | 0.6229 | -0.6277 | ++------------+---------------+----------+ + +Small machine (62GiB memory, 36 processors) +make -j$(nproc) ++------------+---------------+----------+ +| Metric (s) | Variation (%) | Delta(%) | ++------------+---------------+----------+ +| real | 0.1503 | -2.6585 | +| user | 0.0431 | -2.2984 | +| sys | 0.1870 | -3.2013 | ++------------+---------------+----------+ + +Here, variation is the coefficient of variation, i.e. standard deviation +/ mean. + +Based on these results, it seems like there are varying degrees to how +much lock contention this reduces. For the largest and smallest machines +that I ran the tests on, it seems like there is quite some significant +reduction. There is also some performance increases visible from +userspace. + +Interestingly, the performance gains don't scale with the size of the +machine, but rather there seems to be a dip in the gain there is for the +medium-sized machine. One possible theory is that because the high +watermark depends on both memory and the number of local CPUs, what +impacts zone contention the most is not these individual values, but +rather the ratio of mem:processors. + + +This patch (of 5): + +Currently, refresh_cpu_vm_stats returns an int, indicating how many +changes were made during its updates. Using this information, callers +like vmstat_update can heuristically determine if more work will be done +in the future. + +However, all of refresh_cpu_vm_stats's callers either (a) ignore the +result, only caring about performing the updates, or (b) only care about +whether changes were made, but not *how many* changes were made. + +Simplify the code by returning a bool instead to indicate if updates +were made. + +In addition, simplify fold_diff and decay_pcp_high to return a bool +for the same reason. + +Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com +Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com +Signed-off-by: Joshua Hahn +Reviewed-by: Vlastimil Babka +Reviewed-by: SeongJae Park +Cc: Brendan Jackman +Cc: Chris Mason +Cc: Johannes Weiner +Cc: "Kirill A. Shutemov" +Cc: Michal Hocko +Cc: Suren Baghdasaryan +Cc: Zi Yan +Signed-off-by: Andrew Morton +Stable-dep-of: 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n") +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/gfp.h | 2 +- + mm/page_alloc.c | 8 ++++---- + mm/vmstat.c | 28 +++++++++++++++------------- + 3 files changed, 20 insertions(+), 18 deletions(-) + +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -387,7 +387,7 @@ extern void free_pages(unsigned long add + #define free_page(addr) free_pages((addr), 0) + + void page_alloc_init_cpuhp(void); +-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); ++bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); + void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); + void drain_all_pages(struct zone *zone); + void drain_local_pages(struct zone *zone); +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2552,10 +2552,10 @@ static int rmqueue_bulk(struct zone *zon + * Called from the vmstat counter updater to decay the PCP high. + * Return whether there are addition works to do. + */ +-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) ++bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) + { + int high_min, to_drain, batch; +- int todo = 0; ++ bool todo = false; + + high_min = READ_ONCE(pcp->high_min); + batch = READ_ONCE(pcp->batch); +@@ -2568,7 +2568,7 @@ int decay_pcp_high(struct zone *zone, st + pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), + pcp->high - (pcp->high >> 3), high_min); + if (pcp->high > high_min) +- todo++; ++ todo = true; + } + + to_drain = pcp->count - pcp->high; +@@ -2576,7 +2576,7 @@ int decay_pcp_high(struct zone *zone, st + spin_lock(&pcp->lock); + free_pcppages_bulk(zone, to_drain, pcp, 0); + spin_unlock(&pcp->lock); +- todo++; ++ todo = true; + } + + return todo; +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state); + + /* + * Fold a differential into the global counters. +- * Returns the number of counters updated. ++ * Returns whether counters were updated. + */ + static int fold_diff(int *zone_diff, int *node_diff) + { + int i; +- int changes = 0; ++ bool changed = false; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); +- changes++; ++ changed = true; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); +- changes++; ++ changed = true; + } +- return changes; ++ return changed; + } + + /* +@@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int + * with the global counters. These could cause remote node cache line + * bouncing and will have to be only done when necessary. + * +- * The function returns the number of global counters updated. ++ * The function returns whether global counters were updated. + */ +-static int refresh_cpu_vm_stats(bool do_pagesets) ++static bool refresh_cpu_vm_stats(bool do_pagesets) + { + struct pglist_data *pgdat; + struct zone *zone; + int i; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; +- int changes = 0; ++ bool changed = false; + + for_each_populated_zone(zone) { + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; +@@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_ + if (do_pagesets) { + cond_resched(); + +- changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); ++ if (decay_pcp_high(zone, this_cpu_ptr(pcp))) ++ changed = true; + #ifdef CONFIG_NUMA + /* + * Deal with draining the remote pageset of this +@@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_ + } + + if (__this_cpu_dec_return(pcp->expire)) { +- changes++; ++ changed = true; + continue; + } + + if (__this_cpu_read(pcp->count)) { + drain_zone_pages(zone, this_cpu_ptr(pcp)); +- changes++; ++ changed = true; + } + #endif + } +@@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_ + } + } + +- changes += fold_diff(global_zone_diff, global_node_diff); +- return changes; ++ if (fold_diff(global_zone_diff, global_node_diff)) ++ changed = true; ++ return changed; + } + + /* diff --git a/queue-6.18/series b/queue-6.18/series index 2de29eaa05..cb86c1f6d8 100644 --- a/queue-6.18/series +++ b/queue-6.18/series @@ -181,3 +181,16 @@ dmaengine-stm32-dmamux-fix-of-node-leak-on-route-allocation-failure.patch dmaengine-ti-dma-crossbar-fix-device-leak-on-dra7x-route-allocation.patch dmaengine-ti-dma-crossbar-fix-device-leak-on-am335x-route-allocation.patch dmaengine-ti-k3-udma-fix-device-leak-on-udma-lookup.patch +mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch +mm-actually-mark-kernel-page-table-pages.patch +x86-mm-use-ptdesc-when-freeing-pmd-pages.patch +mm-introduce-pure-page-table-freeing-function.patch +x86-mm-use-pagetable_free.patch +mm-introduce-deferred-freeing-for-kernel-page-tables.patch +iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch +hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch +hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch +btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch +mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch +mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch +mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch diff --git a/queue-6.18/x86-mm-use-pagetable_free.patch b/queue-6.18/x86-mm-use-pagetable_free.patch new file mode 100644 index 0000000000..6d7149abb6 --- /dev/null +++ b/queue-6.18/x86-mm-use-pagetable_free.patch @@ -0,0 +1,71 @@ +From bf9e4e30f3538391745a99bc2268ec4f5e4a401e Mon Sep 17 00:00:00 2001 +From: Lu Baolu +Date: Wed, 22 Oct 2025 16:26:32 +0800 +Subject: x86/mm: use pagetable_free() + +From: Lu Baolu + +commit bf9e4e30f3538391745a99bc2268ec4f5e4a401e upstream. + +The kernel's memory management subsystem provides a dedicated interface, +pagetable_free(), for freeing page table pages. Updates two call sites to +use pagetable_free() instead of the lower-level __free_page() or +free_pages(). This improves code consistency and clarity, and ensures the +correct freeing mechanism is used. + +Link: https://lkml.kernel.org/r/20251022082635.2462433-7-baolu.lu@linux.intel.com +Signed-off-by: Lu Baolu +Reviewed-by: Jason Gunthorpe +Acked-by: David Hildenbrand +Acked-by: Mike Rapoport (Microsoft) +Cc: Alistair Popple +Cc: Andy Lutomirski +Cc: Borislav Betkov +Cc: Dave Hansen +Cc: Ingo Molnar +Cc: Jann Horn +Cc: Jean-Philippe Brucker +Cc: Joerg Roedel +Cc: Kevin Tian +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Peter Zijlstra +Cc: Robin Murohy +Cc: Thomas Gleinxer +Cc: "Uladzislau Rezki (Sony)" +Cc: Vasant Hegde +Cc: Vinicius Costa Gomes +Cc: Vlastimil Babka +Cc: Will Deacon +Cc: Yi Lai +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/init_64.c | 2 +- + arch/x86/mm/pat/set_memory.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -1031,7 +1031,7 @@ static void __meminit free_pagetable(str + free_reserved_pages(page, nr_pages); + #endif + } else { +- __free_pages(page, order); ++ pagetable_free(page_ptdesc(page)); + } + } + +--- a/arch/x86/mm/pat/set_memory.c ++++ b/arch/x86/mm/pat/set_memory.c +@@ -429,7 +429,7 @@ static void cpa_collapse_large_pages(str + + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { + list_del(&ptdesc->pt_list); +- __free_page(ptdesc_page(ptdesc)); ++ pagetable_free(ptdesc); + } + } + diff --git a/queue-6.18/x86-mm-use-ptdesc-when-freeing-pmd-pages.patch b/queue-6.18/x86-mm-use-ptdesc-when-freeing-pmd-pages.patch new file mode 100644 index 0000000000..43234f21a9 --- /dev/null +++ b/queue-6.18/x86-mm-use-ptdesc-when-freeing-pmd-pages.patch @@ -0,0 +1,96 @@ +From 412d000346ea38ac4b9bb715a86c73ef89d90dea Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Wed, 22 Oct 2025 16:26:30 +0800 +Subject: x86/mm: use 'ptdesc' when freeing PMD pages + +From: Dave Hansen + +commit 412d000346ea38ac4b9bb715a86c73ef89d90dea upstream. + +There are a billion ways to refer to a physical memory address. One of +the x86 PMD freeing code location chooses to use a 'pte_t *' to point to a +PMD page and then call a PTE-specific freeing function for it. That's a +bit wonky. + +Just use a 'struct ptdesc *' instead. Its entire purpose is to refer to +page table pages. It also means being able to remove an explicit cast. + +Right now, pte_free_kernel() is a one-liner that calls +pagetable_dtor_free(). Effectively, all this patch does is remove one +superfluous __pa(__va(paddr)) conversion and then call +pagetable_dtor_free() directly instead of through a helper. + +Link: https://lkml.kernel.org/r/20251022082635.2462433-5-baolu.lu@linux.intel.com +Signed-off-by: Dave Hansen +Signed-off-by: Lu Baolu +Reviewed-by: Jason Gunthorpe +Reviewed-by: Kevin Tian +Cc: Alistair Popple +Cc: Andy Lutomirski +Cc: Borislav Betkov +Cc: David Hildenbrand +Cc: Ingo Molnar +Cc: Jann Horn +Cc: Jean-Philippe Brucker +Cc: Joerg Roedel +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Mike Rapoport (Microsoft) +Cc: Peter Zijlstra +Cc: Robin Murohy +Cc: Thomas Gleinxer +Cc: "Uladzislau Rezki (Sony)" +Cc: Vasant Hegde +Cc: Vinicius Costa Gomes +Cc: Vlastimil Babka +Cc: Will Deacon +Cc: Yi Lai +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/pgtable.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -729,7 +729,7 @@ int pmd_clear_huge(pmd_t *pmd) + int pud_free_pmd_page(pud_t *pud, unsigned long addr) + { + pmd_t *pmd, *pmd_sv; +- pte_t *pte; ++ struct ptdesc *pt; + int i; + + pmd = pud_pgtable(*pud); +@@ -750,8 +750,8 @@ int pud_free_pmd_page(pud_t *pud, unsign + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd_sv[i])) { +- pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); +- pte_free_kernel(&init_mm, pte); ++ pt = page_ptdesc(pmd_page(pmd_sv[i])); ++ pagetable_dtor_free(pt); + } + } + +@@ -772,15 +772,15 @@ int pud_free_pmd_page(pud_t *pud, unsign + */ + int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) + { +- pte_t *pte; ++ struct ptdesc *pt; + +- pte = (pte_t *)pmd_page_vaddr(*pmd); ++ pt = page_ptdesc(pmd_page(*pmd)); + pmd_clear(pmd); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + +- pte_free_kernel(&init_mm, pte); ++ pagetable_dtor_free(pt); + + return 1; + } -- 2.47.3