]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.18-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 21 Jan 2026 15:15:37 +0000 (16:15 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 21 Jan 2026 15:15:37 +0000 (16:15 +0100)
added patches:
btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch
hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch
hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch
iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch
mm-actually-mark-kernel-page-table-pages.patch
mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch
mm-introduce-deferred-freeing-for-kernel-page-tables.patch
mm-introduce-pure-page-table-freeing-function.patch
mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch
mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch
mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch
x86-mm-use-pagetable_free.patch
x86-mm-use-ptdesc-when-freeing-pmd-pages.patch

14 files changed:
queue-6.18/btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch [new file with mode: 0644]
queue-6.18/hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch [new file with mode: 0644]
queue-6.18/hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch [new file with mode: 0644]
queue-6.18/iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch [new file with mode: 0644]
queue-6.18/mm-actually-mark-kernel-page-table-pages.patch [new file with mode: 0644]
queue-6.18/mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch [new file with mode: 0644]
queue-6.18/mm-introduce-deferred-freeing-for-kernel-page-tables.patch [new file with mode: 0644]
queue-6.18/mm-introduce-pure-page-table-freeing-function.patch [new file with mode: 0644]
queue-6.18/mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch [new file with mode: 0644]
queue-6.18/mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch [new file with mode: 0644]
queue-6.18/mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch [new file with mode: 0644]
queue-6.18/series
queue-6.18/x86-mm-use-pagetable_free.patch [new file with mode: 0644]
queue-6.18/x86-mm-use-ptdesc-when-freeing-pmd-pages.patch [new file with mode: 0644]

diff --git a/queue-6.18/btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch b/queue-6.18/btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch
new file mode 100644 (file)
index 0000000..9ff6464
--- /dev/null
@@ -0,0 +1,154 @@
+From 5037b342825df7094a4906d1e2a9674baab50cb2 Mon Sep 17 00:00:00 2001
+From: Robbie Ko <robbieko@synology.com>
+Date: Thu, 11 Dec 2025 13:30:33 +0800
+Subject: btrfs: fix deadlock in wait_current_trans() due to ignored transaction type
+
+From: Robbie Ko <robbieko@synology.com>
+
+commit 5037b342825df7094a4906d1e2a9674baab50cb2 upstream.
+
+When wait_current_trans() is called during start_transaction(), it
+currently waits for a blocked transaction without considering whether
+the given transaction type actually needs to wait for that particular
+transaction state. The btrfs_blocked_trans_types[] array already defines
+which transaction types should wait for which transaction states, but
+this check was missing in wait_current_trans().
+
+This can lead to a deadlock scenario involving two transactions and
+pending ordered extents:
+
+  1. Transaction A is in TRANS_STATE_COMMIT_DOING state
+
+  2. A worker processing an ordered extent calls start_transaction()
+     with TRANS_JOIN
+
+  3. join_transaction() returns -EBUSY because Transaction A is in
+     TRANS_STATE_COMMIT_DOING
+
+  4. Transaction A moves to TRANS_STATE_UNBLOCKED and completes
+
+  5. A new Transaction B is created (TRANS_STATE_RUNNING)
+
+  6. The ordered extent from step 2 is added to Transaction B's
+     pending ordered extents
+
+  7. Transaction B immediately starts commit by another task and
+     enters TRANS_STATE_COMMIT_START
+
+  8. The worker finally reaches wait_current_trans(), sees Transaction B
+     in TRANS_STATE_COMMIT_START (a blocked state), and waits
+     unconditionally
+
+  9. However, TRANS_JOIN should NOT wait for TRANS_STATE_COMMIT_START
+     according to btrfs_blocked_trans_types[]
+
+  10. Transaction B is waiting for pending ordered extents to complete
+
+  11. Deadlock: Transaction B waits for ordered extent, ordered extent
+      waits for Transaction B
+
+This can be illustrated by the following call stacks:
+  CPU0                              CPU1
+                                    btrfs_finish_ordered_io()
+                                      start_transaction(TRANS_JOIN)
+                                        join_transaction()
+                                          # -EBUSY (Transaction A is
+                                          # TRANS_STATE_COMMIT_DOING)
+  # Transaction A completes
+  # Transaction B created
+  # ordered extent added to
+  # Transaction B's pending list
+  btrfs_commit_transaction()
+    # Transaction B enters
+    # TRANS_STATE_COMMIT_START
+    # waiting for pending ordered
+    # extents
+                                        wait_current_trans()
+                                          # waits for Transaction B
+                                          # (should not wait!)
+
+Task bstore_kv_sync in btrfs_commit_transaction waiting for ordered
+extents:
+
+  __schedule+0x2e7/0x8a0
+  schedule+0x64/0xe0
+  btrfs_commit_transaction+0xbf7/0xda0 [btrfs]
+  btrfs_sync_file+0x342/0x4d0 [btrfs]
+  __x64_sys_fdatasync+0x4b/0x80
+  do_syscall_64+0x33/0x40
+  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Task kworker in wait_current_trans waiting for transaction commit:
+
+  Workqueue: btrfs-syno_nocow btrfs_work_helper [btrfs]
+  __schedule+0x2e7/0x8a0
+  schedule+0x64/0xe0
+  wait_current_trans+0xb0/0x110 [btrfs]
+  start_transaction+0x346/0x5b0 [btrfs]
+  btrfs_finish_ordered_io.isra.0+0x49b/0x9c0 [btrfs]
+  btrfs_work_helper+0xe8/0x350 [btrfs]
+  process_one_work+0x1d3/0x3c0
+  worker_thread+0x4d/0x3e0
+  kthread+0x12d/0x150
+  ret_from_fork+0x1f/0x30
+
+Fix this by passing the transaction type to wait_current_trans() and
+checking btrfs_blocked_trans_types[cur_trans->state] against the given
+type before deciding to wait. This ensures that transaction types which
+are allowed to join during certain blocked states will not unnecessarily
+wait and cause deadlocks.
+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Robbie Ko <robbieko@synology.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Cc: Motiejus Jakštys <motiejus@jakstys.lt>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/transaction.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -518,13 +518,14 @@ static inline int is_transaction_blocked
+  * when this is done, it is safe to start a new transaction, but the current
+  * transaction might not be fully on disk.
+  */
+-static void wait_current_trans(struct btrfs_fs_info *fs_info)
++static void wait_current_trans(struct btrfs_fs_info *fs_info, unsigned int type)
+ {
+       struct btrfs_transaction *cur_trans;
+       spin_lock(&fs_info->trans_lock);
+       cur_trans = fs_info->running_transaction;
+-      if (cur_trans && is_transaction_blocked(cur_trans)) {
++      if (cur_trans && is_transaction_blocked(cur_trans) &&
++          (btrfs_blocked_trans_types[cur_trans->state] & type)) {
+               refcount_inc(&cur_trans->use_count);
+               spin_unlock(&fs_info->trans_lock);
+@@ -699,12 +700,12 @@ again:
+               sb_start_intwrite(fs_info->sb);
+       if (may_wait_transaction(fs_info, type))
+-              wait_current_trans(fs_info);
++              wait_current_trans(fs_info, type);
+       do {
+               ret = join_transaction(fs_info, type);
+               if (ret == -EBUSY) {
+-                      wait_current_trans(fs_info);
++                      wait_current_trans(fs_info, type);
+                       if (unlikely(type == TRANS_ATTACH ||
+                                    type == TRANS_JOIN_NOSTART))
+                               ret = -ENOENT;
+@@ -1001,7 +1002,7 @@ out:
+ void btrfs_throttle(struct btrfs_fs_info *fs_info)
+ {
+-      wait_current_trans(fs_info);
++      wait_current_trans(fs_info, TRANS_START);
+ }
+ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
diff --git a/queue-6.18/hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch b/queue-6.18/hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch
new file mode 100644 (file)
index 0000000..56c494c
--- /dev/null
@@ -0,0 +1,64 @@
+From 3644f4411713f52bf231574aa8759e3d8e20b341 Mon Sep 17 00:00:00 2001
+From: Nathan Chancellor <nathan@kernel.org>
+Date: Wed, 22 Oct 2025 00:49:08 +0200
+Subject: HID: intel-ish-hid: Fix -Wcast-function-type-strict in devm_ishtp_alloc_workqueue()
+
+From: Nathan Chancellor <nathan@kernel.org>
+
+commit 3644f4411713f52bf231574aa8759e3d8e20b341 upstream.
+
+Clang warns (or errors with CONFIG_WERROR=y / W=e):
+
+  drivers/hid/intel-ish-hid/ipc/ipc.c:935:36: error: cast from 'void (*)(struct workqueue_struct *)' to 'void (*)(void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict]
+    935 |         if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue,
+        |                                           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  include/linux/device/devres.h:168:34: note: expanded from macro 'devm_add_action_or_reset'
+    168 |         __devm_add_action_or_ireset(dev, action, data, #action)
+        |                                         ^~~~~~
+
+This warning is pointing out a kernel control flow integrity (kCFI /
+CONFIG_CFI=y) violation will occur due to this function cast when the
+destroy_workqueue() is indirectly called via devm_action_release()
+because the prototype of destroy_workqueue() does not match the
+prototype of (*action)().
+
+Use a local function with the correct prototype to wrap
+destroy_workqueue() to resolve the warning and CFI violation.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Closes: https://lore.kernel.org/oe-kbuild-all/202510190103.qTZvfdjj-lkp@intel.com/
+Closes: https://github.com/ClangBuiltLinux/linux/issues/2139
+Fixes: 0d30dae38fe0 ("HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking")
+Signed-off-by: Nathan Chancellor <nathan@kernel.org>
+Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Reviewed-by: Zhang Lixu <lixu.zhang@intel.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/intel-ish-hid/ipc/ipc.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/drivers/hid/intel-ish-hid/ipc/ipc.c
++++ b/drivers/hid/intel-ish-hid/ipc/ipc.c
+@@ -933,6 +933,11 @@ static const struct ishtp_hw_ops ish_hw_
+       .dma_no_cache_snooping = _dma_no_cache_snooping
+ };
++static void ishtp_free_workqueue(void *wq)
++{
++      destroy_workqueue(wq);
++}
++
+ static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev)
+ {
+       struct workqueue_struct *wq;
+@@ -941,8 +946,7 @@ static struct workqueue_struct *devm_ish
+       if (!wq)
+               return NULL;
+-      if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue,
+-                                   wq))
++      if (devm_add_action_or_reset(dev, ishtp_free_workqueue, wq))
+               return NULL;
+       return wq;
diff --git a/queue-6.18/hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch b/queue-6.18/hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch
new file mode 100644 (file)
index 0000000..deab1d9
--- /dev/null
@@ -0,0 +1,201 @@
+From 0d30dae38fe01cd1de358c6039a0b1184689fe51 Mon Sep 17 00:00:00 2001
+From: Zhang Lixu <lixu.zhang@intel.com>
+Date: Fri, 10 Oct 2025 13:52:54 +0800
+Subject: HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking
+
+From: Zhang Lixu <lixu.zhang@intel.com>
+
+commit 0d30dae38fe01cd1de358c6039a0b1184689fe51 upstream.
+
+During suspend/resume tests with S2IDLE, some ISH functional failures were
+observed because of delay in executing ISH resume handler. Here
+schedule_work() is used from resume handler to do actual work.
+schedule_work() uses system_wq, which is a per CPU work queue. Although
+the queuing is not bound to a CPU, but it prefers local CPU of the caller,
+unless prohibited.
+
+Users of this work queue are not supposed to queue long running work.
+But in practice, there are scenarios where long running work items are
+queued on other unbound workqueues, occupying the CPU. As a result, the
+ISH resume handler may not get a chance to execute in a timely manner.
+
+In one scenario, one of the ish_resume_handler() executions was delayed
+nearly 1 second because another work item on an unbound workqueue occupied
+the same CPU. This delay causes ISH functionality failures.
+
+A similar issue was previously observed where the ISH HID driver timed out
+while getting the HID descriptor during S4 resume in the recovery kernel,
+likely caused by the same workqueue contention problem.
+
+Create dedicated unbound workqueues for all ISH operations to allow work
+items to execute on any available CPU, eliminating CPU-specific bottlenecks
+and improving resume reliability under varying system loads. Also ISH has
+three different components, a bus driver which implements ISH protocols, a
+PCI interface layer and HID interface. Use one dedicated work queue for all
+of them.
+
+Signed-off-by: Zhang Lixu <lixu.zhang@intel.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/intel-ish-hid/ipc/ipc.c          |   21 ++++++++++++++++++++-
+ drivers/hid/intel-ish-hid/ipc/pci-ish.c      |    2 +-
+ drivers/hid/intel-ish-hid/ishtp-hid-client.c |    4 ++--
+ drivers/hid/intel-ish-hid/ishtp/bus.c        |   18 +++++++++++++++++-
+ drivers/hid/intel-ish-hid/ishtp/hbm.c        |    4 ++--
+ drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h  |    3 +++
+ include/linux/intel-ish-client-if.h          |    2 ++
+ 7 files changed, 47 insertions(+), 7 deletions(-)
+
+--- a/drivers/hid/intel-ish-hid/ipc/ipc.c
++++ b/drivers/hid/intel-ish-hid/ipc/ipc.c
+@@ -628,7 +628,7 @@ static void        recv_ipc(struct ishtp_device
+               if (!ishtp_dev) {
+                       ishtp_dev = dev;
+               }
+-              schedule_work(&fw_reset_work);
++              queue_work(dev->unbound_wq, &fw_reset_work);
+               break;
+       case MNG_RESET_NOTIFY_ACK:
+@@ -933,6 +933,21 @@ static const struct ishtp_hw_ops ish_hw_
+       .dma_no_cache_snooping = _dma_no_cache_snooping
+ };
++static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev)
++{
++      struct workqueue_struct *wq;
++
++      wq = alloc_workqueue("ishtp_unbound_%d", WQ_UNBOUND, 0, dev->id);
++      if (!wq)
++              return NULL;
++
++      if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue,
++                                   wq))
++              return NULL;
++
++      return wq;
++}
++
+ /**
+  * ish_dev_init() -Initialize ISH devoce
+  * @pdev: PCI device
+@@ -953,6 +968,10 @@ struct ishtp_device *ish_dev_init(struct
+       if (!dev)
+               return NULL;
++      dev->unbound_wq = devm_ishtp_alloc_workqueue(&pdev->dev);
++      if (!dev->unbound_wq)
++              return NULL;
++
+       dev->devc = &pdev->dev;
+       ishtp_device_init(dev);
+--- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c
++++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c
+@@ -384,7 +384,7 @@ static int __maybe_unused ish_resume(str
+       ish_resume_device = device;
+       dev->resume_flag = 1;
+-      schedule_work(&resume_work);
++      queue_work(dev->unbound_wq, &resume_work);
+       return 0;
+ }
+--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
++++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+@@ -860,7 +860,7 @@ static int hid_ishtp_cl_reset(struct ish
+       hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
+                       hid_ishtp_cl);
+-      schedule_work(&client_data->work);
++      queue_work(ishtp_get_workqueue(cl_device), &client_data->work);
+       return 0;
+ }
+@@ -902,7 +902,7 @@ static int hid_ishtp_cl_resume(struct de
+       hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
+                       hid_ishtp_cl);
+-      schedule_work(&client_data->resume_work);
++      queue_work(ishtp_get_workqueue(cl_device), &client_data->resume_work);
+       return 0;
+ }
+--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
++++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
+@@ -541,7 +541,7 @@ void ishtp_cl_bus_rx_event(struct ishtp_
+               return;
+       if (device->event_cb)
+-              schedule_work(&device->event_work);
++              queue_work(device->ishtp_dev->unbound_wq, &device->event_work);
+ }
+ /**
+@@ -877,6 +877,22 @@ struct device *ishtp_get_pci_device(stru
+ EXPORT_SYMBOL(ishtp_get_pci_device);
+ /**
++ * ishtp_get_workqueue - Retrieve the workqueue associated with an ISHTP device
++ * @cl_device: Pointer to the ISHTP client device structure
++ *
++ * Returns the workqueue_struct pointer (unbound_wq) associated with the given
++ * ISHTP client device. This workqueue is typically used for scheduling work
++ * related to the device.
++ *
++ * Return: Pointer to struct workqueue_struct.
++ */
++struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device)
++{
++      return cl_device->ishtp_dev->unbound_wq;
++}
++EXPORT_SYMBOL(ishtp_get_workqueue);
++
++/**
+  * ishtp_trace_callback() - Return trace callback
+  * @cl_device: ISH-TP client device instance
+  *
+--- a/drivers/hid/intel-ish-hid/ishtp/hbm.c
++++ b/drivers/hid/intel-ish-hid/ishtp/hbm.c
+@@ -573,7 +573,7 @@ void ishtp_hbm_dispatch(struct ishtp_dev
+               /* Start firmware loading process if it has loader capability */
+               if (version_res->host_version_supported & ISHTP_SUPPORT_CAP_LOADER)
+-                      schedule_work(&dev->work_fw_loader);
++                      queue_work(dev->unbound_wq, &dev->work_fw_loader);
+               dev->version.major_version = HBM_MAJOR_VERSION;
+               dev->version.minor_version = HBM_MINOR_VERSION;
+@@ -864,7 +864,7 @@ void       recv_hbm(struct ishtp_device *dev,
+       dev->rd_msg_fifo_tail = (dev->rd_msg_fifo_tail + IPC_PAYLOAD_SIZE) %
+               (RD_INT_FIFO_SIZE * IPC_PAYLOAD_SIZE);
+       spin_unlock_irqrestore(&dev->rd_msg_spinlock, flags);
+-      schedule_work(&dev->bh_hbm_work);
++      queue_work(dev->unbound_wq, &dev->bh_hbm_work);
+ eoi:
+       return;
+ }
+--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
++++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
+@@ -175,6 +175,9 @@ struct ishtp_device {
+       struct hbm_version version;
+       int transfer_path; /* Choice of transfer path: IPC or DMA */
++      /* Alloc a dedicated unbound workqueue for ishtp device */
++      struct workqueue_struct *unbound_wq;
++
+       /* work structure for scheduling firmware loading tasks */
+       struct work_struct work_fw_loader;
+       /* waitq for waiting for command response from the firmware loader */
+--- a/include/linux/intel-ish-client-if.h
++++ b/include/linux/intel-ish-client-if.h
+@@ -87,6 +87,8 @@ bool ishtp_wait_resume(struct ishtp_devi
+ ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device);
+ /* Get device pointer of PCI device for DMA acces */
+ struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device);
++/* Get the ISHTP workqueue */
++struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device);
+ struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device);
+ void ishtp_cl_free(struct ishtp_cl *cl);
diff --git a/queue-6.18/iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch b/queue-6.18/iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch
new file mode 100644 (file)
index 0000000..ccf8869
--- /dev/null
@@ -0,0 +1,194 @@
+From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:34 +0800
+Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit e37d5a2d60a338c5917c45296bac65da1382eda5 upstream.
+
+Introduce a new IOMMU interface to flush IOTLB paging cache entries for
+the CPU kernel address space.  This interface is invoked from the x86
+architecture code that manages combined user and kernel page tables,
+specifically before any kernel page table page is freed and reused.
+
+This addresses the main issue with vfree() which is a common occurrence
+and can be triggered by unprivileged users.  While this resolves the
+primary problem, it doesn't address some extremely rare case related to
+memory unplug of memory that was present as reserved memory at boot, which
+cannot be triggered by unprivileged users.  The discussion can be found at
+the link below.
+
+Enable SVA on x86 architecture since the IOMMU can now receive
+notification to flush the paging cache before freeing the CPU kernel page
+table pages.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com
+Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/
+Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Suggested-by: Jann Horn <jannh@google.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig          |    1 +
+ drivers/iommu/iommu-sva.c |   32 ++++++++++++++++++++++++++++----
+ include/linux/iommu.h     |    4 ++++
+ mm/pgtable-generic.c      |    2 ++
+ 4 files changed, 35 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -279,6 +279,7 @@ config X86
+       select HAVE_PCI
+       select HAVE_PERF_REGS
+       select HAVE_PERF_USER_STACK_DUMP
++      select ASYNC_KERNEL_PGTABLE_FREE        if IOMMU_SVA
+       select MMU_GATHER_RCU_TABLE_FREE
+       select MMU_GATHER_MERGE_VMAS
+       select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+--- a/drivers/iommu/iommu-sva.c
++++ b/drivers/iommu/iommu-sva.c
+@@ -10,6 +10,8 @@
+ #include "iommu-priv.h"
+ static DEFINE_MUTEX(iommu_sva_lock);
++static bool iommu_sva_present;
++static LIST_HEAD(iommu_sva_mms);
+ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+                                                  struct mm_struct *mm);
+@@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc
+               return ERR_PTR(-ENOSPC);
+       }
+       iommu_mm->pasid = pasid;
++      iommu_mm->mm = mm;
+       INIT_LIST_HEAD(&iommu_mm->sva_domains);
+       /*
+        * Make sure the write to mm->iommu_mm is not reordered in front of
+@@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device(
+       if (!group)
+               return ERR_PTR(-ENODEV);
+-      if (IS_ENABLED(CONFIG_X86))
+-              return ERR_PTR(-EOPNOTSUPP);
+-
+       mutex_lock(&iommu_sva_lock);
+       /* Allocate mm->pasid if necessary. */
+@@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(
+       if (ret)
+               goto out_free_domain;
+       domain->users = 1;
+-      list_add(&domain->next, &mm->iommu_mm->sva_domains);
++      if (list_empty(&iommu_mm->sva_domains)) {
++              if (list_empty(&iommu_sva_mms))
++                      iommu_sva_present = true;
++              list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
++      }
++      list_add(&domain->next, &iommu_mm->sva_domains);
+ out:
+       refcount_set(&handle->users, 1);
+       mutex_unlock(&iommu_sva_lock);
+@@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iomm
+               list_del(&domain->next);
+               iommu_domain_free(domain);
+       }
++
++      if (list_empty(&iommu_mm->sva_domains)) {
++              list_del(&iommu_mm->mm_list_elm);
++              if (list_empty(&iommu_sva_mms))
++                      iommu_sva_present = false;
++      }
++
+       mutex_unlock(&iommu_sva_lock);
+       kfree(handle);
+ }
+@@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_do
+       return domain;
+ }
++
++void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
++{
++      struct iommu_mm_data *iommu_mm;
++
++      guard(mutex)(&iommu_sva_lock);
++      if (!iommu_sva_present)
++              return;
++
++      list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
++              mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
++}
+--- a/include/linux/iommu.h
++++ b/include/linux/iommu.h
+@@ -1134,7 +1134,9 @@ struct iommu_sva {
+ struct iommu_mm_data {
+       u32                     pasid;
++      struct mm_struct        *mm;
+       struct list_head        sva_domains;
++      struct list_head        mm_list_elm;
+ };
+ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode);
+@@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(
+                                       struct mm_struct *mm);
+ void iommu_sva_unbind_device(struct iommu_sva *handle);
+ u32 iommu_sva_get_pasid(struct iommu_sva *handle);
++void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end);
+ #else
+ static inline struct iommu_sva *
+ iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
+@@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(st
+ }
+ static inline void mm_pasid_drop(struct mm_struct *mm) {}
++static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {}
+ #endif /* CONFIG_IOMMU_SVA */
+ #ifdef CONFIG_IOMMU_IOPF
+--- a/mm/pgtable-generic.c
++++ b/mm/pgtable-generic.c
+@@ -13,6 +13,7 @@
+ #include <linux/swap.h>
+ #include <linux/swapops.h>
+ #include <linux/mm_inline.h>
++#include <linux/iommu.h>
+ #include <asm/pgalloc.h>
+ #include <asm/tlb.h>
+@@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(str
+       list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
+       spin_unlock(&kernel_pgtable_work.lock);
++      iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL);
+       list_for_each_entry_safe(pt, next, &page_list, pt_list)
+               __pagetable_free(pt);
+ }
diff --git a/queue-6.18/mm-actually-mark-kernel-page-table-pages.patch b/queue-6.18/mm-actually-mark-kernel-page-table-pages.patch
new file mode 100644 (file)
index 0000000..cecf10d
--- /dev/null
@@ -0,0 +1,118 @@
+From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:29 +0800
+Subject: mm: actually mark kernel page table pages
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 977870522af34359b461060597ee3a86f27450d6 upstream.
+
+Now that the API is in place, mark kernel page table pages just after they
+are allocated.  Unmark them just before they are freed.
+
+Note: Unconditionally clearing the 'kernel' marking (via
+ptdesc_clear_kernel()) would be functionally identical to what is here.
+But having the if() makes it logically clear that this function can be
+used for kernel and non-kernel page tables.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/asm-generic/pgalloc.h |   18 ++++++++++++++++++
+ include/linux/mm.h            |    3 +++
+ 2 files changed, 21 insertions(+)
+
+--- a/include/asm-generic/pgalloc.h
++++ b/include/asm-generic/pgalloc.h
+@@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_ker
+               return NULL;
+       }
++      ptdesc_set_kernel(ptdesc);
++
+       return ptdesc_address(ptdesc);
+ }
+ #define __pte_alloc_one_kernel(...)   alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))
+@@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_nopro
+               pagetable_free(ptdesc);
+               return NULL;
+       }
++
++      if (mm == &init_mm)
++              ptdesc_set_kernel(ptdesc);
++
+       return ptdesc_address(ptdesc);
+ }
+ #define pmd_alloc_one(...)    alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
+@@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_nop
+               return NULL;
+       pagetable_pud_ctor(ptdesc);
++
++      if (mm == &init_mm)
++              ptdesc_set_kernel(ptdesc);
++
+       return ptdesc_address(ptdesc);
+ }
+ #define __pud_alloc_one(...)  alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))
+@@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_nop
+               return NULL;
+       pagetable_p4d_ctor(ptdesc);
++
++      if (mm == &init_mm)
++              ptdesc_set_kernel(ptdesc);
++
+       return ptdesc_address(ptdesc);
+ }
+ #define __p4d_alloc_one(...)  alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__))
+@@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(
+               return NULL;
+       pagetable_pgd_ctor(ptdesc);
++
++      if (mm == &init_mm)
++              ptdesc_set_kernel(ptdesc);
++
+       return ptdesc_address(ptdesc);
+ }
+ #define __pgd_alloc(...)      alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__))
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3042,6 +3042,9 @@ static inline void pagetable_free(struct
+ {
+       struct page *page = ptdesc_page(pt);
++      if (ptdesc_test_kernel(pt))
++              ptdesc_clear_kernel(pt);
++
+       __free_pages(page, compound_order(page));
+ }
diff --git a/queue-6.18/mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch b/queue-6.18/mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch
new file mode 100644 (file)
index 0000000..ed6ebf5
--- /dev/null
@@ -0,0 +1,121 @@
+From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:28 +0800
+Subject: mm: add a ptdesc flag to mark kernel page tables
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 27bfafac65d87c58639f5d7af1353ec1e7886963 upstream.
+
+The page tables used to map the kernel and userspace often have very
+different handling rules.  There are frequently *_kernel() variants of
+functions just for kernel page tables.  That's not great and has lead to
+code duplication.
+
+Instead of having completely separate call paths, allow a 'ptdesc' to be
+marked as being for kernel mappings.  Introduce helpers to set and clear
+this status.
+
+Note: this uses the PG_referenced bit.  Page flags are a great fit for
+this since it is truly a single bit of information.  Use PG_referenced
+itself because it's a fairly benign flag (as opposed to things like
+PG_lock).  It's also (according to Willy) unlikely to go away any time
+soon.
+
+PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE.  It does not need to be
+cleared before freeing the page, and pages coming out of the allocator
+should have it cleared.  Regardless, introduce an API to clear it anyway.
+Having symmetry in the API makes it easier to change the underlying
+implementation later, like if there was a need to move to a
+PAGE_FLAGS_CHECK_AT_FREE bit.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h |   41 +++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 41 insertions(+)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2947,6 +2947,7 @@ static inline pmd_t *pmd_alloc(struct mm
+ #endif /* CONFIG_MMU */
+ enum pt_flags {
++      PT_kernel = PG_referenced,
+       PT_reserved = PG_reserved,
+       /* High bits are used for zone/node/section */
+ };
+@@ -2973,6 +2974,46 @@ static inline bool pagetable_is_reserved
+ }
+ /**
++ * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
++ * @ptdesc: The ptdesc to be marked
++ *
++ * Kernel page tables often need special handling. Set a flag so that
++ * the handling code knows this ptdesc will not be used for userspace.
++ */
++static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
++{
++      set_bit(PT_kernel, &ptdesc->pt_flags.f);
++}
++
++/**
++ * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
++ * @ptdesc: The ptdesc to be unmarked
++ *
++ * Use when the ptdesc is no longer used to map the kernel and no longer
++ * needs special handling.
++ */
++static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
++{
++      /*
++       * Note: the 'PG_referenced' bit does not strictly need to be
++       * cleared before freeing the page. But this is nice for
++       * symmetry.
++       */
++      clear_bit(PT_kernel, &ptdesc->pt_flags.f);
++}
++
++/**
++ * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
++ * @ptdesc: The ptdesc being tested
++ *
++ * Call to tell if the ptdesc used to map the kernel.
++ */
++static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
++{
++      return test_bit(PT_kernel, &ptdesc->pt_flags.f);
++}
++
++/**
+  * pagetable_alloc - Allocate pagetables
+  * @gfp:    GFP flags
+  * @order:  desired pagetable order
diff --git a/queue-6.18/mm-introduce-deferred-freeing-for-kernel-page-tables.patch b/queue-6.18/mm-introduce-deferred-freeing-for-kernel-page-tables.patch
new file mode 100644 (file)
index 0000000..f02de4d
--- /dev/null
@@ -0,0 +1,143 @@
+From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:33 +0800
+Subject: mm: introduce deferred freeing for kernel page tables
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 5ba2f0a1556479638ac11a3c201421f5515e89f5 upstream.
+
+This introduces a conditional asynchronous mechanism, enabled by
+CONFIG_ASYNC_KERNEL_PGTABLE_FREE.  When enabled, this mechanism defers the
+freeing of pages that are used as page tables for kernel address mappings.
+These pages are now queued to a work struct instead of being freed
+immediately.
+
+This deferred freeing allows for batch-freeing of page tables, providing a
+safe context for performing a single expensive operation (TLB flush) for a
+batch of kernel page tables instead of performing that expensive operation
+for each page table.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h   |   16 +++++++++++++---
+ mm/Kconfig           |    3 +++
+ mm/pgtable-generic.c |   37 +++++++++++++++++++++++++++++++++++++
+ 3 files changed, 53 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3038,6 +3038,14 @@ static inline void __pagetable_free(stru
+       __free_pages(page, compound_order(page));
+ }
++#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
++void pagetable_free_kernel(struct ptdesc *pt);
++#else
++static inline void pagetable_free_kernel(struct ptdesc *pt)
++{
++      __pagetable_free(pt);
++}
++#endif
+ /**
+  * pagetable_free - Free pagetables
+  * @pt:       The page table descriptor
+@@ -3047,10 +3055,12 @@ static inline void __pagetable_free(stru
+  */
+ static inline void pagetable_free(struct ptdesc *pt)
+ {
+-      if (ptdesc_test_kernel(pt))
++      if (ptdesc_test_kernel(pt)) {
+               ptdesc_clear_kernel(pt);
+-
+-      __pagetable_free(pt);
++              pagetable_free_kernel(pt);
++      } else {
++              __pagetable_free(pt);
++      }
+ }
+ #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -915,6 +915,9 @@ config HAVE_GIGANTIC_FOLIOS
+       def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
+                (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
++config ASYNC_KERNEL_PGTABLE_FREE
++      def_bool n
++
+ # TODO: Allow to be enabled without THP
+ config ARCH_SUPPORTS_HUGE_PFNMAP
+       def_bool n
+--- a/mm/pgtable-generic.c
++++ b/mm/pgtable-generic.c
+@@ -406,3 +406,40 @@ again:
+       pte_unmap_unlock(pte, ptl);
+       goto again;
+ }
++
++#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
++static void kernel_pgtable_work_func(struct work_struct *work);
++
++static struct {
++      struct list_head list;
++      /* protect above ptdesc lists */
++      spinlock_t lock;
++      struct work_struct work;
++} kernel_pgtable_work = {
++      .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
++      .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
++      .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
++};
++
++static void kernel_pgtable_work_func(struct work_struct *work)
++{
++      struct ptdesc *pt, *next;
++      LIST_HEAD(page_list);
++
++      spin_lock(&kernel_pgtable_work.lock);
++      list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
++      spin_unlock(&kernel_pgtable_work.lock);
++
++      list_for_each_entry_safe(pt, next, &page_list, pt_list)
++              __pagetable_free(pt);
++}
++
++void pagetable_free_kernel(struct ptdesc *pt)
++{
++      spin_lock(&kernel_pgtable_work.lock);
++      list_add(&pt->pt_list, &kernel_pgtable_work.list);
++      spin_unlock(&kernel_pgtable_work.lock);
++
++      schedule_work(&kernel_pgtable_work.work);
++}
++#endif
diff --git a/queue-6.18/mm-introduce-pure-page-table-freeing-function.patch b/queue-6.18/mm-introduce-pure-page-table-freeing-function.patch
new file mode 100644 (file)
index 0000000..8719749
--- /dev/null
@@ -0,0 +1,77 @@
+From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:31 +0800
+Subject: mm: introduce pure page table freeing function
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 01894295672335ff304beed4359f30d14d5765f2 upstream.
+
+The pages used for ptdescs are currently freed back to the allocator in a
+single location.  They will shortly be freed from a second location.
+
+Create a simple helper that just frees them back to the allocator.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3031,6 +3031,13 @@ static inline struct ptdesc *pagetable_a
+ }
+ #define pagetable_alloc(...)  alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
++static inline void __pagetable_free(struct ptdesc *pt)
++{
++      struct page *page = ptdesc_page(pt);
++
++      __free_pages(page, compound_order(page));
++}
++
+ /**
+  * pagetable_free - Free pagetables
+  * @pt:       The page table descriptor
+@@ -3040,12 +3047,10 @@ static inline struct ptdesc *pagetable_a
+  */
+ static inline void pagetable_free(struct ptdesc *pt)
+ {
+-      struct page *page = ptdesc_page(pt);
+-
+       if (ptdesc_test_kernel(pt))
+               ptdesc_clear_kernel(pt);
+-      __free_pages(page, compound_order(page));
++      __pagetable_free(pt);
+ }
+ #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
diff --git a/queue-6.18/mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch b/queue-6.18/mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch
new file mode 100644 (file)
index 0000000..8e387e0
--- /dev/null
@@ -0,0 +1,66 @@
+From fc4b909c368f3a7b08c895dd5926476b58e85312 Mon Sep 17 00:00:00 2001
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+Date: Tue, 14 Oct 2025 07:50:09 -0700
+Subject: mm/page_alloc: batch page freeing in decay_pcp_high
+
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+
+commit fc4b909c368f3a7b08c895dd5926476b58e85312 upstream.
+
+It is possible for pcp->count - pcp->high to exceed pcp->batch by a lot.
+When this happens, we should perform batching to ensure that
+free_pcppages_bulk isn't called with too many pages to free at once and
+starve out other threads that need the pcp or zone lock.
+
+Since we are still only freeing the difference between the initial
+pcp->count and pcp->high values, there should be no change to how many
+pages are freed.
+
+Link: https://lkml.kernel.org/r/20251014145011.3427205-3-joshua.hahnjy@gmail.com
+Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
+Suggested-by: Chris Mason <clm@fb.com>
+Suggested-by: Andrew Morton <akpm@linux-foundation.org>
+Co-developed-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Brendan Jackman <jackmanb@google.com>
+Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: SeongJae Park <sj@kernel.org>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2554,7 +2554,7 @@ static int rmqueue_bulk(struct zone *zon
+  */
+ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+ {
+-      int high_min, to_drain, batch;
++      int high_min, to_drain, to_drain_batched, batch;
+       bool todo = false;
+       high_min = READ_ONCE(pcp->high_min);
+@@ -2572,11 +2572,14 @@ bool decay_pcp_high(struct zone *zone, s
+       }
+       to_drain = pcp->count - pcp->high;
+-      if (to_drain > 0) {
++      while (to_drain > 0) {
++              to_drain_batched = min(to_drain, batch);
+               spin_lock(&pcp->lock);
+-              free_pcppages_bulk(zone, to_drain, pcp, 0);
++              free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
+               spin_unlock(&pcp->lock);
+               todo = true;
++
++              to_drain -= to_drain_batched;
+       }
+       return todo;
diff --git a/queue-6.18/mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch b/queue-6.18/mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch
new file mode 100644 (file)
index 0000000..b304fea
--- /dev/null
@@ -0,0 +1,210 @@
+From 038a102535eb49e10e93eafac54352fcc5d78847 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 5 Jan 2026 16:08:56 +0100
+Subject: mm/page_alloc: prevent pcp corruption with SMP=n
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 038a102535eb49e10e93eafac54352fcc5d78847 upstream.
+
+The kernel test robot has reported:
+
+ BUG: spinlock trylock failure on UP on CPU#0, kcompactd0/28
+  lock: 0xffff888807e35ef0, .magic: dead4ead, .owner: kcompactd0/28, .owner_cpu: 0
+ CPU: 0 UID: 0 PID: 28 Comm: kcompactd0 Not tainted 6.18.0-rc5-00127-ga06157804399 #1 PREEMPT  8cc09ef94dcec767faa911515ce9e609c45db470
+ Call Trace:
+  <IRQ>
+  __dump_stack (lib/dump_stack.c:95)
+  dump_stack_lvl (lib/dump_stack.c:123)
+  dump_stack (lib/dump_stack.c:130)
+  spin_dump (kernel/locking/spinlock_debug.c:71)
+  do_raw_spin_trylock (kernel/locking/spinlock_debug.c:?)
+  _raw_spin_trylock (include/linux/spinlock_api_smp.h:89 kernel/locking/spinlock.c:138)
+  __free_frozen_pages (mm/page_alloc.c:2973)
+  ___free_pages (mm/page_alloc.c:5295)
+  __free_pages (mm/page_alloc.c:5334)
+  tlb_remove_table_rcu (include/linux/mm.h:? include/linux/mm.h:3122 include/asm-generic/tlb.h:220 mm/mmu_gather.c:227 mm/mmu_gather.c:290)
+  ? __cfi_tlb_remove_table_rcu (mm/mmu_gather.c:289)
+  ? rcu_core (kernel/rcu/tree.c:?)
+  rcu_core (include/linux/rcupdate.h:341 kernel/rcu/tree.c:2607 kernel/rcu/tree.c:2861)
+  rcu_core_si (kernel/rcu/tree.c:2879)
+  handle_softirqs (arch/x86/include/asm/jump_label.h:36 include/trace/events/irq.h:142 kernel/softirq.c:623)
+  __irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:725)
+  irq_exit_rcu (kernel/softirq.c:741)
+  sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1052)
+  </IRQ>
+  <TASK>
+ RIP: 0010:_raw_spin_unlock_irqrestore (arch/x86/include/asm/preempt.h:95 include/linux/spinlock_api_smp.h:152 kernel/locking/spinlock.c:194)
+  free_pcppages_bulk (mm/page_alloc.c:1494)
+  drain_pages_zone (include/linux/spinlock.h:391 mm/page_alloc.c:2632)
+  __drain_all_pages (mm/page_alloc.c:2731)
+  drain_all_pages (mm/page_alloc.c:2747)
+  kcompactd (mm/compaction.c:3115)
+  kthread (kernel/kthread.c:465)
+  ? __cfi_kcompactd (mm/compaction.c:3166)
+  ? __cfi_kthread (kernel/kthread.c:412)
+  ret_from_fork (arch/x86/kernel/process.c:164)
+  ? __cfi_kthread (kernel/kthread.c:412)
+  ret_from_fork_asm (arch/x86/entry/entry_64.S:255)
+  </TASK>
+
+Matthew has analyzed the report and identified that in drain_page_zone()
+we are in a section protected by spin_lock(&pcp->lock) and then get an
+interrupt that attempts spin_trylock() on the same lock.  The code is
+designed to work this way without disabling IRQs and occasionally fail the
+trylock with a fallback.  However, the SMP=n spinlock implementation
+assumes spin_trylock() will always succeed, and thus it's normally a
+no-op.  Here the enabled lock debugging catches the problem, but otherwise
+it could cause a corruption of the pcp structure.
+
+The problem has been introduced by commit 574907741599 ("mm/page_alloc:
+leave IRQs enabled for per-cpu page allocations").  The pcp locking scheme
+recognizes the need for disabling IRQs to prevent nesting spin_trylock()
+sections on SMP=n, but the need to prevent the nesting in spin_lock() has
+not been recognized.  Fix it by introducing local wrappers that change the
+spin_lock() to spin_lock_iqsave() with SMP=n and use them in all places
+that do spin_lock(&pcp->lock).
+
+[vbabka@suse.cz: add pcp_ prefix to the spin_lock_irqsave wrappers, per Steven]
+Link: https://lkml.kernel.org/r/20260105-fix-pcp-up-v1-1-5579662d2071@suse.cz
+Fixes: 574907741599 ("mm/page_alloc: leave IRQs enabled for per-cpu page allocations")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Closes: https://lore.kernel.org/oe-lkp/202512101320.e2f2dd6f-lkp@intel.com
+Analyzed-by: Matthew Wilcox <willy@infradead.org>
+Link: https://lore.kernel.org/all/aUW05pyc9nZkvY-1@casper.infradead.org/
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Brendan Jackman <jackmanb@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |   47 +++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 39 insertions(+), 8 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -166,6 +166,33 @@ static DEFINE_MUTEX(pcp_batch_high_lock)
+ #define pcp_spin_unlock(ptr)                                          \
+       pcpu_spin_unlock(lock, ptr)
++/*
++ * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
++ * a potentially remote cpu drain) and get interrupted by an operation that
++ * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
++ * spinlock assumptions making the trylock a no-op. So we have to turn that
++ * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
++ * remote cpu's so we can only be locking the only existing local one.
++ */
++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
++static inline void __flags_noop(unsigned long *flags) { }
++#define pcp_spin_lock_maybe_irqsave(ptr, flags)               \
++({                                                    \
++       __flags_noop(&(flags));                        \
++       spin_lock(&(ptr)->lock);                       \
++})
++#define pcp_spin_unlock_maybe_irqrestore(ptr, flags)  \
++({                                                    \
++       spin_unlock(&(ptr)->lock);                     \
++       __flags_noop(&(flags));                        \
++})
++#else
++#define pcp_spin_lock_maybe_irqsave(ptr, flags)               \
++              spin_lock_irqsave(&(ptr)->lock, flags)
++#define pcp_spin_unlock_maybe_irqrestore(ptr, flags)  \
++              spin_unlock_irqrestore(&(ptr)->lock, flags)
++#endif
++
+ #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+ DEFINE_PER_CPU(int, numa_node);
+ EXPORT_PER_CPU_SYMBOL(numa_node);
+@@ -2555,6 +2582,7 @@ static int rmqueue_bulk(struct zone *zon
+ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+ {
+       int high_min, to_drain, to_drain_batched, batch;
++      unsigned long UP_flags;
+       bool todo = false;
+       high_min = READ_ONCE(pcp->high_min);
+@@ -2574,9 +2602,9 @@ bool decay_pcp_high(struct zone *zone, s
+       to_drain = pcp->count - pcp->high;
+       while (to_drain > 0) {
+               to_drain_batched = min(to_drain, batch);
+-              spin_lock(&pcp->lock);
++              pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+               free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
+-              spin_unlock(&pcp->lock);
++              pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+               todo = true;
+               to_drain -= to_drain_batched;
+@@ -2593,14 +2621,15 @@ bool decay_pcp_high(struct zone *zone, s
+  */
+ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
+ {
++      unsigned long UP_flags;
+       int to_drain, batch;
+       batch = READ_ONCE(pcp->batch);
+       to_drain = min(pcp->count, batch);
+       if (to_drain > 0) {
+-              spin_lock(&pcp->lock);
++              pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+               free_pcppages_bulk(zone, to_drain, pcp, 0);
+-              spin_unlock(&pcp->lock);
++              pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+       }
+ }
+ #endif
+@@ -2611,10 +2640,11 @@ void drain_zone_pages(struct zone *zone,
+ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
+ {
+       struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
++      unsigned long UP_flags;
+       int count;
+       do {
+-              spin_lock(&pcp->lock);
++              pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+               count = pcp->count;
+               if (count) {
+                       int to_drain = min(count,
+@@ -2623,7 +2653,7 @@ static void drain_pages_zone(unsigned in
+                       free_pcppages_bulk(zone, to_drain, pcp, 0);
+                       count -= to_drain;
+               }
+-              spin_unlock(&pcp->lock);
++              pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+       } while (count);
+ }
+@@ -6081,6 +6111,7 @@ static void zone_pcp_update_cacheinfo(st
+ {
+       struct per_cpu_pages *pcp;
+       struct cpu_cacheinfo *cci;
++      unsigned long UP_flags;
+       pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+       cci = get_cpu_cacheinfo(cpu);
+@@ -6091,12 +6122,12 @@ static void zone_pcp_update_cacheinfo(st
+        * This can reduce zone lock contention without hurting
+        * cache-hot pages sharing.
+        */
+-      spin_lock(&pcp->lock);
++      pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+       if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+               pcp->flags |= PCPF_FREE_HIGH_BATCH;
+       else
+               pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+-      spin_unlock(&pcp->lock);
++      pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ }
+ void setup_pcp_cacheinfo(unsigned int cpu)
diff --git a/queue-6.18/mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch b/queue-6.18/mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch
new file mode 100644 (file)
index 0000000..c0b7888
--- /dev/null
@@ -0,0 +1,276 @@
+From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+Date: Tue, 14 Oct 2025 07:50:08 -0700
+Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection
+
+From: Joshua Hahn <joshua.hahnjy@gmail.com>
+
+commit 0acc67c4030c39f39ac90413cc5d0abddd3a9527 upstream.
+
+Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5.
+
+Motivation & Approach
+=====================
+
+While testing workloads with high sustained memory pressure on large
+machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly
+high number of softlockups.  Further investigation showed that the zone
+lock in free_pcppages_bulk was being held for a long time, and was called
+to free 2k+ pages over 100 times just during boot.
+
+This causes starvation in other processes for the zone lock, which can
+lead to the system stalling as multiple threads cannot make progress
+without the locks.  We can see these issues manifesting as warnings:
+
+[ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU
+[ 4512.604370] rcu:     20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426
+[ 4512.626401] rcu:              hardirqs   softirqs   csw/system
+[ 4512.638793] rcu:      number:        0        145            0
+[ 4512.651177] rcu:     cputime:       30      10410          174   ==> 10558(ms)
+[ 4512.666657] rcu:     (t=21077 jiffies g=783665 q=1242213 ncpus=316)
+
+While these warnings don't indicate a crash or a kernel panic, they do
+point to the underlying issue of lock contention.  To prevent starvation
+in both locks, batch the freeing of pages using pcp->batch.
+
+Because free_pcppages_bulk is called with the pcp lock and acquires the
+zone lock, relinquishing and reacquiring the locks are only effective when
+both of them are broken together (unless the system was built with queued
+spinlocks).  Thus, instead of modifying free_pcppages_bulk to break both
+locks, batch the freeing from its callers instead.
+
+A similar fix has been implemented in the Meta fleet, and we have seen
+significantly less softlockups.
+
+Testing
+=======
+The following are a few synthetic benchmarks, made on three machines. The
+first is a large machine with 754GiB memory and 316 processors.
+The second is a relatively smaller machine with 251GiB memory and 176
+processors. The third and final is the smallest of the three, which has 62GiB
+memory and 36 processors.
+
+On all machines, I kick off a kernel build with -j$(nproc).
+Negative delta is better (faster compilation).
+
+Large machine (754GiB memory, 316 processors)
+make -j$(nproc)
++------------+---------------+-----------+
+| Metric (s) | Variation (%) | Delta(%)  |
++------------+---------------+-----------+
+| real       |        0.8070 |  - 1.4865 |
+| user       |        0.2823 |  + 0.4081 |
+| sys        |        5.0267 |  -11.8737 |
++------------+---------------+-----------+
+
+Medium machine (251GiB memory, 176 processors)
+make -j$(nproc)
++------------+---------------+----------+
+| Metric (s) | Variation (%) | Delta(%) |
++------------+---------------+----------+
+| real       |        0.2806 |  +0.0351 |
+| user       |        0.0994 |  +0.3170 |
+| sys        |        0.6229 |  -0.6277 |
++------------+---------------+----------+
+
+Small machine (62GiB memory, 36 processors)
+make -j$(nproc)
++------------+---------------+----------+
+| Metric (s) | Variation (%) | Delta(%) |
++------------+---------------+----------+
+| real       |        0.1503 |  -2.6585 |
+| user       |        0.0431 |  -2.2984 |
+| sys        |        0.1870 |  -3.2013 |
++------------+---------------+----------+
+
+Here, variation is the coefficient of variation, i.e.  standard deviation
+/ mean.
+
+Based on these results, it seems like there are varying degrees to how
+much lock contention this reduces.  For the largest and smallest machines
+that I ran the tests on, it seems like there is quite some significant
+reduction.  There is also some performance increases visible from
+userspace.
+
+Interestingly, the performance gains don't scale with the size of the
+machine, but rather there seems to be a dip in the gain there is for the
+medium-sized machine.  One possible theory is that because the high
+watermark depends on both memory and the number of local CPUs, what
+impacts zone contention the most is not these individual values, but
+rather the ratio of mem:processors.
+
+
+This patch (of 5):
+
+Currently, refresh_cpu_vm_stats returns an int, indicating how many
+changes were made during its updates.  Using this information, callers
+like vmstat_update can heuristically determine if more work will be done
+in the future.
+
+However, all of refresh_cpu_vm_stats's callers either (a) ignore the
+result, only caring about performing the updates, or (b) only care about
+whether changes were made, but not *how many* changes were made.
+
+Simplify the code by returning a bool instead to indicate if updates
+were made.
+
+In addition, simplify fold_diff and decay_pcp_high to return a bool
+for the same reason.
+
+Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com
+Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com
+Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Jackman <jackmanb@google.com>
+Cc: Chris Mason <clm@fb.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/gfp.h |    2 +-
+ mm/page_alloc.c     |    8 ++++----
+ mm/vmstat.c         |   28 +++++++++++++++-------------
+ 3 files changed, 20 insertions(+), 18 deletions(-)
+
+--- a/include/linux/gfp.h
++++ b/include/linux/gfp.h
+@@ -387,7 +387,7 @@ extern void free_pages(unsigned long add
+ #define free_page(addr) free_pages((addr), 0)
+ void page_alloc_init_cpuhp(void);
+-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
++bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
+ void drain_all_pages(struct zone *zone);
+ void drain_local_pages(struct zone *zone);
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2552,10 +2552,10 @@ static int rmqueue_bulk(struct zone *zon
+  * Called from the vmstat counter updater to decay the PCP high.
+  * Return whether there are addition works to do.
+  */
+-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
++bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+ {
+       int high_min, to_drain, batch;
+-      int todo = 0;
++      bool todo = false;
+       high_min = READ_ONCE(pcp->high_min);
+       batch = READ_ONCE(pcp->batch);
+@@ -2568,7 +2568,7 @@ int decay_pcp_high(struct zone *zone, st
+               pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+                                pcp->high - (pcp->high >> 3), high_min);
+               if (pcp->high > high_min)
+-                      todo++;
++                      todo = true;
+       }
+       to_drain = pcp->count - pcp->high;
+@@ -2576,7 +2576,7 @@ int decay_pcp_high(struct zone *zone, st
+               spin_lock(&pcp->lock);
+               free_pcppages_bulk(zone, to_drain, pcp, 0);
+               spin_unlock(&pcp->lock);
+-              todo++;
++              todo = true;
+       }
+       return todo;
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state);
+ /*
+  * Fold a differential into the global counters.
+- * Returns the number of counters updated.
++ * Returns whether counters were updated.
+  */
+ static int fold_diff(int *zone_diff, int *node_diff)
+ {
+       int i;
+-      int changes = 0;
++      bool changed = false;
+       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+               if (zone_diff[i]) {
+                       atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+-                      changes++;
++                      changed = true;
+       }
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               if (node_diff[i]) {
+                       atomic_long_add(node_diff[i], &vm_node_stat[i]);
+-                      changes++;
++                      changed = true;
+       }
+-      return changes;
++      return changed;
+ }
+ /*
+@@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int
+  * with the global counters. These could cause remote node cache line
+  * bouncing and will have to be only done when necessary.
+  *
+- * The function returns the number of global counters updated.
++ * The function returns whether global counters were updated.
+  */
+-static int refresh_cpu_vm_stats(bool do_pagesets)
++static bool refresh_cpu_vm_stats(bool do_pagesets)
+ {
+       struct pglist_data *pgdat;
+       struct zone *zone;
+       int i;
+       int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
+-      int changes = 0;
++      bool changed = false;
+       for_each_populated_zone(zone) {
+               struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+@@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_
+               if (do_pagesets) {
+                       cond_resched();
+-                      changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
++                      if (decay_pcp_high(zone, this_cpu_ptr(pcp)))
++                              changed = true;
+ #ifdef CONFIG_NUMA
+                       /*
+                        * Deal with draining the remote pageset of this
+@@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_
+                       }
+                       if (__this_cpu_dec_return(pcp->expire)) {
+-                              changes++;
++                              changed = true;
+                               continue;
+                       }
+                       if (__this_cpu_read(pcp->count)) {
+                               drain_zone_pages(zone, this_cpu_ptr(pcp));
+-                              changes++;
++                              changed = true;
+                       }
+ #endif
+               }
+@@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_
+               }
+       }
+-      changes += fold_diff(global_zone_diff, global_node_diff);
+-      return changes;
++      if (fold_diff(global_zone_diff, global_node_diff))
++              changed = true;
++      return changed;
+ }
+ /*
index 2de29eaa05671a987ffde3cba0e4b17f8b21ca3e..cb86c1f6d84b0a3281fda9ae20907100abf8f384 100644 (file)
@@ -181,3 +181,16 @@ dmaengine-stm32-dmamux-fix-of-node-leak-on-route-allocation-failure.patch
 dmaengine-ti-dma-crossbar-fix-device-leak-on-dra7x-route-allocation.patch
 dmaengine-ti-dma-crossbar-fix-device-leak-on-am335x-route-allocation.patch
 dmaengine-ti-k3-udma-fix-device-leak-on-udma-lookup.patch
+mm-add-a-ptdesc-flag-to-mark-kernel-page-tables.patch
+mm-actually-mark-kernel-page-table-pages.patch
+x86-mm-use-ptdesc-when-freeing-pmd-pages.patch
+mm-introduce-pure-page-table-freeing-function.patch
+x86-mm-use-pagetable_free.patch
+mm-introduce-deferred-freeing-for-kernel-page-tables.patch
+iommu-sva-invalidate-stale-iotlb-entries-for-kernel-address-space.patch
+hid-intel-ish-hid-use-dedicated-unbound-workqueues-to-prevent-resume-blocking.patch
+hid-intel-ish-hid-fix-wcast-function-type-strict-in-devm_ishtp_alloc_workqueue.patch
+btrfs-fix-deadlock-in-wait_current_trans-due-to-ignored-transaction-type.patch
+mm-page_alloc-vmstat-simplify-refresh_cpu_vm_stats-change-detection.patch
+mm-page_alloc-batch-page-freeing-in-decay_pcp_high.patch
+mm-page_alloc-prevent-pcp-corruption-with-smp-n.patch
diff --git a/queue-6.18/x86-mm-use-pagetable_free.patch b/queue-6.18/x86-mm-use-pagetable_free.patch
new file mode 100644 (file)
index 0000000..6d7149a
--- /dev/null
@@ -0,0 +1,71 @@
+From bf9e4e30f3538391745a99bc2268ec4f5e4a401e Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:32 +0800
+Subject: x86/mm: use pagetable_free()
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit bf9e4e30f3538391745a99bc2268ec4f5e4a401e upstream.
+
+The kernel's memory management subsystem provides a dedicated interface,
+pagetable_free(), for freeing page table pages.  Updates two call sites to
+use pagetable_free() instead of the lower-level __free_page() or
+free_pages().  This improves code consistency and clarity, and ensures the
+correct freeing mechanism is used.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-7-baolu.lu@linux.intel.com
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Kevin Tian <kevin.tian@intel.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/init_64.c        |    2 +-
+ arch/x86/mm/pat/set_memory.c |    2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -1031,7 +1031,7 @@ static void __meminit free_pagetable(str
+               free_reserved_pages(page, nr_pages);
+ #endif
+       } else {
+-              __free_pages(page, order);
++              pagetable_free(page_ptdesc(page));
+       }
+ }
+--- a/arch/x86/mm/pat/set_memory.c
++++ b/arch/x86/mm/pat/set_memory.c
+@@ -429,7 +429,7 @@ static void cpa_collapse_large_pages(str
+       list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
+               list_del(&ptdesc->pt_list);
+-              __free_page(ptdesc_page(ptdesc));
++              pagetable_free(ptdesc);
+       }
+ }
diff --git a/queue-6.18/x86-mm-use-ptdesc-when-freeing-pmd-pages.patch b/queue-6.18/x86-mm-use-ptdesc-when-freeing-pmd-pages.patch
new file mode 100644 (file)
index 0000000..43234f2
--- /dev/null
@@ -0,0 +1,96 @@
+From 412d000346ea38ac4b9bb715a86c73ef89d90dea Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 22 Oct 2025 16:26:30 +0800
+Subject: x86/mm: use 'ptdesc' when freeing PMD pages
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 412d000346ea38ac4b9bb715a86c73ef89d90dea upstream.
+
+There are a billion ways to refer to a physical memory address.  One of
+the x86 PMD freeing code location chooses to use a 'pte_t *' to point to a
+PMD page and then call a PTE-specific freeing function for it.  That's a
+bit wonky.
+
+Just use a 'struct ptdesc *' instead.  Its entire purpose is to refer to
+page table pages.  It also means being able to remove an explicit cast.
+
+Right now, pte_free_kernel() is a one-liner that calls
+pagetable_dtor_free().  Effectively, all this patch does is remove one
+superfluous __pa(__va(paddr)) conversion and then call
+pagetable_dtor_free() directly instead of through a helper.
+
+Link: https://lkml.kernel.org/r/20251022082635.2462433-5-baolu.lu@linux.intel.com
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Robin Murohy <robin.murphy@arm.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vasant Hegde <vasant.hegde@amd.com>
+Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yi Lai <yi1.lai@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/pgtable.c |   12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -729,7 +729,7 @@ int pmd_clear_huge(pmd_t *pmd)
+ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+ {
+       pmd_t *pmd, *pmd_sv;
+-      pte_t *pte;
++      struct ptdesc *pt;
+       int i;
+       pmd = pud_pgtable(*pud);
+@@ -750,8 +750,8 @@ int pud_free_pmd_page(pud_t *pud, unsign
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               if (!pmd_none(pmd_sv[i])) {
+-                      pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+-                      pte_free_kernel(&init_mm, pte);
++                      pt = page_ptdesc(pmd_page(pmd_sv[i]));
++                      pagetable_dtor_free(pt);
+               }
+       }
+@@ -772,15 +772,15 @@ int pud_free_pmd_page(pud_t *pud, unsign
+  */
+ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+ {
+-      pte_t *pte;
++      struct ptdesc *pt;
+-      pte = (pte_t *)pmd_page_vaddr(*pmd);
++      pt = page_ptdesc(pmd_page(*pmd));
+       pmd_clear(pmd);
+       /* INVLPG to clear all paging-structure caches */
+       flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+-      pte_free_kernel(&init_mm, pte);
++      pagetable_dtor_free(pt);
+       return 1;
+ }