]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: introduce deferred freeing for kernel page tables
authorDave Hansen <dave.hansen@linux.intel.com>
Wed, 22 Oct 2025 08:26:33 +0000 (16:26 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Mon, 17 Nov 2025 01:28:18 +0000 (17:28 -0800)
This introduces a conditional asynchronous mechanism, enabled by
CONFIG_ASYNC_KERNEL_PGTABLE_FREE.  When enabled, this mechanism defers the
freeing of pages that are used as page tables for kernel address mappings.
These pages are now queued to a work struct instead of being freed
immediately.

This deferred freeing allows for batch-freeing of page tables, providing a
safe context for performing a single expensive operation (TLB flush) for a
batch of kernel page tables instead of performing that expensive operation
for each page table.

Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mm.h
mm/Kconfig
mm/pgtable-generic.c

index 88c0a0fae43af058f63252a9be1b9b35f8d67c03..a6fd9f5aaf30cc4f100dc019efec73f5dffcce13 100644 (file)
@@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
        __free_pages(page, compound_order(page));
 }
 
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+void pagetable_free_kernel(struct ptdesc *pt);
+#else
+static inline void pagetable_free_kernel(struct ptdesc *pt)
+{
+       __pagetable_free(pt);
+}
+#endif
 /**
  * pagetable_free - Free pagetables
  * @pt:        The page table descriptor
@@ -3062,10 +3070,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
  */
 static inline void pagetable_free(struct ptdesc *pt)
 {
-       if (ptdesc_test_kernel(pt))
+       if (ptdesc_test_kernel(pt)) {
                ptdesc_clear_kernel(pt);
-
-       __pagetable_free(pt);
+               pagetable_free_kernel(pt);
+       } else {
+               __pagetable_free(pt);
+       }
 }
 
 #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
index 4971436c869747fc655f7999d8aaafeeb4d1e1ae..682a5c39a1a6ff711283dab85976a5f767a07932 100644 (file)
@@ -906,6 +906,9 @@ config HAVE_GIGANTIC_FOLIOS
        def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
                 (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 
+config ASYNC_KERNEL_PGTABLE_FREE
+       def_bool n
+
 # TODO: Allow to be enabled without THP
 config ARCH_SUPPORTS_HUGE_PFNMAP
        def_bool n
index 567e2d084071e3de09b45a96fc83695a1c9f1b43..1c7caa8ef164c4264a595be315c8ff11eacfcb56 100644 (file)
@@ -406,3 +406,40 @@ again:
        pte_unmap_unlock(pte, ptl);
        goto again;
 }
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+static void kernel_pgtable_work_func(struct work_struct *work);
+
+static struct {
+       struct list_head list;
+       /* protect above ptdesc lists */
+       spinlock_t lock;
+       struct work_struct work;
+} kernel_pgtable_work = {
+       .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
+       .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
+       .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
+};
+
+static void kernel_pgtable_work_func(struct work_struct *work)
+{
+       struct ptdesc *pt, *next;
+       LIST_HEAD(page_list);
+
+       spin_lock(&kernel_pgtable_work.lock);
+       list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
+       spin_unlock(&kernel_pgtable_work.lock);
+
+       list_for_each_entry_safe(pt, next, &page_list, pt_list)
+               __pagetable_free(pt);
+}
+
+void pagetable_free_kernel(struct ptdesc *pt)
+{
+       spin_lock(&kernel_pgtable_work.lock);
+       list_add(&pt->pt_list, &kernel_pgtable_work.list);
+       spin_unlock(&kernel_pgtable_work.lock);
+
+       schedule_work(&kernel_pgtable_work.work);
+}
+#endif