]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
mm: rmap: support batched checks of the references for large folios
authorBaolin Wang <baolin.wang@linux.alibaba.com>
Mon, 9 Feb 2026 14:07:24 +0000 (22:07 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 12 Feb 2026 23:43:00 +0000 (15:43 -0800)
Patch series "support batch checking of references and unmapping for large
folios", v6.

Currently, folio_referenced_one() always checks the young flag for each
PTE sequentially, which is inefficient for large folios.  This
inefficiency is especially noticeable when reclaiming clean file-backed
large folios, where folio_referenced() is observed as a significant
performance hotspot.

Moreover, on Arm architecture, which supports contiguous PTEs, there is
already an optimization to clear the young flags for PTEs within a
contiguous range.  However, this is not sufficient.  We can extend this to
perform batched operations for the entire large folio (which might exceed
the contiguous range: CONT_PTE_SIZE).

Similar to folio_referenced_one(), we can also apply batched unmapping for
large file folios to optimize the performance of file folio reclamation.
By supporting batched checking of the young flags, flushing TLB entries,
and unmapping, I can observed a significant performance improvements in my
performance tests for file folios reclamation.  Please check the
performance data in the commit message of each patch.

This patch (of 5):

Currently, folio_referenced_one() always checks the young flag for each
PTE sequentially, which is inefficient for large folios.  This
inefficiency is especially noticeable when reclaiming clean file-backed
large folios, where folio_referenced() is observed as a significant
performance hotspot.

Moreover, on Arm64 architecture, which supports contiguous PTEs, there is
already an optimization to clear the young flags for PTEs within a
contiguous range.  However, this is not sufficient.  We can extend this to
perform batched operations for the entire large folio (which might exceed
the contiguous range: CONT_PTE_SIZE).

Introduce a new API: clear_flush_young_ptes() to facilitate batched
checking of the young flags and flushing TLB entries, thereby improving
performance during large folio reclamation.  And it will be overridden by
the architecture that implements a more efficient batch operation in the
following patches.

While we are at it, rename ptep_clear_flush_young_notify() to
clear_flush_young_ptes_notify() to indicate that this is a batch
operation.

Link: https://lkml.kernel.org/r/cover.1770645603.git.baolin.wang@linux.alibaba.com
Link: https://lkml.kernel.org/r/12132694536834262062d1fb304f8f8a064b6750.1770645603.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mmu_notifier.h
include/linux/pgtable.h
mm/rmap.c

index d1094c2d5fb61e5e39dcabb48072964f1f64aee3..07a2bbaf86e905133bd80c9bd211be6e16bfa30f 100644 (file)
@@ -515,16 +515,17 @@ static inline void mmu_notifier_range_init_owner(
        range->owner = owner;
 }
 
-#define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
+#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr)  \
 ({                                                                     \
        int __young;                                                    \
        struct vm_area_struct *___vma = __vma;                          \
        unsigned long ___address = __address;                           \
-       __young = ptep_clear_flush_young(___vma, ___address, __ptep);   \
+       unsigned int ___nr = __nr;                                      \
+       __young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr);    \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,           \
                                                  ___address +          \
-                                                       PAGE_SIZE);     \
+                                                 ___nr * PAGE_SIZE);   \
        __young;                                                        \
 })
 
@@ -650,7 +651,7 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
 
 #define mmu_notifier_range_update_to_read_only(r) false
 
-#define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define clear_flush_young_ptes_notify clear_flush_young_ptes
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_young_notify ptep_test_and_clear_young
 #define pmdp_clear_young_notify pmdp_test_and_clear_young
index 21b67d9375558e205842f403a28b2b5ad33efbbf..a50df42a893fb53b32b20cadbbea745702fb2aeb 100644 (file)
@@ -1068,6 +1068,41 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
 }
 #endif
 
+#ifndef clear_flush_young_ptes
+/**
+ * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same
+ *                         folio as old and flush the TLB.
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear access bit.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_clear_flush_young().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline int clear_flush_young_ptes(struct vm_area_struct *vma,
+               unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+       int young = 0;
+
+       for (;;) {
+               young |= ptep_clear_flush_young(vma, addr, ptep);
+               if (--nr == 0)
+                       break;
+               ptep++;
+               addr += PAGE_SIZE;
+       }
+
+       return young;
+}
+#endif
+
 /*
  * On some architectures hardware does not set page access bit when accessing
  * memory page, it is responsibility of software setting this bit. It brings
index ab099405151ff91c6281665f083e51ebcc731d0e..3dbc2c4e02dc88c794c0ac6f95276daf0c481a8a 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -913,9 +913,11 @@ static bool folio_referenced_one(struct folio *folio,
        struct folio_referenced_arg *pra = arg;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        int ptes = 0, referenced = 0;
+       unsigned int nr;
 
        while (page_vma_mapped_walk(&pvmw)) {
                address = pvmw.address;
+               nr = 1;
 
                if (vma->vm_flags & VM_LOCKED) {
                        ptes++;
@@ -960,9 +962,21 @@ static bool folio_referenced_one(struct folio *folio,
                        if (lru_gen_look_around(&pvmw))
                                referenced++;
                } else if (pvmw.pte) {
-                       if (ptep_clear_flush_young_notify(vma, address,
-                                               pvmw.pte))
+                       if (folio_test_large(folio)) {
+                               unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
+                               unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
+                               pte_t pteval = ptep_get(pvmw.pte);
+
+                               nr = folio_pte_batch(folio, pvmw.pte,
+                                                    pteval, max_nr);
+                       }
+
+                       ptes += nr;
+                       if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
                                referenced++;
+                       /* Skip the batched PTEs */
+                       pvmw.pte += nr - 1;
+                       pvmw.address += (nr - 1) * PAGE_SIZE;
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
@@ -972,7 +986,15 @@ static bool folio_referenced_one(struct folio *folio,
                        WARN_ON_ONCE(1);
                }
 
-               pra->mapcount--;
+               pra->mapcount -= nr;
+               /*
+                * If we are sure that we batched the entire folio,
+                * we can just optimize and stop right here.
+                */
+               if (ptes == pvmw.nr_pages) {
+                       page_vma_mapped_walk_done(&pvmw);
+                       break;
+               }
        }
 
        if (referenced)