]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: support batched checking of the young flag for MGLRU
authorBaolin Wang <baolin.wang@linux.alibaba.com>
Fri, 6 Mar 2026 06:43:41 +0000 (14:43 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Sun, 5 Apr 2026 20:53:16 +0000 (13:53 -0700)
Use the batched helper test_and_clear_young_ptes_notify() to check and
clear the young flag to improve the performance during large folio
reclamation when MGLRU is enabled.

Meanwhile, we can also support batched checking the young and dirty flag
when MGLRU walks the mm's pagetable to update the folios' generation
counter.  Since MGLRU also checks the PTE dirty bit, use
folio_pte_batch_flags() with FPB_MERGE_YOUNG_DIRTY set to detect batches
of PTEs for a large folio.

Then we can remove the ptep_test_and_clear_young_notify() since it has no
users now.

Note that we also update the 'young' counter and 'mm_stats[MM_LEAF_YOUNG]'
counter with the batched count in the lru_gen_look_around() and
walk_pte_range().  However, the batched operations may inflate these two
counters, because in a large folio not all PTEs may have been accessed.
(Additionally, tracking how many PTEs have been accessed within a large
folio is not very meaningful, since the mm core actually tracks
access/dirty on a per-folio basis, not per page).  The impact analysis is
as follows:

1. The 'mm_stats[MM_LEAF_YOUNG]' counter has no functional impact and
   is mainly for debugging.

2. The 'young' counter is used to decide whether to place the current
   PMD entry into the bloom filters by suitable_to_scan() (so that next
   time we can check whether it has been accessed again), which may set
   the hash bit in the bloom filters for a PMD entry that hasn't seen much
   access.  However, bloom filters inherently allow some error, so this
   effect appears negligible.

Link: https://lkml.kernel.org/r/378f4acf7d07410aa7c2e4b49d56bb165918eb34.1772778858.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mmzone.h
mm/internal.h
mm/rmap.c
mm/vmscan.c

index 5c3ae03487548bfb5e8ad1c56d036292600a8ab6..3f651baf7e2b562483cb4766d5dcd8b476fef20e 100644 (file)
@@ -684,7 +684,7 @@ struct lru_gen_memcg {
 
 void lru_gen_init_pgdat(struct pglist_data *pgdat);
 void lru_gen_init_lruvec(struct lruvec *lruvec);
-bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr);
 
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -703,7 +703,8 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 }
 
-static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw,
+               unsigned int nr)
 {
        return false;
 }
index 1357dc04f0651011790f754a93210d92e6bc1787..4ab833b8bcdf5473f9c7da91ae0a5af1111e0a95 100644 (file)
@@ -1848,10 +1848,4 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma,
 
 #endif /* CONFIG_MMU_NOTIFIER */
 
-static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma,
-               unsigned long addr, pte_t *ptep)
-{
-       return test_and_clear_young_ptes_notify(vma, addr, ptep, 1);
-}
-
 #endif /* __MM_INTERNAL_H */
index cd48f34f11b5b00b2977544b7dec00a20e05a50d..abe4712a220cf04f3ce4c000b905d79875a63c8a 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -965,25 +965,20 @@ static bool folio_referenced_one(struct folio *folio,
                        return false;
                }
 
+               if (pvmw.pte && folio_test_large(folio)) {
+                       const unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
+                       const unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
+                       pte_t pteval = ptep_get(pvmw.pte);
+
+                       nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr);
+               }
+
                if (lru_gen_enabled() && pvmw.pte) {
-                       if (lru_gen_look_around(&pvmw))
+                       if (lru_gen_look_around(&pvmw, nr))
                                referenced++;
                } else if (pvmw.pte) {
-                       if (folio_test_large(folio)) {
-                               unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
-                               unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
-                               pte_t pteval = ptep_get(pvmw.pte);
-
-                               nr = folio_pte_batch(folio, pvmw.pte,
-                                                    pteval, max_nr);
-                       }
-
-                       ptes += nr;
                        if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
                                referenced++;
-                       /* Skip the batched PTEs */
-                       pvmw.pte += nr - 1;
-                       pvmw.address += (nr - 1) * PAGE_SIZE;
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
@@ -993,6 +988,7 @@ static bool folio_referenced_one(struct folio *folio,
                        WARN_ON_ONCE(1);
                }
 
+               ptes += nr;
                pra->mapcount -= nr;
                /*
                 * If we are sure that we batched the entire folio,
@@ -1002,6 +998,10 @@ static bool folio_referenced_one(struct folio *folio,
                        page_vma_mapped_walk_done(&pvmw);
                        break;
                }
+
+               /* Skip the batched PTEs */
+               pvmw.pte += nr - 1;
+               pvmw.address += (nr - 1) * PAGE_SIZE;
        }
 
        if (referenced)
index 7ab9e1cdccd2776a4f87a089baf0519064dcd944..3a4a0a81c8719c89b814b7e9bc00c2f9da75a047 100644 (file)
@@ -3499,6 +3499,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
        struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
        DEFINE_MAX_SEQ(walk->lruvec);
        int gen = lru_gen_from_seq(max_seq);
+       unsigned int nr;
        pmd_t pmdval;
 
        pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
@@ -3517,11 +3518,13 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 
        lazy_mmu_mode_enable();
 restart:
-       for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+       for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) {
                unsigned long pfn;
                struct folio *folio;
-               pte_t ptent = ptep_get(pte + i);
+               pte_t *cur_pte = pte + i;
+               pte_t ptent = ptep_get(cur_pte);
 
+               nr = 1;
                total++;
                walk->mm_stats[MM_LEAF_TOTAL]++;
 
@@ -3533,7 +3536,16 @@ restart:
                if (!folio)
                        continue;
 
-               if (!ptep_test_and_clear_young_notify(args->vma, addr, pte + i))
+               if (folio_test_large(folio)) {
+                       const unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+                       nr = folio_pte_batch_flags(folio, NULL, cur_pte, &ptent,
+                                                  max_nr, FPB_MERGE_YOUNG_DIRTY);
+                       total += nr - 1;
+                       walk->mm_stats[MM_LEAF_TOTAL] += nr - 1;
+               }
+
+               if (!test_and_clear_young_ptes_notify(args->vma, addr, cur_pte, nr))
                        continue;
 
                if (last != folio) {
@@ -3546,8 +3558,8 @@ restart:
                if (pte_dirty(ptent))
                        dirty = true;
 
-               young++;
-               walk->mm_stats[MM_LEAF_YOUNG]++;
+               young += nr;
+               walk->mm_stats[MM_LEAF_YOUNG] += nr;
        }
 
        walk_update_folio(walk, last, gen, dirty);
@@ -4191,7 +4203,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  * the PTE table to the Bloom filter. This forms a feedback loop between the
  * eviction and the aging.
  */
-bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
 {
        int i;
        bool dirty;
@@ -4214,7 +4226,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
        lockdep_assert_held(pvmw->ptl);
        VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
 
-       if (!ptep_test_and_clear_young_notify(vma, addr, pte))
+       if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr))
                return false;
 
        if (spin_is_contended(pvmw->ptl))
@@ -4248,10 +4260,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 
        pte -= (addr - start) / PAGE_SIZE;
 
-       for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+       for (i = 0, addr = start; addr != end;
+            i += nr, pte += nr, addr += nr * PAGE_SIZE) {
                unsigned long pfn;
-               pte_t ptent = ptep_get(pte + i);
+               pte_t ptent = ptep_get(pte);
 
+               nr = 1;
                pfn = get_pte_pfn(ptent, vma, addr, pgdat);
                if (pfn == -1)
                        continue;
@@ -4260,7 +4274,14 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
                if (!folio)
                        continue;
 
-               if (!ptep_test_and_clear_young_notify(vma, addr, pte + i))
+               if (folio_test_large(folio)) {
+                       const unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+                       nr = folio_pte_batch_flags(folio, NULL, pte, &ptent,
+                                                  max_nr, FPB_MERGE_YOUNG_DIRTY);
+               }
+
+               if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr))
                        continue;
 
                if (last != folio) {
@@ -4273,7 +4294,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
                if (pte_dirty(ptent))
                        dirty = true;
 
-               young++;
+               young += nr;
        }
 
        walk_update_folio(walk, last, gen, dirty);