mm: vmscan: prepare for reparenting MGLRU folios

author Qi Zheng <zhengqi.arch@bytedance.com>

Thu, 5 Mar 2026 11:52:44 +0000 (19:52 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Sat, 18 Apr 2026 07:10:47 +0000 (00:10 -0700)
author Qi Zheng <zhengqi.arch@bytedance.com>
Thu, 5 Mar 2026 11:52:44 +0000 (19:52 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Sat, 18 Apr 2026 07:10:47 +0000 (00:10 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 4a20df1322580248414c311ee09023a6f6be7687..20f920dede65662170cd7419622d2d2c8d9a09a9 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -692,6 +692,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg);
  void lru_gen_offline_memcg(struct mem_cgroup *memcg);
  void lru_gen_release_memcg(struct mem_cgroup *memcg);
  void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
+void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid);
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid);
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
  
  #else /* !CONFIG_LRU_GEN */
  
@@ -733,6 +736,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
  {
  }
  
+static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
+{
+}
+
+static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
+{
+       return true;
+}
+
+static inline
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+}
+
  #endif /* CONFIG_LRU_GEN */
  
  struct lruvec {
diff --git a/mm/vmscan.c b/mm/vmscan.c

index d225e84b52637b9b33cdc59cc4ecef3a1ba3e8bc..8472aa4bddd53acbc4a24b3bc745a15b226388d5 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4426,6 +4426,148 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
                 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
  }
  
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
+{
+       struct lruvec *lruvec = get_lruvec(memcg, nid);
+       int type;
+
+       for (type = 0; type < ANON_AND_FILE; type++) {
+               if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+                       return false;
+       }
+
+       return true;
+}
+
+static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg,
+                                     struct lruvec *lruvec)
+{
+       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+       struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
+       int swappiness = mem_cgroup_swappiness(memcg);
+       DEFINE_MAX_SEQ(lruvec);
+       bool success = false;
+
+       /*
+        * We are not iterating the mm_list here, updating mm_state->seq is just
+        * to make mm walkers work properly.
+        */
+       if (mm_state) {
+               spin_lock(&mm_list->lock);
+               VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+               if (max_seq > mm_state->seq) {
+                       WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+                       success = true;
+               }
+               spin_unlock(&mm_list->lock);
+       } else {
+               success = true;
+       }
+
+       if (success)
+               inc_max_seq(lruvec, max_seq, swappiness);
+}
+
+/*
+ * We need to ensure that the folios of child memcg can be reparented to the
+ * same gen of the parent memcg, so the gens of the parent memcg needed be
+ * incremented to the MAX_NR_GENS before reparenting.
+ */
+void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
+{
+       struct lruvec *lruvec = get_lruvec(memcg, nid);
+       int type;
+
+       for (type = 0; type < ANON_AND_FILE; type++) {
+               while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
+                       try_to_inc_max_seq_nowalk(memcg, lruvec);
+                       cond_resched();
+               }
+       }
+}
+
+/*
+ * Compared to traditional LRU, MGLRU faces the following challenges:
+ *
+ * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
+ *    number of generations of the parent and child memcg may be different,
+ *    so we cannot simply transfer MGLRU folios in the child memcg to the
+ *    parent memcg as we did for traditional LRU folios.
+ * 2. The generation information is stored in folio->flags, but we cannot
+ *    traverse these folios while holding the lru lock, otherwise it may
+ *    cause softlockup.
+ * 3. In walk_update_folio(), the gen of folio and corresponding lru size
+ *    may be updated, but the folio is not immediately moved to the
+ *    corresponding lru list. Therefore, there may be folios of different
+ *    generations on an LRU list.
+ * 4. In lru_gen_del_folio(), the generation to which the folio belongs is
+ *    found based on the generation information in folio->flags, and the
+ *    corresponding LRU size will be updated. Therefore, we need to update
+ *    the lru size correctly during reparenting, otherwise the lru size may
+ *    be updated incorrectly in lru_gen_del_folio().
+ *
+ * Finally, we choose a compromise method, which is to splice the lru list in
+ * the child memcg to the lru list of the same generation in the parent memcg
+ * during reparenting.
+ *
+ * The same generation has different meanings in the parent and child memcg,
+ * so this compromise method will cause the LRU inversion problem. But as the
+ * system runs, this problem will be fixed automatically.
+ */
+static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec,
+                                    int zone, int type)
+{
+       struct lru_gen_folio *child_lrugen, *parent_lrugen;
+       enum lru_list lru = type * LRU_INACTIVE_FILE;
+       int i;
+
+       child_lrugen = &child_lruvec->lrugen;
+       parent_lrugen = &parent_lruvec->lrugen;
+
+       for (i = 0; i < get_nr_gens(child_lruvec, type); i++) {
+               int gen = lru_gen_from_seq(child_lrugen->max_seq - i);
+               long nr_pages = child_lrugen->nr_pages[gen][type][zone];
+               int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0;
+               int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0;
+
+               /* Assuming that child pages are colder than parent pages */
+               list_splice_tail_init(&child_lrugen->folios[gen][type][zone],
+                                     &parent_lrugen->folios[gen][type][zone]);
+
+               WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0);
+               WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone],
+                          parent_lrugen->nr_pages[gen][type][zone] + nr_pages);
+
+               if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) {
+                       __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages);
+                       __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages);
+               }
+       }
+}
+
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+       struct lruvec *child_lruvec, *parent_lruvec;
+       int type, zid;
+       struct zone *zone;
+       enum lru_list lru;
+
+       child_lruvec = get_lruvec(memcg, nid);
+       parent_lruvec = get_lruvec(parent, nid);
+
+       for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1)
+               for (type = 0; type < ANON_AND_FILE; type++)
+                       __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type);
+
+       for_each_lru(lru) {
+               for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
+                       unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
+
+                       mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
+               }
+       }
+}
+
  #endif /* CONFIG_MEMCG */
  
  /******************************************************************************
author	Qi Zheng <zhengqi.arch@bytedance.com>
	Thu, 5 Mar 2026 11:52:44 +0000 (19:52 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sat, 18 Apr 2026 07:10:47 +0000 (00:10 -0700)
include/linux/mmzone.h		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history