]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: vmscan: prepare for reparenting traditional LRU folios
authorQi Zheng <zhengqi.arch@bytedance.com>
Thu, 5 Mar 2026 11:52:43 +0000 (19:52 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Sat, 18 Apr 2026 07:10:46 +0000 (00:10 -0700)
To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg.  For traditional LRU list, each lruvec of every
memcg comprises four LRU lists.  Due to the symmetry of the LRU lists, it
is feasible to transfer the LRU lists from a memcg to its parent memcg
during the reparenting process.

This commit implements the specific function, which will be used during
the reparenting process.

Link: https://lore.kernel.org/a92d217a9fc82bd0c401210204a095caaf615b1c.1772711148.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Allen Pais <apais@linux.microsoft.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yosry Ahmed <yosry@kernel.org>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/swap.h
mm/swap.c
mm/vmscan.c

index ea08e2afa2b4fc0afcb8d8cd1e06f232aaf19a13..d653fe050b8f66bd33683b23f2da2ab31bcef861 100644 (file)
@@ -546,6 +546,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 
        return READ_ONCE(memcg->swappiness);
 }
+
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
 #else
 static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 {
@@ -610,5 +612,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio)
 }
 #endif
 
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+       for ((idx) = 0, (zone) = (pgdat)->node_zones;           \
+           (idx) <= (highidx);                                 \
+           (idx)++, (zone)++)                                  \
+               if (!managed_zone(zone))                        \
+                       continue;                               \
+               else
+
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
index bcd2b52e5def1b054603df52b02657ff5b56f2c7..5cc44f0de98772ee05946d10ba4c66f2de465727 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1090,6 +1090,39 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
        fbatch->nr = j;
 }
 
+#ifdef CONFIG_MEMCG
+static void lruvec_reparent_lru(struct lruvec *child_lruvec,
+                               struct lruvec *parent_lruvec,
+                               enum lru_list lru, int nid)
+{
+       int zid;
+       struct zone *zone;
+
+       if (lru != LRU_UNEVICTABLE)
+               list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);
+
+       for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
+               unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
+
+               mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
+       }
+}
+
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+       enum lru_list lru;
+       struct lruvec *child_lruvec, *parent_lruvec;
+
+       child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+       parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+       parent_lruvec->anon_cost += child_lruvec->anon_cost;
+       parent_lruvec->file_cost += child_lruvec->file_cost;
+
+       for_each_lru(lru)
+               lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
+}
+#endif
+
 static const struct ctl_table swap_sysctl_table[] = {
        {
                .procname       = "page-cluster",
index d4b649abe645fb4c11b12c1da57b10a8d6103c49..d225e84b52637b9b33cdc59cc4ecef3a1ba3e8bc 100644 (file)
@@ -269,25 +269,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
 }
 #endif
 
-/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
- * and including the specified highidx
- * @zone: The current zone in the iterator
- * @pgdat: The pgdat which node_zones are being iterated
- * @idx: The index variable
- * @highidx: The index of the highest zone to return
- *
- * This macro iterates through all managed zones up to and including the specified highidx.
- * The zone iterator enters an invalid state after macro call and must be reinitialized
- * before it can be used again.
- */
-#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
-       for ((idx) = 0, (zone) = (pgdat)->node_zones;           \
-           (idx) <= (highidx);                                 \
-           (idx)++, (zone)++)                                  \
-               if (!managed_zone(zone))                        \
-                       continue;                               \
-               else
-
 static void set_task_reclaim_state(struct task_struct *task,
                                   struct reclaim_state *rs)
 {