]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: memcontrol: prepare for reparenting non-hierarchical stats
authorQi Zheng <zhengqi.arch@bytedance.com>
Thu, 5 Mar 2026 11:52:48 +0000 (19:52 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Sat, 18 Apr 2026 07:10:47 +0000 (00:10 -0700)
To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg.  This could cause problems for non-hierarchical
stats.

As Yosry Ahmed pointed out:

In short, if memory is charged to a dying cgroup at the time of
reparenting, when the memory gets uncharged the stats updates will occur
at the parent. This will update both hierarchical and non-hierarchical
stats of the parent, which would corrupt the parent's non-hierarchical
stats (because those counters were never incremented when the memory was
charged).

Now we have the following two types of non-hierarchical stats, and they
are only used in CONFIG_MEMCG_V1:

a. memcg->vmstats->state_local[i]
b. pn->lruvec_stats->state_local[i]

To ensure that these non-hierarchical stats work properly, we need to
reparent these non-hierarchical stats after reparenting LRU folios. To
this end, this commit makes the following preparations:

1. implement reparent_state_local() to reparent non-hierarchical stats
2. make css_killed_work_fn() to be called in rcu work, and implement
   get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
   between mod_memcg_state()/mod_memcg_lruvec_state()
   and reparent_state_local()

Link: https://lore.kernel.org/e862995c45a7101a541284b6ebee5e5c32c89066.1772711148.git.zhengqi.arch@bytedance.com
Co-developed-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Allen Pais <apais@linux.microsoft.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
kernel/cgroup/cgroup.c
mm/memcontrol-v1.c
mm/memcontrol-v1.h
mm/memcontrol.c

index 01fc2a93f3ef229cfb4d5038df9cd2b42ce57439..babf7b45604881de312e3c91e0f695e2bfd246eb 100644 (file)
@@ -6050,8 +6050,9 @@ out_unlock:
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
-       struct cgroup_subsys_state *css =
-               container_of(work, struct cgroup_subsys_state, destroy_work);
+       struct cgroup_subsys_state *css;
+
+       css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
 
        cgroup_lock();
 
@@ -6072,8 +6073,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
                container_of(ref, struct cgroup_subsys_state, refcnt);
 
        if (atomic_dec_and_test(&css->online_cnt)) {
-               INIT_WORK(&css->destroy_work, css_killed_work_fn);
-               queue_work(cgroup_offline_wq, &css->destroy_work);
+               INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+               queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
        }
 }
 
index 437cd25784fe56f1f7c94745d18bfb98769ba400..8380adfa0f688304071d0cacdf6702520d1048a9 100644 (file)
@@ -1884,6 +1884,22 @@ static const unsigned int memcg1_events[] = {
        PGMAJFAULT,
 };
 
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+               reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+       int i;
+
+       for (i = 0; i < NR_LRU_LISTS; i++)
+               reparent_memcg_lruvec_state_local(memcg, parent, i);
+}
+
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 {
        unsigned long memory, memsw;
index 1b969294ea6a0a0aea69bab3e49f20179d297a65..f92f81108d5ed20a5b8d04a12e90143b6a675c95 100644 (file)
@@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
                           unsigned long nr_memory, int nid);
 
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+                               struct mem_cgroup *parent, int idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+                                      struct mem_cgroup *parent, int idx);
 
 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
 static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
index 271d4c6307b6e6c4189b9dccf14b64ea89e8c193..c9e5ea0d9fc62ddd731ae5836803950b2816db48 100644 (file)
@@ -225,6 +225,34 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
        return objcg;
 }
 
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               return;
+
+       /*
+        * Reparent stats exposed non-hierarchically. Flush @memcg's stats first
+        * to read its stats accurately , and conservatively flush @parent's
+        * stats after reparenting to avoid hiding a potentially large stat
+        * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
+        */
+       __mem_cgroup_flush_stats(memcg, true);
+
+       /* The following counts are all non-hierarchical and need to be reparented. */
+       reparent_memcg1_state_local(memcg, parent);
+       reparent_memcg1_lruvec_state_local(memcg, parent);
+
+       __mem_cgroup_flush_stats(parent, true);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
 static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
 {
        spin_lock_irq(&objcg_lock);
@@ -472,6 +500,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
        return x;
 }
 
+#ifdef CONFIG_MEMCG_V1
+static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
+                                    enum node_stat_item idx, int val);
+
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+                                      struct mem_cgroup *parent, int idx)
+{
+       int nid;
+
+       for_each_node(nid) {
+               struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+               struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+               unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+               struct mem_cgroup_per_node *child_pn, *parent_pn;
+
+               child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
+               parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
+
+               __mod_memcg_lruvec_state(child_pn, idx, -value);
+               __mod_memcg_lruvec_state(parent_pn, idx, value);
+       }
+}
+#endif
+
 /* Subset of vm_event_item to report for memcg event stats */
 static const unsigned int memcg_vm_event_stat[] = {
 #ifdef CONFIG_MEMCG_V1
@@ -717,6 +769,42 @@ static int memcg_state_val_in_pages(int idx, int val)
                return max(val * unit / PAGE_SIZE, 1UL);
 }
 
+#ifdef CONFIG_MEMCG_V1
+/*
+ * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
+ * reparenting of non-hierarchical state_locals.
+ */
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               return memcg;
+
+       rcu_read_lock();
+
+       while (memcg_is_dying(memcg))
+               memcg = parent_mem_cgroup(memcg);
+
+       return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               return;
+
+       rcu_read_unlock();
+}
+#else
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+       return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+}
+#endif
+
 static void __mod_memcg_state(struct mem_cgroup *memcg,
                              enum memcg_stat_item idx, int val)
 {
@@ -768,6 +856,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 #endif
        return x;
 }
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+                               struct mem_cgroup *parent, int idx)
+{
+       unsigned long value = memcg_page_state_local(memcg, idx);
+
+       __mod_memcg_state(memcg, idx, -value);
+       __mod_memcg_state(parent, idx, value);
+}
 #endif
 
 static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,