]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: memcontrol: convert objcg to be per-memcg per-node type
authorQi Zheng <zhengqi.arch@bytedance.com>
Thu, 5 Mar 2026 11:52:49 +0000 (19:52 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Sat, 18 Apr 2026 07:10:47 +0000 (00:10 -0700)
Convert objcg to be per-memcg per-node type, so that when reparent LRU
folios later, we can hold the lru lock at the node level, thus avoiding
holding too many lru locks at once.

[zhengqi.arch@bytedance.com: reset pn->orig_objcg to NULL]
Link: https://lore.kernel.org/20260309112939.31937-1-qi.zheng@linux.dev
[akpm@linux-foundation.org: fix comment typo, per Usama.  Reflow comment to 80 cols]
[devnexen@gmail.com: fix obj_cgroup leak in mem_cgroup_css_online() error path]
Link: https://lore.kernel.org/20260322193631.45457-1-devnexen@gmail.com
[devnexen@gmail.com: add newline, per Qi Zheng]
Link: https://lore.kernel.org/20260323063007.7783-1-devnexen@gmail.com
Link: https://lore.kernel.org/56c04b1c5d54f75ccdc12896df6c1ca35403ecc3.1772711148.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: David Carlier <devnexen@gmail.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Allen Pais <apais@linux.microsoft.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yosry Ahmed <yosry@kernel.org>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Usama Arif <usama.arif@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/memcontrol.h
include/linux/sched.h
mm/memcontrol.c

index 12982875073e35fa714aa67e017a46921fa1a339..3e836b56bfcb84942286082d8c2a7c6f16303f33 100644 (file)
@@ -115,6 +115,16 @@ struct mem_cgroup_per_node {
        unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
        struct mem_cgroup_reclaim_iter  iter;
 
+       /*
+        * objcg is wiped out as a part of the objcg repaprenting process.
+        * orig_objcg preserves a pointer (and a reference) to the original
+        * objcg until the end of live of memcg.
+        */
+       struct obj_cgroup __rcu *objcg;
+       struct obj_cgroup       *orig_objcg;
+       /* list of inherited objcgs, protected by objcg_lock */
+       struct list_head objcg_list;
+
 #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
        /* slab stats for nmi context */
        atomic_t                slab_reclaimable;
@@ -179,6 +189,7 @@ struct obj_cgroup {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
+       bool is_root;
 };
 
 /*
@@ -257,15 +268,6 @@ struct mem_cgroup {
        seqlock_t               socket_pressure_seqlock;
 #endif
        int kmemcg_id;
-       /*
-        * memcg->objcg is wiped out as a part of the objcg repaprenting
-        * process. memcg->orig_objcg preserves a pointer (and a reference)
-        * to the original objcg until the end of live of memcg.
-        */
-       struct obj_cgroup __rcu *objcg;
-       struct obj_cgroup       *orig_objcg;
-       /* list of inherited objcgs, protected by objcg_lock */
-       struct list_head objcg_list;
 
        struct memcg_vmstats_percpu __percpu *vmstats_percpu;
 
@@ -332,7 +334,6 @@ struct mem_cgroup {
 #define MEMCG_CHARGE_BATCH 64U
 
 extern struct mem_cgroup *root_mem_cgroup;
-extern struct obj_cgroup *root_obj_cgroup;
 
 enum page_memcg_data_flags {
        /* page->memcg_data is a pointer to an slabobj_ext vector */
@@ -551,7 +552,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 
 static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
 {
-       return objcg == root_obj_cgroup;
+       return objcg->is_root;
 }
 
 static inline bool mem_cgroup_disabled(void)
index 5a5d3dbc9cdf332c46a96012fceb611a9e944e53..0d27775546f8ce4c737e5d1b24935a2dfc220543 100644 (file)
@@ -1533,7 +1533,7 @@ struct task_struct {
        /* Used by memcontrol for targeted memcg charge: */
        struct mem_cgroup               *active_memcg;
 
-       /* Cache for current->cgroups->memcg->objcg lookups: */
+       /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */
        struct obj_cgroup               *objcg;
 #endif
 
index c9e5ea0d9fc62ddd731ae5836803950b2816db48..1aaa66f729b345bf8f7876f0c9dbf193e54c77c9 100644 (file)
@@ -83,8 +83,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 struct mem_cgroup *root_mem_cgroup __read_mostly;
 EXPORT_SYMBOL(root_mem_cgroup);
 
-struct obj_cgroup *root_obj_cgroup __read_mostly;
-
 /* Active memory cgroup to use from an interrupt context */
 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
@@ -209,18 +207,21 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
 }
 
 static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg,
-                                                        struct mem_cgroup *parent)
+                                                        struct mem_cgroup *parent,
+                                                        int nid)
 {
        struct obj_cgroup *objcg, *iter;
+       struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+       struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid];
 
-       objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+       objcg = rcu_replace_pointer(pn->objcg, NULL, true);
        /* 1) Ready to reparent active objcg. */
-       list_add(&objcg->list, &memcg->objcg_list);
+       list_add(&objcg->list, &pn->objcg_list);
        /* 2) Reparent active objcg and already reparented objcgs to parent. */
-       list_for_each_entry(iter, &memcg->objcg_list, list)
+       list_for_each_entry(iter, &pn->objcg_list, list)
                WRITE_ONCE(iter->memcg, parent);
        /* 3) Move already reparented objcgs to the parent's list */
-       list_splice(&memcg->objcg_list, &parent->objcg_list);
+       list_splice(&pn->objcg_list, &parent_pn->objcg_list);
 
        return objcg;
 }
@@ -267,14 +268,17 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
 {
        struct obj_cgroup *objcg;
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+       int nid;
 
-       reparent_locks(memcg, parent);
+       for_each_node(nid) {
+               reparent_locks(memcg, parent);
 
-       objcg = __memcg_reparent_objcgs(memcg, parent);
+               objcg = __memcg_reparent_objcgs(memcg, parent, nid);
 
-       reparent_unlocks(memcg, parent);
+               reparent_unlocks(memcg, parent);
 
-       percpu_ref_kill(&objcg->refcnt);
+               percpu_ref_kill(&objcg->refcnt);
+       }
 }
 
 /*
@@ -2830,8 +2834,10 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p)
 
 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
 {
+       int nid = numa_node_id();
+
        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-               struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);
+               struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg);
 
                if (likely(objcg && obj_cgroup_tryget(objcg)))
                        return objcg;
@@ -2895,6 +2901,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
 {
        struct mem_cgroup *memcg;
        struct obj_cgroup *objcg;
+       int nid = numa_node_id();
 
        if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
                return NULL;
@@ -2911,14 +2918,14 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
                 * Objcg reference is kept by the task, so it's safe
                 * to use the objcg by the current task.
                 */
-               return objcg ? : root_obj_cgroup;
+               return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
        }
 
        memcg = this_cpu_read(int_active_memcg);
        if (unlikely(memcg))
                goto from_memcg;
 
-       return root_obj_cgroup;
+       return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
 
 from_memcg:
        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
@@ -2928,12 +2935,12 @@ from_memcg:
                 * away and can be used within the scope without any additional
                 * protection.
                 */
-               objcg = rcu_dereference_check(memcg->objcg, 1);
+               objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1);
                if (likely(objcg))
                        return objcg;
        }
 
-       return root_obj_cgroup;
+       return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
 }
 
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
@@ -3876,6 +3883,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
        if (!pn->lruvec_stats_percpu)
                goto fail;
 
+       INIT_LIST_HEAD(&pn->objcg_list);
+
        lruvec_init(&pn->lruvec);
        pn->memcg = memcg;
 
@@ -3890,10 +3899,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
 
-       obj_cgroup_put(memcg->orig_objcg);
+       for_each_node(node) {
+               struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+               if (!pn)
+                       continue;
 
-       for_each_node(node)
-               free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
+               obj_cgroup_put(pn->orig_objcg);
+               free_mem_cgroup_per_node_info(pn);
+       }
        memcg1_free_events(memcg);
        kfree(memcg->vmstats);
        free_percpu(memcg->vmstats_percpu);
@@ -3964,7 +3977,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 #endif
        memcg1_memcg_init(memcg);
        memcg->kmemcg_id = -1;
-       INIT_LIST_HEAD(&memcg->objcg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&memcg->cgwb_list);
        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
@@ -4041,6 +4053,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct obj_cgroup *objcg;
+       int nid;
 
        memcg_online_kmem(memcg);
 
@@ -4052,17 +4065,19 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
        if (alloc_shrinker_info(memcg))
                goto offline_kmem;
 
-       objcg = obj_cgroup_alloc();
-       if (!objcg)
-               goto free_shrinker;
+       for_each_node(nid) {
+               objcg = obj_cgroup_alloc();
+               if (!objcg)
+                       goto free_objcg;
 
-       if (unlikely(mem_cgroup_is_root(memcg)))
-               root_obj_cgroup = objcg;
+               if (unlikely(mem_cgroup_is_root(memcg)))
+                       objcg->is_root = true;
 
-       objcg->memcg = memcg;
-       rcu_assign_pointer(memcg->objcg, objcg);
-       obj_cgroup_get(objcg);
-       memcg->orig_objcg = objcg;
+               objcg->memcg = memcg;
+               rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg);
+               obj_cgroup_get(objcg);
+               memcg->nodeinfo[nid]->orig_objcg = objcg;
+       }
 
        if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
                queue_delayed_work(system_dfl_wq, &stats_flush_dwork,
@@ -4086,7 +4101,24 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
        xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL);
 
        return 0;
-free_shrinker:
+free_objcg:
+       for_each_node(nid) {
+               struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+
+               objcg = rcu_replace_pointer(pn->objcg, NULL, true);
+               if (objcg)
+                       percpu_ref_kill(&objcg->refcnt);
+
+               if (pn->orig_objcg) {
+                       obj_cgroup_put(pn->orig_objcg);
+                       /*
+                        * Reset pn->orig_objcg to NULL to prevent
+                        * obj_cgroup_put() from being called again in
+                        * __mem_cgroup_free().
+                        */
+                       pn->orig_objcg = NULL;
+               }
+       }
        free_shrinker_info(memcg);
 offline_kmem:
        memcg_offline_kmem(memcg);