]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
mm/memcg, swap: store cgroup id in cluster table directly
authorKairui Song <kasong@tencent.com>
Sun, 17 May 2026 15:39:49 +0000 (23:39 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Tue, 2 Jun 2026 22:22:23 +0000 (15:22 -0700)
Drop the usage of the swap_cgroup_ctrl, and use the dynamic cluster table
instead.

The per-cluster memcg table is 1024 / 512 bytes on most archs, and does
not need RCU protection: the cgroup data is only read and written under
the cluster lock.  That keeps things simple, lets the allocation use plain
kmalloc with immediate kfree (no deferred free), and keeps fragmentation
acceptable.

[akpm@linux-foundation.org: memcgv1: don't compile swap functions when CONFIG_SWAP=n]
Link: https://lore.kernel.org/202605281711.bSeZlErK-lkp@intel.com
[akpm@linux-foundation.org: fix CONFIG_SWAP=n build]
Link: https://lore.kernel.org/20260517-swap-table-p4-v5-10-88ae43e064c7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lorenzo Stoakes <ljs@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Youngjun Park <youngjun.park@lge.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/memcontrol.h
include/linux/swap.h
mm/memcontrol-v1.c
mm/memcontrol.c
mm/swap.h
mm/swap_state.c
mm/swap_table.h
mm/swapfile.c
mm/vmscan.c

index a013f37f24aa05ac704764ef3f3d00d17ff7162a..8f2662db166b48648ecb6f2269ed7b0f174d2c13 100644 (file)
@@ -29,6 +29,7 @@ struct obj_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct swap_cluster_info;
 
 /* Cgroup-specific page state, on top of universal node page state */
 enum memcg_stat_item {
@@ -1899,9 +1900,6 @@ static inline void mem_cgroup_exit_user_fault(void)
        current->in_user_fault = 0;
 }
 
-void __memcg1_swapout(struct folio *folio);
-void memcg1_swapin(struct folio *folio);
-
 #else /* CONFIG_MEMCG_V1 */
 static inline
 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -1929,14 +1927,23 @@ static inline void mem_cgroup_exit_user_fault(void)
 {
 }
 
-static inline void __memcg1_swapout(struct folio *folio)
+#endif /* CONFIG_MEMCG_V1 */
+
+#if defined(CONFIG_MEMCG_V1) && defined(CONFIG_SWAP)
+
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci);
+void memcg1_swapin(struct folio *folio);
+
+#else
+
+static inline void __memcg1_swapout(struct folio *folio,
+               struct swap_cluster_info *ci)
 {
 }
 
 static inline void memcg1_swapin(struct folio *folio)
 {
 }
-
-#endif /* CONFIG_MEMCG_V1 */
+#endif
 
 #endif /* _LINUX_MEMCONTROL_H */
index f907d3df52d0f0b13d531e311e4a8c8878e86a39..200e7c345f26abeaa0c768cf298047f8d9d163e2 100644 (file)
@@ -579,12 +579,12 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
        return __mem_cgroup_try_charge_swap(folio);
 }
 
-extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
 {
        if (mem_cgroup_disabled())
                return;
-       __mem_cgroup_uncharge_swap(entry, nr_pages);
+       __mem_cgroup_uncharge_swap(id, nr_pages);
 }
 
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
@@ -595,7 +595,7 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
        return 0;
 }
 
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
+static inline void mem_cgroup_uncharge_swap(unsigned short id,
                                            unsigned int nr_pages)
 {
 }
index 36c507d81dc514d75c737cb8980e8060415fbd7a..517b21236672bcac1f5b749bc81a88618a33ddc6 100644 (file)
@@ -14,6 +14,7 @@
 
 #include "internal.h"
 #include "swap.h"
+#include "swap_table.h"
 #include "memcontrol-v1.h"
 
 /*
@@ -603,17 +604,19 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
        local_irq_restore(flags);
 }
 
+#ifdef CONFIG_SWAP
 /**
  * __memcg1_swapout - transfer a memsw charge to swap
  * @folio: folio whose memsw charge to transfer
+ * @ci: the locked swap cluster holding the swap entries
  *
  * Transfer the memsw charge of @folio to the swap entry stored in
  * folio->swap.
  *
- * Context: folio must be isolated, unmapped, locked and is just about
- * to be freed, and caller must disable IRQs.
+ * Context: folio must be isolated, unmapped, locked and is just about to
+ * be freed, and caller must disable IRQs and hold the swap cluster lock.
  */
-void __memcg1_swapout(struct folio *folio)
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
 {
        struct mem_cgroup *memcg, *swap_memcg;
        struct obj_cgroup *objcg;
@@ -646,7 +649,8 @@ void __memcg1_swapout(struct folio *folio)
        swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries);
        mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
 
-       swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), folio->swap);
+       __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries,
+                         mem_cgroup_private_id(swap_memcg));
 
        folio_unqueue_deferred_split(folio);
        folio->memcg_data = 0;
@@ -661,8 +665,7 @@ void __memcg1_swapout(struct folio *folio)
        }
 
        /*
-        * Interrupts should be disabled here because the caller holds the
-        * i_pages lock which is taken with interrupts-off. It is
+        * The caller must hold the swap cluster lock with IRQ off. It is
         * important here to have the interrupts disabled because it is the
         * only synchronisation we have for updating the per-CPU variables.
         */
@@ -677,7 +680,7 @@ void __memcg1_swapout(struct folio *folio)
 }
 
 /**
- * memcg1_swapin - uncharge swap slot
+ * memcg1_swapin - uncharge swap slot on swapin
  * @folio: folio being swapped in
  *
  * Call this function after successfully adding the charged
@@ -687,6 +690,10 @@ void __memcg1_swapout(struct folio *folio)
  */
 void memcg1_swapin(struct folio *folio)
 {
+       struct swap_cluster_info *ci;
+       unsigned long nr_pages;
+       unsigned short id;
+
        VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
 
@@ -702,15 +709,22 @@ void memcg1_swapin(struct folio *folio)
         * correspond 1:1 to page and swap slot lifetimes: we charge the
         * page to memory here, and uncharge swap when the slot is freed.
         */
-       if (do_memsw_account()) {
-               /*
-                * The swap entry might not get freed for a long time,
-                * let's not wait for it.  The page already received a
-                * memory+swap charge, drop the swap entry duplicate.
-                */
-               mem_cgroup_uncharge_swap(folio->swap, folio_nr_pages(folio));
-       }
+       if (!do_memsw_account())
+               return;
+
+       /*
+        * The swap entry might not get freed for a long time,
+        * let's not wait for it.  The page already received a
+        * memory+swap charge, drop the swap entry duplicate.
+        */
+       nr_pages = folio_nr_pages(folio);
+       ci = swap_cluster_get_and_lock(folio);
+       id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
+                                nr_pages);
+       swap_cluster_unlock(ci);
+       mem_cgroup_uncharge_swap(id, nr_pages);
 }
+#endif
 
 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
                           unsigned long nr_memory, int nid)
index 1b58b314cb186884d0a8937a6f126e74c207d717..beecfc6f376dc5347068a6b84711a32f3c56fc2a 100644 (file)
@@ -64,6 +64,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/kmemleak.h>
 #include "internal.h"
+#include "swap_table.h"
 #include <net/sock.h>
 #include <net/ip.h>
 #include "slab.h"
@@ -5479,6 +5480,7 @@ int __init mem_cgroup_init(void)
 int __mem_cgroup_try_charge_swap(struct folio *folio)
 {
        unsigned int nr_pages = folio_nr_pages(folio);
+       struct swap_cluster_info *ci;
        struct page_counter *counter;
        struct mem_cgroup *memcg;
        struct obj_cgroup *objcg;
@@ -5512,22 +5514,23 @@ int __mem_cgroup_try_charge_swap(struct folio *folio)
        }
        mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
 
-       swap_cgroup_record(folio, mem_cgroup_private_id(memcg), folio->swap);
+       ci = swap_cluster_get_and_lock(folio);
+       __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
+                         mem_cgroup_private_id(memcg));
+       swap_cluster_unlock(ci);
 
        return 0;
 }
 
 /**
  * __mem_cgroup_uncharge_swap - uncharge swap space
- * @entry: swap entry to uncharge
+ * @id: cgroup id to uncharge
  * @nr_pages: the amount of swap space to uncharge
  */
-void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
 {
        struct mem_cgroup *memcg;
-       unsigned short id;
 
-       id = swap_cgroup_clear(entry, nr_pages);
        rcu_read_lock();
        memcg = mem_cgroup_from_private_id(id);
        if (memcg) {
index 8e57e943162461311e4c7772138e9daa7d17f36c..5b2f095fff6e2c3998d6f75cfac6a9107bd5b1c8 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -5,6 +5,7 @@
 #include <linux/atomic.h> /* for atomic_long_t */
 struct mempolicy;
 struct swap_iocb;
+struct swap_memcg_table;
 
 extern int page_cluster;
 
@@ -38,6 +39,9 @@ struct swap_cluster_info {
        u8 order;
        atomic_long_t __rcu *table;     /* Swap table entries, see mm/swap_table.h */
        unsigned int *extend_table;     /* For large swap count, protected by ci->lock */
+#ifdef CONFIG_MEMCG
+       struct swap_memcg_table *memcg_table;   /* Swap table entries' cgroup record */
+#endif
        struct list_head list;
 };
 
index bdd949ae004485a0e0a95df062db3a0c8a0bdb71..873cb3f26337463e262e590a7036bc79685c10da 100644 (file)
@@ -179,21 +179,19 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci,
        if (shadowp && swp_tb_is_shadow(old_tb))
                *shadowp = swp_tb_to_shadow(old_tb);
        if (memcg_id)
-               *memcg_id = lookup_swap_cgroup_id(targ_entry);
+               *memcg_id = __swap_cgroup_get(ci, ci_off);
 
        if (nr == 1)
                return 0;
 
-       targ_entry.val = round_down(targ_entry.val, nr);
        ci_off = round_down(ci_off, nr);
        ci_end = ci_off + nr;
        do {
                old_tb = __swap_table_get(ci, ci_off);
                if (unlikely(swp_tb_is_folio(old_tb) ||
                             !__swp_tb_get_count(old_tb) ||
-                            (memcg_id && *memcg_id != lookup_swap_cgroup_id(targ_entry))))
+                            (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off))))
                        return -EBUSY;
-               targ_entry.val++;
        } while (++ci_off < ci_end);
 
        return 0;
index 8415ffbe2b9cf0ba0a35bd4c5b3631f72ac452eb..b4e1100f8296a648c620498edbdb75d8b6d112cc 100644 (file)
@@ -11,6 +11,11 @@ struct swap_table {
        atomic_long_t entries[SWAPFILE_CLUSTER];
 };
 
+/* For storing memcg private id */
+struct swap_memcg_table {
+       unsigned short id[SWAPFILE_CLUSTER];
+};
+
 #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
 
 /*
@@ -247,4 +252,63 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
 
        return swp_tb;
 }
+
+#ifdef CONFIG_MEMCG
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+               unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+       lockdep_assert_held(&ci->lock);
+       VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+       if (WARN_ON_ONCE(!ci->memcg_table))
+               return;
+       do {
+               ci->memcg_table->id[ci_off++] = id;
+       } while (--nr);
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+                                              unsigned int ci_off)
+{
+       lockdep_assert_held(&ci->lock);
+       VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+       if (unlikely(!ci->memcg_table))
+               return 0;
+       return ci->memcg_table->id[ci_off];
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+                                                unsigned int ci_off,
+                                                unsigned long nr)
+{
+       unsigned short old = __swap_cgroup_get(ci, ci_off);
+
+       if (!old)
+               return 0;
+       do {
+               VM_WARN_ON_ONCE(ci->memcg_table->id[ci_off] != old);
+               ci->memcg_table->id[ci_off++] = 0;
+       } while (--nr);
+
+       return old;
+}
+#else
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+               unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+                                              unsigned int ci_off)
+{
+       return 0;
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+                                                unsigned int ci_off,
+                                                unsigned long nr)
+{
+       return 0;
+}
+#endif
+
 #endif
index 2ddabc0f3a888c4ee9bacff7cc13b8d6c1646dd2..bd141eb9ef10a73124ce8fb0379f4148a1443d0a 100644 (file)
@@ -423,7 +423,12 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci)
 {
        struct swap_table *table;
 
-       table = (struct swap_table *)rcu_dereference_protected(ci->table, true);
+#ifdef CONFIG_MEMCG
+       kfree(ci->memcg_table);
+       ci->memcg_table = NULL;
+#endif
+
+       table = (struct swap_table *)rcu_access_pointer(ci->table);
        if (!table)
                return;
 
@@ -441,6 +446,7 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp)
 {
        struct swap_table *table = NULL;
        struct folio *folio;
+       int ret = 0;
 
        /* The cluster must be empty and not on any list during allocation. */
        VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
@@ -458,7 +464,19 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp)
                return -ENOMEM;
 
        rcu_assign_pointer(ci->table, table);
-       return 0;
+
+#ifdef CONFIG_MEMCG
+       if (!mem_cgroup_disabled()) {
+               VM_WARN_ON_ONCE(ci->memcg_table);
+               ci->memcg_table = kzalloc_obj(*ci->memcg_table, gfp);
+               if (!ci->memcg_table)
+                       ret = -ENOMEM;
+       }
+#endif
+       if (ret)
+               swap_cluster_free_table(ci);
+
+       return ret;
 }
 
 /*
@@ -483,6 +501,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci,
                        bad_slots++;
                else
                        WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+               WARN_ON_ONCE(__swap_cgroup_get(ci, ci_off));
        } while (++ci_off < ci_end);
 
        WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
@@ -1887,12 +1906,10 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
                                 unsigned int ci_start, unsigned int nr_pages)
 {
        unsigned long old_tb;
-       unsigned int type = si->type;
        unsigned short batch_id = 0, id_cur;
        unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
        unsigned long ci_head = cluster_offset(si, ci);
        unsigned int batch_off = ci_off;
-       swp_entry_t entry;
 
        VM_WARN_ON(ci->count < nr_pages);
 
@@ -1910,21 +1927,17 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
                 * Uncharge swap slots by memcg in batches. Consecutive
                 * slots with the same cgroup id are uncharged together.
                 */
-               entry = swp_entry(type, ci_head + ci_off);
-               id_cur = lookup_swap_cgroup_id(entry);
+               id_cur = __swap_cgroup_clear(ci, ci_off, 1);
                if (batch_id != id_cur) {
                        if (batch_id)
-                               mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off),
-                                                        ci_off - batch_off);
+                               mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
                        batch_id = id_cur;
                        batch_off = ci_off;
                }
        } while (++ci_off < ci_end);
 
-       if (batch_id) {
-               mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off),
-                                        ci_off - batch_off);
-       }
+       if (batch_id)
+               mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
 
        swap_range_free(si, ci_head + ci_start, nr_pages);
        swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
index 3231af682fa739c1d7c2132f5cb28591e426a608..3c856a78c0a59eba4f4b62e532aec73b6e07a5d4 100644 (file)
@@ -739,7 +739,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
 
                if (reclaimed && !mapping_exiting(mapping))
                        shadow = workingset_eviction(folio, target_memcg);
-               __memcg1_swapout(folio);
+               __memcg1_swapout(folio, ci);
                __swap_cache_del_folio(ci, folio, swap, shadow);
                swap_cluster_unlock_irq(ci);
        } else {