mm/memcg, swap: store cgroup id in cluster table directly

author Kairui Song <kasong@tencent.com>

Sun, 17 May 2026 15:39:49 +0000 (23:39 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 2 Jun 2026 22:22:23 +0000 (15:22 -0700)
author Kairui Song <kasong@tencent.com>
Sun, 17 May 2026 15:39:49 +0000 (23:39 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 2 Jun 2026 22:22:23 +0000 (15:22 -0700)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index a013f37f24aa05ac704764ef3f3d00d17ff7162a..8f2662db166b48648ecb6f2269ed7b0f174d2c13 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct obj_cgroup;
  struct page;
  struct mm_struct;
  struct kmem_cache;
+struct swap_cluster_info;
  
  /* Cgroup-specific page state, on top of universal node page state */
  enum memcg_stat_item {
@@ -1899,9 +1900,6 @@ static inline void mem_cgroup_exit_user_fault(void)
         current->in_user_fault = 0;
  }
  
-void __memcg1_swapout(struct folio *folio);
-void memcg1_swapin(struct folio *folio);
-
  #else /* CONFIG_MEMCG_V1 */
  static inline
  unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -1929,14 +1927,23 @@ static inline void mem_cgroup_exit_user_fault(void)
  {
  }
  
-static inline void __memcg1_swapout(struct folio *folio)
+#endif /* CONFIG_MEMCG_V1 */
+
+#if defined(CONFIG_MEMCG_V1) && defined(CONFIG_SWAP)
+
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci);
+void memcg1_swapin(struct folio *folio);
+
+#else
+
+static inline void __memcg1_swapout(struct folio *folio,
+               struct swap_cluster_info *ci)
  {
  }
  
  static inline void memcg1_swapin(struct folio *folio)
  {
  }
-
-#endif /* CONFIG_MEMCG_V1 */
+#endif
  
  #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h

index f907d3df52d0f0b13d531e311e4a8c8878e86a39..200e7c345f26abeaa0c768cf298047f8d9d163e2 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -579,12 +579,12 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
         return __mem_cgroup_try_charge_swap(folio);
  }
  
-extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
  {
         if (mem_cgroup_disabled())
                 return;
-       __mem_cgroup_uncharge_swap(entry, nr_pages);
+       __mem_cgroup_uncharge_swap(id, nr_pages);
  }
  
  extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
@@ -595,7 +595,7 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
         return 0;
  }
  
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
+static inline void mem_cgroup_uncharge_swap(unsigned short id,
                                             unsigned int nr_pages)
  {
  }
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c

index 36c507d81dc514d75c737cb8980e8060415fbd7a..517b21236672bcac1f5b749bc81a88618a33ddc6 100644 (file)
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -14,6 +14,7 @@
  
  #include "internal.h"
  #include "swap.h"
+#include "swap_table.h"
  #include "memcontrol-v1.h"
  
  /*
@@ -603,17 +604,19 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
         local_irq_restore(flags);
  }
  
+#ifdef CONFIG_SWAP
  /**
   * __memcg1_swapout - transfer a memsw charge to swap
   * @folio: folio whose memsw charge to transfer
+ * @ci: the locked swap cluster holding the swap entries
   *
   * Transfer the memsw charge of @folio to the swap entry stored in
   * folio->swap.
   *
- * Context: folio must be isolated, unmapped, locked and is just about
- * to be freed, and caller must disable IRQs.
+ * Context: folio must be isolated, unmapped, locked and is just about to
+ * be freed, and caller must disable IRQs and hold the swap cluster lock.
   */
-void __memcg1_swapout(struct folio *folio)
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
  {
         struct mem_cgroup *memcg, *swap_memcg;
         struct obj_cgroup *objcg;
@@ -646,7 +649,8 @@ void __memcg1_swapout(struct folio *folio)
         swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries);
         mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
  
-       swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), folio->swap);
+       __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries,
+                         mem_cgroup_private_id(swap_memcg));
  
         folio_unqueue_deferred_split(folio);
         folio->memcg_data = 0;
@@ -661,8 +665,7 @@ void __memcg1_swapout(struct folio *folio)
         }
  
         /*
-        * Interrupts should be disabled here because the caller holds the
-        * i_pages lock which is taken with interrupts-off. It is
+        * The caller must hold the swap cluster lock with IRQ off. It is
          * important here to have the interrupts disabled because it is the
          * only synchronisation we have for updating the per-CPU variables.
          */
@@ -677,7 +680,7 @@ void __memcg1_swapout(struct folio *folio)
  }
  
  /**
- * memcg1_swapin - uncharge swap slot
+ * memcg1_swapin - uncharge swap slot on swapin
   * @folio: folio being swapped in
   *
   * Call this function after successfully adding the charged
@@ -687,6 +690,10 @@ void __memcg1_swapout(struct folio *folio)
   */
  void memcg1_swapin(struct folio *folio)
  {
+       struct swap_cluster_info *ci;
+       unsigned long nr_pages;
+       unsigned short id;
+
         VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
         VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
  
@@ -702,15 +709,22 @@ void memcg1_swapin(struct folio *folio)
          * correspond 1:1 to page and swap slot lifetimes: we charge the
          * page to memory here, and uncharge swap when the slot is freed.
          */
-       if (do_memsw_account()) {
-               /*
-                * The swap entry might not get freed for a long time,
-                * let's not wait for it.  The page already received a
-                * memory+swap charge, drop the swap entry duplicate.
-                */
-               mem_cgroup_uncharge_swap(folio->swap, folio_nr_pages(folio));
-       }
+       if (!do_memsw_account())
+               return;
+
+       /*
+        * The swap entry might not get freed for a long time,
+        * let's not wait for it.  The page already received a
+        * memory+swap charge, drop the swap entry duplicate.
+        */
+       nr_pages = folio_nr_pages(folio);
+       ci = swap_cluster_get_and_lock(folio);
+       id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
+                                nr_pages);
+       swap_cluster_unlock(ci);
+       mem_cgroup_uncharge_swap(id, nr_pages);
  }
+#endif
  
  void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
                            unsigned long nr_memory, int nid)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 1b58b314cb186884d0a8937a6f126e74c207d717..beecfc6f376dc5347068a6b84711a32f3c56fc2a 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -64,6 +64,7 @@
  #include <linux/sched/isolation.h>
  #include <linux/kmemleak.h>
  #include "internal.h"
+#include "swap_table.h"
  #include <net/sock.h>
  #include <net/ip.h>
  #include "slab.h"
@@ -5479,6 +5480,7 @@ int __init mem_cgroup_init(void)
  int __mem_cgroup_try_charge_swap(struct folio *folio)
  {
         unsigned int nr_pages = folio_nr_pages(folio);
+       struct swap_cluster_info *ci;
         struct page_counter *counter;
         struct mem_cgroup *memcg;
         struct obj_cgroup *objcg;
@@ -5512,22 +5514,23 @@ int __mem_cgroup_try_charge_swap(struct folio *folio)
         }
         mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
  
-       swap_cgroup_record(folio, mem_cgroup_private_id(memcg), folio->swap);
+       ci = swap_cluster_get_and_lock(folio);
+       __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
+                         mem_cgroup_private_id(memcg));
+       swap_cluster_unlock(ci);
  
         return 0;
  }
  
  /**
   * __mem_cgroup_uncharge_swap - uncharge swap space
- * @entry: swap entry to uncharge
+ * @id: cgroup id to uncharge
   * @nr_pages: the amount of swap space to uncharge
   */
-void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
  {
         struct mem_cgroup *memcg;
-       unsigned short id;
  
-       id = swap_cgroup_clear(entry, nr_pages);
         rcu_read_lock();
         memcg = mem_cgroup_from_private_id(id);
         if (memcg) {
diff --git a/mm/swap.h b/mm/swap.h

index 8e57e943162461311e4c7772138e9daa7d17f36c..5b2f095fff6e2c3998d6f75cfac6a9107bd5b1c8 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -5,6 +5,7 @@
  #include <linux/atomic.h> /* for atomic_long_t */
  struct mempolicy;
  struct swap_iocb;
+struct swap_memcg_table;
  
  extern int page_cluster;
  
@@ -38,6 +39,9 @@ struct swap_cluster_info {
         u8 order;
         atomic_long_t __rcu *table;     /* Swap table entries, see mm/swap_table.h */
         unsigned int *extend_table;     /* For large swap count, protected by ci->lock */
+#ifdef CONFIG_MEMCG
+       struct swap_memcg_table *memcg_table;   /* Swap table entries' cgroup record */
+#endif
         struct list_head list;
  };
  
diff --git a/mm/swap_state.c b/mm/swap_state.c

index bdd949ae004485a0e0a95df062db3a0c8a0bdb71..873cb3f26337463e262e590a7036bc79685c10da 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -179,21 +179,19 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci,
         if (shadowp && swp_tb_is_shadow(old_tb))
                 *shadowp = swp_tb_to_shadow(old_tb);
         if (memcg_id)
-               *memcg_id = lookup_swap_cgroup_id(targ_entry);
+               *memcg_id = __swap_cgroup_get(ci, ci_off);
  
         if (nr == 1)
                 return 0;
  
-       targ_entry.val = round_down(targ_entry.val, nr);
         ci_off = round_down(ci_off, nr);
         ci_end = ci_off + nr;
         do {
                 old_tb = __swap_table_get(ci, ci_off);
                 if (unlikely(swp_tb_is_folio(old_tb) ||
                              !__swp_tb_get_count(old_tb) ||
-                            (memcg_id && *memcg_id != lookup_swap_cgroup_id(targ_entry))))
+                            (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off))))
                         return -EBUSY;
-               targ_entry.val++;
         } while (++ci_off < ci_end);
  
         return 0;
diff --git a/mm/swap_table.h b/mm/swap_table.h

index 8415ffbe2b9cf0ba0a35bd4c5b3631f72ac452eb..b4e1100f8296a648c620498edbdb75d8b6d112cc 100644 (file)
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -11,6 +11,11 @@ struct swap_table {
         atomic_long_t entries[SWAPFILE_CLUSTER];
  };
  
+/* For storing memcg private id */
+struct swap_memcg_table {
+       unsigned short id[SWAPFILE_CLUSTER];
+};
+
  #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
  
  /*
@@ -247,4 +252,63 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
  
         return swp_tb;
  }
+
+#ifdef CONFIG_MEMCG
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+               unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+       lockdep_assert_held(&ci->lock);
+       VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+       if (WARN_ON_ONCE(!ci->memcg_table))
+               return;
+       do {
+               ci->memcg_table->id[ci_off++] = id;
+       } while (--nr);
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+                                              unsigned int ci_off)
+{
+       lockdep_assert_held(&ci->lock);
+       VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+       if (unlikely(!ci->memcg_table))
+               return 0;
+       return ci->memcg_table->id[ci_off];
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+                                                unsigned int ci_off,
+                                                unsigned long nr)
+{
+       unsigned short old = __swap_cgroup_get(ci, ci_off);
+
+       if (!old)
+               return 0;
+       do {
+               VM_WARN_ON_ONCE(ci->memcg_table->id[ci_off] != old);
+               ci->memcg_table->id[ci_off++] = 0;
+       } while (--nr);
+
+       return old;
+}
+#else
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+               unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+                                              unsigned int ci_off)
+{
+       return 0;
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+                                                unsigned int ci_off,
+                                                unsigned long nr)
+{
+       return 0;
+}
+#endif
+
  #endif
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 2ddabc0f3a888c4ee9bacff7cc13b8d6c1646dd2..bd141eb9ef10a73124ce8fb0379f4148a1443d0a 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -423,7 +423,12 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci)
  {
         struct swap_table *table;
  
-       table = (struct swap_table *)rcu_dereference_protected(ci->table, true);
+#ifdef CONFIG_MEMCG
+       kfree(ci->memcg_table);
+       ci->memcg_table = NULL;
+#endif
+
+       table = (struct swap_table *)rcu_access_pointer(ci->table);
         if (!table)
                 return;
  
@@ -441,6 +446,7 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp)
  {
         struct swap_table *table = NULL;
         struct folio *folio;
+       int ret = 0;
  
         /* The cluster must be empty and not on any list during allocation. */
         VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
@@ -458,7 +464,19 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp)
                 return -ENOMEM;
  
         rcu_assign_pointer(ci->table, table);
-       return 0;
+
+#ifdef CONFIG_MEMCG
+       if (!mem_cgroup_disabled()) {
+               VM_WARN_ON_ONCE(ci->memcg_table);
+               ci->memcg_table = kzalloc_obj(*ci->memcg_table, gfp);
+               if (!ci->memcg_table)
+                       ret = -ENOMEM;
+       }
+#endif
+       if (ret)
+               swap_cluster_free_table(ci);
+
+       return ret;
  }
  
  /*
@@ -483,6 +501,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci,
                         bad_slots++;
                 else
                         WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+               WARN_ON_ONCE(__swap_cgroup_get(ci, ci_off));
         } while (++ci_off < ci_end);
  
         WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
@@ -1887,12 +1906,10 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
                                  unsigned int ci_start, unsigned int nr_pages)
  {
         unsigned long old_tb;
-       unsigned int type = si->type;
         unsigned short batch_id = 0, id_cur;
         unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
         unsigned long ci_head = cluster_offset(si, ci);
         unsigned int batch_off = ci_off;
-       swp_entry_t entry;
  
         VM_WARN_ON(ci->count < nr_pages);
  
@@ -1910,21 +1927,17 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
                  * Uncharge swap slots by memcg in batches. Consecutive
                  * slots with the same cgroup id are uncharged together.
                  */
-               entry = swp_entry(type, ci_head + ci_off);
-               id_cur = lookup_swap_cgroup_id(entry);
+               id_cur = __swap_cgroup_clear(ci, ci_off, 1);
                 if (batch_id != id_cur) {
                         if (batch_id)
-                               mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off),
-                                                        ci_off - batch_off);
+                               mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
                         batch_id = id_cur;
                         batch_off = ci_off;
                 }
         } while (++ci_off < ci_end);
  
-       if (batch_id) {
-               mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off),
-                                        ci_off - batch_off);
-       }
+       if (batch_id)
+               mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
  
         swap_range_free(si, ci_head + ci_start, nr_pages);
         swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 3231af682fa739c1d7c2132f5cb28591e426a608..3c856a78c0a59eba4f4b62e532aec73b6e07a5d4 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -739,7 +739,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
  
                 if (reclaimed && !mapping_exiting(mapping))
                         shadow = workingset_eviction(folio, target_memcg);
-               __memcg1_swapout(folio);
+               __memcg1_swapout(folio, ci);
                 __swap_cache_del_folio(ci, folio, swap, shadow);
                 swap_cluster_unlock_irq(ci);
         } else {
author	Kairui Song <kasong@tencent.com>
	Sun, 17 May 2026 15:39:49 +0000 (23:39 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 2 Jun 2026 22:22:23 +0000 (15:22 -0700)
include/linux/memcontrol.h		patch \| blob \| blame \| history
include/linux/swap.h		patch \| blob \| blame \| history
mm/memcontrol-v1.c		patch \| blob \| blame \| history
mm/memcontrol.c		patch \| blob \| blame \| history
mm/swap.h		patch \| blob \| blame \| history
mm/swap_state.c		patch \| blob \| blame \| history
mm/swap_table.h		patch \| blob \| blame \| history
mm/swapfile.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history