]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
mm: switch deferred split shrinker to list_lru
authorJohannes Weiner <hannes@cmpxchg.org>
Wed, 27 May 2026 20:45:16 +0000 (16:45 -0400)
committerAndrew Morton <akpm@linux-foundation.org>
Tue, 9 Jun 2026 01:21:25 +0000 (18:21 -0700)
The deferred split queue handles cgroups in a suboptimal fashion.  The
queue is per-NUMA node or per-cgroup, not the intersection.  That means on
a cgrouped system, a node-restricted allocation entering reclaim can end
up splitting large pages on other nodes:

        alloc/unmap
          deferred_split_folio()
            list_add_tail(memcg->split_queue)
            set_shrinker_bit(memcg, node, deferred_shrinker_id)

        for_each_zone_zonelist_nodemask(restricted_nodes)
          mem_cgroup_iter()
            shrink_slab(node, memcg)
              shrink_slab_memcg(node, memcg)
                if test_shrinker_bit(memcg, node, deferred_shrinker_id)
                  deferred_split_scan()
                    walks memcg->split_queue

The shrinker bit adds an imperfect guard rail.  As soon as the cgroup has
a single large page on the node of interest, all large pages owned by that
memcg, including those on other nodes, will be split.

list_lru properly sets up per-node, per-cgroup lists.  As a bonus, it
streamlines a lot of the list operations and reclaim walks.  It's used
widely by other major shrinkers already.  Convert the deferred split queue
as well.

The list_lru per-memcg heads are instantiated on demand when the first
object of interest is allocated for a cgroup, by calling
folio_memcg_alloc_deferred().  Add calls to where splittable pages are
created: anon faults, swapin faults, khugepaged collapse.

These calls create all possible node heads for the cgroup at once, so the
migration code (between nodes) doesn't need any special care.

[akpm@linux-foundation.org: fix build with CONFIG_TRANSPARENT_HUGEPAGE=n]
Link: https://lore.kernel.org/202605281620.lc3rtkBm-lkp@intel.com
[hannes@cmpxchg.org: fix cgroup.memory=nokmem handling]
Link: https://lore.kernel.org/ah9PGv12mqai84ES@cmpxchg.org
Link: https://lore.kernel.org/20260527204757.2544958-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Tested-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Reviewed-by: Kairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: David Hildenbrand (Arm) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam R. Howlett <liam@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/huge_mm.h
include/linux/memcontrol.h
include/linux/mmzone.h
mm/huge_memory.c
mm/internal.h
mm/khugepaged.c
mm/memcontrol.c
mm/memory.c
mm/mm_init.c
mm/swap_state.c

index 58382e97a66d3f97f8848f70836bb5c161300f74..c0d223d0c556c8caab5f32608efa7a152e10a050 100644 (file)
@@ -439,10 +439,10 @@ static inline int split_huge_page(struct page *page)
 {
        return split_huge_page_to_list_to_order(page, NULL, 0);
 }
+
+int folio_memcg_alloc_deferred(struct folio *folio);
+
 void deferred_split_folio(struct folio *folio, bool partially_mapped);
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg);
-#endif
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze);
@@ -679,8 +679,15 @@ static inline int try_folio_split_to_order(struct folio *folio,
        return -EINVAL;
 }
 
-static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
-static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
+static inline int folio_memcg_alloc_deferred(struct folio *folio)
+{
+       return 0;
+}
+
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped)
+{
+}
+
 #define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)
 
index 8f2662db166b48648ecb6f2269ed7b0f174d2c13..e1f46a0016fcfd0269552795ca47ef696f59ab1f 100644 (file)
@@ -278,10 +278,6 @@ struct mem_cgroup {
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       struct deferred_split deferred_split_queue;
-#endif
-
 #ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* per-memcg mm_struct list */
        struct lru_gen_mm_list mm_list;
index 1331a7b93f33c67c6e07df1fd8c5e4504dc28e80..8e449f524f266a8ca95d39a6ffc641bde4b7f07c 100644 (file)
@@ -1431,14 +1431,6 @@ struct zonelist {
  */
 extern struct page *mem_map;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct deferred_split {
-       spinlock_t split_queue_lock;
-       struct list_head split_queue;
-       unsigned long split_queue_len;
-};
-#endif
-
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Per NUMA node memory failure handling statistics.
@@ -1564,10 +1556,6 @@ typedef struct pglist_data {
        unsigned long first_deferred_pfn;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       struct deferred_split deferred_split_queue;
-#endif
-
 #ifdef CONFIG_NUMA_BALANCING
        /* start time in ms of current promote rate limit period */
        unsigned int nbp_rl_start;
index 1f14c5c48b4a43db6870063720c7552dc8c3ccc6..6927f66b2eb2c43e5b8061382c683ffe385dcf27 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/list_lru.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
 #include <linux/swapops.h>
@@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
+static struct lock_class_key deferred_split_key;
+static struct list_lru deferred_split_lru;
 static struct shrinker *deferred_split_shrinker;
 static unsigned long deferred_split_count(struct shrinker *shrink,
                                          struct shrink_control *sc);
@@ -932,15 +935,28 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 }
 #endif /* CONFIG_SYSFS */
 
+int folio_memcg_alloc_deferred(struct folio *folio)
+{
+       if (mem_cgroup_disabled())
+               return 0;
+       return folio_memcg_list_lru_alloc(folio, &deferred_split_lru, GFP_KERNEL);
+}
+
 static int __init thp_shrinker_init(void)
 {
        deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
-                                                SHRINKER_MEMCG_AWARE |
-                                                SHRINKER_NONSLAB,
+                                                SHRINKER_MEMCG_AWARE,
                                                 "thp-deferred_split");
        if (!deferred_split_shrinker)
                return -ENOMEM;
 
+       if (list_lru_init_memcg_key(&deferred_split_lru,
+                                   deferred_split_shrinker,
+                                   &deferred_split_key)) {
+               shrinker_free(deferred_split_shrinker);
+               return -ENOMEM;
+       }
+
        deferred_split_shrinker->count_objects = deferred_split_count;
        deferred_split_shrinker->scan_objects = deferred_split_scan;
        shrinker_register(deferred_split_shrinker);
@@ -962,6 +978,7 @@ static int __init thp_shrinker_init(void)
        huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
        if (!huge_zero_folio_shrinker) {
                shrinker_free(deferred_split_shrinker);
+               list_lru_destroy(&deferred_split_lru);
                return -ENOMEM;
        }
 
@@ -976,6 +993,7 @@ static void __init thp_shrinker_exit(void)
 {
        shrinker_free(huge_zero_folio_shrinker);
        shrinker_free(deferred_split_shrinker);
+       list_lru_destroy(&deferred_split_lru);
 }
 
 static int __init hugepage_init(void)
@@ -1155,119 +1173,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
        return pmd;
 }
 
-static struct deferred_split *split_queue_node(int nid)
-{
-       struct pglist_data *pgdata = NODE_DATA(nid);
-
-       return &pgdata->deferred_split_queue;
-}
-
-#ifdef CONFIG_MEMCG
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
-                                          struct deferred_split *queue)
-{
-       if (mem_cgroup_disabled())
-               return NULL;
-       if (split_queue_node(folio_nid(folio)) == queue)
-               return NULL;
-       return container_of(queue, struct mem_cgroup, deferred_split_queue);
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
-       return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
-}
-#else
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
-                                          struct deferred_split *queue)
-{
-       return NULL;
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
-       return split_queue_node(nid);
-}
-#endif
-
-static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
-{
-       struct deferred_split *queue;
-
-retry:
-       queue = memcg_split_queue(nid, memcg);
-       spin_lock(&queue->split_queue_lock);
-       /*
-        * There is a period between setting memcg to dying and reparenting
-        * deferred split queue, and during this period the THPs in the deferred
-        * split queue will be hidden from the shrinker side.
-        */
-       if (unlikely(memcg_is_dying(memcg))) {
-               spin_unlock(&queue->split_queue_lock);
-               memcg = parent_mem_cgroup(memcg);
-               goto retry;
-       }
-
-       return queue;
-}
-
-static struct deferred_split *
-split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
-{
-       struct deferred_split *queue;
-
-retry:
-       queue = memcg_split_queue(nid, memcg);
-       spin_lock_irqsave(&queue->split_queue_lock, *flags);
-       if (unlikely(memcg_is_dying(memcg))) {
-               spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
-               memcg = parent_mem_cgroup(memcg);
-               goto retry;
-       }
-
-       return queue;
-}
-
-static struct deferred_split *folio_split_queue_lock(struct folio *folio)
-{
-       struct deferred_split *queue;
-
-       rcu_read_lock();
-       queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
-       /*
-        * The memcg destruction path is acquiring the split queue lock for
-        * reparenting. Once you have it locked, it's safe to drop the rcu lock.
-        */
-       rcu_read_unlock();
-
-       return queue;
-}
-
-static struct deferred_split *
-folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
-{
-       struct deferred_split *queue;
-
-       rcu_read_lock();
-       queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
-       rcu_read_unlock();
-
-       return queue;
-}
-
-static inline void split_queue_unlock(struct deferred_split *queue)
-{
-       spin_unlock(&queue->split_queue_lock);
-}
-
-static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
-                                                unsigned long flags)
-{
-       spin_unlock_irqrestore(&queue->split_queue_lock, flags);
-}
-
 static inline bool is_transparent_hugepage(const struct folio *folio)
 {
        if (!folio_test_large(folio))
@@ -1368,6 +1273,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                return NULL;
        }
+
+       if (folio_memcg_alloc_deferred(folio)) {
+               folio_put(folio);
+               count_vm_event(THP_FAULT_FALLBACK);
+               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+               return NULL;
+       }
+
        folio_throttle_swaprate(folio, gfp);
 
        /*
@@ -3903,34 +3816,43 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
        struct folio *end_folio = folio_next(folio);
        struct folio *new_folio, *next;
        int old_order = folio_order(folio);
+       struct list_lru_one *lru;
+       bool dequeue_deferred;
        int ret = 0;
-       struct deferred_split *ds_queue;
 
        VM_WARN_ON_ONCE(!mapping && end);
-       /* Prevent deferred_split_scan() touching ->_refcount */
-       ds_queue = folio_split_queue_lock(folio);
+       /*
+        * If this folio can be on the deferred split queue, lock out
+        * the shrinker before freezing the ref. If the shrinker sees
+        * a 0-ref folio, it assumes it beat folio_put() to the list
+        * lock and must clean up the LRU state - the same dequeue we
+        * will do below as part of the split.
+        */
+       dequeue_deferred = folio_test_anon(folio) && old_order > 1;
+       if (dequeue_deferred) {
+               struct mem_cgroup *memcg;
+
+               rcu_read_lock();
+               memcg = folio_memcg(folio);
+               lru = list_lru_lock(&deferred_split_lru,
+                                   folio_nid(folio), &memcg);
+       }
        if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
                struct swap_cluster_info *ci = NULL;
                struct lruvec *lruvec;
 
-               if (old_order > 1) {
-                       if (!list_empty(&folio->_deferred_list)) {
-                               ds_queue->split_queue_len--;
-                               /*
-                                * Reinitialize page_deferred_list after removing the
-                                * page from the split_queue, otherwise a subsequent
-                                * split will see list corruption when checking the
-                                * page_deferred_list.
-                                */
-                               list_del_init(&folio->_deferred_list);
-                       }
+               if (dequeue_deferred) {
+                       __list_lru_del(&deferred_split_lru, lru,
+                                      &folio->_deferred_list, folio_nid(folio));
                        if (folio_test_partially_mapped(folio)) {
                                folio_clear_partially_mapped(folio);
                                mod_mthp_stat(old_order,
                                        MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                        }
+                       list_lru_unlock(lru);
+                       rcu_read_unlock();
                }
-               split_queue_unlock(ds_queue);
+
                if (mapping) {
                        int nr = folio_nr_pages(folio);
 
@@ -4031,7 +3953,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
                if (ci)
                        swap_cluster_unlock(ci);
        } else {
-               split_queue_unlock(ds_queue);
+               if (dequeue_deferred) {
+                       list_lru_unlock(lru);
+                       rcu_read_unlock();
+               }
                return -EAGAIN;
        }
 
@@ -4397,33 +4322,37 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
  * queueing THP splits, and that list is (racily observed to be) non-empty.
  *
  * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
- * zero: because even when split_queue_lock is held, a non-empty _deferred_list
- * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ * zero: because even when the list_lru lock is held, a non-empty
+ * _deferred_list might be in use on deferred_split_scan()'s unlocked
+ * on-stack list.
  *
- * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
- * therefore important to unqueue deferred split before changing folio memcg.
+ * The list_lru sublist is determined by folio's memcg: it is therefore
+ * important to unqueue deferred split before changing folio memcg.
  */
 bool __folio_unqueue_deferred_split(struct folio *folio)
 {
-       struct deferred_split *ds_queue;
+       struct mem_cgroup *memcg;
+       struct list_lru_one *lru;
+       int nid = folio_nid(folio);
        unsigned long flags;
        bool unqueued = false;
 
        WARN_ON_ONCE(folio_ref_count(folio));
        WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
 
-       ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
-       if (!list_empty(&folio->_deferred_list)) {
-               ds_queue->split_queue_len--;
+       rcu_read_lock();
+       memcg = folio_memcg(folio);
+       lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags);
+       if (__list_lru_del(&deferred_split_lru, lru, &folio->_deferred_list, nid)) {
                if (folio_test_partially_mapped(folio)) {
                        folio_clear_partially_mapped(folio);
                        mod_mthp_stat(folio_order(folio),
                                      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                }
-               list_del_init(&folio->_deferred_list);
                unqueued = true;
        }
-       split_queue_unlock_irqrestore(ds_queue, flags);
+       list_lru_unlock_irqrestore(lru, &flags);
+       rcu_read_unlock();
 
        return unqueued;        /* useful for debug warnings */
 }
@@ -4431,7 +4360,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
-       struct deferred_split *ds_queue;
+       struct list_lru_one *lru;
+       int nid;
+       struct mem_cgroup *memcg;
        unsigned long flags;
 
        /*
@@ -4454,7 +4385,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
        if (folio_test_swapcache(folio))
                return;
 
-       ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
+       nid = folio_nid(folio);
+
+       rcu_read_lock();
+       memcg = folio_memcg(folio);
+       lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags);
        if (partially_mapped) {
                if (!folio_test_partially_mapped(folio)) {
                        folio_set_partially_mapped(folio);
@@ -4462,36 +4397,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
                                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
                        mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
-
                }
        } else {
                /* partially mapped folios cannot become non-partially mapped */
                VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
        }
-       if (list_empty(&folio->_deferred_list)) {
-               struct mem_cgroup *memcg;
-
-               memcg = folio_split_queue_memcg(folio, ds_queue);
-               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
-               ds_queue->split_queue_len++;
-               if (memcg)
-                       set_shrinker_bit(memcg, folio_nid(folio),
-                                        shrinker_id(deferred_split_shrinker));
-       }
-       split_queue_unlock_irqrestore(ds_queue, flags);
+       __list_lru_add(&deferred_split_lru, lru, &folio->_deferred_list, nid, memcg);
+       list_lru_unlock_irqrestore(lru, &flags);
+       rcu_read_unlock();
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
 {
-       struct pglist_data *pgdata = NODE_DATA(sc->nid);
-       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
-
-#ifdef CONFIG_MEMCG
-       if (sc->memcg)
-               ds_queue = &sc->memcg->deferred_split_queue;
-#endif
-       return READ_ONCE(ds_queue->split_queue_len);
+       return list_lru_shrink_count(&deferred_split_lru, sc);
 }
 
 static bool thp_underused(struct folio *folio)
@@ -4521,45 +4440,49 @@ static bool thp_underused(struct folio *folio)
        return false;
 }
 
+static enum lru_status deferred_split_isolate(struct list_head *item,
+                                             struct list_lru_one *lru,
+                                             void *cb_arg)
+{
+       struct folio *folio = container_of(item, struct folio, _deferred_list);
+       struct list_head *freeable = cb_arg;
+
+       if (folio_try_get(folio)) {
+               list_lru_isolate_move(lru, item, freeable);
+               return LRU_REMOVED;
+       }
+
+       /*
+        * We lost race with folio_put(). Read folio state before the
+        * isolate: folio_unqueue_deferred_split() checks list_empty()
+        * locklessly, so once removed the folio can be freed any time.
+        */
+       if (folio_test_partially_mapped(folio)) {
+               folio_clear_partially_mapped(folio);
+               mod_mthp_stat(folio_order(folio),
+                             MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+       }
+       list_lru_isolate(lru, item);
+       return LRU_REMOVED;
+}
+
 static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
 {
-       struct deferred_split *ds_queue;
-       unsigned long flags;
+       LIST_HEAD(dispose);
        struct folio *folio, *next;
-       int split = 0, i;
-       struct folio_batch fbatch;
-
-       folio_batch_init(&fbatch);
+       int split = 0;
+       unsigned long isolated;
 
-retry:
-       ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
-       /* Take pin on all head pages to avoid freeing them under us */
-       list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
-                                                       _deferred_list) {
-               if (folio_try_get(folio)) {
-                       folio_batch_add(&fbatch, folio);
-               } else if (folio_test_partially_mapped(folio)) {
-                       /* We lost race with folio_put() */
-                       folio_clear_partially_mapped(folio);
-                       mod_mthp_stat(folio_order(folio),
-                                     MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
-               }
-               list_del_init(&folio->_deferred_list);
-               ds_queue->split_queue_len--;
-               if (!--sc->nr_to_scan)
-                       break;
-               if (!folio_batch_space(&fbatch))
-                       break;
-       }
-       split_queue_unlock_irqrestore(ds_queue, flags);
+       isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
+                                           deferred_split_isolate, &dispose);
 
-       for (i = 0; i < folio_batch_count(&fbatch); i++) {
+       list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
                bool did_split = false;
                bool underused = false;
-               struct deferred_split *fqueue;
 
-               folio = fbatch.folios[i];
+               list_del_init(&folio->_deferred_list);
+
                if (!folio_test_partially_mapped(folio)) {
                        /*
                         * See try_to_map_unused_to_zeropage(): we cannot
@@ -4588,63 +4511,23 @@ next:
                 * underused, then consider it used and don't add it back to
                 * split_queue.
                 */
-               if (did_split || !folio_test_partially_mapped(folio))
-                       continue;
+               if (!did_split && folio_test_partially_mapped(folio)) {
 requeue:
-               /*
-                * Add back partially mapped folios, or underused folios that
-                * we could not lock this round.
-                */
-               fqueue = folio_split_queue_lock_irqsave(folio, &flags);
-               if (list_empty(&folio->_deferred_list)) {
-                       list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
-                       fqueue->split_queue_len++;
+                       rcu_read_lock();
+                       list_lru_add_irq(&deferred_split_lru,
+                                        &folio->_deferred_list,
+                                        folio_nid(folio),
+                                        folio_memcg(folio));
+                       rcu_read_unlock();
                }
-               split_queue_unlock_irqrestore(fqueue, flags);
-       }
-       folios_put(&fbatch);
-
-       if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
-               cond_resched();
-               goto retry;
+               folio_put(folio);
        }
 
-       /*
-        * Stop shrinker if we didn't split any page, but the queue is empty.
-        * This can happen if pages were freed under us.
-        */
-       if (!split && list_empty(&ds_queue->split_queue))
+       if (!split && !isolated)
                return SHRINK_STOP;
        return split;
 }
 
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg)
-{
-       struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-       struct deferred_split *ds_queue = &memcg->deferred_split_queue;
-       struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
-       int nid;
-
-       spin_lock_irq(&ds_queue->split_queue_lock);
-       spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
-
-       if (!ds_queue->split_queue_len)
-               goto unlock;
-
-       list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
-       parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
-       ds_queue->split_queue_len = 0;
-
-       for_each_node(nid)
-               set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
-
-unlock:
-       spin_unlock(&parent_ds_queue->split_queue_lock);
-       spin_unlock_irq(&ds_queue->split_queue_lock);
-}
-#endif
-
 #ifdef CONFIG_DEBUG_FS
 static void split_huge_pages_all(void)
 {
index 5602393054f3e78ed4804510a7bb3ec7d63bc542..181e79f1d6a2075b3aff26adaef63b2d6a0fe3e5 100644 (file)
@@ -852,7 +852,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
        /*
         * At this point, there is no one trying to add the folio to
         * deferred_list. If folio is not in deferred_list, it's safe
-        * to check without acquiring the split_queue_lock.
+        * to check without acquiring the list_lru lock.
         */
        if (data_race(list_empty(&folio->_deferred_list)))
                return false;
index a4b97ec8ce56c64d954d3c1f14762db93d2874fc..73e262cb30dd32cd35c190c11434bb2af6a47805 100644 (file)
@@ -1123,6 +1123,11 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
        if (result != SCAN_SUCCEED)
                goto out_nolock;
 
+       if (folio_memcg_alloc_deferred(folio)) {
+               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+               goto out_nolock;
+       }
+
        mmap_read_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED) {
index e24114a4493a3564b67d2149af403ef343483f16..56cd4af082326b88b4a4a4643d0c99e1c4e447e5 100644 (file)
@@ -4143,11 +4143,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
                memcg->cgwb_frn[i].done =
                        __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
-       INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
-       memcg->deferred_split_queue.split_queue_len = 0;
 #endif
        lru_gen_init_memcg(memcg);
        return memcg;
@@ -4299,11 +4294,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        zswap_memcg_offline_cleanup(memcg);
 
        memcg_offline_kmem(memcg);
-       reparent_deferred_split_queue(memcg);
        /*
-        * The reparenting of objcg must be after the reparenting of the
-        * list_lru and deferred_split_queue above, which ensures that they will
-        * not mistakenly get the parent list_lru and deferred_split_queue.
+        * The reparenting of objcg must be after the reparenting of
+        * the list_lru in memcg_offline_kmem(), which ensures that
+        * they will not mistakenly get the parent list_lru.
         */
        memcg_reparent_objcgs(memcg);
        reparent_shrinker_deferred(memcg);
index 1d8e09d9b3c931a57b82b8199bbdd54ff3aebf09..56be920c56d742d79ab8982bdad90227a7686f24 100644 (file)
@@ -5222,6 +5222,10 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
                        folio_put(folio);
                        goto next;
                }
+               if (order > 1 && folio_memcg_alloc_deferred(folio)) {
+                       folio_put(folio);
+                       goto fallback;
+               }
                folio_throttle_swaprate(folio, gfp);
                /*
                 * When a folio is not zeroed during allocation
index db5568cf36e12b6fe52854b274fc331d9b36cac3..c0a7f1cf6fef5c75cfe4db07249609bbb0e53eca 100644 (file)
@@ -1373,19 +1373,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
        pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
-       struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
-       spin_lock_init(&ds_queue->split_queue_lock);
-       INIT_LIST_HEAD(&ds_queue->split_queue);
-       ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
 #ifdef CONFIG_COMPACTION
 static void pgdat_init_kcompactd(struct pglist_data *pgdat)
 {
@@ -1401,8 +1388,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 
        pgdat_resize_init(pgdat);
        pgdat_kswapd_lock_init(pgdat);
-
-       pgdat_init_split_queue(pgdat);
        pgdat_init_kcompactd(pgdat);
 
        init_waitqueue_head(&pgdat->kswapd_wait);
index 04f5ce992401ef5c98c04de1b9d1e09f4629ffc7..9c3a5cf9977860ffeb648ab99fbd842a9c4911b7 100644 (file)
@@ -465,6 +465,16 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
                return ERR_PTR(-ENOMEM);
        }
 
+       if (order > 1 && folio_memcg_alloc_deferred(folio)) {
+               spin_lock(&ci->lock);
+               __swap_cache_do_del_folio(ci, folio, entry, shadow);
+               spin_unlock(&ci->lock);
+               folio_unlock(folio);
+               /* nr_pages refs from swap cache, 1 from allocation */
+               folio_put_refs(folio, nr_pages + 1);
+               return ERR_PTR(-ENOMEM);
+       }
+
        /* memsw uncharges swap when folio is added to swap cache */
        memcg1_swapin(folio);
        if (shadow)