mm, swap: unify large folio allocation

author Kairui Song <kasong@tencent.com>

Sun, 17 May 2026 15:39:44 +0000 (23:39 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 2 Jun 2026 22:22:21 +0000 (15:22 -0700)
author Kairui Song <kasong@tencent.com>
Sun, 17 May 2026 15:39:44 +0000 (23:39 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 2 Jun 2026 22:22:21 +0000 (15:22 -0700)
diff --git a/mm/memory.c b/mm/memory.c

index 0c9d9c2cbf0e043a0299fbf04e300ab2a4d899b0..da891bcce59c7e16073a09d96e901398978b99ec 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4609,26 +4609,6 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
         return VM_FAULT_SIGBUS;
  }
  
-static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
-{
-       struct vm_area_struct *vma = vmf->vma;
-       struct folio *folio;
-       softleaf_t entry;
-
-       folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
-       if (!folio)
-               return NULL;
-
-       entry = softleaf_from_pte(vmf->orig_pte);
-       if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
-                                          GFP_KERNEL, entry)) {
-               folio_put(folio);
-               return NULL;
-       }
-
-       return folio;
-}
-
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * Check if the PTEs within a range are contiguous swap entries
@@ -4658,8 +4638,6 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
          */
         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
                 return false;
-       if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
-               return false;
  
         return true;
  }
@@ -4687,16 +4665,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
         return orders;
  }
  
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         unsigned long orders;
-       struct folio *folio;
         unsigned long addr;
         softleaf_t entry;
         spinlock_t *ptl;
         pte_t *pte;
-       gfp_t gfp;
         int order;
  
         /*
@@ -4704,7 +4680,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
          * maintain the uffd semantics.
          */
         if (unlikely(userfaultfd_armed(vma)))
-               goto fallback;
+               return 0;
  
         /*
          * A large swapped out folio could be partially or fully in zswap. We
@@ -4712,7 +4688,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
          * folio.
          */
         if (!zswap_never_enabled())
-               goto fallback;
+               return 0;
  
         entry = softleaf_from_pte(vmf->orig_pte);
         /*
@@ -4726,12 +4702,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
                                           vmf->address, orders);
  
         if (!orders)
-               goto fallback;
+               return 0;
  
         pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                   vmf->address & PMD_MASK, &ptl);
         if (unlikely(!pte))
-               goto fallback;
+               return 0;
  
         /*
          * For do_swap_page, find the highest order where the aligned range is
@@ -4747,29 +4723,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
  
         pte_unmap_unlock(pte, ptl);
  
-       /* Try allocating the highest of the remaining orders. */
-       gfp = vma_thp_gfp_mask(vma);
-       while (orders) {
-               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
-               folio = vma_alloc_folio(gfp, order, vma, addr);
-               if (folio) {
-                       if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
-                                                           gfp, entry))
-                               return folio;
-                       count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
-                       folio_put(folio);
-               }
-               count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
-               order = next_order(&orders, order);
-       }
-
-fallback:
-       return __alloc_swap_folio(vmf);
+       return orders;
  }
  #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
  {
-       return __alloc_swap_folio(vmf);
+       return 0;
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
@@ -4875,23 +4834,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         if (folio)
                 swap_update_readahead(folio, vma, vmf->address);
         if (!folio) {
-               if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
-                       folio = alloc_swap_folio(vmf);
-                       if (folio) {
-                               /*
-                                * folio is charged, so swapin can only fail due
-                                * to raced swapin and return NULL.
-                                */
-                               swapcache = swapin_folio(entry, folio);
-                               if (swapcache != folio)
-                                       folio_put(folio);
-                               folio = swapcache;
-                       }
-               } else {
+               /* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
+               if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+                       folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
+                                           thp_swapin_suitable_orders(vmf) | BIT(0),
+                                           vmf, NULL, 0);
+               else
                         folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
-               }
  
-               if (!folio) {
+               if (IS_ERR_OR_NULL(folio)) {
                         /*
                          * Back out if somebody else faulted in this pte
                          * while we released the pte lock.
@@ -4901,6 +4852,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                         if (likely(vmf->pte &&
                                    pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                                 ret = VM_FAULT_OOM;
+                       folio = NULL;
                         goto unlock;
                 }
  
diff --git a/mm/shmem.c b/mm/shmem.c

index 6edb23b41bac9582629d8646709f11a9dbbd6d55..77a3e28e51601e97f61e1988721fe3d7a3586f8d 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void)
  
  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                         struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
-                       struct vm_area_struct *vma, vm_fault_t *fault_type);
+                       struct vm_fault *vmf, vm_fault_t *fault_type);
  
  static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  {
@@ -2017,68 +2017,32 @@ unlock:
  }
  
  static struct folio *shmem_swap_alloc_folio(struct inode *inode,
-               struct vm_area_struct *vma, pgoff_t index,
+               struct vm_fault *vmf, pgoff_t index,
                 swp_entry_t entry, int order, gfp_t gfp)
  {
+       pgoff_t ilx;
+       struct folio *folio;
+       struct mempolicy *mpol;
         struct shmem_inode_info *info = SHMEM_I(inode);
-       struct folio *new, *swapcache;
-       int nr_pages = 1 << order;
-       gfp_t alloc_gfp = gfp;
-
-       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
-               if (WARN_ON_ONCE(order))
-                       return ERR_PTR(-EINVAL);
-       } else if (order) {
-               /*
-                * If uffd is active for the vma, we need per-page fault
-                * fidelity to maintain the uffd semantics, then fallback
-                * to swapin order-0 folio, as well as for zswap case.
-                * Any existing sub folio in the swap cache also blocks
-                * mTHP swapin.
-                */
-               if ((vma && unlikely(userfaultfd_armed(vma))) ||
-                    !zswap_never_enabled() ||
-                    non_swapcache_batch(entry, nr_pages) != nr_pages)
-                       goto fallback;
  
-               alloc_gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
-       }
-retry:
-       new = shmem_alloc_folio(alloc_gfp, order, info, index);
-       if (!new) {
-               new = ERR_PTR(-ENOMEM);
-               goto fallback;
-       }
+       if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) ||
+            !zswap_never_enabled())
+               order = 0;
  
-       if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
-                                          alloc_gfp, entry)) {
-               folio_put(new);
-               new = ERR_PTR(-ENOMEM);
-               goto fallback;
-       }
+again:
+       mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
+       folio = swapin_sync(entry, gfp, BIT(order), vmf, mpol, ilx);
+       mpol_cond_put(mpol);
  
-       swapcache = swapin_folio(entry, new);
-       if (swapcache != new) {
-               folio_put(new);
-               if (!swapcache) {
-                       /*
-                        * The new folio is charged already, swapin can
-                        * only fail due to another raced swapin.
-                        */
-                       new = ERR_PTR(-EEXIST);
-                       goto fallback;
-               }
+       if (!IS_ERR(folio))
+               return folio;
+
+       if (order) {
+               order = 0;
+               goto again;
         }
-       return swapcache;
-fallback:
-       /* Order 0 swapin failed, nothing to fallback to, abort */
-       if (!order)
-               return new;
-       entry.val += index - round_down(index, nr_pages);
-       alloc_gfp = gfp;
-       nr_pages = 1;
-       order = 0;
-       goto retry;
+
+       return folio;
  }
  
  /*
@@ -2265,11 +2229,12 @@ unlock:
   */
  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                              struct folio **foliop, enum sgp_type sgp,
-                            gfp_t gfp, struct vm_area_struct *vma,
+                            gfp_t gfp, struct vm_fault *vmf,
                              vm_fault_t *fault_type)
  {
         struct address_space *mapping = inode->i_mapping;
-       struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
+       struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+       struct mm_struct *fault_mm = vmf ? vmf->vma->vm_mm : NULL;
         struct shmem_inode_info *info = SHMEM_I(inode);
         swp_entry_t swap;
         softleaf_t index_entry;
@@ -2310,20 +2275,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
         if (!folio) {
                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
                         /* Direct swapin skipping swap cache & readahead */
-                       folio = shmem_swap_alloc_folio(inode, vma, index,
-                                                      index_entry, order, gfp);
-                       if (IS_ERR(folio)) {
-                               error = PTR_ERR(folio);
-                               folio = NULL;
-                               goto failed;
-                       }
+                       folio = shmem_swap_alloc_folio(inode, vmf, index,
+                                                      swap, order, gfp);
                 } else {
                         /* Cached swapin only supports order 0 folio */
                         folio = shmem_swapin_cluster(swap, gfp, info, index);
-                       if (!folio) {
+               }
+               if (IS_ERR_OR_NULL(folio)) {
+                       if (IS_ERR(folio))
+                               error = PTR_ERR(folio);
+                       else
                                 error = -ENOMEM;
-                               goto failed;
-                       }
+                       folio = NULL;
+                       goto failed;
                 }
                 if (fault_type) {
                         *fault_type |= VM_FAULT_MAJOR;
@@ -2471,7 +2435,7 @@ repeat:
  
         if (xa_is_value(folio)) {
                 error = shmem_swapin_folio(inode, index, &folio,
-                                          sgp, gfp, vma, fault_type);
+                                          sgp, gfp, vmf, fault_type);
                 if (error == -EEXIST)
                         goto repeat;
  
diff --git a/mm/swap.h b/mm/swap.h

index 6774af10a94338c5ea4a0fcb863c32bc2d0ae693..8e57e943162461311e4c7772138e9daa7d17f36c 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -300,7 +300,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
                 struct mempolicy *mpol, pgoff_t ilx);
  struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
                 struct vm_fault *vmf);
-struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
+struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders,
+                          struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx);
  void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
                            unsigned long addr);
  
@@ -334,24 +335,6 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
                 return find_next_bit(sis->zeromap, end, start) - start;
  }
  
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
-       int i;
-
-       /*
-        * While allocating a large folio and doing mTHP swapin, we need to
-        * ensure all entries are not cached, otherwise, the mTHP folio will
-        * be in conflict with the folio in swap cache.
-        */
-       for (i = 0; i < max_nr; i++) {
-               if (swap_cache_has_folio(entry))
-                       return i;
-               entry.val++;
-       }
-
-       return i;
-}
-
  #else /* CONFIG_SWAP */
  struct swap_iocb;
  static inline struct swap_cluster_info *swap_cluster_lock(
@@ -433,7 +416,9 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
         return NULL;
  }
  
-static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+static inline struct folio *swapin_sync(
+       swp_entry_t entry, gfp_t flag, unsigned long orders,
+       struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
  {
         return NULL;
  }
@@ -493,10 +478,5 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
  {
         return 0;
  }
-
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
-       return 0;
-}
  #endif /* CONFIG_SWAP */
  #endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 0adb0565bbb10b134085e89175543df63c6f0056..98c8691826fb51746120d582a27a83461e963142 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -238,43 +238,6 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci,
         lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
  }
  
-/**
- * swap_cache_add_folio - Add a folio into the swap cache.
- * @folio: The folio to be added.
- * @entry: The swap entry corresponding to the folio.
- * @shadowp: If a shadow is found, return the shadow.
- *
- * Add a folio into the swap cache. Will return error if any slot is no
- * longer a valid swapped out slot or already occupied by another folio.
- *
- * Context: Caller must ensure @entry is valid and protect the swap device
- * with reference count or locks.
- */
-static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
-                               void **shadowp)
-{
-       int err;
-       void *shadow = NULL;
-       struct swap_info_struct *si;
-       struct swap_cluster_info *ci;
-       unsigned long nr_pages = folio_nr_pages(folio);
-
-       si = __swap_entry_to_info(entry);
-       ci = swap_cluster_lock(si, swp_offset(entry));
-       err = __swap_cache_add_check(ci, entry, nr_pages, &shadow);
-       if (err) {
-               swap_cluster_unlock(ci);
-               return err;
-       }
-
-       __swap_cache_add_folio(ci, folio, entry);
-       swap_cluster_unlock(ci);
-       if (shadowp)
-               *shadowp = shadow;
-
-       return 0;
-}
-
  static void __swap_cache_do_del_folio(struct swap_cluster_info *ci,
                                       struct folio *folio,
                                       swp_entry_t entry, void *shadow)
@@ -650,51 +613,6 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
         }
  }
  
-/**
- * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache.
- * @entry: swap entry to be bound to the folio.
- * @folio: folio to be added.
- * @gfp: memory allocation flags for charge, can be 0 if @charged if true.
- * @charged: if the folio is already charged.
- *
- * Update the swap_map and add folio as swap cache, typically before swapin.
- * All swap slots covered by the folio must have a non-zero swap count.
- *
- * Context: Caller must protect the swap device with reference count or locks.
- * Return: 0 if success, error code if failed.
- */
-static int __swap_cache_prepare_and_add(swp_entry_t entry,
-                                       struct folio *folio,
-                                       gfp_t gfp, bool charged)
-{
-       void *shadow;
-       int ret;
-
-       __folio_set_locked(folio);
-       __folio_set_swapbacked(folio);
-
-       if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) {
-               ret = -ENOMEM;
-               goto failed;
-       }
-
-       ret = swap_cache_add_folio(folio, entry, &shadow);
-       if (ret)
-               goto failed;
-
-       memcg1_swapin(entry, folio_nr_pages(folio));
-       if (shadow)
-               workingset_refault(folio, shadow);
-
-       /* Caller will initiate read into locked folio */
-       folio_add_lru(folio);
-       return 0;
-
-failed:
-       folio_unlock(folio);
-       return ret;
-}
-
  static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
                                            struct mempolicy *mpol, pgoff_t ilx,
                                            struct swap_iocb **plug, bool readahead)
@@ -705,7 +623,6 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
                 folio = swap_cache_get_folio(entry);
                 if (folio)
                         return folio;
-
                 folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx);
         } while (PTR_ERR(folio) == -EEXIST);
  
@@ -722,49 +639,37 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
  }
  
  /**
- * swapin_folio - swap-in one or multiple entries skipping readahead.
- * @entry: starting swap entry to swap in
- * @folio: a new allocated and charged folio
+ * swapin_sync - swap-in one or multiple entries skipping readahead.
+ * @entry: swap entry indicating the target slot
+ * @gfp: memory allocation flags
+ * @orders: allocation orders
+ * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
   *
- * Reads @entry into @folio, @folio will be added to the swap cache.
- * If @folio is a large folio, the @entry will be rounded down to align
- * with the folio size.
+ * This allocates a folio suitable for given @orders, or returns the
+ * existing folio in the swap cache for @entry. This initiates the IO, too,
+ * if needed. @entry is rounded down if @orders allow large allocation.
   *
- * Return: returns pointer to @folio on success. If folio is a large folio
- * and this raced with another swapin, NULL will be returned to allow fallback
- * to order 0. Else, if another folio was already added to the swap cache,
- * return that swap cache folio instead.
+ * Context: Caller must ensure @entry is valid and pin the swap device with refcount.
+ * Return: Returns the folio on success, error code if failed.
   */
-struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
+                          struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
  {
-       int ret;
-       struct folio *swapcache;
-       pgoff_t offset = swp_offset(entry);
-       unsigned long nr_pages = folio_nr_pages(folio);
-
-       entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
-       for (;;) {
-               ret = __swap_cache_prepare_and_add(entry, folio, 0, true);
-               if (!ret) {
-                       swap_read_folio(folio, NULL);
-                       break;
-               }
+       struct folio *folio;
  
-               /*
-                * Large order allocation needs special handling on
-                * race: if a smaller folio exists in cache, swapin needs
-                * to fall back to order 0, and doing a swap cache lookup
-                * might return a folio that is irrelevant to the faulting
-                * entry because @entry is aligned down. Just return NULL.
-                */
-               if (ret != -EEXIST || nr_pages > 1)
-                       return NULL;
+       do {
+               folio = swap_cache_get_folio(entry);
+               if (folio)
+                       return folio;
+               folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx);
+       } while (PTR_ERR(folio) == -EEXIST);
  
-               swapcache = swap_cache_get_folio(entry);
-               if (swapcache)
-                       return swapcache;
-       }
+       if (IS_ERR(folio))
+               return folio;
  
+       swap_read_folio(folio, NULL);
         return folio;
  }
  
diff --git a/mm/swapfile.c b/mm/swapfile.c

index ee515a6fbccd4dad0d790bec2641671dd4b522b7..4ffd491cacca52880dd18dbced24ca80df0f136d 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1853,8 +1853,7 @@ void folio_put_swap(struct folio *folio, struct page *subpage)
   *   do_swap_page()
   *     ...                             swapoff+swapon
   *     swap_cache_alloc_folio()
- *       swap_cache_add_folio()
- *         // check swap_map
+ *       // check swap_map
   *     // verify PTE not changed
   *
   * In __swap_duplicate(), the swap_map need to be checked before
author	Kairui Song <kasong@tencent.com>
	Sun, 17 May 2026 15:39:44 +0000 (23:39 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 2 Jun 2026 22:22:21 +0000 (15:22 -0700)
mm/memory.c		patch \| blob \| blame \| history
mm/shmem.c		patch \| blob \| blame \| history
mm/swap.h		patch \| blob \| blame \| history
mm/swap_state.c		patch \| blob \| blame \| history
mm/swapfile.c		patch \| blob \| blame \| history