mm, swap: never bypass the swap cache even for SWP_SYNCHRONOUS_IO

author Kairui Song <kasong@tencent.com>

Fri, 19 Dec 2025 19:43:32 +0000 (03:43 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Sat, 31 Jan 2026 22:22:53 +0000 (14:22 -0800)
author Kairui Song <kasong@tencent.com>
Fri, 19 Dec 2025 19:43:32 +0000 (03:43 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Sat, 31 Jan 2026 22:22:53 +0000 (14:22 -0800)
diff --git a/mm/memory.c b/mm/memory.c

index 87cf4e1a6f8662af3598996043c13244d489fd1f..5a6dd1606c6757abac446c4c3ac9ce1592fe48f1 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4611,7 +4611,16 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
-static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+/* Sanity check that a folio is fully exclusive */
+static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
+                                unsigned int nr_pages)
+{
+       /* Called under PT locked and folio locked, the swap count is stable */
+       do {
+               VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio);
+               entry.val++;
+       } while (--nr_pages);
+}
  
  /*
   * We enter with non-exclusive mmap_lock (to exclude vma changes,
@@ -4624,17 +4633,14 @@ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
  vm_fault_t do_swap_page(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       struct folio *swapcache, *folio = NULL;
-       DECLARE_WAITQUEUE(wait, current);
+       struct folio *swapcache = NULL, *folio;
         struct page *page;
         struct swap_info_struct *si = NULL;
         rmap_t rmap_flags = RMAP_NONE;
-       bool need_clear_cache = false;
         bool exclusive = false;
         softleaf_t entry;
         pte_t pte;
         vm_fault_t ret = 0;
-       void *shadow = NULL;
         int nr_pages;
         unsigned long page_idx;
         unsigned long address;
@@ -4705,57 +4711,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         folio = swap_cache_get_folio(entry);
         if (folio)
                 swap_update_readahead(folio, vma, vmf->address);
-       swapcache = folio;
-
         if (!folio) {
-               if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
-                   __swap_count(entry) == 1) {
-                       /* skip swapcache */
+               if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
                         folio = alloc_swap_folio(vmf);
                         if (folio) {
-                               __folio_set_locked(folio);
-                               __folio_set_swapbacked(folio);
-
-                               nr_pages = folio_nr_pages(folio);
-                               if (folio_test_large(folio))
-                                       entry.val = ALIGN_DOWN(entry.val, nr_pages);
                                 /*
-                                * Prevent parallel swapin from proceeding with
-                                * the cache flag. Otherwise, another thread
-                                * may finish swapin first, free the entry, and
-                                * swapout reusing the same entry. It's
-                                * undetectable as pte_same() returns true due
-                                * to entry reuse.
+                                * folio is charged, so swapin can only fail due
+                                * to raced swapin and return NULL.
                                  */
-                               if (swapcache_prepare(entry, nr_pages)) {
-                                       /*
-                                        * Relax a bit to prevent rapid
-                                        * repeated page faults.
-                                        */
-                                       add_wait_queue(&swapcache_wq, &wait);
-                                       schedule_timeout_uninterruptible(1);
-                                       remove_wait_queue(&swapcache_wq, &wait);
-                                       goto out_page;
-                               }
-                               need_clear_cache = true;
-
-                               memcg1_swapin(entry, nr_pages);
-
-                               shadow = swap_cache_get_shadow(entry);
-                               if (shadow)
-                                       workingset_refault(folio, shadow);
-
-                               folio_add_lru(folio);
-
-                               /* To provide entry to swap_read_folio() */
-                               folio->swap = entry;
-                               swap_read_folio(folio, NULL);
-                               folio->private = NULL;
+                               swapcache = swapin_folio(entry, folio);
+                               if (swapcache != folio)
+                                       folio_put(folio);
+                               folio = swapcache;
                         }
                 } else {
-                       folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
-                                               vmf);
-                       swapcache = folio;
+                       folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
                 }
  
                 if (!folio) {
@@ -4777,6 +4747,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
         }
  
+       swapcache = folio;
         ret |= folio_lock_or_retry(folio, vmf);
         if (ret & VM_FAULT_RETRY)
                 goto out_release;
@@ -4846,24 +4817,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 goto out_nomap;
         }
  
-       /* allocated large folios for SWP_SYNCHRONOUS_IO */
-       if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
-               unsigned long nr = folio_nr_pages(folio);
-               unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
-               unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
-               pte_t *folio_ptep = vmf->pte - idx;
-               pte_t folio_pte = ptep_get(folio_ptep);
-
-               if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
-                   swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
-                       goto out_nomap;
-
-               page_idx = idx;
-               address = folio_start;
-               ptep = folio_ptep;
-               goto check_folio;
-       }
-
         nr_pages = 1;
         page_idx = 0;
         address = vmf->address;
@@ -4907,12 +4860,37 @@ check_folio:
         BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
         BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
  
+       /*
+        * If a large folio already belongs to anon mapping, then we
+        * can just go on and map it partially.
+        * If not, with the large swapin check above failing, the page table
+        * have changed, so sub pages might got charged to the wrong cgroup,
+        * or even should be shmem. So we have to free it and fallback.
+        * Nothing should have touched it, both anon and shmem checks if a
+        * large folio is fully appliable before use.
+        *
+        * This will be removed once we unify folio allocation in the swap cache
+        * layer, where allocation of a folio stabilizes the swap entries.
+        */
+       if (!folio_test_anon(folio) && folio_test_large(folio) &&
+           nr_pages != folio_nr_pages(folio)) {
+               if (!WARN_ON_ONCE(folio_test_dirty(folio)))
+                       swap_cache_del_folio(folio);
+               goto out_nomap;
+       }
+
         /*
          * Check under PT lock (to protect against concurrent fork() sharing
          * the swap entry concurrently) for certainly exclusive pages.
          */
         if (!folio_test_ksm(folio)) {
+               /*
+                * The can_swapin_thp check above ensures all PTE have
+                * same exclusiveness. Checking just one PTE is fine.
+                */
                 exclusive = pte_swp_exclusive(vmf->orig_pte);
+               if (exclusive)
+                       check_swap_exclusive(folio, entry, nr_pages);
                 if (folio != swapcache) {
                         /*
                          * We have a fresh page that is not exposed to the
@@ -4990,18 +4968,16 @@ check_folio:
         vmf->orig_pte = pte_advance_pfn(pte, page_idx);
  
         /* ksm created a completely new copy */
-       if (unlikely(folio != swapcache && swapcache)) {
+       if (unlikely(folio != swapcache)) {
                 folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
                 folio_add_lru_vma(folio, vma);
         } else if (!folio_test_anon(folio)) {
                 /*
-                * We currently only expect small !anon folios which are either
-                * fully exclusive or fully shared, or new allocated large
-                * folios which are fully exclusive. If we ever get large
-                * folios within swapcache here, we have to be careful.
+                * We currently only expect !anon folios that are fully
+                * mappable. See the comment after can_swapin_thp above.
                  */
-               VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
-               VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+               VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
+               VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
                 folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
         } else {
                 folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
@@ -5041,12 +5017,6 @@ unlock:
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
  out:
-       /* Clear the swap cache pin for direct swapin after PTL unlock */
-       if (need_clear_cache) {
-               swapcache_clear(si, entry, nr_pages);
-               if (waitqueue_active(&swapcache_wq))
-                       wake_up(&swapcache_wq);
-       }
         if (si)
                 put_swap_device(si);
         return ret;
@@ -5054,6 +5024,8 @@ out_nomap:
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
  out_page:
+       if (folio_test_swapcache(folio))
+               folio_free_swap(folio);
         folio_unlock(folio);
  out_release:
         folio_put(folio);
@@ -5061,11 +5033,6 @@ out_release:
                 folio_unlock(swapcache);
                 folio_put(swapcache);
         }
-       if (need_clear_cache) {
-               swapcache_clear(si, entry, nr_pages);
-               if (waitqueue_active(&swapcache_wq))
-                       wake_up(&swapcache_wq);
-       }
         if (si)
                 put_swap_device(si);
         return ret;
diff --git a/mm/swap.h b/mm/swap.h

index 9f8f09fb2644ac98abe58e899550de1cd21f7ca5..5d4a33c4adda8742ff72bef7db8761ab09f9bde2 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -268,6 +268,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
                 struct mempolicy *mpol, pgoff_t ilx);
  struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
                 struct vm_fault *vmf);
+struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
  void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
                            unsigned long addr);
  
@@ -386,6 +387,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
         return NULL;
  }
  
+static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+{
+       return NULL;
+}
+
  static inline void swap_update_readahead(struct folio *folio,
                 struct vm_area_struct *vma, unsigned long addr)
  {
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 9a7ffa8d40f70aad22946343ad7f2e664d473dba..a82d0615f64664c5537360f0a4ba24eebe349583 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -544,6 +544,33 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
         return result;
  }
  
+/**
+ * swapin_folio - swap-in one or multiple entries skipping readahead.
+ * @entry: starting swap entry to swap in
+ * @folio: a new allocated and charged folio
+ *
+ * Reads @entry into @folio, @folio will be added to the swap cache.
+ * If @folio is a large folio, the @entry will be rounded down to align
+ * with the folio size.
+ *
+ * Return: returns pointer to @folio on success. If folio is a large folio
+ * and this raced with another swapin, NULL will be returned to allow fallback
+ * to order 0. Else, if another folio was already added to the swap cache,
+ * return that swap cache folio instead.
+ */
+struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+{
+       struct folio *swapcache;
+       pgoff_t offset = swp_offset(entry);
+       unsigned long nr_pages = folio_nr_pages(folio);
+
+       entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
+       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
+       if (swapcache == folio)
+               swap_read_folio(folio, NULL);
+       return swapcache;
+}
+
  /*
   * Locate a page of swap in physical memory, reserving swap cache space
   * and reading the disk if it is not already cached.
author	Kairui Song <kasong@tencent.com>
	Fri, 19 Dec 2025 19:43:32 +0000 (03:43 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sat, 31 Jan 2026 22:22:53 +0000 (14:22 -0800)
mm/memory.c		patch \| blob \| blame \| history
mm/swap.h		patch \| blob \| blame \| history
mm/swap_state.c		patch \| blob \| blame \| history