mm, swap: use swap cache as the swap in synchronize layer

author Kairui Song <kasong@tencent.com>

Fri, 19 Dec 2025 19:43:41 +0000 (03:43 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Sat, 31 Jan 2026 22:22:56 +0000 (14:22 -0800)
author Kairui Song <kasong@tencent.com>
Fri, 19 Dec 2025 19:43:41 +0000 (03:43 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Sat, 31 Jan 2026 22:22:56 +0000 (14:22 -0800)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index bf72b548a96d342787537ffd18e192f8dd77c857..74df3004c850b03613c4dbc5bfa1ce78a0dc6d39 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -458,7 +458,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
  extern swp_entry_t get_swap_page_of_type(int);
  extern int add_swap_count_continuation(swp_entry_t, gfp_t);
  extern int swap_duplicate_nr(swp_entry_t entry, int nr);
-extern int swapcache_prepare(swp_entry_t entry, int nr);
  extern void swap_free_nr(swp_entry_t entry, int nr_pages);
  extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
  int swap_type_of(dev_t device, sector_t offset);
@@ -517,11 +516,6 @@ static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages)
         return 0;
  }
  
-static inline int swapcache_prepare(swp_entry_t swp, int nr)
-{
-       return 0;
-}
-
  static inline void swap_free_nr(swp_entry_t entry, int nr_pages)
  {
  }
diff --git a/mm/swap.h b/mm/swap.h

index 2f79458b37f3b547861075a15d6ccc0175411cbd..e427240073e9c3ebc8e0c627b0e30dad5dc45d33 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -234,6 +234,14 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
         return folio_entry.val == round_down(entry.val, nr_pages);
  }
  
+/* Temporary internal helpers */
+void __swapcache_set_cached(struct swap_info_struct *si,
+                           struct swap_cluster_info *ci,
+                           swp_entry_t entry);
+void __swapcache_clear_cached(struct swap_info_struct *si,
+                             struct swap_cluster_info *ci,
+                             swp_entry_t entry, unsigned int nr);
+
  /*
   * All swap cache helpers below require the caller to ensure the swap entries
   * used are valid and stabilize the device by any of the following ways:
@@ -247,7 +255,8 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
   */
  struct folio *swap_cache_get_folio(swp_entry_t entry);
  void *swap_cache_get_shadow(swp_entry_t entry);
-void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow);
+int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
+                        void **shadow, bool alloc);
  void swap_cache_del_folio(struct folio *folio);
  struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
                                      struct mempolicy *mpol, pgoff_t ilx,
@@ -413,8 +422,10 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry)
         return NULL;
  }
  
-static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow)
+static inline int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
+                                      void **shadow, bool alloc)
  {
+       return -ENOENT;
  }
  
  static inline void swap_cache_del_folio(struct folio *folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c

index d58bce532d9599e6f0f4f21965e59cca94da5b7d..22990c5259cc48ea1e2a9f89a8543f2e4b2ae317 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -127,34 +127,64 @@ void *swap_cache_get_shadow(swp_entry_t entry)
   * @entry: The swap entry corresponding to the folio.
   * @gfp: gfp_mask for XArray node allocation.
   * @shadowp: If a shadow is found, return the shadow.
+ * @alloc: If it's the allocator that is trying to insert a folio. Allocator
+ *         sets SWAP_HAS_CACHE to pin slots before insert so skip map update.
   *
   * Context: Caller must ensure @entry is valid and protect the swap device
   * with reference count or locks.
- * The caller also needs to update the corresponding swap_map slots with
- * SWAP_HAS_CACHE bit to avoid race or conflict.
   */
-void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp)
+int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
+                        void **shadowp, bool alloc)
  {
+       int err;
         void *shadow = NULL;
+       struct swap_info_struct *si;
         unsigned long old_tb, new_tb;
         struct swap_cluster_info *ci;
-       unsigned int ci_start, ci_off, ci_end;
+       unsigned int ci_start, ci_off, ci_end, offset;
         unsigned long nr_pages = folio_nr_pages(folio);
  
         VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
         VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
         VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
  
+       si = __swap_entry_to_info(entry);
         new_tb = folio_to_swp_tb(folio);
         ci_start = swp_cluster_offset(entry);
         ci_end = ci_start + nr_pages;
         ci_off = ci_start;
-       ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+       offset = swp_offset(entry);
+       ci = swap_cluster_lock(si, swp_offset(entry));
+       if (unlikely(!ci->table)) {
+               err = -ENOENT;
+               goto failed;
+       }
         do {
-               old_tb = __swap_table_xchg(ci, ci_off, new_tb);
-               WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+               old_tb = __swap_table_get(ci, ci_off);
+               if (unlikely(swp_tb_is_folio(old_tb))) {
+                       err = -EEXIST;
+                       goto failed;
+               }
+               if (!alloc && unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) {
+                       err = -ENOENT;
+                       goto failed;
+               }
                 if (swp_tb_is_shadow(old_tb))
                         shadow = swp_tb_to_shadow(old_tb);
+               offset++;
+       } while (++ci_off < ci_end);
+
+       ci_off = ci_start;
+       offset = swp_offset(entry);
+       do {
+               /*
+                * Still need to pin the slots with SWAP_HAS_CACHE since
+                * swap allocator depends on that.
+                */
+               if (!alloc)
+                       __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset));
+               __swap_table_set(ci, ci_off, new_tb);
+               offset++;
         } while (++ci_off < ci_end);
  
         folio_ref_add(folio, nr_pages);
@@ -167,6 +197,11 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp
  
         if (shadowp)
                 *shadowp = shadow;
+       return 0;
+
+failed:
+       swap_cluster_unlock(ci);
+       return err;
  }
  
  /**
@@ -185,6 +220,7 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp
  void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
                             swp_entry_t entry, void *shadow)
  {
+       struct swap_info_struct *si;
         unsigned long old_tb, new_tb;
         unsigned int ci_start, ci_off, ci_end;
         unsigned long nr_pages = folio_nr_pages(folio);
@@ -194,6 +230,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
         VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
         VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
  
+       si = __swap_entry_to_info(entry);
         new_tb = shadow_swp_to_tb(shadow);
         ci_start = swp_cluster_offset(entry);
         ci_end = ci_start + nr_pages;
@@ -209,6 +246,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
         folio_clear_swapcache(folio);
         node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
         lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
+       __swapcache_clear_cached(si, ci, entry, nr_pages);
  }
  
  /**
@@ -230,7 +268,6 @@ void swap_cache_del_folio(struct folio *folio)
         __swap_cache_del_folio(ci, folio, entry, NULL);
         swap_cluster_unlock(ci);
  
-       put_swap_folio(folio, entry);
         folio_ref_sub(folio, folio_nr_pages(folio));
  }
  
@@ -422,67 +459,37 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
                                                   gfp_t gfp, bool charged,
                                                   bool skip_if_exists)
  {
-       struct folio *swapcache;
+       struct folio *swapcache = NULL;
         void *shadow;
         int ret;
  
-       /*
-        * Check and pin the swap map with SWAP_HAS_CACHE, then add the folio
-        * into the swap cache. Loop with a schedule delay if raced with
-        * another process setting SWAP_HAS_CACHE. This hackish loop will
-        * be fixed very soon.
-        */
+       __folio_set_locked(folio);
+       __folio_set_swapbacked(folio);
         for (;;) {
-               ret = swapcache_prepare(entry, folio_nr_pages(folio));
+               ret = swap_cache_add_folio(folio, entry, &shadow, false);
                 if (!ret)
                         break;
  
                 /*
-                * The skip_if_exists is for protecting against a recursive
-                * call to this helper on the same entry waiting forever
-                * here because SWAP_HAS_CACHE is set but the folio is not
-                * in the swap cache yet. This can happen today if
-                * mem_cgroup_swapin_charge_folio() below triggers reclaim
-                * through zswap, which may call this helper again in the
-                * writeback path.
-                *
-                * Large order allocation also needs special handling on
+                * Large order allocation needs special handling on
                  * race: if a smaller folio exists in cache, swapin needs
                  * to fallback to order 0, and doing a swap cache lookup
                  * might return a folio that is irrelevant to the faulting
                  * entry because @entry is aligned down. Just return NULL.
                  */
                 if (ret != -EEXIST || skip_if_exists || folio_test_large(folio))
-                       return NULL;
+                       goto failed;
  
-               /*
-                * Check the swap cache again, we can only arrive
-                * here because swapcache_prepare returns -EEXIST.
-                */
                 swapcache = swap_cache_get_folio(entry);
                 if (swapcache)
-                       return swapcache;
-
-               /*
-                * We might race against __swap_cache_del_folio(), and
-                * stumble across a swap_map entry whose SWAP_HAS_CACHE
-                * has not yet been cleared.  Or race against another
-                * swap_cache_alloc_folio(), which has set SWAP_HAS_CACHE
-                * in swap_map, but not yet added its folio to swap cache.
-                */
-               schedule_timeout_uninterruptible(1);
+                       goto failed;
         }
  
-       __folio_set_locked(folio);
-       __folio_set_swapbacked(folio);
-
         if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) {
-               put_swap_folio(folio, entry);
-               folio_unlock(folio);
-               return NULL;
+               swap_cache_del_folio(folio);
+               goto failed;
         }
  
-       swap_cache_add_folio(folio, entry, &shadow);
         memcg1_swapin(entry, folio_nr_pages(folio));
         if (shadow)
                 workingset_refault(folio, shadow);
@@ -490,6 +497,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
         /* Caller will initiate read into locked folio */
         folio_add_lru(folio);
         return folio;
+
+failed:
+       folio_unlock(folio);
+       return swapcache;
  }
  
  /**
diff --git a/mm/swapfile.c b/mm/swapfile.c

index ced53aba3f4cd0a21ae3c689dc2a209b4fa06048..64970ee11fcf74f2f8d20c968283fef817a27d15 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1476,7 +1476,11 @@ again:
         if (!entry.val)
                 return -ENOMEM;
  
-       swap_cache_add_folio(folio, entry, NULL);
+       /*
+        * Allocator has pinned the slots with SWAP_HAS_CACHE
+        * so it should never fail
+        */
+       WARN_ON_ONCE(swap_cache_add_folio(folio, entry, NULL, true));
  
         return 0;
  
@@ -1582,9 +1586,8 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
   *   do_swap_page()
   *     ...                             swapoff+swapon
   *     swap_cache_alloc_folio()
- *       swapcache_prepare()
- *         __swap_duplicate()
- *           // check swap_map
+ *       swap_cache_add_folio()
+ *         // check swap_map
   *     // verify PTE not changed
   *
   * In __swap_duplicate(), the swap_map need to be checked before
@@ -3769,17 +3772,25 @@ int swap_duplicate_nr(swp_entry_t entry, int nr)
         return err;
  }
  
-/*
- * @entry: first swap entry from which we allocate nr swap cache.
- *
- * Called when allocating swap cache for existing swap entries,
- * This can return error codes. Returns 0 at success.
- * -EEXIST means there is a swap cache.
- * Note: return code is different from swap_duplicate().
- */
-int swapcache_prepare(swp_entry_t entry, int nr)
+/* Mark the swap map as HAS_CACHE, caller need to hold the cluster lock */
+void __swapcache_set_cached(struct swap_info_struct *si,
+                           struct swap_cluster_info *ci,
+                           swp_entry_t entry)
+{
+       WARN_ON(swap_dup_entries(si, ci, swp_offset(entry), SWAP_HAS_CACHE, 1));
+}
+
+/* Clear the swap map as !HAS_CACHE, caller need to hold the cluster lock */
+void __swapcache_clear_cached(struct swap_info_struct *si,
+                             struct swap_cluster_info *ci,
+                             swp_entry_t entry, unsigned int nr)
  {
-       return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
+       if (swap_only_has_cache(si, swp_offset(entry), nr)) {
+               swap_entries_free(si, ci, entry, nr);
+       } else {
+               for (int i = 0; i < nr; i++, entry.val++)
+                       swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+       }
  }
  
  /*
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 1d281174164e0c12ecc59b460a57db5415c79649..973ffb9813ea2a76ae8799e9679aa9cc0f3b3599 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -757,10 +757,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
  
                 if (reclaimed && !mapping_exiting(mapping))
                         shadow = workingset_eviction(folio, target_memcg);
-               __swap_cache_del_folio(ci, folio, swap, shadow);
                 memcg1_swapout(folio, swap);
+               __swap_cache_del_folio(ci, folio, swap, shadow);
                 swap_cluster_unlock_irq(ci);
-               put_swap_folio(folio, swap);
         } else {
                 void (*free_folio)(struct folio *);
author	Kairui Song <kasong@tencent.com>
	Fri, 19 Dec 2025 19:43:41 +0000 (03:43 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sat, 31 Jan 2026 22:22:56 +0000 (14:22 -0800)
include/linux/swap.h		patch \| blob \| blame \| history
mm/swap.h		patch \| blob \| blame \| history
mm/swap_state.c		patch \| blob \| blame \| history
mm/swapfile.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history