mm, swap: drop the SWAP_HAS_CACHE flag

author Kairui Song <kasong@tencent.com>

Fri, 19 Dec 2025 19:43:47 +0000 (03:43 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Sat, 31 Jan 2026 22:22:57 +0000 (14:22 -0800)
author Kairui Song <kasong@tencent.com>
Fri, 19 Dec 2025 19:43:47 +0000 (03:43 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Sat, 31 Jan 2026 22:22:57 +0000 (14:22 -0800)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 517d24e96d8cef3609f392da0dac3b80ba516eaa..62fc7499b40899b0f365e474ad99b4ba433cf61e 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -224,7 +224,6 @@ enum {
  #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
  
  /* Bit flag in swap_map */
-#define SWAP_HAS_CACHE 0x40    /* Flag page is cached, in first swap_map */
  #define COUNT_CONTINUED        0x80    /* Flag swap_map continuation for full count */
  
  /* Special value in first swap_map */
diff --git a/mm/swap.h b/mm/swap.h

index 393378ce16870ccdba2b766b1330889b6a18018e..bfafa637c45850d6f28b71abd9b8e03814ff4a56 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -205,6 +205,11 @@ int folio_alloc_swap(struct folio *folio);
  int folio_dup_swap(struct folio *folio, struct page *subpage);
  void folio_put_swap(struct folio *folio, struct page *subpage);
  
+/* For internal use */
+extern void swap_entries_free(struct swap_info_struct *si,
+                             struct swap_cluster_info *ci,
+                             unsigned long offset, unsigned int nr_pages);
+
  /* linux/mm/page_io.c */
  int sio_pool_init(void);
  struct swap_iocb;
@@ -256,14 +261,6 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
         return folio_entry.val == round_down(entry.val, nr_pages);
  }
  
-/* Temporary internal helpers */
-void __swapcache_set_cached(struct swap_info_struct *si,
-                           struct swap_cluster_info *ci,
-                           swp_entry_t entry);
-void __swapcache_clear_cached(struct swap_info_struct *si,
-                             struct swap_cluster_info *ci,
-                             swp_entry_t entry, unsigned int nr);
-
  /*
   * All swap cache helpers below require the caller to ensure the swap entries
   * used are valid and stabilize the device by any of the following ways:
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 4f59770e5eb7c6ccd9f2c909176f4cd97e23d951..6d0eef7470be88fcea5884b99eefe6a83edf4cd3 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -210,17 +210,6 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
                         shadow = swp_tb_to_shadow(old_tb);
                 offset++;
         } while (++ci_off < ci_end);
-
-       ci_off = ci_start;
-       offset = swp_offset(entry);
-       do {
-               /*
-                * Still need to pin the slots with SWAP_HAS_CACHE since
-                * swap allocator depends on that.
-                */
-               __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset));
-               offset++;
-       } while (++ci_off < ci_end);
         __swap_cache_add_folio(ci, folio, entry);
         swap_cluster_unlock(ci);
         if (shadowp)
@@ -251,6 +240,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
         struct swap_info_struct *si;
         unsigned long old_tb, new_tb;
         unsigned int ci_start, ci_off, ci_end;
+       bool folio_swapped = false, need_free = false;
         unsigned long nr_pages = folio_nr_pages(folio);
  
         VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci);
@@ -268,13 +258,27 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
                 old_tb = __swap_table_xchg(ci, ci_off, new_tb);
                 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
                              swp_tb_to_folio(old_tb) != folio);
+               if (__swap_count(swp_entry(si->type,
+                                swp_offset(entry) + ci_off - ci_start)))
+                       folio_swapped = true;
+               else
+                       need_free = true;
         } while (++ci_off < ci_end);
  
         folio->swap.val = 0;
         folio_clear_swapcache(folio);
         node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
         lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
-       __swapcache_clear_cached(si, ci, entry, nr_pages);
+
+       if (!folio_swapped) {
+               swap_entries_free(si, ci, swp_offset(entry), nr_pages);
+       } else if (need_free) {
+               do {
+                       if (!__swap_count(entry))
+                               swap_entries_free(si, ci, swp_offset(entry), 1);
+                       entry.val++;
+               } while (--nr_pages);
+       }
  }
  
  /**
diff --git a/mm/swapfile.c b/mm/swapfile.c

index a41632e747875f2f98db69a72ffc7a758db24c32..5721018cb28a379264caa9d81a2ca57b3e67b580 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -48,21 +48,18 @@
  #include <linux/swap_cgroup.h>
  #include "swap_table.h"
  #include "internal.h"
+#include "swap_table.h"
  #include "swap.h"
  
  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                  unsigned char);
  static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entries_free(struct swap_info_struct *si,
-                             struct swap_cluster_info *ci,
-                             unsigned long start, unsigned int nr_pages);
  static void swap_range_alloc(struct swap_info_struct *si,
                              unsigned int nr_entries);
  static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr);
  static void swap_put_entry_locked(struct swap_info_struct *si,
                                   struct swap_cluster_info *ci,
-                                 unsigned long offset,
-                                 unsigned char usage);
+                                 unsigned long offset);
  static bool folio_swapcache_freeable(struct folio *folio);
  static void move_cluster(struct swap_info_struct *si,
                          struct swap_cluster_info *ci, struct list_head *list,
@@ -149,11 +146,6 @@ static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
         return swap_type_to_info(swp_type(entry));
  }
  
-static inline unsigned char swap_count(unsigned char ent)
-{
-       return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
-}
-
  /*
   * Use the second highest bit of inuse_pages counter as the indicator
   * if one swap device is on the available plist, so the atomic can
@@ -185,15 +177,20 @@ static long swap_usage_in_pages(struct swap_info_struct *si)
  #define TTRS_FULL              0x4
  
  static bool swap_only_has_cache(struct swap_info_struct *si,
-                             unsigned long offset, int nr_pages)
+                               struct swap_cluster_info *ci,
+                               unsigned long offset, int nr_pages)
  {
+       unsigned int ci_off = offset % SWAPFILE_CLUSTER;
         unsigned char *map = si->swap_map + offset;
         unsigned char *map_end = map + nr_pages;
+       unsigned long swp_tb;
  
         do {
-               VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
-               if (*map != SWAP_HAS_CACHE)
+               swp_tb = __swap_table_get(ci, ci_off);
+               VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
+               if (*map)
                         return false;
+               ++ci_off;
         } while (++map < map_end);
  
         return true;
@@ -248,12 +245,12 @@ again:
                 goto out_unlock;
  
         /*
-        * It's safe to delete the folio from swap cache only if the folio's
-        * swap_map is HAS_CACHE only, which means the slots have no page table
+        * It's safe to delete the folio from swap cache only if the folio
+        * is in swap cache with swap count == 0. The slots have no page table
          * reference or pending writeback, and can't be allocated to others.
          */
         ci = swap_cluster_lock(si, offset);
-       need_reclaim = swap_only_has_cache(si, offset, nr_pages);
+       need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages);
         swap_cluster_unlock(ci);
         if (!need_reclaim)
                 goto out_unlock;
@@ -779,7 +776,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
  
         spin_unlock(&ci->lock);
         do {
-               if (swap_count(READ_ONCE(map[offset])))
+               if (READ_ONCE(map[offset]))
                         break;
                 swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
                 if (swp_tb_is_folio(swp_tb)) {
@@ -809,7 +806,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
          */
         for (offset = start; offset < end; offset++) {
                 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-               if (swap_count(map[offset]) || !swp_tb_is_null(swp_tb))
+               if (map[offset] || !swp_tb_is_null(swp_tb))
                         return false;
         }
  
@@ -829,11 +826,10 @@ static bool cluster_scan_range(struct swap_info_struct *si,
                 return true;
  
         do {
-               if (swap_count(map[offset]))
+               if (map[offset])
                         return false;
                 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
                 if (swp_tb_is_folio(swp_tb)) {
-                       WARN_ON_ONCE(!(map[offset] & SWAP_HAS_CACHE));
                         if (!vm_swap_full())
                                 return false;
                         *need_reclaim = true;
@@ -891,11 +887,6 @@ static bool cluster_alloc_range(struct swap_info_struct *si,
         if (likely(folio)) {
                 order = folio_order(folio);
                 nr_pages = 1 << order;
-               /*
-                * Pin the slot with SWAP_HAS_CACHE to satisfy swap_dup_entries.
-                * This is the legacy allocation behavior, will drop it very soon.
-                */
-               memset(si->swap_map + offset, SWAP_HAS_CACHE, nr_pages);
                 __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset));
         } else if (IS_ENABLED(CONFIG_HIBERNATION)) {
                 order = 0;
@@ -1012,8 +1003,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
                 to_scan--;
  
                 while (offset < end) {
-                       if (!swap_count(READ_ONCE(map[offset])) &&
-                           swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
+                       if (!READ_ONCE(map[offset]) &&
+                           swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
                                 spin_unlock(&ci->lock);
                                 nr_reclaim = __try_to_reclaim_swap(si, offset,
                                                                    TTRS_ANYWAY);
@@ -1115,7 +1106,7 @@ new_cluster:
                  * Scan only one fragment cluster is good enough. Order 0
                  * allocation will surely success, and large allocation
                  * failure is not critical. Scanning one cluster still
-                * keeps the list rotated and reclaimed (for HAS_CACHE).
+                * keeps the list rotated and reclaimed (for clean swap cache).
                  */
                 found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false);
                 if (found)
@@ -1450,8 +1441,8 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
         do {
                 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
                 count = si->swap_map[offset];
-               VM_WARN_ON(swap_count(count) < 1 || count == SWAP_MAP_BAD);
-               if (swap_count(count) == 1) {
+               VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD);
+               if (count == 1) {
                         /* count == 1 and non-cached slots will be batch freed. */
                         if (!swp_tb_is_folio(swp_tb)) {
                                 if (!batch_start)
@@ -1459,7 +1450,6 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
                                 continue;
                         }
                         /* count will be 0 after put, slot can be reclaimed */
-                       VM_WARN_ON(!(count & SWAP_HAS_CACHE));
                         need_reclaim = true;
                 }
                 /*
@@ -1468,7 +1458,7 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
                  * slots will be freed when folio is removed from swap cache
                  * (__swap_cache_del_folio).
                  */
-               swap_put_entry_locked(si, ci, offset, 1);
+               swap_put_entry_locked(si, ci, offset);
                 if (batch_start) {
                         swap_entries_free(si, ci, batch_start, offset - batch_start);
                         batch_start = SWAP_ENTRY_INVALID;
@@ -1625,7 +1615,8 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
         offset = swp_offset(entry);
         if (offset >= si->max)
                 goto bad_offset;
-       if (data_race(!si->swap_map[swp_offset(entry)]))
+       if (data_race(!si->swap_map[swp_offset(entry)]) &&
+           !swap_cache_has_folio(entry))
                 goto bad_free;
         return si;
  
@@ -1646,21 +1637,12 @@ out:
  
  static void swap_put_entry_locked(struct swap_info_struct *si,
                                   struct swap_cluster_info *ci,
-                                 unsigned long offset,
-                                 unsigned char usage)
+                                 unsigned long offset)
  {
         unsigned char count;
-       unsigned char has_cache;
  
         count = si->swap_map[offset];
-
-       has_cache = count & SWAP_HAS_CACHE;
-       count &= ~SWAP_HAS_CACHE;
-
-       if (usage == SWAP_HAS_CACHE) {
-               VM_BUG_ON(!has_cache);
-               has_cache = 0;
-       } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+       if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
                 if (count == COUNT_CONTINUED) {
                         if (swap_count_continued(si, offset, count))
                                 count = SWAP_MAP_MAX | COUNT_CONTINUED;
@@ -1670,10 +1652,8 @@ static void swap_put_entry_locked(struct swap_info_struct *si,
                         count--;
         }
  
-       usage = count | has_cache;
-       if (usage)
-               WRITE_ONCE(si->swap_map[offset], usage);
-       else
+       WRITE_ONCE(si->swap_map[offset], count);
+       if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))
                 swap_entries_free(si, ci, offset, 1);
  }
  
@@ -1742,21 +1722,13 @@ put_out:
         return NULL;
  }
  
-/*
- * Check if it's the last ref of swap entry in the freeing path.
- */
-static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
-{
-       return (count == SWAP_HAS_CACHE) || (count == 1);
-}
-
  /*
   * Drop the last ref of swap entries, caller have to ensure all entries
   * belong to the same cgroup and cluster.
   */
-static void swap_entries_free(struct swap_info_struct *si,
-                             struct swap_cluster_info *ci,
-                             unsigned long offset, unsigned int nr_pages)
+void swap_entries_free(struct swap_info_struct *si,
+                      struct swap_cluster_info *ci,
+                      unsigned long offset, unsigned int nr_pages)
  {
         swp_entry_t entry = swp_entry(si->type, offset);
         unsigned char *map = si->swap_map + offset;
@@ -1769,7 +1741,7 @@ static void swap_entries_free(struct swap_info_struct *si,
  
         ci->count -= nr_pages;
         do {
-               VM_BUG_ON(!swap_is_last_ref(*map));
+               VM_WARN_ON(*map > 1);
                 *map = 0;
         } while (++map < map_end);
  
@@ -1788,7 +1760,7 @@ int __swap_count(swp_entry_t entry)
         struct swap_info_struct *si = __swap_entry_to_info(entry);
         pgoff_t offset = swp_offset(entry);
  
-       return swap_count(si->swap_map[offset]);
+       return si->swap_map[offset];
  }
  
  /**
@@ -1803,7 +1775,7 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
         int count;
  
         ci = swap_cluster_lock(si, offset);
-       count = swap_count(si->swap_map[offset]);
+       count = si->swap_map[offset];
         swap_cluster_unlock(ci);
  
         return count && count != SWAP_MAP_BAD;
@@ -1830,7 +1802,7 @@ int swp_swapcount(swp_entry_t entry)
  
         ci = swap_cluster_lock(si, offset);
  
-       count = swap_count(si->swap_map[offset]);
+       count = si->swap_map[offset];
         if (!(count & COUNT_CONTINUED))
                 goto out;
  
@@ -1868,12 +1840,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
  
         ci = swap_cluster_lock(si, offset);
         if (nr_pages == 1) {
-               if (swap_count(map[roffset]))
+               if (map[roffset])
                         ret = true;
                 goto unlock_out;
         }
         for (i = 0; i < nr_pages; i++) {
-               if (swap_count(map[offset + i])) {
+               if (map[offset + i]) {
                         ret = true;
                         break;
                 }
@@ -2027,7 +1999,7 @@ void swap_free_hibernation_slot(swp_entry_t entry)
                 return;
  
         ci = swap_cluster_lock(si, offset);
-       swap_put_entry_locked(si, ci, offset, 1);
+       swap_put_entry_locked(si, ci, offset);
         swap_cluster_unlock(ci);
  
         /* In theory readahead might add it to the swap cache by accident */
@@ -2432,6 +2404,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                                         unsigned int prev)
  {
         unsigned int i;
+       unsigned long swp_tb;
         unsigned char count;
  
         /*
@@ -2442,7 +2415,11 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
          */
         for (i = prev + 1; i < si->max; i++) {
                 count = READ_ONCE(si->swap_map[i]);
-               if (count && swap_count(count) != SWAP_MAP_BAD)
+               swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
+                                       i % SWAPFILE_CLUSTER);
+               if (count == SWAP_MAP_BAD)
+                       continue;
+               if (count || swp_tb_is_folio(swp_tb))
                         break;
                 if ((i % LATENCY_LIMIT) == 0)
                         cond_resched();
@@ -3667,8 +3644,7 @@ void si_swapinfo(struct sysinfo *val)
   * Returns error code in following case.
   * - success -> 0
   * - swp_entry is invalid -> EINVAL
- * - swap-cache reference is requested but there is already one. -> EEXIST
- * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference is requested but the entry is not used. -> ENOENT
   * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
   */
  static int swap_dup_entries(struct swap_info_struct *si,
@@ -3677,39 +3653,30 @@ static int swap_dup_entries(struct swap_info_struct *si,
                             unsigned char usage, int nr)
  {
         int i;
-       unsigned char count, has_cache;
+       unsigned char count;
  
         for (i = 0; i < nr; i++) {
                 count = si->swap_map[offset + i];
-
                 /*
                  * For swapin out, allocator never allocates bad slots. for
                  * swapin, readahead is guarded by swap_entry_swapped.
                  */
-               if (WARN_ON(swap_count(count) == SWAP_MAP_BAD))
+               if (WARN_ON(count == SWAP_MAP_BAD))
                         return -ENOENT;
-
-               has_cache = count & SWAP_HAS_CACHE;
-               count &= ~SWAP_HAS_CACHE;
-
-               if (!count && !has_cache) {
+               /*
+                * Swap count duplication must be guarded by either swap cache folio (from
+                * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct).
+                */
+               if (WARN_ON(!count &&
+                           !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))))
                         return -ENOENT;
-               } else if (usage == SWAP_HAS_CACHE) {
-                       if (has_cache)
-                               return -EEXIST;
-               } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
+               if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX))
                         return -EINVAL;
-               }
         }
  
         for (i = 0; i < nr; i++) {
                 count = si->swap_map[offset + i];
-               has_cache = count & SWAP_HAS_CACHE;
-               count &= ~SWAP_HAS_CACHE;
-
-               if (usage == SWAP_HAS_CACHE)
-                       has_cache = SWAP_HAS_CACHE;
-               else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+               if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
                         count += usage;
                 else if (swap_count_continued(si, offset + i, count))
                         count = COUNT_CONTINUED;
@@ -3721,7 +3688,7 @@ static int swap_dup_entries(struct swap_info_struct *si,
                         return -ENOMEM;
                 }
  
-               WRITE_ONCE(si->swap_map[offset + i], count | has_cache);
+               WRITE_ONCE(si->swap_map[offset + i], count);
         }
  
         return 0;
@@ -3767,27 +3734,6 @@ int swap_dup_entry_direct(swp_entry_t entry)
         return err;
  }
  
-/* Mark the swap map as HAS_CACHE, caller need to hold the cluster lock */
-void __swapcache_set_cached(struct swap_info_struct *si,
-                           struct swap_cluster_info *ci,
-                           swp_entry_t entry)
-{
-       WARN_ON(swap_dup_entries(si, ci, swp_offset(entry), SWAP_HAS_CACHE, 1));
-}
-
-/* Clear the swap map as !HAS_CACHE, caller need to hold the cluster lock */
-void __swapcache_clear_cached(struct swap_info_struct *si,
-                             struct swap_cluster_info *ci,
-                             swp_entry_t entry, unsigned int nr)
-{
-       if (swap_only_has_cache(si, swp_offset(entry), nr)) {
-               swap_entries_free(si, ci, swp_offset(entry), nr);
-       } else {
-               for (int i = 0; i < nr; i++, entry.val++)
-                       swap_put_entry_locked(si, ci, swp_offset(entry), SWAP_HAS_CACHE);
-       }
-}
-
  /*
   * add_swap_count_continuation - called when a swap count is duplicated
   * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
@@ -3833,7 +3779,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
  
         ci = swap_cluster_lock(si, offset);
  
-       count = swap_count(si->swap_map[offset]);
+       count = si->swap_map[offset];
  
         if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
                 /*
author	Kairui Song <kasong@tencent.com>
	Fri, 19 Dec 2025 19:43:47 +0000 (03:43 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sat, 31 Jan 2026 22:22:57 +0000 (14:22 -0800)
include/linux/swap.h		patch \| blob \| blame \| history
mm/swap.h		patch \| blob \| blame \| history
mm/swap_state.c		patch \| blob \| blame \| history
mm/swapfile.c		patch \| blob \| blame \| history