mm, swap: use the swap table to track the swap count

author Kairui Song <kasong@tencent.com>

Tue, 17 Feb 2026 20:06:34 +0000 (04:06 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Sun, 5 Apr 2026 20:52:59 +0000 (13:52 -0700)
author Kairui Song <kasong@tencent.com>
Tue, 17 Feb 2026 20:06:34 +0000 (04:06 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Sun, 5 Apr 2026 20:52:59 +0000 (13:52 -0700)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 62fc7499b40899b0f365e474ad99b4ba433cf61e..0effe3cc50f5f8a95111fb311e2aeed83f5d9c1d 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,7 +208,6 @@ enum {
         SWP_DISCARDABLE = (1 << 2),     /* blkdev support discard */
         SWP_DISCARDING  = (1 << 3),     /* now discarding a free cluster */
         SWP_SOLIDSTATE  = (1 << 4),     /* blkdev seeks are cheap */
-       SWP_CONTINUED   = (1 << 5),     /* swap_map has count continuation */
         SWP_BLKDEV      = (1 << 6),     /* its a block device */
         SWP_ACTIVATED   = (1 << 7),     /* set after swap_activate success */
         SWP_FS_OPS      = (1 << 8),     /* swapfile operations go through fs */
@@ -223,16 +222,6 @@ enum {
  #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
  #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
  
-/* Bit flag in swap_map */
-#define COUNT_CONTINUED        0x80    /* Flag swap_map continuation for full count */
-
-/* Special value in first swap_map */
-#define SWAP_MAP_MAX   0x3e    /* Max count */
-#define SWAP_MAP_BAD   0x3f    /* Note page is bad */
-
-/* Special value in each swap_map continuation */
-#define SWAP_CONT_MAX  0x7f    /* Max count */
-
  /*
   * The first page in the swap file is the swap header, which is always marked
   * bad to prevent it from being allocated as an entry. This also prevents the
@@ -264,8 +253,7 @@ struct swap_info_struct {
         signed short    prio;           /* swap priority of this type */
         struct plist_node list;         /* entry in swap_active_head */
         signed char     type;           /* strange name for an index */
-       unsigned int    max;            /* extent of the swap_map */
-       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
+       unsigned int    max;            /* size of this swap device */
         unsigned long *zeromap;         /* kvmalloc'ed bitmap to track zero pages */
         struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
         struct list_head free_clusters; /* free clusters list */
@@ -284,18 +272,14 @@ struct swap_info_struct {
         struct completion comp;         /* seldom referenced */
         spinlock_t lock;                /*
                                          * protect map scan related fields like
-                                        * swap_map, inuse_pages and all cluster
-                                        * lists. other fields are only changed
+                                        * inuse_pages and all cluster lists.
+                                        * Other fields are only changed
                                          * at swapon/swapoff, so are protected
                                          * by swap_lock. changing flags need
                                          * hold this lock and swap_lock. If
                                          * both locks need hold, hold swap_lock
                                          * first.
                                          */
-       spinlock_t cont_lock;           /*
-                                        * protect swap count continuation page
-                                        * list.
-                                        */
         struct work_struct discard_work; /* discard worker */
         struct work_struct reclaim_work; /* reclaim worker */
         struct list_head discard_clusters; /* discard clusters list */
@@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void)
  }
  
  extern void si_swapinfo(struct sysinfo *);
-extern int add_swap_count_continuation(swp_entry_t, gfp_t);
  int swap_type_of(dev_t device, sector_t offset);
  int find_first_swap(dev_t *device);
  extern unsigned int count_swap_pages(int, int);
@@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio)
  {
  }
  
-static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
-{
-       return 0;
-}
-
  static inline int swap_dup_entry_direct(swp_entry_t ent)
  {
         return 0;
diff --git a/mm/memory.c b/mm/memory.c

index 2f815a34d924c58ca7a549226169565afc674146..7084c426f9338b988b6598aa161cbceb167de1f9 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1346,7 +1346,7 @@ again:
  
         if (ret == -EIO) {
                 VM_WARN_ON_ONCE(!entry.val);
-               if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
+               if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) {
                         ret = -ENOMEM;
                         goto out;
                 }
diff --git a/mm/swap.h b/mm/swap.h

index bfafa637c45850d6f28b71abd9b8e03814ff4a56..0a91e21e92b144bc18e93a76e332eaa5954a65e0 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -37,6 +37,7 @@ struct swap_cluster_info {
         u8 flags;
         u8 order;
         atomic_long_t __rcu *table;     /* Swap table entries, see mm/swap_table.h */
+       unsigned int *extend_table;     /* For large swap count, protected by ci->lock */
         struct list_head list;
  };
  
@@ -183,6 +184,8 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
         spin_unlock_irq(&ci->lock);
  }
  
+extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp);
+
  /*
   * Below are the core routines for doing swap for a folio.
   * All helpers requires the folio to be locked, and a locked folio
@@ -206,9 +209,9 @@ int folio_dup_swap(struct folio *folio, struct page *subpage);
  void folio_put_swap(struct folio *folio, struct page *subpage);
  
  /* For internal use */
-extern void swap_entries_free(struct swap_info_struct *si,
-                             struct swap_cluster_info *ci,
-                             unsigned long offset, unsigned int nr_pages);
+extern void __swap_cluster_free_entries(struct swap_info_struct *si,
+                                       struct swap_cluster_info *ci,
+                                       unsigned int ci_off, unsigned int nr_pages);
  
  /* linux/mm/page_io.c */
  int sio_pool_init(void);
@@ -446,6 +449,11 @@ static inline int swap_writeout(struct folio *folio,
         return 0;
  }
  
+static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
+{
+       return -EINVAL;
+}
+
  static inline bool swap_cache_has_folio(swp_entry_t entry)
  {
         return false;
diff --git a/mm/swap_state.c b/mm/swap_state.c

index e213ee35c1d24d5462335ebefc6335b05615ca2f..e7618ffe6d70819fc4f1136881e2452afc8fbb00 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -140,21 +140,20 @@ void *swap_cache_get_shadow(swp_entry_t entry)
  void __swap_cache_add_folio(struct swap_cluster_info *ci,
                             struct folio *folio, swp_entry_t entry)
  {
-       unsigned long new_tb;
-       unsigned int ci_start, ci_off, ci_end;
+       unsigned int ci_off = swp_cluster_offset(entry), ci_end;
         unsigned long nr_pages = folio_nr_pages(folio);
+       unsigned long pfn = folio_pfn(folio);
+       unsigned long old_tb;
  
         VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
         VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
         VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
  
-       new_tb = folio_to_swp_tb(folio, 0);
-       ci_start = swp_cluster_offset(entry);
-       ci_off = ci_start;
-       ci_end = ci_start + nr_pages;
+       ci_end = ci_off + nr_pages;
         do {
-               VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off)));
-               __swap_table_set(ci, ci_off, new_tb);
+               old_tb = __swap_table_get(ci, ci_off);
+               VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+               __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
         } while (++ci_off < ci_end);
  
         folio_ref_add(folio, nr_pages);
@@ -183,14 +182,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
         unsigned long old_tb;
         struct swap_info_struct *si;
         struct swap_cluster_info *ci;
-       unsigned int ci_start, ci_off, ci_end, offset;
+       unsigned int ci_start, ci_off, ci_end;
         unsigned long nr_pages = folio_nr_pages(folio);
  
         si = __swap_entry_to_info(entry);
         ci_start = swp_cluster_offset(entry);
         ci_end = ci_start + nr_pages;
         ci_off = ci_start;
-       offset = swp_offset(entry);
         ci = swap_cluster_lock(si, swp_offset(entry));
         if (unlikely(!ci->table)) {
                 err = -ENOENT;
@@ -202,13 +200,12 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
                         err = -EEXIST;
                         goto failed;
                 }
-               if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) {
+               if (unlikely(!__swp_tb_get_count(old_tb))) {
                         err = -ENOENT;
                         goto failed;
                 }
                 if (swp_tb_is_shadow(old_tb))
                         shadow = swp_tb_to_shadow(old_tb);
-               offset++;
         } while (++ci_off < ci_end);
         __swap_cache_add_folio(ci, folio, entry);
         swap_cluster_unlock(ci);
@@ -237,8 +234,9 @@ failed:
  void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
                             swp_entry_t entry, void *shadow)
  {
+       int count;
+       unsigned long old_tb;
         struct swap_info_struct *si;
-       unsigned long old_tb, new_tb;
         unsigned int ci_start, ci_off, ci_end;
         bool folio_swapped = false, need_free = false;
         unsigned long nr_pages = folio_nr_pages(folio);
@@ -249,20 +247,20 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
         VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
  
         si = __swap_entry_to_info(entry);
-       new_tb = shadow_to_swp_tb(shadow, 0);
         ci_start = swp_cluster_offset(entry);
         ci_end = ci_start + nr_pages;
         ci_off = ci_start;
         do {
-               /* If shadow is NULL, we sets an empty shadow */
-               old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+               old_tb = __swap_table_get(ci, ci_off);
                 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
                              swp_tb_to_folio(old_tb) != folio);
-               if (__swap_count(swp_entry(si->type,
-                                swp_offset(entry) + ci_off - ci_start)))
+               count = __swp_tb_get_count(old_tb);
+               if (count)
                         folio_swapped = true;
                 else
                         need_free = true;
+               /* If shadow is NULL, we sets an empty shadow. */
+               __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count));
         } while (++ci_off < ci_end);
  
         folio->swap.val = 0;
@@ -271,13 +269,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
         lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
  
         if (!folio_swapped) {
-               swap_entries_free(si, ci, swp_offset(entry), nr_pages);
+               __swap_cluster_free_entries(si, ci, ci_start, nr_pages);
         } else if (need_free) {
+               ci_off = ci_start;
                 do {
-                       if (!__swap_count(entry))
-                               swap_entries_free(si, ci, swp_offset(entry), 1);
-                       entry.val++;
-               } while (--nr_pages);
+                       if (!__swp_tb_get_count(__swap_table_get(ci, ci_off)))
+                               __swap_cluster_free_entries(si, ci, ci_off, 1);
+               } while (++ci_off < ci_end);
         }
  }
  
@@ -324,17 +322,18 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
         unsigned long nr_pages = folio_nr_pages(new);
         unsigned int ci_off = swp_cluster_offset(entry);
         unsigned int ci_end = ci_off + nr_pages;
-       unsigned long old_tb, new_tb;
+       unsigned long pfn = folio_pfn(new);
+       unsigned long old_tb;
  
         VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
         VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
         VM_WARN_ON_ONCE(!entry.val);
  
         /* Swap cache still stores N entries instead of a high-order entry */
-       new_tb = folio_to_swp_tb(new, 0);
         do {
-               old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+               old_tb = __swap_table_get(ci, ci_off);
                 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
+               __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
         } while (++ci_off < ci_end);
  
         /*
@@ -368,7 +367,7 @@ void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
         ci_end = ci_off + nr_ents;
         do {
                 old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
-               WARN_ON_ONCE(swp_tb_is_folio(old));
+               WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old));
         } while (++ci_off < ci_end);
  }
  
diff --git a/mm/swap_table.h b/mm/swap_table.h

index 10762ac5f4f5ac985c26be9a9ee9378e9d58a0d1..8415ffbe2b9cf0ba0a35bd4c5b3631f72ac452eb 100644 (file)
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -191,6 +191,11 @@ static inline int swp_tb_get_count(unsigned long swp_tb)
         return -EINVAL;
  }
  
+static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count)
+{
+       return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count));
+}
+
  /*
   * Helpers for accessing or modifying the swap table of a cluster,
   * the swap cluster must be locked.
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 54a19ebce540ae97fea92949f4aab3b618cf9c45..cf976ecae8a801297e6bcf7aca216e254336dc97 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,15 +51,8 @@
  #include "swap_table.h"
  #include "swap.h"
  
-static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
-                                unsigned char);
-static void free_swap_count_continuations(struct swap_info_struct *);
  static void swap_range_alloc(struct swap_info_struct *si,
                              unsigned int nr_entries);
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr);
-static void swap_put_entry_locked(struct swap_info_struct *si,
-                                 struct swap_cluster_info *ci,
-                                 unsigned long offset);
  static bool folio_swapcache_freeable(struct folio *folio);
  static void move_cluster(struct swap_info_struct *si,
                          struct swap_cluster_info *ci, struct list_head *list,
@@ -182,22 +175,19 @@ static long swap_usage_in_pages(struct swap_info_struct *si)
  /* Reclaim the swap entry if swap is getting full */
  #define TTRS_FULL              0x4
  
-static bool swap_only_has_cache(struct swap_info_struct *si,
-                               struct swap_cluster_info *ci,
+static bool swap_only_has_cache(struct swap_cluster_info *ci,
                                 unsigned long offset, int nr_pages)
  {
         unsigned int ci_off = offset % SWAPFILE_CLUSTER;
-       unsigned char *map = si->swap_map + offset;
-       unsigned char *map_end = map + nr_pages;
+       unsigned int ci_end = ci_off + nr_pages;
         unsigned long swp_tb;
  
         do {
                 swp_tb = __swap_table_get(ci, ci_off);
                 VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
-               if (*map)
+               if (swp_tb_get_count(swp_tb))
                         return false;
-               ++ci_off;
-       } while (++map < map_end);
+       } while (++ci_off < ci_end);
  
         return true;
  }
@@ -256,7 +246,7 @@ again:
          * reference or pending writeback, and can't be allocated to others.
          */
         ci = swap_cluster_lock(si, offset);
-       need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages);
+       need_reclaim = swap_only_has_cache(ci, offset, nr_pages);
         swap_cluster_unlock(ci);
         if (!need_reclaim)
                 goto out_unlock;
@@ -479,6 +469,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci,
         } while (++ci_off < ci_end);
  
         WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
+       WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table);
  }
  
  static void swap_cluster_free_table(struct swap_cluster_info *ci)
@@ -807,7 +798,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si,
                 pr_warn("Duplicated bad slot offset %d\n", offset);
                 ret = -EINVAL;
         } else {
-               si->swap_map[offset] = SWAP_MAP_BAD;
                 ci->count++;
         }
         spin_unlock(&ci->lock);
@@ -829,18 +819,16 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
  {
         unsigned int nr_pages = 1 << order;
         unsigned long offset = start, end = start + nr_pages;
-       unsigned char *map = si->swap_map;
         unsigned long swp_tb;
  
         spin_unlock(&ci->lock);
         do {
-               if (READ_ONCE(map[offset]))
-                       break;
                 swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-               if (swp_tb_is_folio(swp_tb)) {
+               if (swp_tb_get_count(swp_tb))
+                       break;
+               if (swp_tb_is_folio(swp_tb))
                         if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
                                 break;
-               }
         } while (++offset < end);
         spin_lock(&ci->lock);
  
@@ -864,7 +852,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
          */
         for (offset = start; offset < end; offset++) {
                 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-               if (map[offset] || !swp_tb_is_null(swp_tb))
+               if (!swp_tb_is_null(swp_tb))
                         return false;
         }
  
@@ -876,37 +864,35 @@ static bool cluster_scan_range(struct swap_info_struct *si,
                                unsigned long offset, unsigned int nr_pages,
                                bool *need_reclaim)
  {
-       unsigned long end = offset + nr_pages;
-       unsigned char *map = si->swap_map;
+       unsigned int ci_off = offset % SWAPFILE_CLUSTER;
+       unsigned int ci_end = ci_off + nr_pages;
         unsigned long swp_tb;
  
-       if (cluster_is_empty(ci))
-               return true;
-
         do {
-               if (map[offset])
-                       return false;
-               swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-               if (swp_tb_is_folio(swp_tb)) {
+               swp_tb = __swap_table_get(ci, ci_off);
+               if (swp_tb_is_null(swp_tb))
+                       continue;
+               if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
                         if (!vm_swap_full())
                                 return false;
                         *need_reclaim = true;
-               } else {
-                       /* A entry with no count and no cache must be null */
-                       VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+                       continue;
                 }
-       } while (++offset < end);
+               /* Slot with zero count can only be NULL or folio */
+               VM_WARN_ON(!swp_tb_get_count(swp_tb));
+               return false;
+       } while (++ci_off < ci_end);
  
         return true;
  }
  
-static bool cluster_alloc_range(struct swap_info_struct *si,
-                               struct swap_cluster_info *ci,
-                               struct folio *folio,
-                               unsigned int offset)
+static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
+                                        struct swap_cluster_info *ci,
+                                        struct folio *folio,
+                                        unsigned int ci_off)
  {
-       unsigned long nr_pages;
         unsigned int order;
+       unsigned long nr_pages;
  
         lockdep_assert_held(&ci->lock);
  
@@ -925,14 +911,15 @@ static bool cluster_alloc_range(struct swap_info_struct *si,
         if (likely(folio)) {
                 order = folio_order(folio);
                 nr_pages = 1 << order;
-               swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false);
-               __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset));
+               swap_cluster_assert_empty(ci, ci_off, nr_pages, false);
+               __swap_cache_add_folio(ci, folio, swp_entry(si->type,
+                                                           ci_off + cluster_offset(si, ci)));
         } else if (IS_ENABLED(CONFIG_HIBERNATION)) {
                 order = 0;
                 nr_pages = 1;
-               WARN_ON_ONCE(si->swap_map[offset]);
-               si->swap_map[offset] = 1;
-               swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false);
+               swap_cluster_assert_empty(ci, ci_off, 1, false);
+               /* Sets a fake shadow as placeholder */
+               __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1));
         } else {
                 /* Allocation without folio is only possible with hibernation */
                 WARN_ON_ONCE(1);
@@ -983,7 +970,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
                         if (!ret)
                                 continue;
                 }
-               if (!cluster_alloc_range(si, ci, folio, offset))
+               if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER))
                         break;
                 found = offset;
                 offset += nr_pages;
@@ -1030,7 +1017,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
         long to_scan = 1;
         unsigned long offset, end;
         struct swap_cluster_info *ci;
-       unsigned char *map = si->swap_map;
+       unsigned long swp_tb;
         int nr_reclaim;
  
         if (force)
@@ -1042,8 +1029,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
                 to_scan--;
  
                 while (offset < end) {
-                       if (!READ_ONCE(map[offset]) &&
-                           swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
+                       swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+                       if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
                                 spin_unlock(&ci->lock);
                                 nr_reclaim = __try_to_reclaim_swap(si, offset,
                                                                    TTRS_ANYWAY);
@@ -1452,40 +1439,127 @@ start_over:
         return false;
  }
  
+static int swap_extend_table_alloc(struct swap_info_struct *si,
+                                  struct swap_cluster_info *ci, gfp_t gfp)
+{
+       void *table;
+
+       table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp);
+       if (!table)
+               return -ENOMEM;
+
+       spin_lock(&ci->lock);
+       if (!ci->extend_table)
+               ci->extend_table = table;
+       else
+               kfree(table);
+       spin_unlock(&ci->lock);
+       return 0;
+}
+
+int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
+{
+       int ret;
+       struct swap_info_struct *si;
+       struct swap_cluster_info *ci;
+       unsigned long offset = swp_offset(entry);
+
+       si = get_swap_device(entry);
+       if (!si)
+               return 0;
+
+       ci = __swap_offset_to_cluster(si, offset);
+       ret = swap_extend_table_alloc(si, ci, gfp);
+
+       put_swap_device(si);
+       return ret;
+}
+
+static void swap_extend_table_try_free(struct swap_cluster_info *ci)
+{
+       unsigned long i;
+       bool can_free = true;
+
+       if (!ci->extend_table)
+               return;
+
+       for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+               if (ci->extend_table[i])
+                       can_free = false;
+       }
+
+       if (can_free) {
+               kfree(ci->extend_table);
+               ci->extend_table = NULL;
+       }
+}
+
+/* Decrease the swap count of one slot, without freeing it */
+static void __swap_cluster_put_entry(struct swap_cluster_info *ci,
+                                   unsigned int ci_off)
+{
+       int count;
+       unsigned long swp_tb;
+
+       lockdep_assert_held(&ci->lock);
+       swp_tb = __swap_table_get(ci, ci_off);
+       count = __swp_tb_get_count(swp_tb);
+
+       VM_WARN_ON_ONCE(count <= 0);
+       VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX);
+
+       if (count == SWP_TB_COUNT_MAX) {
+               count = ci->extend_table[ci_off];
+               /* Overflow starts with SWP_TB_COUNT_MAX */
+               VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX);
+               count--;
+               if (count == (SWP_TB_COUNT_MAX - 1)) {
+                       ci->extend_table[ci_off] = 0;
+                       __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count));
+                       swap_extend_table_try_free(ci);
+               } else {
+                       ci->extend_table[ci_off] = count;
+               }
+       } else {
+               __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count));
+       }
+}
+
  /**
- * swap_put_entries_cluster - Decrease the swap count of a set of slots.
+ * swap_put_entries_cluster - Decrease the swap count of slots within one cluster
   * @si: The swap device.
- * @start: start offset of slots.
+ * @offset: start offset of slots.
   * @nr: number of slots.
- * @reclaim_cache: if true, also reclaim the swap cache.
+ * @reclaim_cache: if true, also reclaim the swap cache if slots are freed.
   *
   * This helper decreases the swap count of a set of slots and tries to
   * batch free them. Also reclaims the swap cache if @reclaim_cache is true.
- * Context: The caller must ensure that all slots belong to the same
- * cluster and their swap count doesn't go underflow.
+ *
+ * Context: The specified slots must be pinned by existing swap count or swap
+ * cache reference, so they won't be released until this helper returns.
   */
  static void swap_put_entries_cluster(struct swap_info_struct *si,
-                                    unsigned long start, int nr,
+                                    pgoff_t offset, int nr,
                                      bool reclaim_cache)
  {
-       unsigned long offset = start, end = start + nr;
-       unsigned long batch_start = SWAP_ENTRY_INVALID;
         struct swap_cluster_info *ci;
+       unsigned int ci_off, ci_end;
+       pgoff_t end = offset + nr;
         bool need_reclaim = false;
         unsigned int nr_reclaimed;
         unsigned long swp_tb;
-       unsigned int count;
+       int ci_batch = -1;
  
         ci = swap_cluster_lock(si, offset);
+       ci_off = offset % SWAPFILE_CLUSTER;
+       ci_end = ci_off + nr;
         do {
-               swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-               count = si->swap_map[offset];
-               VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD);
-               if (count == 1) {
+               swp_tb = __swap_table_get(ci, ci_off);
+               if (swp_tb_get_count(swp_tb) == 1) {
                         /* count == 1 and non-cached slots will be batch freed. */
                         if (!swp_tb_is_folio(swp_tb)) {
-                               if (!batch_start)
-                                       batch_start = offset;
+                               if (ci_batch == -1)
+                                       ci_batch = ci_off;
                                 continue;
                         }
                         /* count will be 0 after put, slot can be reclaimed */
@@ -1497,21 +1571,20 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
                  * slots will be freed when folio is removed from swap cache
                  * (__swap_cache_del_folio).
                  */
-               swap_put_entry_locked(si, ci, offset);
-               if (batch_start) {
-                       swap_entries_free(si, ci, batch_start, offset - batch_start);
-                       batch_start = SWAP_ENTRY_INVALID;
+               __swap_cluster_put_entry(ci, ci_off);
+               if (ci_batch != -1) {
+                       __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
+                       ci_batch = -1;
                 }
-       } while (++offset < end);
+       } while (++ci_off < ci_end);
  
-       if (batch_start)
-               swap_entries_free(si, ci, batch_start, offset - batch_start);
+       if (ci_batch != -1)
+               __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
         swap_cluster_unlock(ci);
  
         if (!need_reclaim || !reclaim_cache)
                 return;
  
-       offset = start;
         do {
                 nr_reclaimed = __try_to_reclaim_swap(si, offset,
                                                      TTRS_UNMAPPED | TTRS_FULL);
@@ -1521,6 +1594,92 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
         } while (offset < end);
  }
  
+/* Increase the swap count of one slot. */
+static int __swap_cluster_dup_entry(struct swap_cluster_info *ci,
+                                   unsigned int ci_off)
+{
+       int count;
+       unsigned long swp_tb;
+
+       lockdep_assert_held(&ci->lock);
+       swp_tb = __swap_table_get(ci, ci_off);
+       /* Bad or special slots can't be handled */
+       if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb)))
+               return -EINVAL;
+       count = __swp_tb_get_count(swp_tb);
+       /* Must be either cached or have a count already */
+       if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb)))
+               return -ENOENT;
+
+       if (likely(count < (SWP_TB_COUNT_MAX - 1))) {
+               __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1));
+               VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]);
+       } else if (count == (SWP_TB_COUNT_MAX - 1)) {
+               if (ci->extend_table) {
+                       VM_WARN_ON_ONCE(ci->extend_table[ci_off]);
+                       ci->extend_table[ci_off] = SWP_TB_COUNT_MAX;
+                       __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX));
+               } else {
+                       return -ENOMEM;
+               }
+       } else if (count == SWP_TB_COUNT_MAX) {
+               VM_WARN_ON_ONCE(ci->extend_table[ci_off] >=
+                               type_max(typeof(ci->extend_table[0])));
+               ++ci->extend_table[ci_off];
+       } else {
+               /* Never happens unless counting went wrong */
+               WARN_ON_ONCE(1);
+       }
+
+       return 0;
+}
+
+/**
+ * swap_dup_entries_cluster: Increase the swap count of slots within one cluster.
+ * @si: The swap device.
+ * @offset: start offset of slots.
+ * @nr: number of slots.
+ *
+ * Context: The specified slots must be pinned by existing swap count or swap
+ * cache reference, so they won't be released until this helper returns.
+ * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX)
+ * and failed to allocate an extended table, -EINVAL if any entry is bad entry.
+ */
+static int swap_dup_entries_cluster(struct swap_info_struct *si,
+                                   pgoff_t offset, int nr)
+{
+       int err;
+       struct swap_cluster_info *ci;
+       unsigned int ci_start, ci_off, ci_end;
+
+       ci_start = offset % SWAPFILE_CLUSTER;
+       ci_end = ci_start + nr;
+       ci_off = ci_start;
+       ci = swap_cluster_lock(si, offset);
+restart:
+       do {
+               err = __swap_cluster_dup_entry(ci, ci_off);
+               if (unlikely(err)) {
+                       if (err == -ENOMEM) {
+                               spin_unlock(&ci->lock);
+                               err = swap_extend_table_alloc(si, ci, GFP_ATOMIC);
+                               spin_lock(&ci->lock);
+                               if (!err)
+                                       goto restart;
+                       }
+                       goto failed;
+               }
+       } while (++ci_off < ci_end);
+       swap_cluster_unlock(ci);
+       return 0;
+failed:
+       while (ci_off-- > ci_start)
+               __swap_cluster_put_entry(ci, ci_off);
+       swap_extend_table_try_free(ci);
+       swap_cluster_unlock(ci);
+       return err;
+}
+
  /**
   * folio_alloc_swap - allocate swap space for a folio
   * @folio: folio we want to move to swap
@@ -1589,13 +1748,10 @@ again:
   * Context: Caller must ensure the folio is locked and in the swap cache.
   * NOTE: The caller also has to ensure there is no raced call to
   * swap_put_entries_direct on its swap entry before this helper returns, or
- * the swap map may underflow. Currently, we only accept @subpage == NULL
- * for shmem due to the limitation of swap continuation: shmem always
- * duplicates the swap entry only once, so there is no such issue for it.
+ * the swap count may underflow.
   */
  int folio_dup_swap(struct folio *folio, struct page *subpage)
  {
-       int err = 0;
         swp_entry_t entry = folio->swap;
         unsigned long nr_pages = folio_nr_pages(folio);
  
@@ -1607,10 +1763,8 @@ int folio_dup_swap(struct folio *folio, struct page *subpage)
                 nr_pages = 1;
         }
  
-       while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM)
-               err = add_swap_count_continuation(entry, GFP_ATOMIC);
-
-       return err;
+       return swap_dup_entries_cluster(swap_entry_to_info(entry),
+                                       swp_offset(entry), nr_pages);
  }
  
  /**
@@ -1639,28 +1793,6 @@ void folio_put_swap(struct folio *folio, struct page *subpage)
         swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false);
  }
  
-static void swap_put_entry_locked(struct swap_info_struct *si,
-                                 struct swap_cluster_info *ci,
-                                 unsigned long offset)
-{
-       unsigned char count;
-
-       count = si->swap_map[offset];
-       if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
-               if (count == COUNT_CONTINUED) {
-                       if (swap_count_continued(si, offset, count))
-                               count = SWAP_MAP_MAX | COUNT_CONTINUED;
-                       else
-                               count = SWAP_MAP_MAX;
-               } else
-                       count--;
-       }
-
-       WRITE_ONCE(si->swap_map[offset], count);
-       if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))
-               swap_entries_free(si, ci, offset, 1);
-}
-
  /*
   * When we get a swap entry, if there aren't some other ways to
   * prevent swapoff, such as the folio in swap cache is locked, RCU
@@ -1727,31 +1859,30 @@ put_out:
  }
  
  /*
- * Drop the last ref of swap entries, caller have to ensure all entries
- * belong to the same cgroup and cluster.
+ * Free a set of swap slots after their swap count dropped to zero, or will be
+ * zero after putting the last ref (saves one __swap_cluster_put_entry call).
   */
-void swap_entries_free(struct swap_info_struct *si,
-                      struct swap_cluster_info *ci,
-                      unsigned long offset, unsigned int nr_pages)
+void __swap_cluster_free_entries(struct swap_info_struct *si,
+                                struct swap_cluster_info *ci,
+                                unsigned int ci_start, unsigned int nr_pages)
  {
-       swp_entry_t entry = swp_entry(si->type, offset);
-       unsigned char *map = si->swap_map + offset;
-       unsigned char *map_end = map + nr_pages;
+       unsigned long old_tb;
+       unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
+       unsigned long offset = cluster_offset(si, ci) + ci_start;
  
-       /* It should never free entries across different clusters */
-       VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
-       VM_BUG_ON(cluster_is_empty(ci));
-       VM_BUG_ON(ci->count < nr_pages);
+       VM_WARN_ON(ci->count < nr_pages);
  
         ci->count -= nr_pages;
         do {
-               VM_WARN_ON(*map > 1);
-               *map = 0;
-       } while (++map < map_end);
+               old_tb = __swap_table_get(ci, ci_off);
+               /* Release the last ref, or after swap cache is dropped */
+               VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1);
+               __swap_table_set(ci, ci_off, null_to_swp_tb());
+       } while (++ci_off < ci_end);
  
-       mem_cgroup_uncharge_swap(entry, nr_pages);
+       mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages);
         swap_range_free(si, offset, nr_pages);
-       swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false);
+       swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
  
         if (!ci->count)
                 free_cluster(si, ci);
@@ -1761,10 +1892,10 @@ void swap_entries_free(struct swap_info_struct *si,
  
  int __swap_count(swp_entry_t entry)
  {
-       struct swap_info_struct *si = __swap_entry_to_info(entry);
-       pgoff_t offset = swp_offset(entry);
+       struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+       unsigned int ci_off = swp_cluster_offset(entry);
  
-       return si->swap_map[offset];
+       return swp_tb_get_count(__swap_table_get(ci, ci_off));
  }
  
  /**
@@ -1776,81 +1907,62 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
  {
         pgoff_t offset = swp_offset(entry);
         struct swap_cluster_info *ci;
-       int count;
+       unsigned long swp_tb;
  
         ci = swap_cluster_lock(si, offset);
-       count = si->swap_map[offset];
+       swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
         swap_cluster_unlock(ci);
  
-       return count && count != SWAP_MAP_BAD;
+       return swp_tb_get_count(swp_tb) > 0;
  }
  
  /*
   * How many references to @entry are currently swapped out?
- * This considers COUNT_CONTINUED so it returns exact answer.
+ * This returns exact answer.
   */
  int swp_swapcount(swp_entry_t entry)
  {
-       int count, tmp_count, n;
         struct swap_info_struct *si;
         struct swap_cluster_info *ci;
-       struct page *page;
-       pgoff_t offset;
-       unsigned char *map;
+       unsigned long swp_tb;
+       int count;
  
         si = get_swap_device(entry);
         if (!si)
                 return 0;
  
-       offset = swp_offset(entry);
-
-       ci = swap_cluster_lock(si, offset);
-
-       count = si->swap_map[offset];
-       if (!(count & COUNT_CONTINUED))
-               goto out;
-
-       count &= ~COUNT_CONTINUED;
-       n = SWAP_MAP_MAX + 1;
-
-       page = vmalloc_to_page(si->swap_map + offset);
-       offset &= ~PAGE_MASK;
-       VM_BUG_ON(page_private(page) != SWP_CONTINUED);
-
-       do {
-               page = list_next_entry(page, lru);
-               map = kmap_local_page(page);
-               tmp_count = map[offset];
-               kunmap_local(map);
-
-               count += (tmp_count & ~COUNT_CONTINUED) * n;
-               n *= (SWAP_CONT_MAX + 1);
-       } while (tmp_count & COUNT_CONTINUED);
-out:
+       ci = swap_cluster_lock(si, swp_offset(entry));
+       swp_tb = __swap_table_get(ci, swp_cluster_offset(entry));
+       count = swp_tb_get_count(swp_tb);
+       if (count == SWP_TB_COUNT_MAX)
+               count = ci->extend_table[swp_cluster_offset(entry)];
         swap_cluster_unlock(ci);
         put_swap_device(si);
-       return count;
+
+       return count < 0 ? 0 : count;
  }
  
  static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
                                          swp_entry_t entry, int order)
  {
         struct swap_cluster_info *ci;
-       unsigned char *map = si->swap_map;
         unsigned int nr_pages = 1 << order;
         unsigned long roffset = swp_offset(entry);
         unsigned long offset = round_down(roffset, nr_pages);
+       unsigned int ci_off;
         int i;
         bool ret = false;
  
         ci = swap_cluster_lock(si, offset);
         if (nr_pages == 1) {
-               if (map[roffset])
+               ci_off = roffset % SWAPFILE_CLUSTER;
+               if (swp_tb_get_count(__swap_table_get(ci, ci_off)))
                         ret = true;
                 goto unlock_out;
         }
         for (i = 0; i < nr_pages; i++) {
-               if (map[offset + i]) {
+               ci_off = (offset + i) % SWAPFILE_CLUSTER;
+               if (swp_tb_get_count(__swap_table_get(ci, ci_off))) {
                         ret = true;
                         break;
                 }
@@ -2016,7 +2128,8 @@ void swap_free_hibernation_slot(swp_entry_t entry)
                 return;
  
         ci = swap_cluster_lock(si, offset);
-       swap_put_entry_locked(si, ci, offset);
+       __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER);
+       __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1);
         swap_cluster_unlock(ci);
  
         /* In theory readahead might add it to the swap cache by accident */
@@ -2242,13 +2355,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                         unsigned int type)
  {
         pte_t *pte = NULL;
-       struct swap_info_struct *si;
  
-       si = swap_info[type];
         do {
                 struct folio *folio;
-               unsigned long offset;
-               unsigned char swp_count;
+               unsigned long swp_tb;
                 softleaf_t entry;
                 int ret;
                 pte_t ptent;
@@ -2267,7 +2377,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 if (swp_type(entry) != type)
                         continue;
  
-               offset = swp_offset(entry);
                 pte_unmap(pte);
                 pte = NULL;
  
@@ -2284,8 +2393,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                                 &vmf);
                 }
                 if (!folio) {
-                       swp_count = READ_ONCE(si->swap_map[offset]);
-                       if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
+                       swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+                                               swp_cluster_offset(entry));
+                       if (swp_tb_get_count(swp_tb) <= 0)
                                 continue;
                         return -ENOMEM;
                 }
@@ -2413,7 +2523,7 @@ unlock:
  }
  
  /*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap table from current position to next entry still in use.
   * Return 0 if there are no inuse entries after prev till end of
   * the map.
   */
@@ -2422,7 +2532,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
  {
         unsigned int i;
         unsigned long swp_tb;
-       unsigned char count;
  
         /*
          * No need for swap_lock here: we're just looking
@@ -2431,12 +2540,9 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
          * allocations from this area (while holding swap_lock).
          */
         for (i = prev + 1; i < si->max; i++) {
-               count = READ_ONCE(si->swap_map[i]);
                 swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
                                         i % SWAPFILE_CLUSTER);
-               if (count == SWAP_MAP_BAD)
-                       continue;
-               if (count || swp_tb_is_folio(swp_tb))
+               if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb))
                         break;
                 if ((i % LATENCY_LIMIT) == 0)
                         cond_resched();
@@ -2796,7 +2902,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
  SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  {
         struct swap_info_struct *p = NULL;
-       unsigned char *swap_map;
         unsigned long *zeromap;
         struct swap_cluster_info *cluster_info;
         struct file *swap_file, *victim;
@@ -2874,8 +2979,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         flush_percpu_swap_cluster(p);
  
         destroy_swap_extents(p, p->swap_file);
-       if (p->flags & SWP_CONTINUED)
-               free_swap_count_continuations(p);
  
         if (!(p->flags & SWP_SOLIDSTATE))
                 atomic_dec(&nr_rotate_swap);
@@ -2887,8 +2990,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  
         swap_file = p->swap_file;
         p->swap_file = NULL;
-       swap_map = p->swap_map;
-       p->swap_map = NULL;
         zeromap = p->zeromap;
         p->zeromap = NULL;
         maxpages = p->max;
@@ -2902,7 +3003,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         mutex_unlock(&swapon_mutex);
         kfree(p->global_cluster);
         p->global_cluster = NULL;
-       vfree(swap_map);
         kvfree(zeromap);
         free_swap_cluster_info(cluster_info, maxpages);
         /* Destroy swap account information */
@@ -3122,7 +3222,6 @@ static struct swap_info_struct *alloc_swap_info(void)
                 kvfree(defer);
         }
         spin_lock_init(&p->lock);
-       spin_lock_init(&p->cont_lock);
         atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
         init_completion(&p->comp);
  
@@ -3249,19 +3348,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
         return maxpages;
  }
  
-static int setup_swap_map(struct swap_info_struct *si,
-                         union swap_header *swap_header,
-                         unsigned long maxpages)
-{
-       unsigned char *swap_map;
-
-       swap_map = vzalloc(maxpages);
-       si->swap_map = swap_map;
-       if (!swap_map)
-               return -ENOMEM;
-       return 0;
-}
-
  static int setup_swap_clusters_info(struct swap_info_struct *si,
                                     union swap_header *swap_header,
                                     unsigned long maxpages)
@@ -3446,11 +3532,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
  
         maxpages = si->max;
  
-       /* Setup the swap map and apply bad block */
-       error = setup_swap_map(si, swap_header, maxpages);
-       if (error)
-               goto bad_swap_unlock_inode;
-
         /* Set up the swap cluster info */
         error = setup_swap_clusters_info(si, swap_header, maxpages);
         if (error)
@@ -3571,8 +3652,6 @@ bad_swap:
         inode = NULL;
         destroy_swap_extents(si, swap_file);
         swap_cgroup_swapoff(si->type);
-       vfree(si->swap_map);
-       si->swap_map = NULL;
         free_swap_cluster_info(si->cluster_info, si->max);
         si->cluster_info = NULL;
         kvfree(si->zeromap);
@@ -3613,322 +3692,29 @@ void si_swapinfo(struct sysinfo *val)
         spin_unlock(&swap_lock);
  }
  
-/*
- * Verify that nr swap entries are valid and increment their swap map counts.
- *
- * Returns error code in following case.
- * - success -> 0
- * - swp_entry is invalid -> EINVAL
- * - swap-mapped reference is requested but the entry is not used. -> ENOENT
- * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
- */
-static int swap_dup_entries(struct swap_info_struct *si,
-                           struct swap_cluster_info *ci,
-                           unsigned long offset,
-                           unsigned char usage, int nr)
-{
-       int i;
-       unsigned char count;
-
-       for (i = 0; i < nr; i++) {
-               count = si->swap_map[offset + i];
-               /*
-                * For swapin out, allocator never allocates bad slots. for
-                * swapin, readahead is guarded by swap_entry_swapped.
-                */
-               if (WARN_ON(count == SWAP_MAP_BAD))
-                       return -ENOENT;
-               /*
-                * Swap count duplication must be guarded by either swap cache folio (from
-                * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct).
-                */
-               if (WARN_ON(!count &&
-                           !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))))
-                       return -ENOENT;
-               if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX))
-                       return -EINVAL;
-       }
-
-       for (i = 0; i < nr; i++) {
-               count = si->swap_map[offset + i];
-               if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
-                       count += usage;
-               else if (swap_count_continued(si, offset + i, count))
-                       count = COUNT_CONTINUED;
-               else {
-                       /*
-                        * Don't need to rollback changes, because if
-                        * usage == 1, there must be nr == 1.
-                        */
-                       return -ENOMEM;
-               }
-
-               WRITE_ONCE(si->swap_map[offset + i], count);
-       }
-
-       return 0;
-}
-
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
-{
-       int err;
-       struct swap_info_struct *si;
-       struct swap_cluster_info *ci;
-       unsigned long offset = swp_offset(entry);
-
-       si = swap_entry_to_info(entry);
-       if (WARN_ON_ONCE(!si)) {
-               pr_err("%s%08lx\n", Bad_file, entry.val);
-               return -EINVAL;
-       }
-
-       VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
-       ci = swap_cluster_lock(si, offset);
-       err = swap_dup_entries(si, ci, offset, usage, nr);
-       swap_cluster_unlock(ci);
-       return err;
-}
-
  /*
   * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
   * @entry: first swap entry from which we want to increase the refcount.
   *
- * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
- * but could not be atomically allocated.  Returns 0, just as if it succeeded,
- * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
- * might occur if a page table entry has got corrupted.
+ * Returns 0 for success, or -ENOMEM if the extend table is required
+ * but could not be atomically allocated.  Returns -EINVAL if the swap
+ * entry is invalid, which might occur if a page table entry has got
+ * corrupted.
   *
   * Context: Caller must ensure there is no race condition on the reference
   * owner. e.g., locking the PTL of a PTE containing the entry being increased.
   */
  int swap_dup_entry_direct(swp_entry_t entry)
-{
-       int err = 0;
-       while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
-               err = add_swap_count_continuation(entry, GFP_ATOMIC);
-       return err;
-}
-
-/*
- * add_swap_count_continuation - called when a swap count is duplicated
- * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
- * page of the original vmalloc'ed swap_map, to hold the continuation count
- * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
- * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
- *
- * These continuation pages are seldom referenced: the common paths all work
- * on the original swap_map, only referring to a continuation page when the
- * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
- *
- * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
- * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
- * can be called after dropping locks.
- */
-int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
  {
         struct swap_info_struct *si;
-       struct swap_cluster_info *ci;
-       struct page *head;
-       struct page *page;
-       struct page *list_page;
-       pgoff_t offset;
-       unsigned char count;
-       int ret = 0;
-
-       /*
-        * When debugging, it's easier to use __GFP_ZERO here; but it's better
-        * for latency not to zero a page while GFP_ATOMIC and holding locks.
-        */
-       page = alloc_page(gfp_mask | __GFP_HIGHMEM);
-
-       si = get_swap_device(entry);
-       if (!si) {
-               /*
-                * An acceptable race has occurred since the failing
-                * __swap_duplicate(): the swap device may be swapoff
-                */
-               goto outer;
-       }
-
-       offset = swp_offset(entry);
-
-       ci = swap_cluster_lock(si, offset);
-
-       count = si->swap_map[offset];
-
-       if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
-               /*
-                * The higher the swap count, the more likely it is that tasks
-                * will race to add swap count continuation: we need to avoid
-                * over-provisioning.
-                */
-               goto out;
-       }
-
-       if (!page) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       head = vmalloc_to_page(si->swap_map + offset);
-       offset &= ~PAGE_MASK;
-
-       spin_lock(&si->cont_lock);
-       /*
-        * Page allocation does not initialize the page's lru field,
-        * but it does always reset its private field.
-        */
-       if (!page_private(head)) {
-               BUG_ON(count & COUNT_CONTINUED);
-               INIT_LIST_HEAD(&head->lru);
-               set_page_private(head, SWP_CONTINUED);
-               si->flags |= SWP_CONTINUED;
-       }
-
-       list_for_each_entry(list_page, &head->lru, lru) {
-               unsigned char *map;
-
-               /*
-                * If the previous map said no continuation, but we've found
-                * a continuation page, free our allocation and use this one.
-                */
-               if (!(count & COUNT_CONTINUED))
-                       goto out_unlock_cont;
-
-               map = kmap_local_page(list_page) + offset;
-               count = *map;
-               kunmap_local(map);
-
-               /*
-                * If this continuation count now has some space in it,
-                * free our allocation and use this one.
-                */
-               if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
-                       goto out_unlock_cont;
-       }
  
-       list_add_tail(&page->lru, &head->lru);
-       page = NULL;                    /* now it's attached, don't free it */
-out_unlock_cont:
-       spin_unlock(&si->cont_lock);
-out:
-       swap_cluster_unlock(ci);
-       put_swap_device(si);
-outer:
-       if (page)
-               __free_page(page);
-       return ret;
-}
-
-/*
- * swap_count_continued - when the original swap_map count is incremented
- * from SWAP_MAP_MAX, check if there is already a continuation page to carry
- * into, carry if so, or else fail until a new continuation page is allocated;
- * when the original swap_map count is decremented from 0 with continuation,
- * borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of swap_put_entry_locked()
- * holds cluster lock.
- */
-static bool swap_count_continued(struct swap_info_struct *si,
-                                pgoff_t offset, unsigned char count)
-{
-       struct page *head;
-       struct page *page;
-       unsigned char *map;
-       bool ret;
-
-       head = vmalloc_to_page(si->swap_map + offset);
-       if (page_private(head) != SWP_CONTINUED) {
-               BUG_ON(count & COUNT_CONTINUED);
-               return false;           /* need to add count continuation */
-       }
-
-       spin_lock(&si->cont_lock);
-       offset &= ~PAGE_MASK;
-       page = list_next_entry(head, lru);
-       map = kmap_local_page(page) + offset;
-
-       if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
-               goto init_map;          /* jump over SWAP_CONT_MAX checks */
-
-       if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
-               /*
-                * Think of how you add 1 to 999
-                */
-               while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
-                       kunmap_local(map);
-                       page = list_next_entry(page, lru);
-                       BUG_ON(page == head);
-                       map = kmap_local_page(page) + offset;
-               }
-               if (*map == SWAP_CONT_MAX) {
-                       kunmap_local(map);
-                       page = list_next_entry(page, lru);
-                       if (page == head) {
-                               ret = false;    /* add count continuation */
-                               goto out;
-                       }
-                       map = kmap_local_page(page) + offset;
-init_map:              *map = 0;               /* we didn't zero the page */
-               }
-               *map += 1;
-               kunmap_local(map);
-               while ((page = list_prev_entry(page, lru)) != head) {
-                       map = kmap_local_page(page) + offset;
-                       *map = COUNT_CONTINUED;
-                       kunmap_local(map);
-               }
-               ret = true;                     /* incremented */
-
-       } else {                                /* decrementing */
-               /*
-                * Think of how you subtract 1 from 1000
-                */
-               BUG_ON(count != COUNT_CONTINUED);
-               while (*map == COUNT_CONTINUED) {
-                       kunmap_local(map);
-                       page = list_next_entry(page, lru);
-                       BUG_ON(page == head);
-                       map = kmap_local_page(page) + offset;
-               }
-               BUG_ON(*map == 0);
-               *map -= 1;
-               if (*map == 0)
-                       count = 0;
-               kunmap_local(map);
-               while ((page = list_prev_entry(page, lru)) != head) {
-                       map = kmap_local_page(page) + offset;
-                       *map = SWAP_CONT_MAX | count;
-                       count = COUNT_CONTINUED;
-                       kunmap_local(map);
-               }
-               ret = count == COUNT_CONTINUED;
+       si = swap_entry_to_info(entry);
+       if (WARN_ON_ONCE(!si)) {
+               pr_err("%s%08lx\n", Bad_file, entry.val);
+               return -EINVAL;
         }
-out:
-       spin_unlock(&si->cont_lock);
-       return ret;
-}
  
-/*
- * free_swap_count_continuations - swapoff free all the continuation pages
- * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
- */
-static void free_swap_count_continuations(struct swap_info_struct *si)
-{
-       pgoff_t offset;
-
-       for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
-               struct page *head;
-               head = vmalloc_to_page(si->swap_map + offset);
-               if (page_private(head)) {
-                       struct page *page, *next;
-
-                       list_for_each_entry_safe(page, next, &head->lru, lru) {
-                               list_del(&page->lru);
-                               __free_page(page);
-                       }
-               }
-       }
+       return swap_dup_entries_cluster(si, swp_offset(entry), 1);
  }
  
  #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
author	Kairui Song <kasong@tencent.com>
	Tue, 17 Feb 2026 20:06:34 +0000 (04:06 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sun, 5 Apr 2026 20:52:59 +0000 (13:52 -0700)
include/linux/swap.h		patch \| blob \| blame \| history
mm/memory.c		patch \| blob \| blame \| history
mm/swap.h		patch \| blob \| blame \| history
mm/swap_state.c		patch \| blob \| blame \| history
mm/swap_table.h		patch \| blob \| blame \| history
mm/swapfile.c		patch \| blob \| blame \| history