mm/shmem, swap: rework swap entry and index calculation for large swapin

author Kairui Song <kasong@tencent.com>

Mon, 28 Jul 2025 07:53:05 +0000 (15:53 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Sat, 2 Aug 2025 19:06:13 +0000 (12:06 -0700)
author Kairui Song <kasong@tencent.com>
Mon, 28 Jul 2025 07:53:05 +0000 (15:53 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Sat, 2 Aug 2025 19:06:13 +0000 (12:06 -0700)
diff --git a/mm/shmem.c b/mm/shmem.c

index 0de37d0145243942f802388181ff75c1507352e7..33d30ee5bc8471cf183d231b50f81118bd8ba390 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2302,7 +2302,7 @@ unlock:
         if (xas_error(&xas))
                 return xas_error(&xas);
  
-       return entry_order;
+       return 0;
  }
  
  /*
@@ -2323,7 +2323,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
         struct swap_info_struct *si;
         struct folio *folio = NULL;
         bool skip_swapcache = false;
-       int error, nr_pages, order, split_order;
+       int error, nr_pages, order;
         pgoff_t offset;
  
         VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
@@ -2331,11 +2331,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
         swap = index_entry;
         *foliop = NULL;
  
-       if (is_poisoned_swp_entry(swap))
+       if (is_poisoned_swp_entry(index_entry))
                 return -EIO;
  
-       si = get_swap_device(swap);
-       order = shmem_confirm_swap(mapping, index, swap);
+       si = get_swap_device(index_entry);
+       order = shmem_confirm_swap(mapping, index, index_entry);
         if (unlikely(!si)) {
                 if (order < 0)
                         return -EEXIST;
@@ -2347,6 +2347,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                 return -EEXIST;
         }
  
+       /* index may point to the middle of a large entry, get the sub entry */
+       if (order) {
+               offset = index - round_down(index, 1 << order);
+               swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+       }
+
         /* Look it up and read it in.. */
         folio = swap_cache_get_folio(swap, NULL, 0);
         if (!folio) {
@@ -2359,7 +2365,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  
                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
                         /* Direct swapin skipping swap cache & readahead */
-                       folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
+                       folio = shmem_swap_alloc_folio(inode, vma, index,
+                                                      index_entry, order, gfp);
                         if (IS_ERR(folio)) {
                                 error = PTR_ERR(folio);
                                 folio = NULL;
@@ -2367,16 +2374,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                         }
                         skip_swapcache = true;
                 } else {
-                       /*
-                        * Cached swapin only supports order 0 folio, it is
-                        * necessary to recalculate the new swap entry based on
-                        * the offset, as the swapin index might be unalgined.
-                        */
-                       if (order) {
-                               offset = index - round_down(index, 1 << order);
-                               swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
-                       }
-
+                       /* Cached swapin only supports order 0 folio */
                         folio = shmem_swapin_cluster(swap, gfp, info, index);
                         if (!folio) {
                                 error = -ENOMEM;
@@ -2384,6 +2382,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                         }
                 }
         }
+
         if (order > folio_order(folio)) {
                 /*
                  * Swapin may get smaller folios due to various reasons:
@@ -2393,24 +2392,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                  * large swap entries. In such cases, we should split the
                  * large swap entry to prevent possible data corruption.
                  */
-               split_order = shmem_split_large_entry(inode, index, index_entry, gfp);
-               if (split_order < 0) {
-                       error = split_order;
+               error = shmem_split_large_entry(inode, index, index_entry, gfp);
+               if (error)
                         goto failed_nolock;
-               }
+       }
  
-               /*
-                * If the large swap entry has already been split, it is
-                * necessary to recalculate the new swap entry based on
-                * the old order alignment.
-                */
-               if (split_order > 0) {
-                       offset = index - round_down(index, 1 << split_order);
-                       swap = swp_entry(swp_type(swap), swp_offset(index_entry) + offset);
-               }
-       } else if (order < folio_order(folio)) {
-               swap.val = round_down(swap.val, 1 << folio_order(folio));
-               index = round_down(index, 1 << folio_order(folio));
+       /*
+        * If the folio is large, round down swap and index by folio size.
+        * No matter what race occurs, the swap layer ensures we either get
+        * a valid folio that has its swap entry aligned by size, or a
+        * temporarily invalid one which we'll abort very soon and retry.
+        *
+        * shmem_add_to_page_cache ensures the whole range contains expected
+        * entries and prevents any corruption, so any race split is fine
+        * too, it will succeed as long as the entries are still there.
+        */
+       nr_pages = folio_nr_pages(folio);
+       if (nr_pages > 1) {
+               swap.val = round_down(swap.val, nr_pages);
+               index = round_down(index, nr_pages);
         }
  
         /*
@@ -2446,8 +2446,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                         goto failed;
         }
  
-       error = shmem_add_to_page_cache(folio, mapping,
-                                       round_down(index, nr_pages),
+       error = shmem_add_to_page_cache(folio, mapping, index,
                                         swp_to_radix_entry(swap), gfp);
         if (error)
                 goto failed;
author	Kairui Song <kasong@tencent.com>
	Mon, 28 Jul 2025 07:53:05 +0000 (15:53 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sat, 2 Aug 2025 19:06:13 +0000 (12:06 -0700)