mm,hugetlb: change mechanism to detect a COW on private mapping

author Oscar Salvador <osalvador@suse.de>

Mon, 30 Jun 2025 14:42:08 +0000 (16:42 +0200)

committer Andrew Morton <akpm@linux-foundation.org>

Sun, 13 Jul 2025 23:38:22 +0000 (16:38 -0700)
author Oscar Salvador <osalvador@suse.de>
Mon, 30 Jun 2025 14:42:08 +0000 (16:42 +0200)
committer Andrew Morton <akpm@linux-foundation.org>
Sun, 13 Jul 2025 23:38:22 +0000 (16:38 -0700)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index db53ead8ac4340a7e8d5f19d0ac41751b941b1ea..cf5d5ad5bbe9dedfe76d6506e04f558d955b86ae 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6131,8 +6131,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
   * cannot race with other handlers or page migration.
   * Keep the pte_same checks anyway to make transition from the mutex easier.
   */
-static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
-                      struct vm_fault *vmf)
+static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         struct mm_struct *mm = vma->vm_mm;
@@ -6194,16 +6193,17 @@ retry_avoidcopy:
                        PageAnonExclusive(&old_folio->page), &old_folio->page);
  
         /*
-        * If the process that created a MAP_PRIVATE mapping is about to
-        * perform a COW due to a shared page count, attempt to satisfy
-        * the allocation without using the existing reserves. The pagecache
-        * page is used to determine if the reserve at this address was
-        * consumed or not. If reserves were used, a partial faulted mapping
-        * at the time of fork() could consume its reserves on COW instead
-        * of the full address range.
+        * If the process that created a MAP_PRIVATE mapping is about to perform
+        * a COW due to a shared page count, attempt to satisfy the allocation
+        * without using the existing reserves.
+        * In order to determine where this is a COW on a MAP_PRIVATE mapping it
+        * is enough to check whether the old_folio is anonymous. This means that
+        * the reserve for this address was consumed. If reserves were used, a
+        * partial faulted mapping at the fime of fork() could consume its reserves
+        * on COW instead of the full address range.
          */
         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
-                       old_folio != pagecache_folio)
+           folio_test_anon(old_folio))
                 cow_from_owner = true;
  
         folio_get(old_folio);
@@ -6582,7 +6582,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
         hugetlb_count_add(pages_per_huge_page(h), mm);
         if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                 /* Optimization, do the COW without a second fault */
-               ret = hugetlb_wp(folio, vmf);
+               ret = hugetlb_wp(vmf);
         }
  
         spin_unlock(vmf->ptl);
@@ -6650,10 +6650,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         vm_fault_t ret;
         u32 hash;
         struct folio *folio = NULL;
-       struct folio *pagecache_folio = NULL;
         struct hstate *h = hstate_vma(vma);
         struct address_space *mapping;
-       int need_wait_lock = 0;
+       bool need_wait_lock = false;
         struct vm_fault vmf = {
                 .vma = vma,
                 .address = address & huge_page_mask(h),
@@ -6748,8 +6747,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * If we are going to COW/unshare the mapping later, we examine the
          * pending reservations for this page now. This will ensure that any
          * allocations necessary to record that reservation occur outside the
-        * spinlock. Also lookup the pagecache page now as it is used to
-        * determine if a reservation has been consumed.
+        * spinlock.
          */
         if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
             !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
@@ -6759,11 +6757,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 }
                 /* Just decrements count, does not deallocate */
                 vma_end_reservation(h, vma, vmf.address);
-
-               pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
-                                                            vmf.pgoff);
-               if (IS_ERR(pagecache_folio))
-                       pagecache_folio = NULL;
         }
  
         vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
@@ -6777,10 +6770,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
             (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
                 if (!userfaultfd_wp_async(vma)) {
                         spin_unlock(vmf.ptl);
-                       if (pagecache_folio) {
-                               folio_unlock(pagecache_folio);
-                               folio_put(pagecache_folio);
-                       }
                         hugetlb_vma_unlock_read(vma);
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                         return handle_userfault(&vmf, VM_UFFD_WP);
@@ -6792,24 +6781,19 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 /* Fallthrough to CoW */
         }
  
-       /*
-        * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
-        * pagecache_folio, so here we need take the former one
-        * when folio != pagecache_folio or !pagecache_folio.
-        */
-       folio = page_folio(pte_page(vmf.orig_pte));
-       if (folio != pagecache_folio)
-               if (!folio_trylock(folio)) {
-                       need_wait_lock = 1;
-                       goto out_ptl;
-               }
-
-       folio_get(folio);
-
         if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                 if (!huge_pte_write(vmf.orig_pte)) {
-                       ret = hugetlb_wp(pagecache_folio, &vmf);
-                       goto out_put_page;
+                       /* hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) */
+                       folio = page_folio(pte_page(vmf.orig_pte));
+                       if (!folio_trylock(folio)) {
+                               need_wait_lock = true;
+                               goto out_ptl;
+                       }
+                       folio_get(folio);
+                       ret = hugetlb_wp(&vmf);
+                       folio_unlock(folio);
+                       folio_put(folio);
+                       goto out_ptl;
                 } else if (likely(flags & FAULT_FLAG_WRITE)) {
                         vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
                 }
@@ -6818,17 +6802,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
                                                 flags & FAULT_FLAG_WRITE))
                 update_mmu_cache(vma, vmf.address, vmf.pte);
-out_put_page:
-       if (folio != pagecache_folio)
-               folio_unlock(folio);
-       folio_put(folio);
  out_ptl:
         spin_unlock(vmf.ptl);
-
-       if (pagecache_folio) {
-               folio_unlock(pagecache_folio);
-               folio_put(pagecache_folio);
-       }
  out_mutex:
         hugetlb_vma_unlock_read(vma);
  
@@ -6841,11 +6816,16 @@ out_mutex:
  
         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
         /*
-        * Generally it's safe to hold refcount during waiting page lock. But
-        * here we just wait to defer the next page fault to avoid busy loop and
-        * the page is not used after unlocked before returning from the current
-        * page fault. So we are safe from accessing freed page, even if we wait
-        * here without taking refcount.
+        * hugetlb_wp drops all the locks, but the folio lock, before trying to
+        * unmap the folio from other processes. During that window, if another
+        * process mapping that folio faults in, it will take the mutex and then
+        * it will wait on folio_lock, causing an ABBA deadlock.
+        * Use trylock instead and bail out if we fail.
+        *
+        * Ideally, we should hold a refcount on the folio we wait for, but we do
+        * not want to use the folio after it becomes unlocked, but rather just
+        * wait for it to become unlocked, so hopefully next fault successes on
+        * the trylock.
          */
         if (need_wait_lock)
                 folio_wait_locked(folio);
author	Oscar Salvador <osalvador@suse.de>
	Mon, 30 Jun 2025 14:42:08 +0000 (16:42 +0200)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sun, 13 Jul 2025 23:38:22 +0000 (16:38 -0700)