mm: add spurious fault fixing support for huge pmd

author Huang Ying <ying.huang@linux.alibaba.com>

Fri, 14 Nov 2025 08:54:02 +0000 (16:54 +0800)

committer Catalin Marinas <catalin.marinas@arm.com>

Wed, 19 Nov 2025 16:01:48 +0000 (16:01 +0000)
author Huang Ying <ying.huang@linux.alibaba.com>
Fri, 14 Nov 2025 08:54:02 +0000 (16:54 +0800)
committer Catalin Marinas <catalin.marinas@arm.com>
Wed, 19 Nov 2025 16:01:48 +0000 (16:01 +0000)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h

index f327d62fc9852d3cf717e633ed9b39deecfa0f20..887a632ce7a06dc8c90ebae24f572c3175d96d37 100644 (file)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                   struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-void huge_pmd_set_accessed(struct vm_fault *vmf);
+bool huge_pmd_set_accessed(struct vm_fault *vmf);
  int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                   pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                   struct vm_area_struct *vma);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h

index 32e8457ad53524bfaed13bf9c9824504f53cdb42..ee3148ef87f60ff5e3faed012da8e2e0b5188671 100644 (file)
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
  #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
  #endif
  
+#ifndef flush_tlb_fix_spurious_fault_pmd
+#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0)
+#endif
+
  /*
   * When walking page tables, get the address of the next boundary,
   * or the end address of the range if that comes earlier.  Although no
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 1b81680b4225f34b928753251d8b1bbf5b4c9252..6a8679907eaa8b91bb02f2fbc97739b132e3a860 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1641,17 +1641,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
  EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+/**
+ * touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
+ * @vma: The VMA covering @addr
+ * @addr: The virtual address
+ * @pmd: pmd pointer into the page table mapping @addr
+ * @write: Whether it's a write access
+ *
+ * Return: whether the pmd entry is changed
+ */
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, bool write)
  {
-       pmd_t _pmd;
+       pmd_t entry;
  
-       _pmd = pmd_mkyoung(*pmd);
+       entry = pmd_mkyoung(*pmd);
         if (write)
-               _pmd = pmd_mkdirty(_pmd);
+               entry = pmd_mkdirty(entry);
         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-                                 pmd, _pmd, write))
+                                 pmd, entry, write)) {
                 update_mmu_cache_pmd(vma, addr, pmd);
+               return true;
+       }
+
+       return false;
  }
  
  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1841,18 +1854,14 @@ unlock:
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  
-void huge_pmd_set_accessed(struct vm_fault *vmf)
+bool huge_pmd_set_accessed(struct vm_fault *vmf)
  {
         bool write = vmf->flags & FAULT_FLAG_WRITE;
  
-       vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
         if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
-               goto unlock;
-
-       touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
+               return false;
  
-unlock:
-       spin_unlock(vmf->ptl);
+       return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
  }
  
  static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
diff --git a/mm/internal.h b/mm/internal.h

index 1561fc2ff5b832e45f7c4131c565614594748a4d..27ad37a418686577a84b50034eb3313d7337c76e 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
   */
  void touch_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, bool write);
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, bool write);
  
  /*
diff --git a/mm/memory.c b/mm/memory.c

index 74b45e258323e5c40c4f5d6955d60df87c9a0eb6..6e5a08c4fd2e64da034bbe5899f1ad91c9ab69b8 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6115,6 +6115,45 @@ split:
         return VM_FAULT_FALLBACK;
  }
  
+/*
+ * The page faults may be spurious because of the racy access to the
+ * page table.  For example, a non-populated virtual page is accessed
+ * on 2 CPUs simultaneously, thus the page faults are triggered on
+ * both CPUs.  However, it's possible that one CPU (say CPU A) cannot
+ * find the reason for the page fault if the other CPU (say CPU B) has
+ * changed the page table before the PTE is checked on CPU A.  Most of
+ * the time, the spurious page faults can be ignored safely.  However,
+ * if the page fault is for the write access, it's possible that a
+ * stale read-only TLB entry exists in the local CPU and needs to be
+ * flushed on some architectures.  This is called the spurious page
+ * fault fixing.
+ *
+ * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
+ * by default and used as such on most architectures, while
+ * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
+ * used as such on most architectures.
+ */
+static void fix_spurious_fault(struct vm_fault *vmf,
+                              enum pgtable_level ptlevel)
+{
+       /* Skip spurious TLB flush for retried page fault */
+       if (vmf->flags & FAULT_FLAG_TRIED)
+               return;
+       /*
+        * This is needed only for protection faults but the arch code
+        * is not yet telling us if this is a protection fault or not.
+        * This still avoids useless tlb flushes for .text page faults
+        * with threads.
+        */
+       if (vmf->flags & FAULT_FLAG_WRITE) {
+               if (ptlevel == PGTABLE_LEVEL_PTE)
+                       flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+                                                    vmf->pte);
+               else
+                       flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
+                                                        vmf->pmd);
+       }
+}
  /*
   * These routines also need to handle stuff like marking pages dirty
   * and/or accessed for architectures that don't do it in hardware (most
@@ -6196,23 +6235,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
         }
         entry = pte_mkyoung(entry);
         if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
-                               vmf->flags & FAULT_FLAG_WRITE)) {
+                               vmf->flags & FAULT_FLAG_WRITE))
                 update_mmu_cache_range(vmf, vmf->vma, vmf->address,
                                 vmf->pte, 1);
-       } else {
-               /* Skip spurious TLB flush for retried page fault */
-               if (vmf->flags & FAULT_FLAG_TRIED)
-                       goto unlock;
-               /*
-                * This is needed only for protection faults but the arch code
-                * is not yet telling us if this is a protection fault or not.
-                * This still avoids useless tlb flushes for .text page faults
-                * with threads.
-                */
-               if (vmf->flags & FAULT_FLAG_WRITE)
-                       flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
-                                                    vmf->pte);
-       }
+       else
+               fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
  unlock:
         pte_unmap_unlock(vmf->pte, vmf->ptl);
         return 0;
@@ -6309,7 +6336,10 @@ retry_pud:
                                 if (!(ret & VM_FAULT_FALLBACK))
                                         return ret;
                         } else {
-                               huge_pmd_set_accessed(&vmf);
+                               vmf.ptl = pmd_lock(mm, vmf.pmd);
+                               if (!huge_pmd_set_accessed(&vmf))
+                                       fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
+                               spin_unlock(vmf.ptl);
                                 return 0;
                         }
                 }
author	Huang Ying <ying.huang@linux.alibaba.com>
	Fri, 14 Nov 2025 08:54:02 +0000 (16:54 +0800)
committer	Catalin Marinas <catalin.marinas@arm.com>
	Wed, 19 Nov 2025 16:01:48 +0000 (16:01 +0000)
include/linux/huge_mm.h		patch \| blob \| blame \| history
include/linux/pgtable.h		patch \| blob \| blame \| history
mm/huge_memory.c		patch \| blob \| blame \| history
mm/internal.h		patch \| blob \| blame \| history
mm/memory.c		patch \| blob \| blame \| history