mm/hugetlb: unshare page tables during VMA split, not before

author Jann Horn <jannh@google.com>

Tue, 27 May 2025 21:23:53 +0000 (23:23 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 27 Jun 2025 10:04:21 +0000 (11:04 +0100)
author Jann Horn <jannh@google.com>
Tue, 27 May 2025 21:23:53 +0000 (23:23 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 27 Jun 2025 10:04:21 +0000 (11:04 +0100)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 90c66b9458c317f574d4197a2c73974817cf9a25..1c03935aa3d136f2ffc94db8a3a31fa6d5b59151 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -188,6 +188,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot);
  
  bool is_hugetlb_entry_migration(pte_t pte);
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
  
  #else /* !CONFIG_HUGETLB_PAGE */
  
@@ -369,6 +371,10 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
         return 0;
  }
  
+static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
+
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
+
  #endif /* !CONFIG_HUGETLB_PAGE */
  /*
   * hugepages at page global directory. If arch support
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 1be0d9a88e6c5f71dd89c7675a5f459e1312b842..0711f91f5c5ecb5263219611cddee55b6699beaa 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,7 +97,7 @@ static inline void ClearPageHugeFreed(struct page *head)
  /* Forward declaration */
  static int hugetlb_acct_memory(struct hstate *h, long delta);
  static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
-               unsigned long start, unsigned long end);
+               unsigned long start, unsigned long end, bool take_locks);
  
  static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
  {
@@ -3699,26 +3699,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
  {
         if (addr & ~(huge_page_mask(hstate_vma(vma))))
                 return -EINVAL;
+       return 0;
+}
  
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
         /*
          * PMD sharing is only possible for PUD_SIZE-aligned address ranges
          * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
          * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+        * This function is called in the middle of a VMA split operation, with
+        * MM, VMA and rmap all write-locked to prevent concurrent page table
+        * walks (except hardware and gup_fast()).
          */
+       mmap_assert_write_locked(vma->vm_mm);
+       i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
         if (addr & ~PUD_MASK) {
-               /*
-                * hugetlb_vm_op_split is called right before we attempt to
-                * split the VMA. We will need to unshare PMDs in the old and
-                * new VMAs, so let's unshare before we split.
-                */
                 unsigned long floor = addr & PUD_MASK;
                 unsigned long ceil = floor + PUD_SIZE;
  
-               if (floor >= vma->vm_start && ceil <= vma->vm_end)
-                       hugetlb_unshare_pmds(vma, floor, ceil);
+               if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+                       /*
+                        * Locking:
+                        * Use take_locks=false here.
+                        * The file rmap lock is already held.
+                        * The hugetlb VMA lock can't be taken when we already
+                        * hold the file rmap lock, and we don't need it because
+                        * its purpose is to synchronize against concurrent page
+                        * table walks, which are not possible thanks to the
+                        * locks held by our caller.
+                        */
+                       hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+               }
         }
-
-       return 0;
  }
  
  static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -5727,9 +5741,16 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
         }
  }
  
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
  static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                                    unsigned long start,
-                                  unsigned long end)
+                                  unsigned long end,
+                                  bool take_locks)
  {
         struct hstate *h = hstate_vma(vma);
         unsigned long sz = huge_page_size(h);
@@ -5753,7 +5774,11 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
                                 start, end);
         mmu_notifier_invalidate_range_start(&range);
-       i_mmap_lock_write(vma->vm_file->f_mapping);
+       if (take_locks) {
+               i_mmap_lock_write(vma->vm_file->f_mapping);
+       } else {
+               i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+       }
         for (address = start; address < end; address += PUD_SIZE) {
                 ptep = huge_pte_offset(mm, address, sz);
                 if (!ptep)
@@ -5763,7 +5788,9 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                 spin_unlock(ptl);
         }
         flush_hugetlb_tlb_range(vma, start, end);
-       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       if (take_locks) {
+               i_mmap_unlock_write(vma->vm_file->f_mapping);
+       }
         /*
          * No need to call mmu_notifier_invalidate_range(), see
          * Documentation/mm/mmu_notifier.rst.
diff --git a/mm/mmap.c b/mm/mmap.c

index 9f76625a17439fd38f409c72d860ab16214cd061..8c188ed3738ac8e2088a58cd923fe4c891a9a0c1 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -832,7 +832,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
                 }
         }
  again:
+       /*
+        * Get rid of huge pages and shared page tables straddling the split
+        * boundary.
+        */
         vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+       if (is_vm_hugetlb_page(orig_vma)) {
+               hugetlb_split(orig_vma, start);
+               hugetlb_split(orig_vma, end);
+       }
  
         if (file) {
                 mapping = file->f_mapping;
author	Jann Horn <jannh@google.com>
	Tue, 27 May 2025 21:23:53 +0000 (23:23 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 27 Jun 2025 10:04:21 +0000 (11:04 +0100)
include/linux/hugetlb.h		patch \| blob \| blame \| history
mm/hugetlb.c		patch \| blob \| blame \| history
mm/mmap.c		patch \| blob \| blame \| history