]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 21 Jun 2025 09:30:56 +0000 (11:30 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 21 Jun 2025 09:42:16 +0000 (11:42 +0200)
added patches:
hugetlb-unshare-some-pmds-when-splitting-vmas.patch
mm-hugetlb-fix-huge_pmd_unshare-vs-gup-fast-race.patch
mm-hugetlb-independent-pmd-page-table-shared-count.patch
mm-hugetlb-unshare-page-tables-during-vma-split-not-before.patch

queue-5.10/hugetlb-unshare-some-pmds-when-splitting-vmas.patch [new file with mode: 0644]
queue-5.10/mm-hugetlb-fix-huge_pmd_unshare-vs-gup-fast-race.patch [new file with mode: 0644]
queue-5.10/mm-hugetlb-independent-pmd-page-table-shared-count.patch [new file with mode: 0644]
queue-5.10/mm-hugetlb-unshare-page-tables-during-vma-split-not-before.patch [new file with mode: 0644]
queue-5.10/series

diff --git a/queue-5.10/hugetlb-unshare-some-pmds-when-splitting-vmas.patch b/queue-5.10/hugetlb-unshare-some-pmds-when-splitting-vmas.patch
new file mode 100644 (file)
index 0000000..de3eb52
--- /dev/null
@@ -0,0 +1,131 @@
+From b30c14cd61025eeea2f2e8569606cd167ba9ad2d Mon Sep 17 00:00:00 2001
+From: James Houghton <jthoughton@google.com>
+Date: Wed, 4 Jan 2023 23:19:10 +0000
+Subject: hugetlb: unshare some PMDs when splitting VMAs
+
+From: James Houghton <jthoughton@google.com>
+
+commit b30c14cd61025eeea2f2e8569606cd167ba9ad2d upstream.
+
+PMD sharing can only be done in PUD_SIZE-aligned pieces of VMAs; however,
+it is possible that HugeTLB VMAs are split without unsharing the PMDs
+first.
+
+Without this fix, it is possible to hit the uffd-wp-related WARN_ON_ONCE
+in hugetlb_change_protection [1].  The key there is that
+hugetlb_unshare_all_pmds will not attempt to unshare PMDs in
+non-PUD_SIZE-aligned sections of the VMA.
+
+It might seem ideal to unshare in hugetlb_vm_op_open, but we need to
+unshare in both the new and old VMAs, so unsharing in hugetlb_vm_op_split
+seems natural.
+
+[1]: https://lore.kernel.org/linux-mm/CADrL8HVeOkj0QH5VZZbRzybNE8CG-tEGFshnA+bG9nMgcWtBSg@mail.gmail.com/
+
+Link: https://lkml.kernel.org/r/20230104231910.1464197-1-jthoughton@google.com
+Fixes: 6dfeaff93be1 ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp")
+Signed-off-by: James Houghton <jthoughton@google.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[backport notes: I believe the "Fixes" tag is somewhat wrong - kernels
+before that commit already had an adjust_range_if_pmd_sharing_possible()
+that assumes that shared PMDs can't straddle page table boundaries.
+huge_pmd_unshare() takes different parameter type]
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |   65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 65 insertions(+)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -96,6 +96,8 @@ static inline void ClearPageHugeFreed(st
+ /* Forward declaration */
+ static int hugetlb_acct_memory(struct hstate *h, long delta);
++static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
++              unsigned long start, unsigned long end);
+ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+ {
+@@ -3697,6 +3699,25 @@ static int hugetlb_vm_op_split(struct vm
+ {
+       if (addr & ~(huge_page_mask(hstate_vma(vma))))
+               return -EINVAL;
++
++      /*
++       * PMD sharing is only possible for PUD_SIZE-aligned address ranges
++       * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
++       * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
++       */
++      if (addr & ~PUD_MASK) {
++              /*
++               * hugetlb_vm_op_split is called right before we attempt to
++               * split the VMA. We will need to unshare PMDs in the old and
++               * new VMAs, so let's unshare before we split.
++               */
++              unsigned long floor = addr & PUD_MASK;
++              unsigned long ceil = floor + PUD_SIZE;
++
++              if (floor >= vma->vm_start && ceil <= vma->vm_end)
++                      hugetlb_unshare_pmds(vma, floor, ceil);
++      }
++
+       return 0;
+ }
+@@ -5706,6 +5727,50 @@ void move_hugetlb_state(struct page *old
+       }
+ }
++static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
++                                 unsigned long start,
++                                 unsigned long end)
++{
++      struct hstate *h = hstate_vma(vma);
++      unsigned long sz = huge_page_size(h);
++      struct mm_struct *mm = vma->vm_mm;
++      struct mmu_notifier_range range;
++      unsigned long address;
++      spinlock_t *ptl;
++      pte_t *ptep;
++
++      if (!(vma->vm_flags & VM_MAYSHARE))
++              return;
++
++      if (start >= end)
++              return;
++
++      flush_cache_range(vma, start, end);
++      /*
++       * No need to call adjust_range_if_pmd_sharing_possible(), because
++       * we have already done the PUD_SIZE alignment.
++       */
++      mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
++                              start, end);
++      mmu_notifier_invalidate_range_start(&range);
++      i_mmap_lock_write(vma->vm_file->f_mapping);
++      for (address = start; address < end; address += PUD_SIZE) {
++              ptep = huge_pte_offset(mm, address, sz);
++              if (!ptep)
++                      continue;
++              ptl = huge_pte_lock(h, mm, ptep);
++              huge_pmd_unshare(mm, vma, &address, ptep);
++              spin_unlock(ptl);
++      }
++      flush_hugetlb_tlb_range(vma, start, end);
++      i_mmap_unlock_write(vma->vm_file->f_mapping);
++      /*
++       * No need to call mmu_notifier_invalidate_range(), see
++       * Documentation/mm/mmu_notifier.rst.
++       */
++      mmu_notifier_invalidate_range_end(&range);
++}
++
+ #ifdef CONFIG_CMA
+ static bool cma_reserve_called __initdata;
diff --git a/queue-5.10/mm-hugetlb-fix-huge_pmd_unshare-vs-gup-fast-race.patch b/queue-5.10/mm-hugetlb-fix-huge_pmd_unshare-vs-gup-fast-race.patch
new file mode 100644 (file)
index 0000000..d54d5bd
--- /dev/null
@@ -0,0 +1,55 @@
+From 1013af4f585fccc4d3e5c5824d174de2257f7d6d Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 27 May 2025 23:23:54 +0200
+Subject: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race
+
+From: Jann Horn <jannh@google.com>
+
+commit 1013af4f585fccc4d3e5c5824d174de2257f7d6d upstream.
+
+huge_pmd_unshare() drops a reference on a page table that may have
+previously been shared across processes, potentially turning it into a
+normal page table used in another process in which unrelated VMAs can
+afterwards be installed.
+
+If this happens in the middle of a concurrent gup_fast(), gup_fast() could
+end up walking the page tables of another process.  While I don't see any
+way in which that immediately leads to kernel memory corruption, it is
+really weird and unexpected.
+
+Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(),
+just like we do in khugepaged when removing page tables for a THP
+collapse.
+
+Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com
+Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com
+Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
+Signed-off-by: Jann Horn <jannh@google.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -5488,6 +5488,13 @@ int huge_pmd_unshare(struct mm_struct *m
+               return 0;
+       pud_clear(pud);
++      /*
++       * Once our caller drops the rmap lock, some other process might be
++       * using this page table as a normal, non-hugetlb page table.
++       * Wait for pending gup_fast() in other threads to finish before letting
++       * that happen.
++       */
++      tlb_remove_table_sync_one();
+       atomic_dec(&virt_to_page(ptep)->pt_share_count);
+       mm_dec_nr_pmds(mm);
+       /*
diff --git a/queue-5.10/mm-hugetlb-independent-pmd-page-table-shared-count.patch b/queue-5.10/mm-hugetlb-independent-pmd-page-table-shared-count.patch
new file mode 100644 (file)
index 0000000..3beeede
--- /dev/null
@@ -0,0 +1,162 @@
+From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001
+From: Liu Shixin <liushixin2@huawei.com>
+Date: Mon, 16 Dec 2024 15:11:47 +0800
+Subject: mm: hugetlb: independent PMD page table shared count
+
+From: Liu Shixin <liushixin2@huawei.com>
+
+commit 59d9094df3d79443937add8700b2ef1a866b1081 upstream.
+
+The folio refcount may be increased unexpectly through try_get_folio() by
+caller such as split_huge_pages.  In huge_pmd_unshare(), we use refcount
+to check whether a pmd page table is shared.  The check is incorrect if
+the refcount is increased by the above caller, and this can cause the page
+table leaked:
+
+ BUG: Bad page state in process sh  pfn:109324
+ page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324
+ flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff)
+ page_type: f2(table)
+ raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000
+ raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000
+ page dumped because: nonzero mapcount
+ ...
+ CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G    B              6.13.0-rc2master+ #7
+ Tainted: [B]=BAD_PAGE
+ Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+ Call trace:
+  show_stack+0x20/0x38 (C)
+  dump_stack_lvl+0x80/0xf8
+  dump_stack+0x18/0x28
+  bad_page+0x8c/0x130
+  free_page_is_bad_report+0xa4/0xb0
+  free_unref_page+0x3cc/0x620
+  __folio_put+0xf4/0x158
+  split_huge_pages_all+0x1e0/0x3e8
+  split_huge_pages_write+0x25c/0x2d8
+  full_proxy_write+0x64/0xd8
+  vfs_write+0xcc/0x280
+  ksys_write+0x70/0x110
+  __arm64_sys_write+0x24/0x38
+  invoke_syscall+0x50/0x120
+  el0_svc_common.constprop.0+0xc8/0xf0
+  do_el0_svc+0x24/0x38
+  el0_svc+0x34/0x128
+  el0t_64_sync_handler+0xc8/0xd0
+  el0t_64_sync+0x190/0x198
+
+The issue may be triggered by damon, offline_page, page_idle, etc, which
+will increase the refcount of page table.
+
+1. The page table itself will be discarded after reporting the
+   "nonzero mapcount".
+
+2. The HugeTLB page mapped by the page table miss freeing since we
+   treat the page table as shared and a shared page table will not be
+   unmapped.
+
+Fix it by introducing independent PMD page table shared count.  As
+described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390
+gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv
+pmds, so we can reuse the field as pt_share_count.
+
+Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com
+Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Ken Chen <kenneth.w.chen@intel.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Nanyong Sun <sunnanyong@huawei.com>
+Cc: Jane Chu <jane.chu@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[backport note: struct ptdesc did not exist yet, stuff it equivalently
+into struct page instead]
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h       |    3 +++
+ include/linux/mm_types.h |    3 +++
+ mm/hugetlb.c             |   18 ++++++++----------
+ 3 files changed, 14 insertions(+), 10 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2318,6 +2318,9 @@ static inline bool pgtable_pmd_page_ctor
+       if (!pmd_ptlock_init(page))
+               return false;
+       __SetPageTable(page);
++#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
++      atomic_set(&page->pt_share_count, 0);
++#endif
+       inc_zone_page_state(page, NR_PAGETABLE);
+       return true;
+ }
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -151,6 +151,9 @@ struct page {
+                       union {
+                               struct mm_struct *pt_mm; /* x86 pgds only */
+                               atomic_t pt_frag_refcount; /* powerpc */
++#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
++                              atomic_t pt_share_count;
++#endif
+                       };
+ #if ALLOC_SPLIT_PTLOCKS
+                       spinlock_t *ptl;
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -5442,7 +5442,7 @@ pte_t *huge_pmd_share(struct mm_struct *
+                       spte = huge_pte_offset(svma->vm_mm, saddr,
+                                              vma_mmu_pagesize(svma));
+                       if (spte) {
+-                              get_page(virt_to_page(spte));
++                              atomic_inc(&virt_to_page(spte)->pt_share_count);
+                               break;
+                       }
+               }
+@@ -5457,7 +5457,7 @@ pte_t *huge_pmd_share(struct mm_struct *
+                               (pmd_t *)((unsigned long)spte & PAGE_MASK));
+               mm_inc_nr_pmds(mm);
+       } else {
+-              put_page(virt_to_page(spte));
++              atomic_dec(&virt_to_page(spte)->pt_share_count);
+       }
+       spin_unlock(ptl);
+ out:
+@@ -5468,11 +5468,7 @@ out:
+ /*
+  * unmap huge page backed by shared pte.
+  *
+- * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+- * indicated by page_count > 1, unmap is achieved by clearing pud and
+- * decrementing the ref count. If count == 1, the pte page is not shared.
+- *
+- * Called with page table lock held and i_mmap_rwsem held in write mode.
++ * Called with page table lock held.
+  *
+  * returns: 1 successfully unmapped a shared pte page
+  *        0 the underlying pte page is not shared, or it is the last user
+@@ -5480,17 +5476,19 @@ out:
+ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+                                       unsigned long *addr, pte_t *ptep)
+ {
++      unsigned long sz = huge_page_size(hstate_vma(vma));
+       pgd_t *pgd = pgd_offset(mm, *addr);
+       p4d_t *p4d = p4d_offset(pgd, *addr);
+       pud_t *pud = pud_offset(p4d, *addr);
+       i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+-      BUG_ON(page_count(virt_to_page(ptep)) == 0);
+-      if (page_count(virt_to_page(ptep)) == 1)
++      if (sz != PMD_SIZE)
++              return 0;
++      if (!atomic_read(&virt_to_page(ptep)->pt_share_count))
+               return 0;
+       pud_clear(pud);
+-      put_page(virt_to_page(ptep));
++      atomic_dec(&virt_to_page(ptep)->pt_share_count);
+       mm_dec_nr_pmds(mm);
+       /*
+        * This update of passed address optimizes loops sequentially
diff --git a/queue-5.10/mm-hugetlb-unshare-page-tables-during-vma-split-not-before.patch b/queue-5.10/mm-hugetlb-unshare-page-tables-during-vma-split-not-before.patch
new file mode 100644 (file)
index 0000000..9a7ad11
--- /dev/null
@@ -0,0 +1,203 @@
+From 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 27 May 2025 23:23:53 +0200
+Subject: mm/hugetlb: unshare page tables during VMA split, not before
+
+From: Jann Horn <jannh@google.com>
+
+commit 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 upstream.
+
+Currently, __split_vma() triggers hugetlb page table unsharing through
+vm_ops->may_split().  This happens before the VMA lock and rmap locks are
+taken - which is too early, it allows racing VMA-locked page faults in our
+process and racing rmap walks from other processes to cause page tables to
+be shared again before we actually perform the split.
+
+Fix it by explicitly calling into the hugetlb unshare logic from
+__split_vma() in the same place where THP splitting also happens.  At that
+point, both the VMA and the rmap(s) are write-locked.
+
+An annoying detail is that we can now call into the helper
+hugetlb_unshare_pmds() from two different locking contexts:
+
+1. from hugetlb_split(), holding:
+    - mmap lock (exclusively)
+    - VMA lock
+    - file rmap lock (exclusively)
+2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
+   call us with only the mmap lock held (in shared mode), but currently
+   only runs while holding mmap lock (exclusively) and VMA lock
+
+Backporting note:
+This commit fixes a racy protection that was introduced in commit
+b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
+commit claimed to fix an issue introduced in 5.13, but it should actually
+also go all the way back.
+
+[jannh@google.com: v2]
+  Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
+Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
+Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
+Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
+Signed-off-by: Jann Horn <jannh@google.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>   [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[stable backport: code got moved around, VMA splitting is in
+__vma_adjust, hugetlb lock wasn't used back then]
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/hugetlb.h |    6 +++++
+ mm/hugetlb.c            |   53 ++++++++++++++++++++++++++++++++++++------------
+ mm/mmap.c               |    8 +++++++
+ 3 files changed, 54 insertions(+), 13 deletions(-)
+
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -188,6 +188,8 @@ unsigned long hugetlb_change_protection(
+               unsigned long address, unsigned long end, pgprot_t newprot);
+ bool is_hugetlb_entry_migration(pte_t pte);
++void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
++void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
+ #else /* !CONFIG_HUGETLB_PAGE */
+@@ -369,6 +371,10 @@ static inline vm_fault_t hugetlb_fault(s
+       return 0;
+ }
++static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
++
++static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
++
+ #endif /* !CONFIG_HUGETLB_PAGE */
+ /*
+  * hugepages at page global directory. If arch support
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -97,7 +97,7 @@ static inline void ClearPageHugeFreed(st
+ /* Forward declaration */
+ static int hugetlb_acct_memory(struct hstate *h, long delta);
+ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+-              unsigned long start, unsigned long end);
++              unsigned long start, unsigned long end, bool take_locks);
+ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+ {
+@@ -3699,26 +3699,40 @@ static int hugetlb_vm_op_split(struct vm
+ {
+       if (addr & ~(huge_page_mask(hstate_vma(vma))))
+               return -EINVAL;
++      return 0;
++}
++void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
++{
+       /*
+        * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+        * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+        * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
++       * This function is called in the middle of a VMA split operation, with
++       * MM, VMA and rmap all write-locked to prevent concurrent page table
++       * walks (except hardware and gup_fast()).
+        */
++      mmap_assert_write_locked(vma->vm_mm);
++      i_mmap_assert_write_locked(vma->vm_file->f_mapping);
++
+       if (addr & ~PUD_MASK) {
+-              /*
+-               * hugetlb_vm_op_split is called right before we attempt to
+-               * split the VMA. We will need to unshare PMDs in the old and
+-               * new VMAs, so let's unshare before we split.
+-               */
+               unsigned long floor = addr & PUD_MASK;
+               unsigned long ceil = floor + PUD_SIZE;
+-              if (floor >= vma->vm_start && ceil <= vma->vm_end)
+-                      hugetlb_unshare_pmds(vma, floor, ceil);
++              if (floor >= vma->vm_start && ceil <= vma->vm_end) {
++                      /*
++                       * Locking:
++                       * Use take_locks=false here.
++                       * The file rmap lock is already held.
++                       * The hugetlb VMA lock can't be taken when we already
++                       * hold the file rmap lock, and we don't need it because
++                       * its purpose is to synchronize against concurrent page
++                       * table walks, which are not possible thanks to the
++                       * locks held by our caller.
++                       */
++                      hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
++              }
+       }
+-
+-      return 0;
+ }
+ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
+@@ -5727,9 +5741,16 @@ void move_hugetlb_state(struct page *old
+       }
+ }
++/*
++ * If @take_locks is false, the caller must ensure that no concurrent page table
++ * access can happen (except for gup_fast() and hardware page walks).
++ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
++ * concurrent page fault handling) and the file rmap lock.
++ */
+ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+                                  unsigned long start,
+-                                 unsigned long end)
++                                 unsigned long end,
++                                 bool take_locks)
+ {
+       struct hstate *h = hstate_vma(vma);
+       unsigned long sz = huge_page_size(h);
+@@ -5753,7 +5774,11 @@ static void hugetlb_unshare_pmds(struct
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               start, end);
+       mmu_notifier_invalidate_range_start(&range);
+-      i_mmap_lock_write(vma->vm_file->f_mapping);
++      if (take_locks) {
++              i_mmap_lock_write(vma->vm_file->f_mapping);
++      } else {
++              i_mmap_assert_write_locked(vma->vm_file->f_mapping);
++      }
+       for (address = start; address < end; address += PUD_SIZE) {
+               ptep = huge_pte_offset(mm, address, sz);
+               if (!ptep)
+@@ -5763,7 +5788,9 @@ static void hugetlb_unshare_pmds(struct
+               spin_unlock(ptl);
+       }
+       flush_hugetlb_tlb_range(vma, start, end);
+-      i_mmap_unlock_write(vma->vm_file->f_mapping);
++      if (take_locks) {
++              i_mmap_unlock_write(vma->vm_file->f_mapping);
++      }
+       /*
+        * No need to call mmu_notifier_invalidate_range(), see
+        * Documentation/mm/mmu_notifier.rst.
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -832,7 +832,15 @@ int __vma_adjust(struct vm_area_struct *
+               }
+       }
+ again:
++      /*
++       * Get rid of huge pages and shared page tables straddling the split
++       * boundary.
++       */
+       vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
++      if (is_vm_hugetlb_page(orig_vma)) {
++              hugetlb_split(orig_vma, start);
++              hugetlb_split(orig_vma, end);
++      }
+       if (file) {
+               mapping = file->f_mapping;
index 157d087387757ccd7af7c12db14db91654d3de6a..c83948c30a7ad93ab7a8c8b6d375703b0096e6fb 100644 (file)
@@ -296,3 +296,7 @@ arm64-restrict-pagetable-teardown-to-avoid-false-warning.patch
 alsa-usb-audio-rename-alsa-kcontrol-pcm-and-pcm1-for-the-ktmicro-sound-card.patch
 alsa-hda-intel-add-thinkpad-e15-to-pm-deny-list.patch
 alsa-hda-realtek-enable-headset-mic-on-latitude-5420-rugged.patch
+hugetlb-unshare-some-pmds-when-splitting-vmas.patch
+mm-hugetlb-unshare-page-tables-during-vma-split-not-before.patch
+mm-hugetlb-independent-pmd-page-table-shared-count.patch
+mm-hugetlb-fix-huge_pmd_unshare-vs-gup-fast-race.patch