--- /dev/null
+From dff11abe280b47c21b804a8ace318e0638bb9a49 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Fri, 5 Oct 2018 15:51:33 -0700
+Subject: hugetlb: take PMD sharing into account when flushing tlb/caches
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit dff11abe280b47c21b804a8ace318e0638bb9a49 upstream.
+
+When fixing an issue with PMD sharing and migration, it was discovered via
+code inspection that other callers of huge_pmd_unshare potentially have an
+issue with cache and tlb flushing.
+
+Use the routine adjust_range_if_pmd_sharing_possible() to calculate worst
+case ranges for mmu notifiers. Ensure that this range is flushed if
+huge_pmd_unshare succeeds and unmaps a PUD_SUZE area.
+
+Link: http://lkml.kernel.org/r/20180823205917.16297-3-mike.kravetz@oracle.com
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Nadav Amit <namit@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c | 53 ++++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 44 insertions(+), 9 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3393,14 +3393,19 @@ void __unmap_hugepage_range(struct mmu_g
+ struct page *page;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+- const unsigned long mmun_start = start; /* For mmu_notifiers */
+- const unsigned long mmun_end = end; /* For mmu_notifiers */
++ unsigned long mmun_start = start; /* For mmu_notifiers */
++ unsigned long mmun_end = end; /* For mmu_notifiers */
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(start & ~huge_page_mask(h));
+ BUG_ON(end & ~huge_page_mask(h));
+
+ tlb_start_vma(tlb, vma);
++
++ /*
++ * If sharing possible, alert mmu notifiers of worst case.
++ */
++ adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ address = start;
+ for (; address < end; address += sz) {
+@@ -3411,6 +3416,10 @@ void __unmap_hugepage_range(struct mmu_g
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ spin_unlock(ptl);
++ /*
++ * We just unmapped a page of PMDs by clearing a PUD.
++ * The caller's TLB flush range should cover this area.
++ */
+ continue;
+ }
+
+@@ -3493,12 +3502,23 @@ void unmap_hugepage_range(struct vm_area
+ {
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
++ unsigned long tlb_start = start;
++ unsigned long tlb_end = end;
++
++ /*
++ * If shared PMDs were possibly used within this vma range, adjust
++ * start/end for worst case tlb flushing.
++ * Note that we can not be sure if PMDs are shared until we try to
++ * unmap pages. However, we want to make sure TLB flushing covers
++ * the largest possible range.
++ */
++ adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
+
+ mm = vma->vm_mm;
+
+- tlb_gather_mmu(&tlb, mm, start, end);
++ tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+- tlb_finish_mmu(&tlb, start, end);
++ tlb_finish_mmu(&tlb, tlb_start, tlb_end);
+ }
+
+ /*
+@@ -4186,11 +4206,21 @@ unsigned long hugetlb_change_protection(
+ pte_t pte;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
++ unsigned long f_start = start;
++ unsigned long f_end = end;
++ bool shared_pmd = false;
++
++ /*
++ * In the case of shared PMDs, the area to flush could be beyond
++ * start/end. Set f_start/f_end to cover the maximum possible
++ * range if PMD sharing is possible.
++ */
++ adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
+
+ BUG_ON(address >= end);
+- flush_cache_range(vma, address, end);
++ flush_cache_range(vma, f_start, f_end);
+
+- mmu_notifier_invalidate_range_start(mm, start, end);
++ mmu_notifier_invalidate_range_start(mm, f_start, f_end);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ for (; address < end; address += huge_page_size(h)) {
+ spinlock_t *ptl;
+@@ -4201,6 +4231,7 @@ unsigned long hugetlb_change_protection(
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
+ spin_unlock(ptl);
++ shared_pmd = true;
+ continue;
+ }
+ pte = huge_ptep_get(ptep);
+@@ -4235,12 +4266,16 @@ unsigned long hugetlb_change_protection(
+ * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_rwsem, another task can do the final put_page
+- * and that page table be reused and filled with junk.
++ * and that page table be reused and filled with junk. If we actually
++ * did unshare a page of pmds, flush the range corresponding to the pud.
+ */
+- flush_hugetlb_tlb_range(vma, start, end);
++ if (shared_pmd)
++ flush_hugetlb_tlb_range(vma, f_start, f_end);
++ else
++ flush_hugetlb_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range(mm, start, end);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+- mmu_notifier_invalidate_range_end(mm, start, end);
++ mmu_notifier_invalidate_range_end(mm, f_start, f_end);
+
+ return pages << h->order;
+ }
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
---
- include/asm-generic/tlb.h | 17 +++++++++++++++++
- mm/hugetlb.c | 19 +++++++++++++++++++
- 2 files changed, 36 insertions(+)
+ arch/arm/include/asm/tlb.h | 8 ++++++++
+ arch/ia64/include/asm/tlb.h | 10 ++++++++++
+ arch/s390/include/asm/tlb.h | 14 ++++++++++++++
+ arch/sh/include/asm/tlb.h | 9 +++++++++
+ arch/um/include/asm/tlb.h | 12 ++++++++++++
+ include/asm-generic/tlb.h | 2 ++
+ mm/hugetlb.c | 19 +++++++++++++++++++
+ mm/memory.c | 16 ++++++++++++++++
+ 8 files changed, 90 insertions(+)
---- a/include/asm-generic/tlb.h
-+++ b/include/asm-generic/tlb.h
-@@ -202,6 +202,23 @@ static inline bool __tlb_remove_pte_page
- #define tlb_end_vma __tlb_end_vma
- #endif
+--- a/arch/arm/include/asm/tlb.h
++++ b/arch/arm/include/asm/tlb.h
+@@ -278,6 +278,14 @@ tlb_remove_pmd_tlb_entry(struct mmu_gath
+ tlb_add_flush(tlb, addr);
+ }
+
++static inline void
++tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
++ unsigned long size)
++{
++ tlb_add_flush(tlb, address);
++ tlb_add_flush(tlb, address + size - PMD_SIZE);
++}
++
+ #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr)
+ #define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr)
+ #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp)
+--- a/arch/ia64/include/asm/tlb.h
++++ b/arch/ia64/include/asm/tlb.h
+@@ -272,6 +272,16 @@ __tlb_remove_tlb_entry (struct mmu_gathe
+ tlb->end_addr = address + PAGE_SIZE;
+ }
+
++static inline void
++tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
++ unsigned long size)
++{
++ if (tlb->start_addr > address)
++ tlb->start_addr = address;
++ if (tlb->end_addr < address + size)
++ tlb->end_addr = address + size;
++}
++
+ #define tlb_migrate_finish(mm) platform_tlb_migrate_finish(mm)
+
+ #define tlb_start_vma(tlb, vma) do { } while (0)
+--- a/arch/s390/include/asm/tlb.h
++++ b/arch/s390/include/asm/tlb.h
+@@ -116,6 +116,20 @@ static inline void tlb_remove_page_size(
+ return tlb_remove_page(tlb, page);
+ }
+static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
+ unsigned long address, unsigned long size)
+{
-+ if (tlb->page_size != 0 && tlb->page_size != PMD_SIZE)
-+ tlb_flush_mmu(tlb);
-+
-+ tlb->page_size = PMD_SIZE;
-+ tlb->start = min(tlb->start, address);
-+ tlb->end = max(tlb->end, address + size);
+ /*
-+ * Track the last address with which we adjusted the range. This
-+ * will be used later to adjust again after a mmu_flush due to
-+ * failed __tlb_remove_page
++ * the range might exceed the original range that was provided to
++ * tlb_gather_mmu(), so we need to update it despite the fact it is
++ * usually not updated.
+ */
-+ tlb->addr = address + size - PMD_SIZE;
++ if (tlb->start > address)
++ tlb->start = address;
++ if (tlb->end < address + size)
++ tlb->end = address + size;
++}
++
+ /*
+ * pte_free_tlb frees a pte table and clears the CRSTE for the
+ * page table from the tlb.
+--- a/arch/sh/include/asm/tlb.h
++++ b/arch/sh/include/asm/tlb.h
+@@ -115,6 +115,15 @@ static inline bool __tlb_remove_page_siz
+ return __tlb_remove_page(tlb, page);
+ }
+
++static inline tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
++ unsigned long size)
++{
++ if (tlb->start > address)
++ tlb->start = address;
++ if (tlb->end < address + size)
++ tlb->end = address + size;
++}
++
+ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
+ struct page *page)
+ {
+--- a/arch/um/include/asm/tlb.h
++++ b/arch/um/include/asm/tlb.h
+@@ -128,6 +128,18 @@ static inline void tlb_remove_page_size(
+ return tlb_remove_page(tlb, page);
+ }
+
++static inline void
++tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
++ unsigned long size)
++{
++ tlb->need_flush = 1;
++
++ if (tlb->start > address)
++ tlb->start = address;
++ if (tlb->end < address + size)
++ tlb->end = address + size;
+}
+
- #ifndef __tlb_remove_tlb_entry
- #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
- #endif
+ /**
+ * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
+ *
+--- a/include/asm-generic/tlb.h
++++ b/include/asm-generic/tlb.h
+@@ -123,6 +123,8 @@ void tlb_finish_mmu(struct mmu_gather *t
+ unsigned long end);
+ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+ int page_size);
++void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
++ unsigned long size);
+
+ static inline void __tlb_adjust_range(struct mmu_gather *tlb,
+ unsigned long address)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3395,6 +3395,7 @@ void __unmap_hugepage_range(struct mmu_g
unsigned long sz = huge_page_size(h);
- const unsigned long mmun_start = start; /* For mmu_notifiers */
- const unsigned long mmun_end = end; /* For mmu_notifiers */
+ unsigned long mmun_start = start; /* For mmu_notifiers */
+ unsigned long mmun_end = end; /* For mmu_notifiers */
+ bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
-@@ -3411,6 +3412,8 @@ void __unmap_hugepage_range(struct mmu_g
- ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+@@ -3426,6 +3427,8 @@ void __unmap_hugepage_range(struct mmu_g
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte)) {
spin_unlock(ptl);
+ tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
+ force_flush = true;
continue;
}
-@@ -3467,6 +3470,22 @@ void __unmap_hugepage_range(struct mmu_g
+@@ -3476,6 +3479,22 @@ void __unmap_hugepage_range(struct mmu_g
}
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
tlb_end_vma(tlb, vma);
}
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -320,6 +320,22 @@ bool __tlb_remove_page_size(struct mmu_g
+ return false;
+ }
+
++void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
++ unsigned long size)
++{
++ if (tlb->page_size != 0 && tlb->page_size != PMD_SIZE)
++ tlb_flush_mmu(tlb);
++
++ tlb->page_size = PMD_SIZE;
++ tlb->start = min(tlb->start, address);
++ tlb->end = max(tlb->end, address + size);
++ /*
++ * Track the last address with which we adjusted the range. This
++ * will be used later to adjust again after a mmu_flush due to
++ * failed __tlb_remove_page
++ */
++ tlb->addr = address + size - PMD_SIZE;
++}
+ #endif /* HAVE_GENERIC_MMU_GATHER */
+
+ #ifdef CONFIG_HAVE_RCU_TABLE_FREE