5.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 28 Jun 2021 11:37:49 +0000 (13:37 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 28 Jun 2021 11:37:49 +0000 (13:37 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 28 Jun 2021 11:37:49 +0000 (13:37 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 28 Jun 2021 11:37:49 +0000 (13:37 +0200)
diff --git a/queue-5.4/mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch b/queue-5.4/mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch

new file mode 100644 (file)

index 0000000..ee847d0
--- /dev/null
+++ b/queue-5.4/mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch
@@ -0,0 +1,154 @@
+From foo@baz Mon Jun 28 01:37:10 PM CEST 2021
+From: Hugh Dickins <hughd@google.com>
+Date: Thu, 24 Jun 2021 18:39:52 -0700
+Subject: mm, futex: fix shared futex pgoff on shmem huge page
+
+From: Hugh Dickins <hughd@google.com>
+
+[ Upstream commit fe19bd3dae3d15d2fbfdb3de8839a6ea0fe94264 ]
+
+If more than one futex is placed on a shmem huge page, it can happen
+that waking the second wakes the first instead, and leaves the second
+waiting: the key's shared.pgoff is wrong.
+
+When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when
+generating futex_key"), the only shared huge pages came from hugetlbfs,
+and the code added to deal with its exceptional page->index was put into
+hugetlb source.  Then that was missed when 4.8 added shmem huge pages.
+
+page_to_pgoff() is what others use for this nowadays: except that, as
+currently written, it gives the right answer on hugetlbfs head, but
+nonsense on hugetlbfs tails.  Fix that by calling hugetlbfs-specific
+hugetlb_basepage_index() on PageHuge tails as well as on head.
+
+Yes, it's unconventional to declare hugetlb_basepage_index() there in
+pagemap.h, rather than in hugetlb.h; but I do not expect anything but
+page_to_pgoff() ever to need it.
+
+[akpm@linux-foundation.org: give hugetlb_basepage_index() prototype the correct scope]
+
+Link: https://lkml.kernel.org/r/b17d946b-d09-326e-b42a-52884c36df32@google.com
+Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
+Reported-by: Neel Natu <neelnatu@google.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Zhang Yi <wetpzy@gmail.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Darren Hart <dvhart@infradead.org>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+Note on stable backport: leave redundant #include <linux/hugetlb.h>
+in kernel/futex.c, to avoid conflict over the header files included.
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/hugetlb.h |   16 ----------------
+ include/linux/pagemap.h |   13 +++++++------
+ kernel/futex.c          |    2 +-
+ mm/hugetlb.c            |    5 +----
+ 4 files changed, 9 insertions(+), 27 deletions(-)
+
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -469,17 +469,6 @@ static inline int hstate_index(struct hs
+       return h - hstates;
+ }
+ 
+-pgoff_t __basepage_index(struct page *page);
+-
+-/* Return page->index in PAGE_SIZE units */
+-static inline pgoff_t basepage_index(struct page *page)
+-{
+-      if (!PageCompound(page))
+-              return page->index;
+-
+-      return __basepage_index(page);
+-}
+-
+ extern int dissolve_free_huge_page(struct page *page);
+ extern int dissolve_free_huge_pages(unsigned long start_pfn,
+                                   unsigned long end_pfn);
+@@ -695,11 +684,6 @@ static inline int hstate_index(struct hs
+       return 0;
+ }
+ 
+-static inline pgoff_t basepage_index(struct page *page)
+-{
+-      return page->index;
+-}
+-
+ static inline int dissolve_free_huge_page(struct page *page)
+ {
+       return 0;
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -397,7 +397,7 @@ static inline struct page *read_mapping_
+ }
+ 
+ /*
+- * Get index of the page with in radix-tree
++ * Get index of the page within radix-tree (but not for hugetlb pages).
+  * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+  */
+ static inline pgoff_t page_to_index(struct page *page)
+@@ -416,15 +416,16 @@ static inline pgoff_t page_to_index(stru
+       return pgoff;
+ }
+ 
++extern pgoff_t hugetlb_basepage_index(struct page *page);
++
+ /*
+- * Get the offset in PAGE_SIZE.
+- * (TODO: hugepage should have ->index in PAGE_SIZE)
++ * Get the offset in PAGE_SIZE (even for hugetlb pages).
++ * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
+  */
+ static inline pgoff_t page_to_pgoff(struct page *page)
+ {
+-      if (unlikely(PageHeadHuge(page)))
+-              return page->index << compound_order(page);
+-
++      if (unlikely(PageHuge(page)))
++              return hugetlb_basepage_index(page);
+       return page_to_index(page);
+ }
+ 
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -737,7 +737,7 @@ again:
+ 
+               key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+               key->shared.i_seq = get_inode_sequence_number(inode);
+-              key->shared.pgoff = basepage_index(tail);
++              key->shared.pgoff = page_to_pgoff(tail);
+               rcu_read_unlock();
+       }
+ 
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1461,15 +1461,12 @@ int PageHeadHuge(struct page *page_head)
+       return get_compound_page_dtor(page_head) == free_huge_page;
+ }
+ 
+-pgoff_t __basepage_index(struct page *page)
++pgoff_t hugetlb_basepage_index(struct page *page)
+ {
+       struct page *page_head = compound_head(page);
+       pgoff_t index = page_index(page_head);
+       unsigned long compound_idx;
+ 
+-      if (!PageHuge(page_head))
+-              return page_index(page);
+-
+       if (compound_order(page_head) >= MAX_ORDER)
+               compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+       else
diff --git a/queue-5.4/mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch b/queue-5.4/mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch

new file mode 100644 (file)

index 0000000..fd576b3
--- /dev/null
+++ b/queue-5.4/mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
@@ -0,0 +1,251 @@
+From foo@baz Mon Jun 28 01:37:10 PM CEST 2021
+From: Hugh Dickins <hughd@google.com>
+Date: Tue, 15 Jun 2021 18:23:56 -0700
+Subject: mm/thp: fix vma_address() if virtual address below file offset
+
+From: Hugh Dickins <hughd@google.com>
+
+[ Upstream commit 494334e43c16d63b878536a26505397fce6ff3a2 ]
+
+Running certain tests with a DEBUG_VM kernel would crash within hours,
+on the total_mapcount BUG() in split_huge_page_to_list(), while trying
+to free up some memory by punching a hole in a shmem huge page: split's
+try_to_unmap() was unable to find all the mappings of the page (which,
+on a !DEBUG_VM kernel, would then keep the huge page pinned in memory).
+
+When that BUG() was changed to a WARN(), it would later crash on the
+VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma) in
+mm/internal.h:vma_address(), used by rmap_walk_file() for
+try_to_unmap().
+
+vma_address() is usually correct, but there's a wraparound case when the
+vm_start address is unusually low, but vm_pgoff not so low:
+vma_address() chooses max(start, vma->vm_start), but that decides on the
+wrong address, because start has become almost ULONG_MAX.
+
+Rewrite vma_address() to be more careful about vm_pgoff; move the
+VM_BUG_ON_VMA() out of it, returning -EFAULT for errors, so that it can
+be safely used from page_mapped_in_vma() and page_address_in_vma() too.
+
+Add vma_address_end() to apply similar care to end address calculation,
+in page_vma_mapped_walk() and page_mkclean_one() and try_to_unmap_one();
+though it raises a question of whether callers would do better to supply
+pvmw->end to page_vma_mapped_walk() - I chose not, for a smaller patch.
+
+An irritation is that their apparent generality breaks down on KSM
+pages, which cannot be located by the page->index that page_to_pgoff()
+uses: as commit 4b0ece6fa016 ("mm: migrate: fix remove_migration_pte()
+for ksm pages") once discovered.  I dithered over the best thing to do
+about that, and have ended up with a VM_BUG_ON_PAGE(PageKsm) in both
+vma_address() and vma_address_end(); though the only place in danger of
+using it on them was try_to_unmap_one().
+
+Sidenote: vma_address() and vma_address_end() now use compound_nr() on a
+head page, instead of thp_size(): to make the right calculation on a
+hugetlbfs page, whether or not THPs are configured.  try_to_unmap() is
+used on hugetlbfs pages, but perhaps the wrong calculation never
+mattered.
+
+Link: https://lkml.kernel.org/r/caf1c1a3-7cfb-7f8f-1beb-ba816e932825@google.com
+Fixes: a8fa41ad2f6f ("mm, rmap: check all VMAs that PTE-mapped THP can be part of")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Jue Wang <juew@google.com>
+Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Ralph Campbell <rcampbell@nvidia.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Wang Yugui <wangyugui@e16-tech.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+Note on stable backport: fixed up conflicts on intervening thp_size().
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/internal.h        |   51 ++++++++++++++++++++++++++++++++++++++-------------
+ mm/page_vma_mapped.c |   16 ++++++----------
+ mm/rmap.c            |   16 ++++++++--------
+ 3 files changed, 52 insertions(+), 31 deletions(-)
+
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -339,27 +339,52 @@ static inline void mlock_migrate_page(st
+ extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+ 
+ /*
+- * At what user virtual address is page expected in @vma?
++ * At what user virtual address is page expected in vma?
++ * Returns -EFAULT if all of the page is outside the range of vma.
++ * If page is a compound head, the entire compound page is considered.
+  */
+ static inline unsigned long
+-__vma_address(struct page *page, struct vm_area_struct *vma)
++vma_address(struct page *page, struct vm_area_struct *vma)
+ {
+-      pgoff_t pgoff = page_to_pgoff(page);
+-      return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++      pgoff_t pgoff;
++      unsigned long address;
++
++      VM_BUG_ON_PAGE(PageKsm(page), page);    /* KSM page->index unusable */
++      pgoff = page_to_pgoff(page);
++      if (pgoff >= vma->vm_pgoff) {
++              address = vma->vm_start +
++                      ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++              /* Check for address beyond vma (or wrapped through 0?) */
++              if (address < vma->vm_start || address >= vma->vm_end)
++                      address = -EFAULT;
++      } else if (PageHead(page) &&
++                 pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
++              /* Test above avoids possibility of wrap to 0 on 32-bit */
++              address = vma->vm_start;
++      } else {
++              address = -EFAULT;
++      }
++      return address;
+ }
+ 
++/*
++ * Then at what user virtual address will none of the page be found in vma?
++ * Assumes that vma_address() already returned a good starting address.
++ * If page is a compound head, the entire compound page is considered.
++ */
+ static inline unsigned long
+-vma_address(struct page *page, struct vm_area_struct *vma)
++vma_address_end(struct page *page, struct vm_area_struct *vma)
+ {
+-      unsigned long start, end;
+-
+-      start = __vma_address(page, vma);
+-      end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
+-
+-      /* page should be within @vma mapping range */
+-      VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
++      pgoff_t pgoff;
++      unsigned long address;
+ 
+-      return max(start, vma->vm_start);
++      VM_BUG_ON_PAGE(PageKsm(page), page);    /* KSM page->index unusable */
++      pgoff = page_to_pgoff(page) + compound_nr(page);
++      address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++      /* Check for address beyond vma (or wrapped through 0?) */
++      if (address < vma->vm_start || address > vma->vm_end)
++              address = vma->vm_end;
++      return address;
+ }
+ 
+ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+--- a/mm/page_vma_mapped.c
++++ b/mm/page_vma_mapped.c
+@@ -223,18 +223,18 @@ restart:
+       if (!map_pte(pvmw))
+               goto next_pte;
+       while (1) {
++              unsigned long end;
++
+               if (check_pte(pvmw))
+                       return true;
+ next_pte:
+               /* Seek to next pte only makes sense for THP */
+               if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+                       return not_found(pvmw);
++              end = vma_address_end(pvmw->page, pvmw->vma);
+               do {
+                       pvmw->address += PAGE_SIZE;
+-                      if (pvmw->address >= pvmw->vma->vm_end ||
+-                          pvmw->address >=
+-                                      __vma_address(pvmw->page, pvmw->vma) +
+-                                      hpage_nr_pages(pvmw->page) * PAGE_SIZE)
++                      if (pvmw->address >= end)
+                               return not_found(pvmw);
+                       /* Did we cross page table boundary? */
+                       if (pvmw->address % PMD_SIZE == 0) {
+@@ -272,14 +272,10 @@ int page_mapped_in_vma(struct page *page
+               .vma = vma,
+               .flags = PVMW_SYNC,
+       };
+-      unsigned long start, end;
+-
+-      start = __vma_address(page, vma);
+-      end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
+ 
+-      if (unlikely(end < vma->vm_start || start >= vma->vm_end))
++      pvmw.address = vma_address(page, vma);
++      if (pvmw.address == -EFAULT)
+               return 0;
+-      pvmw.address = max(start, vma->vm_start);
+       if (!page_vma_mapped_walk(&pvmw))
+               return 0;
+       page_vma_mapped_walk_done(&pvmw);
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -687,7 +687,6 @@ static bool should_defer_flush(struct mm
+  */
+ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+ {
+-      unsigned long address;
+       if (PageAnon(page)) {
+               struct anon_vma *page__anon_vma = page_anon_vma(page);
+               /*
+@@ -702,10 +701,8 @@ unsigned long page_address_in_vma(struct
+                       return -EFAULT;
+       } else
+               return -EFAULT;
+-      address = __vma_address(page, vma);
+-      if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+-              return -EFAULT;
+-      return address;
++
++      return vma_address(page, vma);
+ }
+ 
+ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+@@ -899,7 +896,7 @@ static bool page_mkclean_one(struct page
+        */
+       mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+                               0, vma, vma->vm_mm, address,
+-                              min(vma->vm_end, address + page_size(page)));
++                              vma_address_end(page, vma));
+       mmu_notifier_invalidate_range_start(&range);
+ 
+       while (page_vma_mapped_walk(&pvmw)) {
+@@ -1383,9 +1380,10 @@ static bool try_to_unmap_one(struct page
+        * Note that the page can not be free in this function as call of
+        * try_to_unmap() must hold a reference on the page.
+        */
++      range.end = PageKsm(page) ?
++                      address + PAGE_SIZE : vma_address_end(page, vma);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+-                              address,
+-                              min(vma->vm_end, address + page_size(page)));
++                              address, range.end);
+       if (PageHuge(page)) {
+               /*
+                * If sharing is possible, start and end will be adjusted
+@@ -1848,6 +1846,7 @@ static void rmap_walk_anon(struct page *
+               struct vm_area_struct *vma = avc->vma;
+               unsigned long address = vma_address(page, vma);
+ 
++              VM_BUG_ON_VMA(address == -EFAULT, vma);
+               cond_resched();
+ 
+               if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+@@ -1902,6 +1901,7 @@ static void rmap_walk_file(struct page *
+                       pgoff_start, pgoff_end) {
+               unsigned long address = vma_address(page, vma);
+ 
++              VM_BUG_ON_VMA(address == -EFAULT, vma);
+               cond_resched();
+ 
+               if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
diff --git a/queue-5.4/mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch b/queue-5.4/mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch

new file mode 100644 (file)

index 0000000..754e949
--- /dev/null
+++ b/queue-5.4/mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch
@@ -0,0 +1,118 @@
+From foo@baz Mon Jun 28 01:37:10 PM CEST 2021
+From: Yang Shi <shy828301@gmail.com>
+Date: Tue, 15 Jun 2021 18:24:07 -0700
+Subject: mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
+
+From: Yang Shi <shy828301@gmail.com>
+
+[ Upstream commit 504e070dc08f757bccaed6d05c0f53ecbfac8a23 ]
+
+When debugging the bug reported by Wang Yugui [1], try_to_unmap() may
+fail, but the first VM_BUG_ON_PAGE() just checks page_mapcount() however
+it may miss the failure when head page is unmapped but other subpage is
+mapped.  Then the second DEBUG_VM BUG() that check total mapcount would
+catch it.  This may incur some confusion.
+
+As this is not a fatal issue, so consolidate the two DEBUG_VM checks
+into one VM_WARN_ON_ONCE_PAGE().
+
+[1] https://lore.kernel.org/linux-mm/20210412180659.B9E3.409509F4@e16-tech.com/
+
+Link: https://lkml.kernel.org/r/d0f0db68-98b8-ebfb-16dc-f29df24cf012@google.com
+Signed-off-by: Yang Shi <shy828301@gmail.com>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Jue Wang <juew@google.com>
+Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Ralph Campbell <rcampbell@nvidia.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Wang Yugui <wangyugui@e16-tech.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+Note on stable backport: fixed up variables in split_huge_page_to_list(),
+and fixed up the conflict on ttu_flags in unmap_page().
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/huge_memory.c |   24 +++++++-----------------
+ 1 file changed, 7 insertions(+), 17 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2450,15 +2450,15 @@ static void unmap_page(struct page *page
+ {
+       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
+               TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC;
+-      bool unmap_success;
+ 
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+ 
+       if (PageAnon(page))
+               ttu_flags |= TTU_SPLIT_FREEZE;
+ 
+-      unmap_success = try_to_unmap(page, ttu_flags);
+-      VM_BUG_ON_PAGE(!unmap_success, page);
++      try_to_unmap(page, ttu_flags);
++
++      VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
+ }
+ 
+ static void remap_page(struct page *page)
+@@ -2737,7 +2737,7 @@ int split_huge_page_to_list(struct page
+       struct deferred_split *ds_queue = get_deferred_split_queue(page);
+       struct anon_vma *anon_vma = NULL;
+       struct address_space *mapping = NULL;
+-      int count, mapcount, extra_pins, ret;
++      int extra_pins, ret;
+       bool mlocked;
+       unsigned long flags;
+       pgoff_t end;
+@@ -2799,7 +2799,6 @@ int split_huge_page_to_list(struct page
+ 
+       mlocked = PageMlocked(page);
+       unmap_page(head);
+-      VM_BUG_ON_PAGE(compound_mapcount(head), head);
+ 
+       /* Make sure the page is not on per-CPU pagevec as it takes pin */
+       if (mlocked)
+@@ -2822,9 +2821,7 @@ int split_huge_page_to_list(struct page
+ 
+       /* Prevent deferred_split_scan() touching ->_refcount */
+       spin_lock(&ds_queue->split_queue_lock);
+-      count = page_count(head);
+-      mapcount = total_mapcount(head);
+-      if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
++      if (page_ref_freeze(head, 1 + extra_pins)) {
+               if (!list_empty(page_deferred_list(head))) {
+                       ds_queue->split_queue_len--;
+                       list_del(page_deferred_list(head));
+@@ -2845,16 +2842,9 @@ int split_huge_page_to_list(struct page
+               } else
+                       ret = 0;
+       } else {
+-              if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+-                      pr_alert("total_mapcount: %u, page_count(): %u\n",
+-                                      mapcount, count);
+-                      if (PageTail(page))
+-                              dump_page(head, NULL);
+-                      dump_page(page, "total_mapcount(head) > 0");
+-                      BUG();
+-              }
+               spin_unlock(&ds_queue->split_queue_lock);
+-fail:         if (mapping)
++fail:
++              if (mapping)
+                       xa_unlock(&mapping->i_pages);
+               spin_unlock_irqrestore(&pgdata->lru_lock, flags);
+               remap_page(head);
diff --git a/queue-5.4/mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch b/queue-5.4/mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch

new file mode 100644 (file)

index 0000000..e0569a4
--- /dev/null
+++ b/queue-5.4/mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
@@ -0,0 +1,148 @@
+From foo@baz Mon Jun 28 01:37:10 PM CEST 2021
+From: Hugh Dickins <hughd@google.com>
+Date: Tue, 15 Jun 2021 18:23:53 -0700
+Subject: mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
+
+From: Hugh Dickins <hughd@google.com>
+
+[ Upstream commit 732ed55823fc3ad998d43b86bf771887bcc5ec67 ]
+
+Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE
+(!unmap_success): with dump_page() showing mapcount:1, but then its raw
+struct page output showing _mapcount ffffffff i.e.  mapcount 0.
+
+And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed,
+it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)),
+and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG():
+all indicative of some mapcount difficulty in development here perhaps.
+But the !CONFIG_DEBUG_VM path handles the failures correctly and
+silently.
+
+I believe the problem is that once a racing unmap has cleared pte or
+pmd, try_to_unmap_one() may skip taking the page table lock, and emerge
+from try_to_unmap() before the racing task has reached decrementing
+mapcount.
+
+Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
+follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding
+TTU_SYNC to the options, and passing that from unmap_page().
+
+When CONFIG_DEBUG_VM, or for non-debug too? Consensus is to do the same
+for both: the slight overhead added should rarely matter, except perhaps
+if splitting sparsely-populated multiply-mapped shmem.  Once confident
+that bugs are fixed, TTU_SYNC here can be removed, and the race
+tolerated.
+
+Link: https://lkml.kernel.org/r/c1e95853-8bcd-d8fd-55fa-e7f2488e78f@google.com
+Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Jue Wang <juew@google.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Ralph Campbell <rcampbell@nvidia.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Wang Yugui <wangyugui@e16-tech.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+Note on stable backport: upstream TTU_SYNC 0x10 takes the value which
+5.11 commit 013339df116c ("mm/rmap: always do TTU_IGNORE_ACCESS") freed.
+It is very tempting to backport that commit (as 5.10 already did) and
+make no change here; but on reflection, good as that commit is, I'm
+reluctant to include any possible side-effect of it in this series.
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/rmap.h |    3 ++-
+ mm/huge_memory.c     |    2 +-
+ mm/page_vma_mapped.c |   11 +++++++++++
+ mm/rmap.c            |   17 ++++++++++++++++-
+ 4 files changed, 30 insertions(+), 3 deletions(-)
+
+--- a/include/linux/rmap.h
++++ b/include/linux/rmap.h
+@@ -98,7 +98,8 @@ enum ttu_flags {
+                                        * do a final flush if necessary */
+       TTU_RMAP_LOCKED         = 0x80, /* do not grab rmap lock:
+                                        * caller holds it */
+-      TTU_SPLIT_FREEZE        = 0x100,                /* freeze pte under splitting thp */
++      TTU_SPLIT_FREEZE        = 0x100, /* freeze pte under splitting thp */
++      TTU_SYNC                = 0x200, /* avoid racy checks with PVMW_SYNC */
+ };
+ 
+ #ifdef CONFIG_MMU
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2449,7 +2449,7 @@ void vma_adjust_trans_huge(struct vm_are
+ static void unmap_page(struct page *page)
+ {
+       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
+-              TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
++              TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC;
+       bool unmap_success;
+ 
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+--- a/mm/page_vma_mapped.c
++++ b/mm/page_vma_mapped.c
+@@ -207,6 +207,17 @@ restart:
+                       pvmw->ptl = NULL;
+               }
+       } else if (!pmd_present(pmde)) {
++              /*
++               * If PVMW_SYNC, take and drop THP pmd lock so that we
++               * cannot return prematurely, while zap_huge_pmd() has
++               * cleared *pmd but not decremented compound_mapcount().
++               */
++              if ((pvmw->flags & PVMW_SYNC) &&
++                  PageTransCompound(pvmw->page)) {
++                      spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
++
++                      spin_unlock(ptl);
++              }
+               return false;
+       }
+       if (!map_pte(pvmw))
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -1353,6 +1353,15 @@ static bool try_to_unmap_one(struct page
+       struct mmu_notifier_range range;
+       enum ttu_flags flags = (enum ttu_flags)arg;
+ 
++      /*
++       * When racing against e.g. zap_pte_range() on another cpu,
++       * in between its ptep_get_and_clear_full() and page_remove_rmap(),
++       * try_to_unmap() may return false when it is about to become true,
++       * if page table locking is skipped: use TTU_SYNC to wait for that.
++       */
++      if (flags & TTU_SYNC)
++              pvmw.flags = PVMW_SYNC;
++
+       /* munlock has nothing to gain from examining un-locked vmas */
+       if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+               return true;
+@@ -1731,7 +1740,13 @@ bool try_to_unmap(struct page *page, enu
+       else
+               rmap_walk(page, &rwc);
+ 
+-      return !page_mapcount(page) ? true : false;
++      /*
++       * When racing against e.g. zap_pte_range() on another cpu,
++       * in between its ptep_get_and_clear_full() and page_remove_rmap(),
++       * try_to_unmap() may return false when it is about to become true,
++       * if page table locking is skipped: use TTU_SYNC to wait for that.
++       */
++      return !page_mapcount(page);
+ }
+ 
+ static int page_not_mapped(struct page *page)
diff --git a/queue-5.4/mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch b/queue-5.4/mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch

new file mode 100644 (file)

index 0000000..112073c
--- /dev/null
+++ b/queue-5.4/mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
@@ -0,0 +1,255 @@
+From foo@baz Mon Jun 28 01:37:10 PM CEST 2021
+From: Hugh Dickins <hughd@google.com>
+Date: Tue, 15 Jun 2021 18:24:03 -0700
+Subject: mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
+
+From: Hugh Dickins <hughd@google.com>
+
+[ Upstream commit 22061a1ffabdb9c3385de159c5db7aac3a4df1cc ]
+
+There is a race between THP unmapping and truncation, when truncate sees
+pmd_none() and skips the entry, after munmap's zap_huge_pmd() cleared
+it, but before its page_remove_rmap() gets to decrement
+compound_mapcount: generating false "BUG: Bad page cache" reports that
+the page is still mapped when deleted.  This commit fixes that, but not
+in the way I hoped.
+
+The first attempt used try_to_unmap(page, TTU_SYNC|TTU_IGNORE_MLOCK)
+instead of unmap_mapping_range() in truncate_cleanup_page(): it has
+often been an annoyance that we usually call unmap_mapping_range() with
+no pages locked, but there apply it to a single locked page.
+try_to_unmap() looks more suitable for a single locked page.
+
+However, try_to_unmap_one() contains a VM_BUG_ON_PAGE(!pvmw.pte,page):
+it is used to insert THP migration entries, but not used to unmap THPs.
+Copy zap_huge_pmd() and add THP handling now? Perhaps, but their TLB
+needs are different, I'm too ignorant of the DAX cases, and couldn't
+decide how far to go for anon+swap.  Set that aside.
+
+The second attempt took a different tack: make no change in truncate.c,
+but modify zap_huge_pmd() to insert an invalidated huge pmd instead of
+clearing it initially, then pmd_clear() between page_remove_rmap() and
+unlocking at the end.  Nice.  But powerpc blows that approach out of the
+water, with its serialize_against_pte_lookup(), and interesting pgtable
+usage.  It would need serious help to get working on powerpc (with a
+minor optimization issue on s390 too).  Set that aside.
+
+Just add an "if (page_mapped(page)) synchronize_rcu();" or other such
+delay, after unmapping in truncate_cleanup_page()? Perhaps, but though
+that's likely to reduce or eliminate the number of incidents, it would
+give less assurance of whether we had identified the problem correctly.
+
+This successful iteration introduces "unmap_mapping_page(page)" instead
+of try_to_unmap(), and goes the usual unmap_mapping_range_tree() route,
+with an addition to details.  Then zap_pmd_range() watches for this
+case, and does spin_unlock(pmd_lock) if so - just like
+page_vma_mapped_walk() now does in the PVMW_SYNC case.  Not pretty, but
+safe.
+
+Note that unmap_mapping_page() is doing a VM_BUG_ON(!PageLocked) to
+assert its interface; but currently that's only used to make sure that
+page->mapping is stable, and zap_pmd_range() doesn't care if the page is
+locked or not.  Along these lines, in invalidate_inode_pages2_range()
+move the initial unmap_mapping_range() out from under page lock, before
+then calling unmap_mapping_page() under page lock if still mapped.
+
+Link: https://lkml.kernel.org/r/a2a4a148-cdd8-942c-4ef8-51b77f643dbe@google.com
+Fixes: fc127da085c2 ("truncate: handle file thp")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Reviewed-by: Yang Shi <shy828301@gmail.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Jue Wang <juew@google.com>
+Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Ralph Campbell <rcampbell@nvidia.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Wang Yugui <wangyugui@e16-tech.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+Note on stable backport: fixed up call to truncate_cleanup_page()
+in truncate_inode_pages_range().  Use hpage_nr_pages() in
+unmap_mapping_page().
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h |    3 +++
+ mm/memory.c        |   41 +++++++++++++++++++++++++++++++++++++++++
+ mm/truncate.c      |   43 +++++++++++++++++++------------------------
+ 3 files changed, 63 insertions(+), 24 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1459,6 +1459,7 @@ struct zap_details {
+       struct address_space *check_mapping;    /* Check page->mapping if set */
+       pgoff_t first_index;                    /* Lowest page->index to unmap */
+       pgoff_t last_index;                     /* Highest page->index to unmap */
++      struct page *single_page;               /* Locked page to be unmapped */
+ };
+ 
+ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+@@ -1505,6 +1506,7 @@ extern vm_fault_t handle_mm_fault(struct
+ extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+                           unsigned long address, unsigned int fault_flags,
+                           bool *unlocked);
++void unmap_mapping_page(struct page *page);
+ void unmap_mapping_pages(struct address_space *mapping,
+               pgoff_t start, pgoff_t nr, bool even_cows);
+ void unmap_mapping_range(struct address_space *mapping,
+@@ -1525,6 +1527,7 @@ static inline int fixup_user_fault(struc
+       BUG();
+       return -EFAULT;
+ }
++static inline void unmap_mapping_page(struct page *page) { }
+ static inline void unmap_mapping_pages(struct address_space *mapping,
+               pgoff_t start, pgoff_t nr, bool even_cows) { }
+ static inline void unmap_mapping_range(struct address_space *mapping,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1165,7 +1165,18 @@ static inline unsigned long zap_pmd_rang
+                       else if (zap_huge_pmd(tlb, vma, pmd, addr))
+                               goto next;
+                       /* fall through */
++              } else if (details && details->single_page &&
++                         PageTransCompound(details->single_page) &&
++                         next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
++                      spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
++                      /*
++                       * Take and drop THP pmd lock so that we cannot return
++                       * prematurely, while zap_huge_pmd() has cleared *pmd,
++                       * but not yet decremented compound_mapcount().
++                       */
++                      spin_unlock(ptl);
+               }
++
+               /*
+                * Here there can be other concurrent MADV_DONTNEED or
+                * trans huge page faults running, and if the pmd is
+@@ -2770,6 +2781,36 @@ static inline void unmap_mapping_range_t
+ }
+ 
+ /**
++ * unmap_mapping_page() - Unmap single page from processes.
++ * @page: The locked page to be unmapped.
++ *
++ * Unmap this page from any userspace process which still has it mmaped.
++ * Typically, for efficiency, the range of nearby pages has already been
++ * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
++ * truncation or invalidation holds the lock on a page, it may find that
++ * the page has been remapped again: and then uses unmap_mapping_page()
++ * to unmap it finally.
++ */
++void unmap_mapping_page(struct page *page)
++{
++      struct address_space *mapping = page->mapping;
++      struct zap_details details = { };
++
++      VM_BUG_ON(!PageLocked(page));
++      VM_BUG_ON(PageTail(page));
++
++      details.check_mapping = mapping;
++      details.first_index = page->index;
++      details.last_index = page->index + hpage_nr_pages(page) - 1;
++      details.single_page = page;
++
++      i_mmap_lock_write(mapping);
++      if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
++              unmap_mapping_range_tree(&mapping->i_mmap, &details);
++      i_mmap_unlock_write(mapping);
++}
++
++/**
+  * unmap_mapping_pages() - Unmap pages from processes.
+  * @mapping: The address space containing pages to be unmapped.
+  * @start: Index of first page to be unmapped.
+--- a/mm/truncate.c
++++ b/mm/truncate.c
+@@ -173,13 +173,10 @@ void do_invalidatepage(struct page *page
+  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
+  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+  */
+-static void
+-truncate_cleanup_page(struct address_space *mapping, struct page *page)
++static void truncate_cleanup_page(struct page *page)
+ {
+-      if (page_mapped(page)) {
+-              pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+-              unmap_mapping_pages(mapping, page->index, nr, false);
+-      }
++      if (page_mapped(page))
++              unmap_mapping_page(page);
+ 
+       if (page_has_private(page))
+               do_invalidatepage(page, 0, PAGE_SIZE);
+@@ -224,7 +221,7 @@ int truncate_inode_page(struct address_s
+       if (page->mapping != mapping)
+               return -EIO;
+ 
+-      truncate_cleanup_page(mapping, page);
++      truncate_cleanup_page(page);
+       delete_from_page_cache(page);
+       return 0;
+ }
+@@ -362,7 +359,7 @@ void truncate_inode_pages_range(struct a
+                       pagevec_add(&locked_pvec, page);
+               }
+               for (i = 0; i < pagevec_count(&locked_pvec); i++)
+-                      truncate_cleanup_page(mapping, locked_pvec.pages[i]);
++                      truncate_cleanup_page(locked_pvec.pages[i]);
+               delete_from_page_cache_batch(mapping, &locked_pvec);
+               for (i = 0; i < pagevec_count(&locked_pvec); i++)
+                       unlock_page(locked_pvec.pages[i]);
+@@ -715,6 +712,16 @@ int invalidate_inode_pages2_range(struct
+                               continue;
+                       }
+ 
++                      if (!did_range_unmap && page_mapped(page)) {
++                              /*
++                               * If page is mapped, before taking its lock,
++                               * zap the rest of the file in one hit.
++                               */
++                              unmap_mapping_pages(mapping, index,
++                                              (1 + end - index), false);
++                              did_range_unmap = 1;
++                      }
++
+                       lock_page(page);
+                       WARN_ON(page_to_index(page) != index);
+                       if (page->mapping != mapping) {
+@@ -722,23 +729,11 @@ int invalidate_inode_pages2_range(struct
+                               continue;
+                       }
+                       wait_on_page_writeback(page);
+-                      if (page_mapped(page)) {
+-                              if (!did_range_unmap) {
+-                                      /*
+-                                       * Zap the rest of the file in one hit.
+-                                       */
+-                                      unmap_mapping_pages(mapping, index,
+-                                              (1 + end - index), false);
+-                                      did_range_unmap = 1;
+-                              } else {
+-                                      /*
+-                                       * Just zap this page
+-                                       */
+-                                      unmap_mapping_pages(mapping, index,
+-                                                              1, false);
+-                              }
+-                      }
++
++                      if (page_mapped(page))
++                              unmap_mapping_page(page);
+                       BUG_ON(page_mapped(page));
++
+                       ret2 = do_launder_page(mapping, page);
+                       if (ret2 == 0) {
+                               if (!invalidate_complete_page2(mapping, page))
diff --git a/queue-5.4/series b/queue-5.4/series

index 7e49bce1b81ab8f1e8fe60c8fc4fdd6ebc004972..7bc9b7e04a098541fd4090d405f630ac0d467c65 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -42,3 +42,8 @@ kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch
  i2c-robotfuzz-osif-fix-control-request-directions.patch
  kthread_worker-split-code-for-canceling-the-delayed-work-timer.patch
  kthread-prevent-deadlock-when-kthread_mod_delayed_work-races-with-kthread_cancel_delayed_work_sync.patch
+mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
+mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
+mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
+mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch
+mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 28 Jun 2021 11:37:49 +0000 (13:37 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 28 Jun 2021 11:37:49 +0000 (13:37 +0200)
queue-5.4/mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history