From: Greg Kroah-Hartman Date: Mon, 28 Jun 2021 11:37:49 +0000 (+0200) Subject: 5.4-stable patches X-Git-Tag: v5.12.14~11 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=90477a624999ffe274575718246803470b25d556;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch --- diff --git a/queue-5.4/mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch b/queue-5.4/mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch new file mode 100644 index 00000000000..ee847d04609 --- /dev/null +++ b/queue-5.4/mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch @@ -0,0 +1,154 @@ +From foo@baz Mon Jun 28 01:37:10 PM CEST 2021 +From: Hugh Dickins +Date: Thu, 24 Jun 2021 18:39:52 -0700 +Subject: mm, futex: fix shared futex pgoff on shmem huge page + +From: Hugh Dickins + +[ Upstream commit fe19bd3dae3d15d2fbfdb3de8839a6ea0fe94264 ] + +If more than one futex is placed on a shmem huge page, it can happen +that waking the second wakes the first instead, and leaves the second +waiting: the key's shared.pgoff is wrong. + +When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when +generating futex_key"), the only shared huge pages came from hugetlbfs, +and the code added to deal with its exceptional page->index was put into +hugetlb source. Then that was missed when 4.8 added shmem huge pages. + +page_to_pgoff() is what others use for this nowadays: except that, as +currently written, it gives the right answer on hugetlbfs head, but +nonsense on hugetlbfs tails. Fix that by calling hugetlbfs-specific +hugetlb_basepage_index() on PageHuge tails as well as on head. + +Yes, it's unconventional to declare hugetlb_basepage_index() there in +pagemap.h, rather than in hugetlb.h; but I do not expect anything but +page_to_pgoff() ever to need it. + +[akpm@linux-foundation.org: give hugetlb_basepage_index() prototype the correct scope] + +Link: https://lkml.kernel.org/r/b17d946b-d09-326e-b42a-52884c36df32@google.com +Fixes: 800d8c63b2e9 ("shmem: add huge pages support") +Reported-by: Neel Natu +Signed-off-by: Hugh Dickins +Reviewed-by: Matthew Wilcox (Oracle) +Acked-by: Thomas Gleixner +Cc: "Kirill A. Shutemov" +Cc: Zhang Yi +Cc: Mel Gorman +Cc: Mike Kravetz +Cc: Ingo Molnar +Cc: Peter Zijlstra +Cc: Darren Hart +Cc: Davidlohr Bueso +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds + +Note on stable backport: leave redundant #include +in kernel/futex.c, to avoid conflict over the header files included. + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/hugetlb.h | 16 ---------------- + include/linux/pagemap.h | 13 +++++++------ + kernel/futex.c | 2 +- + mm/hugetlb.c | 5 +---- + 4 files changed, 9 insertions(+), 27 deletions(-) + +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -469,17 +469,6 @@ static inline int hstate_index(struct hs + return h - hstates; + } + +-pgoff_t __basepage_index(struct page *page); +- +-/* Return page->index in PAGE_SIZE units */ +-static inline pgoff_t basepage_index(struct page *page) +-{ +- if (!PageCompound(page)) +- return page->index; +- +- return __basepage_index(page); +-} +- + extern int dissolve_free_huge_page(struct page *page); + extern int dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn); +@@ -695,11 +684,6 @@ static inline int hstate_index(struct hs + return 0; + } + +-static inline pgoff_t basepage_index(struct page *page) +-{ +- return page->index; +-} +- + static inline int dissolve_free_huge_page(struct page *page) + { + return 0; +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -397,7 +397,7 @@ static inline struct page *read_mapping_ + } + + /* +- * Get index of the page with in radix-tree ++ * Get index of the page within radix-tree (but not for hugetlb pages). + * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) + */ + static inline pgoff_t page_to_index(struct page *page) +@@ -416,15 +416,16 @@ static inline pgoff_t page_to_index(stru + return pgoff; + } + ++extern pgoff_t hugetlb_basepage_index(struct page *page); ++ + /* +- * Get the offset in PAGE_SIZE. +- * (TODO: hugepage should have ->index in PAGE_SIZE) ++ * Get the offset in PAGE_SIZE (even for hugetlb pages). ++ * (TODO: hugetlb pages should have ->index in PAGE_SIZE) + */ + static inline pgoff_t page_to_pgoff(struct page *page) + { +- if (unlikely(PageHeadHuge(page))) +- return page->index << compound_order(page); +- ++ if (unlikely(PageHuge(page))) ++ return hugetlb_basepage_index(page); + return page_to_index(page); + } + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -737,7 +737,7 @@ again: + + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.i_seq = get_inode_sequence_number(inode); +- key->shared.pgoff = basepage_index(tail); ++ key->shared.pgoff = page_to_pgoff(tail); + rcu_read_unlock(); + } + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1461,15 +1461,12 @@ int PageHeadHuge(struct page *page_head) + return get_compound_page_dtor(page_head) == free_huge_page; + } + +-pgoff_t __basepage_index(struct page *page) ++pgoff_t hugetlb_basepage_index(struct page *page) + { + struct page *page_head = compound_head(page); + pgoff_t index = page_index(page_head); + unsigned long compound_idx; + +- if (!PageHuge(page_head)) +- return page_index(page); +- + if (compound_order(page_head) >= MAX_ORDER) + compound_idx = page_to_pfn(page) - page_to_pfn(page_head); + else diff --git a/queue-5.4/mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch b/queue-5.4/mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch new file mode 100644 index 00000000000..fd576b3494b --- /dev/null +++ b/queue-5.4/mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch @@ -0,0 +1,251 @@ +From foo@baz Mon Jun 28 01:37:10 PM CEST 2021 +From: Hugh Dickins +Date: Tue, 15 Jun 2021 18:23:56 -0700 +Subject: mm/thp: fix vma_address() if virtual address below file offset + +From: Hugh Dickins + +[ Upstream commit 494334e43c16d63b878536a26505397fce6ff3a2 ] + +Running certain tests with a DEBUG_VM kernel would crash within hours, +on the total_mapcount BUG() in split_huge_page_to_list(), while trying +to free up some memory by punching a hole in a shmem huge page: split's +try_to_unmap() was unable to find all the mappings of the page (which, +on a !DEBUG_VM kernel, would then keep the huge page pinned in memory). + +When that BUG() was changed to a WARN(), it would later crash on the +VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma) in +mm/internal.h:vma_address(), used by rmap_walk_file() for +try_to_unmap(). + +vma_address() is usually correct, but there's a wraparound case when the +vm_start address is unusually low, but vm_pgoff not so low: +vma_address() chooses max(start, vma->vm_start), but that decides on the +wrong address, because start has become almost ULONG_MAX. + +Rewrite vma_address() to be more careful about vm_pgoff; move the +VM_BUG_ON_VMA() out of it, returning -EFAULT for errors, so that it can +be safely used from page_mapped_in_vma() and page_address_in_vma() too. + +Add vma_address_end() to apply similar care to end address calculation, +in page_vma_mapped_walk() and page_mkclean_one() and try_to_unmap_one(); +though it raises a question of whether callers would do better to supply +pvmw->end to page_vma_mapped_walk() - I chose not, for a smaller patch. + +An irritation is that their apparent generality breaks down on KSM +pages, which cannot be located by the page->index that page_to_pgoff() +uses: as commit 4b0ece6fa016 ("mm: migrate: fix remove_migration_pte() +for ksm pages") once discovered. I dithered over the best thing to do +about that, and have ended up with a VM_BUG_ON_PAGE(PageKsm) in both +vma_address() and vma_address_end(); though the only place in danger of +using it on them was try_to_unmap_one(). + +Sidenote: vma_address() and vma_address_end() now use compound_nr() on a +head page, instead of thp_size(): to make the right calculation on a +hugetlbfs page, whether or not THPs are configured. try_to_unmap() is +used on hugetlbfs pages, but perhaps the wrong calculation never +mattered. + +Link: https://lkml.kernel.org/r/caf1c1a3-7cfb-7f8f-1beb-ba816e932825@google.com +Fixes: a8fa41ad2f6f ("mm, rmap: check all VMAs that PTE-mapped THP can be part of") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Alistair Popple +Cc: Jan Kara +Cc: Jue Wang +Cc: "Matthew Wilcox (Oracle)" +Cc: Miaohe Lin +Cc: Minchan Kim +Cc: Naoya Horiguchi +Cc: Oscar Salvador +Cc: Peter Xu +Cc: Ralph Campbell +Cc: Shakeel Butt +Cc: Wang Yugui +Cc: Yang Shi +Cc: Zi Yan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds + +Note on stable backport: fixed up conflicts on intervening thp_size(). + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman +--- + mm/internal.h | 51 ++++++++++++++++++++++++++++++++++++++------------- + mm/page_vma_mapped.c | 16 ++++++---------- + mm/rmap.c | 16 ++++++++-------- + 3 files changed, 52 insertions(+), 31 deletions(-) + +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -339,27 +339,52 @@ static inline void mlock_migrate_page(st + extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + + /* +- * At what user virtual address is page expected in @vma? ++ * At what user virtual address is page expected in vma? ++ * Returns -EFAULT if all of the page is outside the range of vma. ++ * If page is a compound head, the entire compound page is considered. + */ + static inline unsigned long +-__vma_address(struct page *page, struct vm_area_struct *vma) ++vma_address(struct page *page, struct vm_area_struct *vma) + { +- pgoff_t pgoff = page_to_pgoff(page); +- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ pgoff_t pgoff; ++ unsigned long address; ++ ++ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ ++ pgoff = page_to_pgoff(page); ++ if (pgoff >= vma->vm_pgoff) { ++ address = vma->vm_start + ++ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ /* Check for address beyond vma (or wrapped through 0?) */ ++ if (address < vma->vm_start || address >= vma->vm_end) ++ address = -EFAULT; ++ } else if (PageHead(page) && ++ pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) { ++ /* Test above avoids possibility of wrap to 0 on 32-bit */ ++ address = vma->vm_start; ++ } else { ++ address = -EFAULT; ++ } ++ return address; + } + ++/* ++ * Then at what user virtual address will none of the page be found in vma? ++ * Assumes that vma_address() already returned a good starting address. ++ * If page is a compound head, the entire compound page is considered. ++ */ + static inline unsigned long +-vma_address(struct page *page, struct vm_area_struct *vma) ++vma_address_end(struct page *page, struct vm_area_struct *vma) + { +- unsigned long start, end; +- +- start = __vma_address(page, vma); +- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); +- +- /* page should be within @vma mapping range */ +- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); ++ pgoff_t pgoff; ++ unsigned long address; + +- return max(start, vma->vm_start); ++ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ ++ pgoff = page_to_pgoff(page) + compound_nr(page); ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ /* Check for address beyond vma (or wrapped through 0?) */ ++ if (address < vma->vm_start || address > vma->vm_end) ++ address = vma->vm_end; ++ return address; + } + + static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, +--- a/mm/page_vma_mapped.c ++++ b/mm/page_vma_mapped.c +@@ -223,18 +223,18 @@ restart: + if (!map_pte(pvmw)) + goto next_pte; + while (1) { ++ unsigned long end; ++ + if (check_pte(pvmw)) + return true; + next_pte: + /* Seek to next pte only makes sense for THP */ + if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) + return not_found(pvmw); ++ end = vma_address_end(pvmw->page, pvmw->vma); + do { + pvmw->address += PAGE_SIZE; +- if (pvmw->address >= pvmw->vma->vm_end || +- pvmw->address >= +- __vma_address(pvmw->page, pvmw->vma) + +- hpage_nr_pages(pvmw->page) * PAGE_SIZE) ++ if (pvmw->address >= end) + return not_found(pvmw); + /* Did we cross page table boundary? */ + if (pvmw->address % PMD_SIZE == 0) { +@@ -272,14 +272,10 @@ int page_mapped_in_vma(struct page *page + .vma = vma, + .flags = PVMW_SYNC, + }; +- unsigned long start, end; +- +- start = __vma_address(page, vma); +- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); + +- if (unlikely(end < vma->vm_start || start >= vma->vm_end)) ++ pvmw.address = vma_address(page, vma); ++ if (pvmw.address == -EFAULT) + return 0; +- pvmw.address = max(start, vma->vm_start); + if (!page_vma_mapped_walk(&pvmw)) + return 0; + page_vma_mapped_walk_done(&pvmw); +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -687,7 +687,6 @@ static bool should_defer_flush(struct mm + */ + unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) + { +- unsigned long address; + if (PageAnon(page)) { + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* +@@ -702,10 +701,8 @@ unsigned long page_address_in_vma(struct + return -EFAULT; + } else + return -EFAULT; +- address = __vma_address(page, vma); +- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) +- return -EFAULT; +- return address; ++ ++ return vma_address(page, vma); + } + + pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) +@@ -899,7 +896,7 @@ static bool page_mkclean_one(struct page + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, vma->vm_mm, address, +- min(vma->vm_end, address + page_size(page))); ++ vma_address_end(page, vma)); + mmu_notifier_invalidate_range_start(&range); + + while (page_vma_mapped_walk(&pvmw)) { +@@ -1383,9 +1380,10 @@ static bool try_to_unmap_one(struct page + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. + */ ++ range.end = PageKsm(page) ? ++ address + PAGE_SIZE : vma_address_end(page, vma); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, +- address, +- min(vma->vm_end, address + page_size(page))); ++ address, range.end); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted +@@ -1848,6 +1846,7 @@ static void rmap_walk_anon(struct page * + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + ++ VM_BUG_ON_VMA(address == -EFAULT, vma); + cond_resched(); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) +@@ -1902,6 +1901,7 @@ static void rmap_walk_file(struct page * + pgoff_start, pgoff_end) { + unsigned long address = vma_address(page, vma); + ++ VM_BUG_ON_VMA(address == -EFAULT, vma); + cond_resched(); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) diff --git a/queue-5.4/mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch b/queue-5.4/mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch new file mode 100644 index 00000000000..754e9498166 --- /dev/null +++ b/queue-5.4/mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch @@ -0,0 +1,118 @@ +From foo@baz Mon Jun 28 01:37:10 PM CEST 2021 +From: Yang Shi +Date: Tue, 15 Jun 2021 18:24:07 -0700 +Subject: mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split + +From: Yang Shi + +[ Upstream commit 504e070dc08f757bccaed6d05c0f53ecbfac8a23 ] + +When debugging the bug reported by Wang Yugui [1], try_to_unmap() may +fail, but the first VM_BUG_ON_PAGE() just checks page_mapcount() however +it may miss the failure when head page is unmapped but other subpage is +mapped. Then the second DEBUG_VM BUG() that check total mapcount would +catch it. This may incur some confusion. + +As this is not a fatal issue, so consolidate the two DEBUG_VM checks +into one VM_WARN_ON_ONCE_PAGE(). + +[1] https://lore.kernel.org/linux-mm/20210412180659.B9E3.409509F4@e16-tech.com/ + +Link: https://lkml.kernel.org/r/d0f0db68-98b8-ebfb-16dc-f29df24cf012@google.com +Signed-off-by: Yang Shi +Reviewed-by: Zi Yan +Acked-by: Kirill A. Shutemov +Signed-off-by: Hugh Dickins +Cc: Alistair Popple +Cc: Jan Kara +Cc: Jue Wang +Cc: "Matthew Wilcox (Oracle)" +Cc: Miaohe Lin +Cc: Minchan Kim +Cc: Naoya Horiguchi +Cc: Oscar Salvador +Cc: Peter Xu +Cc: Ralph Campbell +Cc: Shakeel Butt +Cc: Wang Yugui +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds + +Note on stable backport: fixed up variables in split_huge_page_to_list(), +and fixed up the conflict on ttu_flags in unmap_page(). + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman +--- + mm/huge_memory.c | 24 +++++++----------------- + 1 file changed, 7 insertions(+), 17 deletions(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2450,15 +2450,15 @@ static void unmap_page(struct page *page + { + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; +- bool unmap_success; + + VM_BUG_ON_PAGE(!PageHead(page), page); + + if (PageAnon(page)) + ttu_flags |= TTU_SPLIT_FREEZE; + +- unmap_success = try_to_unmap(page, ttu_flags); +- VM_BUG_ON_PAGE(!unmap_success, page); ++ try_to_unmap(page, ttu_flags); ++ ++ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); + } + + static void remap_page(struct page *page) +@@ -2737,7 +2737,7 @@ int split_huge_page_to_list(struct page + struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; +- int count, mapcount, extra_pins, ret; ++ int extra_pins, ret; + bool mlocked; + unsigned long flags; + pgoff_t end; +@@ -2799,7 +2799,6 @@ int split_huge_page_to_list(struct page + + mlocked = PageMlocked(page); + unmap_page(head); +- VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ + if (mlocked) +@@ -2822,9 +2821,7 @@ int split_huge_page_to_list(struct page + + /* Prevent deferred_split_scan() touching ->_refcount */ + spin_lock(&ds_queue->split_queue_lock); +- count = page_count(head); +- mapcount = total_mapcount(head); +- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { ++ if (page_ref_freeze(head, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(head))) { + ds_queue->split_queue_len--; + list_del(page_deferred_list(head)); +@@ -2845,16 +2842,9 @@ int split_huge_page_to_list(struct page + } else + ret = 0; + } else { +- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { +- pr_alert("total_mapcount: %u, page_count(): %u\n", +- mapcount, count); +- if (PageTail(page)) +- dump_page(head, NULL); +- dump_page(page, "total_mapcount(head) > 0"); +- BUG(); +- } + spin_unlock(&ds_queue->split_queue_lock); +-fail: if (mapping) ++fail: ++ if (mapping) + xa_unlock(&mapping->i_pages); + spin_unlock_irqrestore(&pgdata->lru_lock, flags); + remap_page(head); diff --git a/queue-5.4/mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch b/queue-5.4/mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch new file mode 100644 index 00000000000..e0569a44b44 --- /dev/null +++ b/queue-5.4/mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch @@ -0,0 +1,148 @@ +From foo@baz Mon Jun 28 01:37:10 PM CEST 2021 +From: Hugh Dickins +Date: Tue, 15 Jun 2021 18:23:53 -0700 +Subject: mm/thp: try_to_unmap() use TTU_SYNC for safe splitting + +From: Hugh Dickins + +[ Upstream commit 732ed55823fc3ad998d43b86bf771887bcc5ec67 ] + +Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE +(!unmap_success): with dump_page() showing mapcount:1, but then its raw +struct page output showing _mapcount ffffffff i.e. mapcount 0. + +And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed, +it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)), +and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG(): +all indicative of some mapcount difficulty in development here perhaps. +But the !CONFIG_DEBUG_VM path handles the failures correctly and +silently. + +I believe the problem is that once a racing unmap has cleared pte or +pmd, try_to_unmap_one() may skip taking the page table lock, and emerge +from try_to_unmap() before the racing task has reached decrementing +mapcount. + +Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that +follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding +TTU_SYNC to the options, and passing that from unmap_page(). + +When CONFIG_DEBUG_VM, or for non-debug too? Consensus is to do the same +for both: the slight overhead added should rarely matter, except perhaps +if splitting sparsely-populated multiply-mapped shmem. Once confident +that bugs are fixed, TTU_SYNC here can be removed, and the race +tolerated. + +Link: https://lkml.kernel.org/r/c1e95853-8bcd-d8fd-55fa-e7f2488e78f@google.com +Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers") +Signed-off-by: Hugh Dickins +Cc: Alistair Popple +Cc: Jan Kara +Cc: Jue Wang +Cc: Kirill A. Shutemov +Cc: "Matthew Wilcox (Oracle)" +Cc: Miaohe Lin +Cc: Minchan Kim +Cc: Naoya Horiguchi +Cc: Oscar Salvador +Cc: Peter Xu +Cc: Ralph Campbell +Cc: Shakeel Butt +Cc: Wang Yugui +Cc: Yang Shi +Cc: Zi Yan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds + +Note on stable backport: upstream TTU_SYNC 0x10 takes the value which +5.11 commit 013339df116c ("mm/rmap: always do TTU_IGNORE_ACCESS") freed. +It is very tempting to backport that commit (as 5.10 already did) and +make no change here; but on reflection, good as that commit is, I'm +reluctant to include any possible side-effect of it in this series. + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/rmap.h | 3 ++- + mm/huge_memory.c | 2 +- + mm/page_vma_mapped.c | 11 +++++++++++ + mm/rmap.c | 17 ++++++++++++++++- + 4 files changed, 30 insertions(+), 3 deletions(-) + +--- a/include/linux/rmap.h ++++ b/include/linux/rmap.h +@@ -98,7 +98,8 @@ enum ttu_flags { + * do a final flush if necessary */ + TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: + * caller holds it */ +- TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ ++ TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ ++ TTU_SYNC = 0x200, /* avoid racy checks with PVMW_SYNC */ + }; + + #ifdef CONFIG_MMU +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2449,7 +2449,7 @@ void vma_adjust_trans_huge(struct vm_are + static void unmap_page(struct page *page) + { + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | +- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; ++ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; + bool unmap_success; + + VM_BUG_ON_PAGE(!PageHead(page), page); +--- a/mm/page_vma_mapped.c ++++ b/mm/page_vma_mapped.c +@@ -207,6 +207,17 @@ restart: + pvmw->ptl = NULL; + } + } else if (!pmd_present(pmde)) { ++ /* ++ * If PVMW_SYNC, take and drop THP pmd lock so that we ++ * cannot return prematurely, while zap_huge_pmd() has ++ * cleared *pmd but not decremented compound_mapcount(). ++ */ ++ if ((pvmw->flags & PVMW_SYNC) && ++ PageTransCompound(pvmw->page)) { ++ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); ++ ++ spin_unlock(ptl); ++ } + return false; + } + if (!map_pte(pvmw)) +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -1353,6 +1353,15 @@ static bool try_to_unmap_one(struct page + struct mmu_notifier_range range; + enum ttu_flags flags = (enum ttu_flags)arg; + ++ /* ++ * When racing against e.g. zap_pte_range() on another cpu, ++ * in between its ptep_get_and_clear_full() and page_remove_rmap(), ++ * try_to_unmap() may return false when it is about to become true, ++ * if page table locking is skipped: use TTU_SYNC to wait for that. ++ */ ++ if (flags & TTU_SYNC) ++ pvmw.flags = PVMW_SYNC; ++ + /* munlock has nothing to gain from examining un-locked vmas */ + if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) + return true; +@@ -1731,7 +1740,13 @@ bool try_to_unmap(struct page *page, enu + else + rmap_walk(page, &rwc); + +- return !page_mapcount(page) ? true : false; ++ /* ++ * When racing against e.g. zap_pte_range() on another cpu, ++ * in between its ptep_get_and_clear_full() and page_remove_rmap(), ++ * try_to_unmap() may return false when it is about to become true, ++ * if page table locking is skipped: use TTU_SYNC to wait for that. ++ */ ++ return !page_mapcount(page); + } + + static int page_not_mapped(struct page *page) diff --git a/queue-5.4/mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch b/queue-5.4/mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch new file mode 100644 index 00000000000..112073ce64c --- /dev/null +++ b/queue-5.4/mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch @@ -0,0 +1,255 @@ +From foo@baz Mon Jun 28 01:37:10 PM CEST 2021 +From: Hugh Dickins +Date: Tue, 15 Jun 2021 18:24:03 -0700 +Subject: mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page() + +From: Hugh Dickins + +[ Upstream commit 22061a1ffabdb9c3385de159c5db7aac3a4df1cc ] + +There is a race between THP unmapping and truncation, when truncate sees +pmd_none() and skips the entry, after munmap's zap_huge_pmd() cleared +it, but before its page_remove_rmap() gets to decrement +compound_mapcount: generating false "BUG: Bad page cache" reports that +the page is still mapped when deleted. This commit fixes that, but not +in the way I hoped. + +The first attempt used try_to_unmap(page, TTU_SYNC|TTU_IGNORE_MLOCK) +instead of unmap_mapping_range() in truncate_cleanup_page(): it has +often been an annoyance that we usually call unmap_mapping_range() with +no pages locked, but there apply it to a single locked page. +try_to_unmap() looks more suitable for a single locked page. + +However, try_to_unmap_one() contains a VM_BUG_ON_PAGE(!pvmw.pte,page): +it is used to insert THP migration entries, but not used to unmap THPs. +Copy zap_huge_pmd() and add THP handling now? Perhaps, but their TLB +needs are different, I'm too ignorant of the DAX cases, and couldn't +decide how far to go for anon+swap. Set that aside. + +The second attempt took a different tack: make no change in truncate.c, +but modify zap_huge_pmd() to insert an invalidated huge pmd instead of +clearing it initially, then pmd_clear() between page_remove_rmap() and +unlocking at the end. Nice. But powerpc blows that approach out of the +water, with its serialize_against_pte_lookup(), and interesting pgtable +usage. It would need serious help to get working on powerpc (with a +minor optimization issue on s390 too). Set that aside. + +Just add an "if (page_mapped(page)) synchronize_rcu();" or other such +delay, after unmapping in truncate_cleanup_page()? Perhaps, but though +that's likely to reduce or eliminate the number of incidents, it would +give less assurance of whether we had identified the problem correctly. + +This successful iteration introduces "unmap_mapping_page(page)" instead +of try_to_unmap(), and goes the usual unmap_mapping_range_tree() route, +with an addition to details. Then zap_pmd_range() watches for this +case, and does spin_unlock(pmd_lock) if so - just like +page_vma_mapped_walk() now does in the PVMW_SYNC case. Not pretty, but +safe. + +Note that unmap_mapping_page() is doing a VM_BUG_ON(!PageLocked) to +assert its interface; but currently that's only used to make sure that +page->mapping is stable, and zap_pmd_range() doesn't care if the page is +locked or not. Along these lines, in invalidate_inode_pages2_range() +move the initial unmap_mapping_range() out from under page lock, before +then calling unmap_mapping_page() under page lock if still mapped. + +Link: https://lkml.kernel.org/r/a2a4a148-cdd8-942c-4ef8-51b77f643dbe@google.com +Fixes: fc127da085c2 ("truncate: handle file thp") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Reviewed-by: Yang Shi +Cc: Alistair Popple +Cc: Jan Kara +Cc: Jue Wang +Cc: "Matthew Wilcox (Oracle)" +Cc: Miaohe Lin +Cc: Minchan Kim +Cc: Naoya Horiguchi +Cc: Oscar Salvador +Cc: Peter Xu +Cc: Ralph Campbell +Cc: Shakeel Butt +Cc: Wang Yugui +Cc: Zi Yan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds + +Note on stable backport: fixed up call to truncate_cleanup_page() +in truncate_inode_pages_range(). Use hpage_nr_pages() in +unmap_mapping_page(). + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mm.h | 3 +++ + mm/memory.c | 41 +++++++++++++++++++++++++++++++++++++++++ + mm/truncate.c | 43 +++++++++++++++++++------------------------ + 3 files changed, 63 insertions(+), 24 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1459,6 +1459,7 @@ struct zap_details { + struct address_space *check_mapping; /* Check page->mapping if set */ + pgoff_t first_index; /* Lowest page->index to unmap */ + pgoff_t last_index; /* Highest page->index to unmap */ ++ struct page *single_page; /* Locked page to be unmapped */ + }; + + struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, +@@ -1505,6 +1506,7 @@ extern vm_fault_t handle_mm_fault(struct + extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags, + bool *unlocked); ++void unmap_mapping_page(struct page *page); + void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); + void unmap_mapping_range(struct address_space *mapping, +@@ -1525,6 +1527,7 @@ static inline int fixup_user_fault(struc + BUG(); + return -EFAULT; + } ++static inline void unmap_mapping_page(struct page *page) { } + static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } + static inline void unmap_mapping_range(struct address_space *mapping, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1165,7 +1165,18 @@ static inline unsigned long zap_pmd_rang + else if (zap_huge_pmd(tlb, vma, pmd, addr)) + goto next; + /* fall through */ ++ } else if (details && details->single_page && ++ PageTransCompound(details->single_page) && ++ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { ++ spinlock_t *ptl = pmd_lock(tlb->mm, pmd); ++ /* ++ * Take and drop THP pmd lock so that we cannot return ++ * prematurely, while zap_huge_pmd() has cleared *pmd, ++ * but not yet decremented compound_mapcount(). ++ */ ++ spin_unlock(ptl); + } ++ + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is +@@ -2770,6 +2781,36 @@ static inline void unmap_mapping_range_t + } + + /** ++ * unmap_mapping_page() - Unmap single page from processes. ++ * @page: The locked page to be unmapped. ++ * ++ * Unmap this page from any userspace process which still has it mmaped. ++ * Typically, for efficiency, the range of nearby pages has already been ++ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once ++ * truncation or invalidation holds the lock on a page, it may find that ++ * the page has been remapped again: and then uses unmap_mapping_page() ++ * to unmap it finally. ++ */ ++void unmap_mapping_page(struct page *page) ++{ ++ struct address_space *mapping = page->mapping; ++ struct zap_details details = { }; ++ ++ VM_BUG_ON(!PageLocked(page)); ++ VM_BUG_ON(PageTail(page)); ++ ++ details.check_mapping = mapping; ++ details.first_index = page->index; ++ details.last_index = page->index + hpage_nr_pages(page) - 1; ++ details.single_page = page; ++ ++ i_mmap_lock_write(mapping); ++ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) ++ unmap_mapping_range_tree(&mapping->i_mmap, &details); ++ i_mmap_unlock_write(mapping); ++} ++ ++/** + * unmap_mapping_pages() - Unmap pages from processes. + * @mapping: The address space containing pages to be unmapped. + * @start: Index of first page to be unmapped. +--- a/mm/truncate.c ++++ b/mm/truncate.c +@@ -173,13 +173,10 @@ void do_invalidatepage(struct page *page + * its lock, b) when a concurrent invalidate_mapping_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +-static void +-truncate_cleanup_page(struct address_space *mapping, struct page *page) ++static void truncate_cleanup_page(struct page *page) + { +- if (page_mapped(page)) { +- pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1; +- unmap_mapping_pages(mapping, page->index, nr, false); +- } ++ if (page_mapped(page)) ++ unmap_mapping_page(page); + + if (page_has_private(page)) + do_invalidatepage(page, 0, PAGE_SIZE); +@@ -224,7 +221,7 @@ int truncate_inode_page(struct address_s + if (page->mapping != mapping) + return -EIO; + +- truncate_cleanup_page(mapping, page); ++ truncate_cleanup_page(page); + delete_from_page_cache(page); + return 0; + } +@@ -362,7 +359,7 @@ void truncate_inode_pages_range(struct a + pagevec_add(&locked_pvec, page); + } + for (i = 0; i < pagevec_count(&locked_pvec); i++) +- truncate_cleanup_page(mapping, locked_pvec.pages[i]); ++ truncate_cleanup_page(locked_pvec.pages[i]); + delete_from_page_cache_batch(mapping, &locked_pvec); + for (i = 0; i < pagevec_count(&locked_pvec); i++) + unlock_page(locked_pvec.pages[i]); +@@ -715,6 +712,16 @@ int invalidate_inode_pages2_range(struct + continue; + } + ++ if (!did_range_unmap && page_mapped(page)) { ++ /* ++ * If page is mapped, before taking its lock, ++ * zap the rest of the file in one hit. ++ */ ++ unmap_mapping_pages(mapping, index, ++ (1 + end - index), false); ++ did_range_unmap = 1; ++ } ++ + lock_page(page); + WARN_ON(page_to_index(page) != index); + if (page->mapping != mapping) { +@@ -722,23 +729,11 @@ int invalidate_inode_pages2_range(struct + continue; + } + wait_on_page_writeback(page); +- if (page_mapped(page)) { +- if (!did_range_unmap) { +- /* +- * Zap the rest of the file in one hit. +- */ +- unmap_mapping_pages(mapping, index, +- (1 + end - index), false); +- did_range_unmap = 1; +- } else { +- /* +- * Just zap this page +- */ +- unmap_mapping_pages(mapping, index, +- 1, false); +- } +- } ++ ++ if (page_mapped(page)) ++ unmap_mapping_page(page); + BUG_ON(page_mapped(page)); ++ + ret2 = do_launder_page(mapping, page); + if (ret2 == 0) { + if (!invalidate_complete_page2(mapping, page)) diff --git a/queue-5.4/series b/queue-5.4/series index 7e49bce1b81..7bc9b7e04a0 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -42,3 +42,8 @@ kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch i2c-robotfuzz-osif-fix-control-request-directions.patch kthread_worker-split-code-for-canceling-the-delayed-work-timer.patch kthread-prevent-deadlock-when-kthread_mod_delayed_work-races-with-kthread_cancel_delayed_work_sync.patch +mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch +mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch +mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch +mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch +mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch