From: Greg Kroah-Hartman Date: Wed, 5 Oct 2022 16:41:36 +0000 (+0200) Subject: 5.10-stable patches X-Git-Tag: v5.4.217~17 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8f34825bb6a44095c13276fe56081df006365998;p=thirdparty%2Fkernel%2Fstable-queue.git 5.10-stable patches added patches: mm-gup-fix-the-fast-gup-race-against-thp-collapse.patch powerpc-64s-radix-don-t-need-to-broadcast-ipi-for-radix-pmd-collapse-flush.patch --- diff --git a/queue-5.10/mm-gup-fix-the-fast-gup-race-against-thp-collapse.patch b/queue-5.10/mm-gup-fix-the-fast-gup-race-against-thp-collapse.patch new file mode 100644 index 00000000000..d71809339b0 --- /dev/null +++ b/queue-5.10/mm-gup-fix-the-fast-gup-race-against-thp-collapse.patch @@ -0,0 +1,145 @@ +From 70cbc3cc78a997d8247b50389d37c4e1736019da Mon Sep 17 00:00:00 2001 +From: Yang Shi +Date: Wed, 7 Sep 2022 11:01:43 -0700 +Subject: mm: gup: fix the fast GUP race against THP collapse + +From: Yang Shi + +commit 70cbc3cc78a997d8247b50389d37c4e1736019da upstream. + +Since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm: +introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer +sufficient to handle concurrent GUP-fast in all cases, it only handles +traditional IPI-based GUP-fast correctly. On architectures that send an +IPI broadcast on TLB flush, it works as expected. But on the +architectures that do not use IPI to broadcast TLB flush, it may have the +below race: + + CPU A CPU B +THP collapse fast GUP + gup_pmd_range() <-- see valid pmd + gup_pte_range() <-- work on pte +pmdp_collapse_flush() <-- clear pmd and flush +__collapse_huge_page_isolate() + check page pinned <-- before GUP bump refcount + pin the page + check PTE <-- no change +__collapse_huge_page_copy() + copy data to huge page + ptep_clear() +install huge pmd for the huge page + return the stale page +discard the stale page + +The race can be fixed by checking whether PMD is changed or not after +taking the page pin in fast GUP, just like what it does for PTE. If the +PMD is changed it means there may be parallel THP collapse, so GUP should +back off. + +Also update the stale comment about serializing against fast GUP in +khugepaged. + +Link: https://lkml.kernel.org/r/20220907180144.555485-1-shy828301@gmail.com +Fixes: 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()") +Acked-by: David Hildenbrand +Acked-by: Peter Xu +Signed-off-by: Yang Shi +Reviewed-by: John Hubbard +Cc: "Aneesh Kumar K.V" +Cc: Hugh Dickins +Cc: Jason Gunthorpe +Cc: "Kirill A. Shutemov" +Cc: Michael Ellerman +Cc: Nicholas Piggin +Cc: Christophe Leroy +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/gup.c | 34 ++++++++++++++++++++++++++++------ + mm/khugepaged.c | 10 ++++++---- + 2 files changed, 34 insertions(+), 10 deletions(-) + +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -2128,8 +2128,28 @@ static void __maybe_unused undo_dev_page + } + + #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL +-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, +- unsigned int flags, struct page **pages, int *nr) ++/* ++ * Fast-gup relies on pte change detection to avoid concurrent pgtable ++ * operations. ++ * ++ * To pin the page, fast-gup needs to do below in order: ++ * (1) pin the page (by prefetching pte), then (2) check pte not changed. ++ * ++ * For the rest of pgtable operations where pgtable updates can be racy ++ * with fast-gup, we need to do (1) clear pte, then (2) check whether page ++ * is pinned. ++ * ++ * Above will work for all pte-level operations, including THP split. ++ * ++ * For THP collapse, it's a bit more complicated because fast-gup may be ++ * walking a pgtable page that is being freed (pte is still valid but pmd ++ * can be cleared already). To avoid race in such condition, we need to ++ * also check pmd here to make sure pmd doesn't change (corresponds to ++ * pmdp_collapse_flush() in the THP collapse code path). ++ */ ++static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, ++ unsigned long end, unsigned int flags, ++ struct page **pages, int *nr) + { + struct dev_pagemap *pgmap = NULL; + int nr_start = *nr, ret = 0; +@@ -2169,7 +2189,8 @@ static int gup_pte_range(pmd_t pmd, unsi + if (!head) + goto pte_unmap; + +- if (unlikely(pte_val(pte) != pte_val(*ptep))) { ++ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || ++ unlikely(pte_val(pte) != pte_val(*ptep))) { + put_compound_head(head, 1, flags); + goto pte_unmap; + } +@@ -2214,8 +2235,9 @@ pte_unmap: + * get_user_pages_fast_only implementation that can pin pages. Thus it's still + * useful to have gup_huge_pmd even if we can't operate on ptes. + */ +-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, +- unsigned int flags, struct page **pages, int *nr) ++static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, ++ unsigned long end, unsigned int flags, ++ struct page **pages, int *nr) + { + return 0; + } +@@ -2522,7 +2544,7 @@ static int gup_pmd_range(pud_t *pudp, pu + if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, + PMD_SHIFT, next, flags, pages, nr)) + return 0; +- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) ++ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1144,10 +1144,12 @@ static void collapse_huge_page(struct mm + + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ + /* +- * After this gup_fast can't run anymore. This also removes +- * any huge TLB entry from the CPU so we won't allow +- * huge and small TLB entries for the same virtual address +- * to avoid the risk of CPU bugs in that area. ++ * This removes any huge TLB entry from the CPU so we won't allow ++ * huge and small TLB entries for the same virtual address to ++ * avoid the risk of CPU bugs in that area. ++ * ++ * Parallel fast GUP is fine since fast GUP will back off when ++ * it detects PMD is changed. + */ + _pmd = pmdp_collapse_flush(vma, address, pmd); + spin_unlock(pmd_ptl); diff --git a/queue-5.10/powerpc-64s-radix-don-t-need-to-broadcast-ipi-for-radix-pmd-collapse-flush.patch b/queue-5.10/powerpc-64s-radix-don-t-need-to-broadcast-ipi-for-radix-pmd-collapse-flush.patch new file mode 100644 index 00000000000..a439f4c3380 --- /dev/null +++ b/queue-5.10/powerpc-64s-radix-don-t-need-to-broadcast-ipi-for-radix-pmd-collapse-flush.patch @@ -0,0 +1,55 @@ +From bedf03416913d88c796288f9dca109a53608c745 Mon Sep 17 00:00:00 2001 +From: Yang Shi +Date: Wed, 7 Sep 2022 11:01:44 -0700 +Subject: powerpc/64s/radix: don't need to broadcast IPI for radix pmd collapse flush + +From: Yang Shi + +commit bedf03416913d88c796288f9dca109a53608c745 upstream. + +The IPI broadcast is used to serialize against fast-GUP, but fast-GUP will +move to use RCU instead of disabling local interrupts in fast-GUP. Using +an IPI is the old-styled way of serializing against fast-GUP although it +still works as expected now. + +And fast-GUP now fixed the potential race with THP collapse by checking +whether PMD is changed or not. So IPI broadcast in radix pmd collapse +flush is not necessary anymore. But it is still needed for hash TLB. + +Link: https://lkml.kernel.org/r/20220907180144.555485-2-shy828301@gmail.com +Suggested-by: Aneesh Kumar K.V +Signed-off-by: Yang Shi +Acked-by: David Hildenbrand +Acked-by: Peter Xu +Cc: Christophe Leroy +Cc: Hugh Dickins +Cc: Jason Gunthorpe +Cc: John Hubbard +Cc: "Kirill A. Shutemov" +Cc: Michael Ellerman +Cc: Nicholas Piggin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/mm/book3s64/radix_pgtable.c | 9 --------- + 1 file changed, 9 deletions(-) + +--- a/arch/powerpc/mm/book3s64/radix_pgtable.c ++++ b/arch/powerpc/mm/book3s64/radix_pgtable.c +@@ -997,15 +997,6 @@ pmd_t radix__pmdp_collapse_flush(struct + pmd = *pmdp; + pmd_clear(pmdp); + +- /* +- * pmdp collapse_flush need to ensure that there are no parallel gup +- * walk after this call. This is needed so that we can have stable +- * page ref count when collapsing a page. We don't allow a collapse page +- * if we have gup taken on the page. We can ensure that by sending IPI +- * because gup walk happens with IRQ disabled. +- */ +- serialize_against_pte_lookup(vma->vm_mm); +- + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); + + return pmd; diff --git a/queue-5.10/series b/queue-5.10/series index 3d6353b1535..e61fb23bffb 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -1,2 +1,4 @@ makefile.extrawarn-move-wcast-function-type-strict-to-w-1.patch docs-update-mediator-information-in-coc-docs.patch +mm-gup-fix-the-fast-gup-race-against-thp-collapse.patch +powerpc-64s-radix-don-t-need-to-broadcast-ipi-for-radix-pmd-collapse-flush.patch