5.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 24 Jun 2024 15:00:59 +0000 (17:00 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 24 Jun 2024 15:00:59 +0000 (17:00 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 24 Jun 2024 15:00:59 +0000 (17:00 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 24 Jun 2024 15:00:59 +0000 (17:00 +0200)
diff --git a/queue-5.4/mm-fix-race-between-__split_huge_pmd_locked-and-gup-fast.patch b/queue-5.4/mm-fix-race-between-__split_huge_pmd_locked-and-gup-fast.patch

new file mode 100644 (file)

index 0000000..18dcac7
--- /dev/null
+++ b/queue-5.4/mm-fix-race-between-__split_huge_pmd_locked-and-gup-fast.patch
@@ -0,0 +1,186 @@
+From 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Wed, 1 May 2024 15:33:10 +0100
+Subject: mm: fix race between __split_huge_pmd_locked() and GUP-fast
+
+From: Ryan Roberts <ryan.roberts@arm.com>
+
+commit 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 upstream.
+
+__split_huge_pmd_locked() can be called for a present THP, devmap or
+(non-present) migration entry.  It calls pmdp_invalidate() unconditionally
+on the pmdp and only determines if it is present or not based on the
+returned old pmd.  This is a problem for the migration entry case because
+pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a
+present pmd.
+
+On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future
+call to pmd_present() will return true.  And therefore any lockless
+pgtable walker could see the migration entry pmd in this state and start
+interpretting the fields as if it were present, leading to BadThings (TM).
+GUP-fast appears to be one such lockless pgtable walker.
+
+x86 does not suffer the above problem, but instead pmd_mkinvalid() will
+corrupt the offset field of the swap entry within the swap pte.  See link
+below for discussion of that problem.
+
+Fix all of this by only calling pmdp_invalidate() for a present pmd.  And
+for good measure let's add a warning to all implementations of
+pmdp_invalidate[_ad]().  I've manually reviewed all other
+pmdp_invalidate[_ad]() call sites and believe all others to be conformant.
+
+This is a theoretical bug found during code review.  I don't have any test
+case to trigger it in practice.
+
+Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com
+Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/
+Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andreas Larsson <andreas@gaisler.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
+Cc: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: "David S. Miller" <davem@davemloft.net>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Sven Schnelle <svens@linux.ibm.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Deacon <will@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/book3s64/pgtable.c |    1 
+ arch/s390/include/asm/pgtable.h    |    4 ++-
+ arch/sparc/mm/tlb.c                |    1 
+ mm/huge_memory.c                   |   49 +++++++++++++++++++------------------
+ mm/pgtable-generic.c               |    5 +++
+ 5 files changed, 35 insertions(+), 25 deletions(-)
+
+--- a/arch/powerpc/mm/book3s64/pgtable.c
++++ b/arch/powerpc/mm/book3s64/pgtable.c
+@@ -107,6 +107,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
+ {
+       unsigned long old_pmd;
+ 
++      VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+       old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
+       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       /*
+--- a/arch/s390/include/asm/pgtable.h
++++ b/arch/s390/include/asm/pgtable.h
+@@ -1609,8 +1609,10 @@ static inline pmd_t pmdp_huge_clear_flus
+ static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
+                                  unsigned long addr, pmd_t *pmdp)
+ {
+-      pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
++      pmd_t pmd;
+ 
++      VM_WARN_ON_ONCE(!pmd_present(*pmdp));
++      pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+       return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
+ }
+ 
+--- a/arch/sparc/mm/tlb.c
++++ b/arch/sparc/mm/tlb.c
+@@ -246,6 +246,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
+ {
+       pmd_t old, entry;
+ 
++      VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+       entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
+       old = pmdp_establish(vma, address, pmdp, entry);
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2198,38 +2198,41 @@ static void __split_huge_pmd_locked(stru
+               return __split_huge_zero_page_pmd(vma, haddr, pmd);
+       }
+ 
+-      /*
+-       * Up to this point the pmd is present and huge and userland has the
+-       * whole access to the hugepage during the split (which happens in
+-       * place). If we overwrite the pmd with the not-huge version pointing
+-       * to the pte here (which of course we could if all CPUs were bug
+-       * free), userland could trigger a small page size TLB miss on the
+-       * small sized TLB while the hugepage TLB entry is still established in
+-       * the huge TLB. Some CPU doesn't like that.
+-       * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+-       * 383 on page 93. Intel should be safe but is also warns that it's
+-       * only safe if the permission and cache attributes of the two entries
+-       * loaded in the two TLB is identical (which should be the case here).
+-       * But it is generally safer to never allow small and huge TLB entries
+-       * for the same virtual address to be loaded simultaneously. So instead
+-       * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+-       * current pmd notpresent (atomically because here the pmd_trans_huge
+-       * must remain set at all times on the pmd until the split is complete
+-       * for this pmd), then we flush the SMP TLB and finally we write the
+-       * non-huge version of the pmd entry with pmd_populate.
+-       */
+-      old_pmd = pmdp_invalidate(vma, haddr, pmd);
+-
+-      pmd_migration = is_pmd_migration_entry(old_pmd);
++      pmd_migration = is_pmd_migration_entry(*pmd);
+       if (unlikely(pmd_migration)) {
+               swp_entry_t entry;
+ 
++              old_pmd = *pmd;
+               entry = pmd_to_swp_entry(old_pmd);
+               page = pfn_to_page(swp_offset(entry));
+               write = is_write_migration_entry(entry);
+               young = false;
+               soft_dirty = pmd_swp_soft_dirty(old_pmd);
+       } else {
++              /*
++               * Up to this point the pmd is present and huge and userland has
++               * the whole access to the hugepage during the split (which
++               * happens in place). If we overwrite the pmd with the not-huge
++               * version pointing to the pte here (which of course we could if
++               * all CPUs were bug free), userland could trigger a small page
++               * size TLB miss on the small sized TLB while the hugepage TLB
++               * entry is still established in the huge TLB. Some CPU doesn't
++               * like that. See
++               * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
++               * 383 on page 105. Intel should be safe but is also warns that
++               * it's only safe if the permission and cache attributes of the
++               * two entries loaded in the two TLB is identical (which should
++               * be the case here). But it is generally safer to never allow
++               * small and huge TLB entries for the same virtual address to be
++               * loaded simultaneously. So instead of doing "pmd_populate();
++               * flush_pmd_tlb_range();" we first mark the current pmd
++               * notpresent (atomically because here the pmd_trans_huge must
++               * remain set at all times on the pmd until the split is
++               * complete for this pmd), then we flush the SMP TLB and finally
++               * we write the non-huge version of the pmd entry with
++               * pmd_populate.
++               */
++              old_pmd = pmdp_invalidate(vma, haddr, pmd);
+               page = pmd_page(old_pmd);
+               if (pmd_dirty(old_pmd))
+                       SetPageDirty(page);
+--- a/mm/pgtable-generic.c
++++ b/mm/pgtable-generic.c
+@@ -185,7 +185,10 @@ pgtable_t pgtable_trans_huge_withdraw(st
+ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                    pmd_t *pmdp)
+ {
+-      pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
++      pmd_t old;
++
++      VM_WARN_ON_ONCE(!pmd_present(*pmdp));
++      old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
+       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       return old;
+ }
diff --git a/queue-5.4/series b/queue-5.4/series

index 1dbc20a0c72e7636177d87b24b0a25aeac3aeb80..d3d2cf655eec62df2c16213da628fbc017f9cd25 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -142,3 +142,4 @@ net-usb-rtl8150-fix-unintiatilzed-variables-in-rtl81.patch
  regulator-core-fix-modpost-error-regulator_get_regma.patch
  dmaengine-ioatdma-fix-missing-kmem_cache_destroy.patch
  acpica-revert-acpica-avoid-info-mapping-multiple-bar.patch
+mm-fix-race-between-__split_huge_pmd_locked-and-gup-fast.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 24 Jun 2024 15:00:59 +0000 (17:00 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 24 Jun 2024 15:00:59 +0000 (17:00 +0200)
queue-5.4/mm-fix-race-between-__split_huge_pmd_locked-and-gup-fast.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history