--- /dev/null
+From 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Wed, 1 May 2024 15:33:10 +0100
+Subject: mm: fix race between __split_huge_pmd_locked() and GUP-fast
+
+From: Ryan Roberts <ryan.roberts@arm.com>
+
+commit 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 upstream.
+
+__split_huge_pmd_locked() can be called for a present THP, devmap or
+(non-present) migration entry. It calls pmdp_invalidate() unconditionally
+on the pmdp and only determines if it is present or not based on the
+returned old pmd. This is a problem for the migration entry case because
+pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a
+present pmd.
+
+On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future
+call to pmd_present() will return true. And therefore any lockless
+pgtable walker could see the migration entry pmd in this state and start
+interpretting the fields as if it were present, leading to BadThings (TM).
+GUP-fast appears to be one such lockless pgtable walker.
+
+x86 does not suffer the above problem, but instead pmd_mkinvalid() will
+corrupt the offset field of the swap entry within the swap pte. See link
+below for discussion of that problem.
+
+Fix all of this by only calling pmdp_invalidate() for a present pmd. And
+for good measure let's add a warning to all implementations of
+pmdp_invalidate[_ad](). I've manually reviewed all other
+pmdp_invalidate[_ad]() call sites and believe all others to be conformant.
+
+This is a theoretical bug found during code review. I don't have any test
+case to trigger it in practice.
+
+Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com
+Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/
+Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andreas Larsson <andreas@gaisler.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
+Cc: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: "David S. Miller" <davem@davemloft.net>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Sven Schnelle <svens@linux.ibm.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Deacon <will@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/book3s64/pgtable.c | 1
+ arch/s390/include/asm/pgtable.h | 4 ++-
+ arch/sparc/mm/tlb.c | 1
+ mm/huge_memory.c | 49 +++++++++++++++++++------------------
+ mm/pgtable-generic.c | 5 +++
+ 5 files changed, 35 insertions(+), 25 deletions(-)
+
+--- a/arch/powerpc/mm/book3s64/pgtable.c
++++ b/arch/powerpc/mm/book3s64/pgtable.c
+@@ -107,6 +107,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
+ {
+ unsigned long old_pmd;
+
++ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+ old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+--- a/arch/s390/include/asm/pgtable.h
++++ b/arch/s390/include/asm/pgtable.h
+@@ -1609,8 +1609,10 @@ static inline pmd_t pmdp_huge_clear_flus
+ static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+ {
+- pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
++ pmd_t pmd;
+
++ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
++ pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+ return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
+ }
+
+--- a/arch/sparc/mm/tlb.c
++++ b/arch/sparc/mm/tlb.c
+@@ -246,6 +246,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
+ {
+ pmd_t old, entry;
+
++ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+ entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
+ old = pmdp_establish(vma, address, pmdp, entry);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2198,38 +2198,41 @@ static void __split_huge_pmd_locked(stru
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
+ }
+
+- /*
+- * Up to this point the pmd is present and huge and userland has the
+- * whole access to the hugepage during the split (which happens in
+- * place). If we overwrite the pmd with the not-huge version pointing
+- * to the pte here (which of course we could if all CPUs were bug
+- * free), userland could trigger a small page size TLB miss on the
+- * small sized TLB while the hugepage TLB entry is still established in
+- * the huge TLB. Some CPU doesn't like that.
+- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+- * 383 on page 93. Intel should be safe but is also warns that it's
+- * only safe if the permission and cache attributes of the two entries
+- * loaded in the two TLB is identical (which should be the case here).
+- * But it is generally safer to never allow small and huge TLB entries
+- * for the same virtual address to be loaded simultaneously. So instead
+- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+- * current pmd notpresent (atomically because here the pmd_trans_huge
+- * must remain set at all times on the pmd until the split is complete
+- * for this pmd), then we flush the SMP TLB and finally we write the
+- * non-huge version of the pmd entry with pmd_populate.
+- */
+- old_pmd = pmdp_invalidate(vma, haddr, pmd);
+-
+- pmd_migration = is_pmd_migration_entry(old_pmd);
++ pmd_migration = is_pmd_migration_entry(*pmd);
+ if (unlikely(pmd_migration)) {
+ swp_entry_t entry;
+
++ old_pmd = *pmd;
+ entry = pmd_to_swp_entry(old_pmd);
+ page = pfn_to_page(swp_offset(entry));
+ write = is_write_migration_entry(entry);
+ young = false;
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ } else {
++ /*
++ * Up to this point the pmd is present and huge and userland has
++ * the whole access to the hugepage during the split (which
++ * happens in place). If we overwrite the pmd with the not-huge
++ * version pointing to the pte here (which of course we could if
++ * all CPUs were bug free), userland could trigger a small page
++ * size TLB miss on the small sized TLB while the hugepage TLB
++ * entry is still established in the huge TLB. Some CPU doesn't
++ * like that. See
++ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
++ * 383 on page 105. Intel should be safe but is also warns that
++ * it's only safe if the permission and cache attributes of the
++ * two entries loaded in the two TLB is identical (which should
++ * be the case here). But it is generally safer to never allow
++ * small and huge TLB entries for the same virtual address to be
++ * loaded simultaneously. So instead of doing "pmd_populate();
++ * flush_pmd_tlb_range();" we first mark the current pmd
++ * notpresent (atomically because here the pmd_trans_huge must
++ * remain set at all times on the pmd until the split is
++ * complete for this pmd), then we flush the SMP TLB and finally
++ * we write the non-huge version of the pmd entry with
++ * pmd_populate.
++ */
++ old_pmd = pmdp_invalidate(vma, haddr, pmd);
+ page = pmd_page(old_pmd);
+ if (pmd_dirty(old_pmd))
+ SetPageDirty(page);
+--- a/mm/pgtable-generic.c
++++ b/mm/pgtable-generic.c
+@@ -185,7 +185,10 @@ pgtable_t pgtable_trans_huge_withdraw(st
+ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+ {
+- pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
++ pmd_t old;
++
++ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
++ old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return old;
+ }