+++ /dev/null
-From 6f73cf81e6438c334ae03321c915e9d376501fd8 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Mon, 9 May 2022 18:20:50 -0700
-Subject: mm: avoid unnecessary flush on change_huge_pmd()
-
-From: Nadav Amit <namit@vmware.com>
-
-[ Upstream commit 4f83145721f362c2f4d312edc4755269a2069488 ]
-
-Calls to change_protection_range() on THP can trigger, at least on x86,
-two TLB flushes for one page: one immediately, when pmdp_invalidate() is
-called by change_huge_pmd(), and then another one later (that can be
-batched) when change_protection_range() finishes.
-
-The first TLB flush is only necessary to prevent the dirty bit (and with a
-lesser importance the access bit) from changing while the PTE is modified.
-However, this is not necessary as the x86 CPUs set the dirty-bit
-atomically with an additional check that the PTE is (still) present. One
-caveat is Intel's Knights Landing that has a bug and does not do so.
-
-Leverage this behavior to eliminate the unnecessary TLB flush in
-change_huge_pmd(). Introduce a new arch specific pmdp_invalidate_ad()
-that only invalidates the access and dirty bit from further changes.
-
-Link: https://lkml.kernel.org/r/20220401180821.1986781-4-namit@vmware.com
-Signed-off-by: Nadav Amit <namit@vmware.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Andrew Cooper <andrew.cooper3@citrix.com>
-Cc: Andy Lutomirski <luto@kernel.org>
-Cc: Dave Hansen <dave.hansen@linux.intel.com>
-Cc: Peter Xu <peterx@redhat.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: Will Deacon <will@kernel.org>
-Cc: Yu Zhao <yuzhao@google.com>
-Cc: Nick Piggin <npiggin@gmail.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Stable-dep-of: 3a5a8d343e1c ("mm: fix race between __split_huge_pmd_locked() and GUP-fast")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- arch/x86/include/asm/pgtable.h | 5 +++++
- arch/x86/mm/pgtable.c | 10 ++++++++++
- include/linux/pgtable.h | 20 ++++++++++++++++++++
- mm/huge_memory.c | 4 ++--
- mm/pgtable-generic.c | 8 ++++++++
- 5 files changed, 45 insertions(+), 2 deletions(-)
-
-diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
-index 448cd01eb3ecb..c04be133a6cd7 100644
---- a/arch/x86/include/asm/pgtable.h
-+++ b/arch/x86/include/asm/pgtable.h
-@@ -1146,6 +1146,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
- }
- }
- #endif
-+
-+#define __HAVE_ARCH_PMDP_INVALIDATE_AD
-+extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
-+ unsigned long address, pmd_t *pmdp);
-+
- /*
- * Page table pages are page-aligned. The lower half of the top
- * level is used for userspace and the top half for the kernel.
-diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
-index 3481b35cb4ec7..f16059e9a85e7 100644
---- a/arch/x86/mm/pgtable.c
-+++ b/arch/x86/mm/pgtable.c
-@@ -608,6 +608,16 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
-
- return young;
- }
-+
-+pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
-+ pmd_t *pmdp)
-+{
-+ /*
-+ * No flush is necessary. Once an invalid PTE is established, the PTE's
-+ * access and dirty bits cannot be updated.
-+ */
-+ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
-+}
- #endif
-
- /**
-diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
-index d468efcf48f45..952969aa19ec1 100644
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -562,6 +562,26 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
- #endif
-
-+#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
-+
-+/*
-+ * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
-+ * hugepage mapping in the page tables. This function is similar to
-+ * pmdp_invalidate(), but should only be used if the access and dirty bits would
-+ * not be cleared by the software in the new PMD value. The function ensures
-+ * that hardware changes of the access and dirty bits updates would not be lost.
-+ *
-+ * Doing so can allow in certain architectures to avoid a TLB flush in most
-+ * cases. Yet, another TLB flush might be necessary later if the PMD update
-+ * itself requires such flush (e.g., if protection was set to be stricter). Yet,
-+ * even when a TLB flush is needed because of the update, the caller may be able
-+ * to batch these TLB flushing operations, so fewer TLB flush operations are
-+ * needed.
-+ */
-+extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
-+ unsigned long address, pmd_t *pmdp);
-+#endif
-+
- #ifndef __HAVE_ARCH_PTE_SAME
- static inline int pte_same(pte_t pte_a, pte_t pte_b)
- {
-diff --git a/mm/huge_memory.c b/mm/huge_memory.c
-index 8ab6316d85391..265ef8d1393c5 100644
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -1798,10 +1798,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
- * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
- * which may break userspace.
- *
-- * pmdp_invalidate() is required to make sure we don't miss
-+ * pmdp_invalidate_ad() is required to make sure we don't miss
- * dirty/young flags set by hardware.
- */
-- oldpmd = pmdp_invalidate(vma, addr, pmd);
-+ oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
-
- entry = pmd_modify(oldpmd, newprot);
- if (preserve_write)
-diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
-index 4e640baf97948..b0ce6c7391bf4 100644
---- a/mm/pgtable-generic.c
-+++ b/mm/pgtable-generic.c
-@@ -200,6 +200,14 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
- }
- #endif
-
-+#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
-+pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
-+ pmd_t *pmdp)
-+{
-+ return pmdp_invalidate(vma, address, pmdp);
-+}
-+#endif
-+
- #ifndef pmdp_collapse_flush
- pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
---
-2.43.0
-
+++ /dev/null
-From dca09ad288fc1dd6652c82f0aa90f993a357f4f8 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Wed, 1 May 2024 15:33:10 +0100
-Subject: mm: fix race between __split_huge_pmd_locked() and GUP-fast
-
-From: Ryan Roberts <ryan.roberts@arm.com>
-
-[ Upstream commit 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 ]
-
-__split_huge_pmd_locked() can be called for a present THP, devmap or
-(non-present) migration entry. It calls pmdp_invalidate() unconditionally
-on the pmdp and only determines if it is present or not based on the
-returned old pmd. This is a problem for the migration entry case because
-pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a
-present pmd.
-
-On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future
-call to pmd_present() will return true. And therefore any lockless
-pgtable walker could see the migration entry pmd in this state and start
-interpretting the fields as if it were present, leading to BadThings (TM).
-GUP-fast appears to be one such lockless pgtable walker.
-
-x86 does not suffer the above problem, but instead pmd_mkinvalid() will
-corrupt the offset field of the swap entry within the swap pte. See link
-below for discussion of that problem.
-
-Fix all of this by only calling pmdp_invalidate() for a present pmd. And
-for good measure let's add a warning to all implementations of
-pmdp_invalidate[_ad](). I've manually reviewed all other
-pmdp_invalidate[_ad]() call sites and believe all others to be conformant.
-
-This is a theoretical bug found during code review. I don't have any test
-case to trigger it in practice.
-
-Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com
-Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/
-Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
-Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
-Reviewed-by: Zi Yan <ziy@nvidia.com>
-Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
-Acked-by: David Hildenbrand <david@redhat.com>
-Cc: Andreas Larsson <andreas@gaisler.com>
-Cc: Andy Lutomirski <luto@kernel.org>
-Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
-Cc: Borislav Petkov (AMD) <bp@alien8.de>
-Cc: Catalin Marinas <catalin.marinas@arm.com>
-Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
-Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
-Cc: Dave Hansen <dave.hansen@linux.intel.com>
-Cc: "David S. Miller" <davem@davemloft.net>
-Cc: Ingo Molnar <mingo@redhat.com>
-Cc: Jonathan Corbet <corbet@lwn.net>
-Cc: Mark Rutland <mark.rutland@arm.com>
-Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
-Cc: Nicholas Piggin <npiggin@gmail.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Sven Schnelle <svens@linux.ibm.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: Will Deacon <will@kernel.org>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- Documentation/vm/arch_pgtable_helpers.rst | 6 ++-
- arch/powerpc/mm/book3s64/pgtable.c | 1
- arch/s390/include/asm/pgtable.h | 4 +-
- arch/sparc/mm/tlb.c | 1
- arch/x86/mm/pgtable.c | 2 +
- mm/huge_memory.c | 49 +++++++++++++++---------------
- mm/pgtable-generic.c | 2 +
- 7 files changed, 39 insertions(+), 26 deletions(-)
-
---- a/Documentation/vm/arch_pgtable_helpers.rst
-+++ b/Documentation/vm/arch_pgtable_helpers.rst
-@@ -134,7 +134,8 @@ PMD Page Table Helpers
- +---------------------------+--------------------------------------------------+
- | pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD |
- +---------------------------+--------------------------------------------------+
--| pmd_mkinvalid | Invalidates a mapped PMD [1] |
-+| pmd_mkinvalid | Invalidates a present PMD; do not call for |
-+| | non-present PMD [1] |
- +---------------------------+--------------------------------------------------+
- | pmd_set_huge | Creates a PMD huge mapping |
- +---------------------------+--------------------------------------------------+
-@@ -190,7 +191,8 @@ PUD Page Table Helpers
- +---------------------------+--------------------------------------------------+
- | pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD |
- +---------------------------+--------------------------------------------------+
--| pud_mkinvalid | Invalidates a mapped PUD [1] |
-+| pud_mkinvalid | Invalidates a present PUD; do not call for |
-+| | non-present PUD [1] |
- +---------------------------+--------------------------------------------------+
- | pud_set_huge | Creates a PUD huge mapping |
- +---------------------------+--------------------------------------------------+
---- a/arch/powerpc/mm/book3s64/pgtable.c
-+++ b/arch/powerpc/mm/book3s64/pgtable.c
-@@ -115,6 +115,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
- {
- unsigned long old_pmd;
-
-+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
- old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- return __pmd(old_pmd);
---- a/arch/s390/include/asm/pgtable.h
-+++ b/arch/s390/include/asm/pgtable.h
-@@ -1625,8 +1625,10 @@ static inline pmd_t pmdp_huge_clear_flus
- static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
- {
-- pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
-+ pmd_t pmd;
-
-+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
-+ pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
- return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
- }
-
---- a/arch/sparc/mm/tlb.c
-+++ b/arch/sparc/mm/tlb.c
-@@ -245,6 +245,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
- {
- pmd_t old, entry;
-
-+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
- entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
- old = pmdp_establish(vma, address, pmdp, entry);
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
---- a/arch/x86/mm/pgtable.c
-+++ b/arch/x86/mm/pgtable.c
-@@ -612,6 +612,8 @@ int pmdp_clear_flush_young(struct vm_are
- pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
- {
-+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
-+
- /*
- * No flush is necessary. Once an invalid PTE is established, the PTE's
- * access and dirty bits cannot be updated.
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -2024,32 +2024,11 @@ static void __split_huge_pmd_locked(stru
- return __split_huge_zero_page_pmd(vma, haddr, pmd);
- }
-
-- /*
-- * Up to this point the pmd is present and huge and userland has the
-- * whole access to the hugepage during the split (which happens in
-- * place). If we overwrite the pmd with the not-huge version pointing
-- * to the pte here (which of course we could if all CPUs were bug
-- * free), userland could trigger a small page size TLB miss on the
-- * small sized TLB while the hugepage TLB entry is still established in
-- * the huge TLB. Some CPU doesn't like that.
-- * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
-- * 383 on page 105. Intel should be safe but is also warns that it's
-- * only safe if the permission and cache attributes of the two entries
-- * loaded in the two TLB is identical (which should be the case here).
-- * But it is generally safer to never allow small and huge TLB entries
-- * for the same virtual address to be loaded simultaneously. So instead
-- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
-- * current pmd notpresent (atomically because here the pmd_trans_huge
-- * must remain set at all times on the pmd until the split is complete
-- * for this pmd), then we flush the SMP TLB and finally we write the
-- * non-huge version of the pmd entry with pmd_populate.
-- */
-- old_pmd = pmdp_invalidate(vma, haddr, pmd);
--
-- pmd_migration = is_pmd_migration_entry(old_pmd);
-+ pmd_migration = is_pmd_migration_entry(*pmd);
- if (unlikely(pmd_migration)) {
- swp_entry_t entry;
-
-+ old_pmd = *pmd;
- entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
- write = is_writable_migration_entry(entry);
-@@ -2057,6 +2036,30 @@ static void __split_huge_pmd_locked(stru
- soft_dirty = pmd_swp_soft_dirty(old_pmd);
- uffd_wp = pmd_swp_uffd_wp(old_pmd);
- } else {
-+ /*
-+ * Up to this point the pmd is present and huge and userland has
-+ * the whole access to the hugepage during the split (which
-+ * happens in place). If we overwrite the pmd with the not-huge
-+ * version pointing to the pte here (which of course we could if
-+ * all CPUs were bug free), userland could trigger a small page
-+ * size TLB miss on the small sized TLB while the hugepage TLB
-+ * entry is still established in the huge TLB. Some CPU doesn't
-+ * like that. See
-+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
-+ * 383 on page 105. Intel should be safe but is also warns that
-+ * it's only safe if the permission and cache attributes of the
-+ * two entries loaded in the two TLB is identical (which should
-+ * be the case here). But it is generally safer to never allow
-+ * small and huge TLB entries for the same virtual address to be
-+ * loaded simultaneously. So instead of doing "pmd_populate();
-+ * flush_pmd_tlb_range();" we first mark the current pmd
-+ * notpresent (atomically because here the pmd_trans_huge must
-+ * remain set at all times on the pmd until the split is
-+ * complete for this pmd), then we flush the SMP TLB and finally
-+ * we write the non-huge version of the pmd entry with
-+ * pmd_populate.
-+ */
-+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
- page = pmd_page(old_pmd);
- if (pmd_dirty(old_pmd))
- SetPageDirty(page);
---- a/mm/pgtable-generic.c
-+++ b/mm/pgtable-generic.c
-@@ -195,6 +195,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
- pmd_t *pmdp)
- {
- pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
-+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- return old;
- }
-@@ -204,6 +205,7 @@ pmd_t pmdp_invalidate(struct vm_area_str
- pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
- {
-+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
- return pmdp_invalidate(vma, address, pmdp);
- }
- #endif
+++ /dev/null
-From 2031c117202f5d2e11b95194e0012d36553e6e78 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Mon, 9 May 2022 18:20:50 -0700
-Subject: mm/mprotect: do not flush when not required architecturally
-
-From: Nadav Amit <namit@vmware.com>
-
-[ Upstream commit c9fe66560bf2dc7d109754414e309888cb8c9ba9 ]
-
-Currently, using mprotect() to unprotect a memory region or uffd to
-unprotect a memory region causes a TLB flush. However, in such cases the
-PTE is often not modified (i.e., remain RO) and therefore not TLB flush is
-needed.
-
-Add an arch-specific pte_needs_flush() which tells whether a TLB flush is
-needed based on the old PTE and the new one. Implement an x86
-pte_needs_flush().
-
-Always flush the TLB when it is architecturally needed even when skipping
-a TLB flush might only result in a spurious page-faults by skipping the
-flush.
-
-Even with such conservative manner, we can in the future further refine
-the checks to test whether a PTE is present by only considering the
-architectural _PAGE_PRESENT flag instead of {pte|pmd}_preesnt(). For not
-be careful and use the latter.
-
-Link: https://lkml.kernel.org/r/20220401180821.1986781-3-namit@vmware.com
-Signed-off-by: Nadav Amit <namit@vmware.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Andy Lutomirski <luto@kernel.org>
-Cc: Dave Hansen <dave.hansen@linux.intel.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: Will Deacon <will@kernel.org>
-Cc: Yu Zhao <yuzhao@google.com>
-Cc: Nick Piggin <npiggin@gmail.com>
-Cc: Andrew Cooper <andrew.cooper3@citrix.com>
-Cc: Peter Xu <peterx@redhat.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Stable-dep-of: 3a5a8d343e1c ("mm: fix race between __split_huge_pmd_locked() and GUP-fast")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- arch/x86/include/asm/pgtable_types.h | 2 +
- arch/x86/include/asm/tlbflush.h | 97 ++++++++++++++++++++++++++++
- include/asm-generic/tlb.h | 14 ++++
- mm/huge_memory.c | 9 +--
- mm/mprotect.c | 3 +-
- 5 files changed, 120 insertions(+), 5 deletions(-)
-
-diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
-index 28e59576c75be..de9e3c635618e 100644
---- a/arch/x86/include/asm/pgtable_types.h
-+++ b/arch/x86/include/asm/pgtable_types.h
-@@ -110,9 +110,11 @@
- #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
- #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
- #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
-+#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
- #else
- #define _PAGE_NX (_AT(pteval_t, 0))
- #define _PAGE_DEVMAP (_AT(pteval_t, 0))
-+#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
- #endif
-
- #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
-index b587a9ee9cb25..8be1ff9081728 100644
---- a/arch/x86/include/asm/tlbflush.h
-+++ b/arch/x86/include/asm/tlbflush.h
-@@ -259,6 +259,103 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
-
- extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
-
-+static inline bool pte_flags_need_flush(unsigned long oldflags,
-+ unsigned long newflags,
-+ bool ignore_access)
-+{
-+ /*
-+ * Flags that require a flush when cleared but not when they are set.
-+ * Only include flags that would not trigger spurious page-faults.
-+ * Non-present entries are not cached. Hardware would set the
-+ * dirty/access bit if needed without a fault.
-+ */
-+ const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
-+ _PAGE_ACCESSED;
-+ const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
-+ _PAGE_SOFTW3 | _PAGE_SOFTW4;
-+ const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
-+ _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
-+ _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
-+ _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX;
-+ unsigned long diff = oldflags ^ newflags;
-+
-+ BUILD_BUG_ON(flush_on_clear & software_flags);
-+ BUILD_BUG_ON(flush_on_clear & flush_on_change);
-+ BUILD_BUG_ON(flush_on_change & software_flags);
-+
-+ /* Ignore software flags */
-+ diff &= ~software_flags;
-+
-+ if (ignore_access)
-+ diff &= ~_PAGE_ACCESSED;
-+
-+ /*
-+ * Did any of the 'flush_on_clear' flags was clleared set from between
-+ * 'oldflags' and 'newflags'?
-+ */
-+ if (diff & oldflags & flush_on_clear)
-+ return true;
-+
-+ /* Flush on modified flags. */
-+ if (diff & flush_on_change)
-+ return true;
-+
-+ /* Ensure there are no flags that were left behind */
-+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
-+ (diff & ~(flush_on_clear | software_flags | flush_on_change))) {
-+ VM_WARN_ON_ONCE(1);
-+ return true;
-+ }
-+
-+ return false;
-+}
-+
-+/*
-+ * pte_needs_flush() checks whether permissions were demoted and require a
-+ * flush. It should only be used for userspace PTEs.
-+ */
-+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
-+{
-+ /* !PRESENT -> * ; no need for flush */
-+ if (!(pte_flags(oldpte) & _PAGE_PRESENT))
-+ return false;
-+
-+ /* PFN changed ; needs flush */
-+ if (pte_pfn(oldpte) != pte_pfn(newpte))
-+ return true;
-+
-+ /*
-+ * check PTE flags; ignore access-bit; see comment in
-+ * ptep_clear_flush_young().
-+ */
-+ return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte),
-+ true);
-+}
-+#define pte_needs_flush pte_needs_flush
-+
-+/*
-+ * huge_pmd_needs_flush() checks whether permissions were demoted and require a
-+ * flush. It should only be used for userspace huge PMDs.
-+ */
-+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
-+{
-+ /* !PRESENT -> * ; no need for flush */
-+ if (!(pmd_flags(oldpmd) & _PAGE_PRESENT))
-+ return false;
-+
-+ /* PFN changed ; needs flush */
-+ if (pmd_pfn(oldpmd) != pmd_pfn(newpmd))
-+ return true;
-+
-+ /*
-+ * check PMD flags; do not ignore access-bit; see
-+ * pmdp_clear_flush_young().
-+ */
-+ return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd),
-+ false);
-+}
-+#define huge_pmd_needs_flush huge_pmd_needs_flush
-+
- #endif /* !MODULE */
-
- #endif /* _ASM_X86_TLBFLUSH_H */
-diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
-index c99710b3027a0..7afde1eff2398 100644
---- a/include/asm-generic/tlb.h
-+++ b/include/asm-generic/tlb.h
-@@ -662,6 +662,20 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
- } while (0)
- #endif
-
-+#ifndef pte_needs_flush
-+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
-+{
-+ return true;
-+}
-+#endif
-+
-+#ifndef huge_pmd_needs_flush
-+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
-+{
-+ return true;
-+}
-+#endif
-+
- #endif /* CONFIG_MMU */
-
- #endif /* _ASM_GENERIC__TLB_H */
-diff --git a/mm/huge_memory.c b/mm/huge_memory.c
-index 661dd29642ebc..8ab6316d85391 100644
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -1726,7 +1726,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
- {
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
-- pmd_t entry;
-+ pmd_t oldpmd, entry;
- bool preserve_write;
- int ret;
- bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
-@@ -1801,9 +1801,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
- * pmdp_invalidate() is required to make sure we don't miss
- * dirty/young flags set by hardware.
- */
-- entry = pmdp_invalidate(vma, addr, pmd);
-+ oldpmd = pmdp_invalidate(vma, addr, pmd);
-
-- entry = pmd_modify(entry, newprot);
-+ entry = pmd_modify(oldpmd, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
- if (uffd_wp) {
-@@ -1820,7 +1820,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
-
-- tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
-+ if (huge_pmd_needs_flush(oldpmd, entry))
-+ tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
-
- BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
- unlock:
-diff --git a/mm/mprotect.c b/mm/mprotect.c
-index fe1196be9ca28..09c5c448b9e7c 100644
---- a/mm/mprotect.c
-+++ b/mm/mprotect.c
-@@ -141,7 +141,8 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
- ptent = pte_mkwrite(ptent);
- }
- ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
-- tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
-+ if (pte_needs_flush(oldpte, ptent))
-+ tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
- pages++;
- } else if (is_swap_pte(oldpte)) {
- swp_entry_t entry = pte_to_swp_entry(oldpte);
---
-2.43.0
-
+++ /dev/null
-From 61cba6a6dc1cc6682b9aeff3aff3114f0ff30462 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Mon, 9 May 2022 18:20:50 -0700
-Subject: mm/mprotect: use mmu_gather
-
-From: Nadav Amit <namit@vmware.com>
-
-[ Upstream commit 4a18419f71cdf9155d2d2a6c79546f720978b990 ]
-
-Patch series "mm/mprotect: avoid unnecessary TLB flushes", v6.
-
-This patchset is intended to remove unnecessary TLB flushes during
-mprotect() syscalls. Once this patch-set make it through, similar and
-further optimizations for MADV_COLD and userfaultfd would be possible.
-
-Basically, there are 3 optimizations in this patch-set:
-
-1. Use TLB batching infrastructure to batch flushes across VMAs and do
- better/fewer flushes. This would also be handy for later userfaultfd
- enhancements.
-
-2. Avoid unnecessary TLB flushes. This optimization is the one that
- provides most of the performance benefits. Unlike previous versions,
- we now only avoid flushes that would not result in spurious
- page-faults.
-
-3. Avoiding TLB flushes on change_huge_pmd() that are only needed to
- prevent the A/D bits from changing.
-
-Andrew asked for some benchmark numbers. I do not have an easy
-determinate macrobenchmark in which it is easy to show benefit. I
-therefore ran a microbenchmark: a loop that does the following on
-anonymous memory, just as a sanity check to see that time is saved by
-avoiding TLB flushes. The loop goes:
-
- mprotect(p, PAGE_SIZE, PROT_READ)
- mprotect(p, PAGE_SIZE, PROT_READ|PROT_WRITE)
- *p = 0; // make the page writable
-
-The test was run in KVM guest with 1 or 2 threads (the second thread was
-busy-looping). I measured the time (cycles) of each operation:
-
- 1 thread 2 threads
- mmots +patch mmots +patch
-PROT_READ 3494 2725 (-22%) 8630 7788 (-10%)
-PROT_READ|WRITE 3952 2724 (-31%) 9075 2865 (-68%)
-
-[ mmots = v5.17-rc6-mmots-2022-03-06-20-38 ]
-
-The exact numbers are really meaningless, but the benefit is clear. There
-are 2 interesting results though.
-
-(1) PROT_READ is cheaper, while one can expect it not to be affected.
-This is presumably due to TLB miss that is saved
-
-(2) Without memory access (*p = 0), the speedup of the patch is even
-greater. In that scenario mprotect(PROT_READ) also avoids the TLB flush.
-As a result both operations on the patched kernel take roughly ~1500
-cycles (with either 1 or 2 threads), whereas on mmotm their cost is as
-high as presented in the table.
-
-This patch (of 3):
-
-change_pXX_range() currently does not use mmu_gather, but instead
-implements its own deferred TLB flushes scheme. This both complicates the
-code, as developers need to be aware of different invalidation schemes,
-and prevents opportunities to avoid TLB flushes or perform them in finer
-granularity.
-
-The use of mmu_gather for modified PTEs has benefits in various scenarios
-even if pages are not released. For instance, if only a single page needs
-to be flushed out of a range of many pages, only that page would be
-flushed. If a THP page is flushed, on x86 a single TLB invlpg instruction
-can be used instead of 512 instructions (or a full TLB flush, which would
-Linux would actually use by default). mprotect() over multiple VMAs
-requires a single flush.
-
-Use mmu_gather in change_pXX_range(). As the pages are not released, only
-record the flushed range using tlb_flush_pXX_range().
-
-Handle THP similarly and get rid of flush_cache_range() which becomes
-redundant since tlb_start_vma() calls it when needed.
-
-Link: https://lkml.kernel.org/r/20220401180821.1986781-1-namit@vmware.com
-Link: https://lkml.kernel.org/r/20220401180821.1986781-2-namit@vmware.com
-Signed-off-by: Nadav Amit <namit@vmware.com>
-Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Andrew Cooper <andrew.cooper3@citrix.com>
-Cc: Andy Lutomirski <luto@kernel.org>
-Cc: Dave Hansen <dave.hansen@linux.intel.com>
-Cc: Peter Xu <peterx@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: Will Deacon <will@kernel.org>
-Cc: Yu Zhao <yuzhao@google.com>
-Cc: Nick Piggin <npiggin@gmail.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Stable-dep-of: 3a5a8d343e1c ("mm: fix race between __split_huge_pmd_locked() and GUP-fast")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- fs/exec.c | 6 ++-
- include/linux/huge_mm.h | 5 ++-
- include/linux/mm.h | 5 ++-
- mm/huge_memory.c | 10 ++++-
- mm/mempolicy.c | 9 +++-
- mm/mprotect.c | 92 ++++++++++++++++++++++-------------------
- mm/userfaultfd.c | 6 ++-
- 7 files changed, 82 insertions(+), 51 deletions(-)
-
-diff --git a/fs/exec.c b/fs/exec.c
-index 03516b704d8a4..3cf38e5e8b733 100644
---- a/fs/exec.c
-+++ b/fs/exec.c
-@@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
- unsigned long stack_size;
- unsigned long stack_expand;
- unsigned long rlim_stack;
-+ struct mmu_gather tlb;
-
- #ifdef CONFIG_STACK_GROWSUP
- /* Limit stack size */
-@@ -812,8 +813,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
- vm_flags |= mm->def_flags;
- vm_flags |= VM_STACK_INCOMPLETE_SETUP;
-
-- ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
-+ tlb_gather_mmu(&tlb, mm);
-+ ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
- vm_flags);
-+ tlb_finish_mmu(&tlb);
-+
- if (ret)
- goto out_unlock;
- BUG_ON(prev != vma);
-diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
-index f123e15d966e8..6cb3e6fe11e7f 100644
---- a/include/linux/huge_mm.h
-+++ b/include/linux/huge_mm.h
-@@ -36,8 +36,9 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr);
- bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
--int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
-- pgprot_t newprot, unsigned long cp_flags);
-+int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
-+ pmd_t *pmd, unsigned long addr, pgprot_t newprot,
-+ unsigned long cp_flags);
- vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write);
-
-diff --git a/include/linux/mm.h b/include/linux/mm.h
-index 5692055f202cb..e05c91ea5735d 100644
---- a/include/linux/mm.h
-+++ b/include/linux/mm.h
-@@ -1899,10 +1899,11 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
- #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
- MM_CP_UFFD_WP_RESOLVE)
-
--extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
-+extern unsigned long change_protection(struct mmu_gather *tlb,
-+ struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags);
--extern int mprotect_fixup(struct vm_area_struct *vma,
-+extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
- struct vm_area_struct **pprev, unsigned long start,
- unsigned long end, unsigned long newflags);
-
-diff --git a/mm/huge_memory.c b/mm/huge_memory.c
-index 98ff57c8eda69..661dd29642ebc 100644
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -1720,8 +1720,9 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- * or if prot_numa but THP migration is not supported
- * - HPAGE_PMD_NR if protections changed and TLB flush necessary
- */
--int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-- unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
-+int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
-+ pmd_t *pmd, unsigned long addr, pgprot_t newprot,
-+ unsigned long cp_flags)
- {
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
-@@ -1732,6 +1733,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
- bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
-
-+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
-+
- if (prot_numa && !thp_migration_supported())
- return 1;
-
-@@ -1816,6 +1819,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- }
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
-+
-+ tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
-+
- BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
- unlock:
- spin_unlock(ptl);
-diff --git a/mm/mempolicy.c b/mm/mempolicy.c
-index 818753635e427..c05e979fd8695 100644
---- a/mm/mempolicy.c
-+++ b/mm/mempolicy.c
-@@ -104,6 +104,7 @@
- #include <linux/swapops.h>
-
- #include <asm/tlbflush.h>
-+#include <asm/tlb.h>
- #include <linux/uaccess.h>
-
- #include "internal.h"
-@@ -634,12 +635,18 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
- unsigned long change_prot_numa(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
- {
-+ struct mmu_gather tlb;
- int nr_updated;
-
-- nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
-+ tlb_gather_mmu(&tlb, vma->vm_mm);
-+
-+ nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
-+ MM_CP_PROT_NUMA);
- if (nr_updated)
- count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
-
-+ tlb_finish_mmu(&tlb);
-+
- return nr_updated;
- }
- #else
-diff --git a/mm/mprotect.c b/mm/mprotect.c
-index ed18dc49533f6..fe1196be9ca28 100644
---- a/mm/mprotect.c
-+++ b/mm/mprotect.c
-@@ -32,12 +32,13 @@
- #include <asm/cacheflush.h>
- #include <asm/mmu_context.h>
- #include <asm/tlbflush.h>
-+#include <asm/tlb.h>
-
- #include "internal.h"
-
--static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-- unsigned long addr, unsigned long end, pgprot_t newprot,
-- unsigned long cp_flags)
-+static unsigned long change_pte_range(struct mmu_gather *tlb,
-+ struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
-+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
- {
- pte_t *pte, oldpte;
- spinlock_t *ptl;
-@@ -48,6 +49,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
- bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
-
-+ tlb_change_page_size(tlb, PAGE_SIZE);
-+
- /*
- * Can be called with only the mmap_lock for reading by
- * prot_numa so we must check the pmd isn't constantly
-@@ -138,6 +141,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- ptent = pte_mkwrite(ptent);
- }
- ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
-+ tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
- pages++;
- } else if (is_swap_pte(oldpte)) {
- swp_entry_t entry = pte_to_swp_entry(oldpte);
-@@ -219,9 +223,9 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
- return 0;
- }
-
--static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
-- pud_t *pud, unsigned long addr, unsigned long end,
-- pgprot_t newprot, unsigned long cp_flags)
-+static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
-+ struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
-+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
- {
- pmd_t *pmd;
- unsigned long next;
-@@ -261,8 +265,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
- if (next - addr != HPAGE_PMD_SIZE) {
- __split_huge_pmd(vma, pmd, addr, false, NULL);
- } else {
-- int nr_ptes = change_huge_pmd(vma, pmd, addr,
-- newprot, cp_flags);
-+ /*
-+ * change_huge_pmd() does not defer TLB flushes,
-+ * so no need to propagate the tlb argument.
-+ */
-+ int nr_ptes = change_huge_pmd(tlb, vma, pmd,
-+ addr, newprot, cp_flags);
-
- if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR) {
-@@ -276,8 +284,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
- }
- /* fall through, the trans huge pmd just split */
- }
-- this_pages = change_pte_range(vma, pmd, addr, next, newprot,
-- cp_flags);
-+ this_pages = change_pte_range(tlb, vma, pmd, addr, next,
-+ newprot, cp_flags);
- pages += this_pages;
- next:
- cond_resched();
-@@ -291,9 +299,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
- return pages;
- }
-
--static inline unsigned long change_pud_range(struct vm_area_struct *vma,
-- p4d_t *p4d, unsigned long addr, unsigned long end,
-- pgprot_t newprot, unsigned long cp_flags)
-+static inline unsigned long change_pud_range(struct mmu_gather *tlb,
-+ struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
-+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
- {
- pud_t *pud;
- unsigned long next;
-@@ -304,16 +312,16 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
- continue;
-- pages += change_pmd_range(vma, pud, addr, next, newprot,
-+ pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
- cp_flags);
- } while (pud++, addr = next, addr != end);
-
- return pages;
- }
-
--static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
-- pgd_t *pgd, unsigned long addr, unsigned long end,
-- pgprot_t newprot, unsigned long cp_flags)
-+static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
-+ struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
-+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
- {
- p4d_t *p4d;
- unsigned long next;
-@@ -324,44 +332,40 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
- next = p4d_addr_end(addr, end);
- if (p4d_none_or_clear_bad(p4d))
- continue;
-- pages += change_pud_range(vma, p4d, addr, next, newprot,
-+ pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
- cp_flags);
- } while (p4d++, addr = next, addr != end);
-
- return pages;
- }
-
--static unsigned long change_protection_range(struct vm_area_struct *vma,
-- unsigned long addr, unsigned long end, pgprot_t newprot,
-- unsigned long cp_flags)
-+static unsigned long change_protection_range(struct mmu_gather *tlb,
-+ struct vm_area_struct *vma, unsigned long addr,
-+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
- {
- struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- unsigned long next;
-- unsigned long start = addr;
- unsigned long pages = 0;
-
- BUG_ON(addr >= end);
- pgd = pgd_offset(mm, addr);
-- flush_cache_range(vma, addr, end);
-- inc_tlb_flush_pending(mm);
-+ tlb_start_vma(tlb, vma);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
-- pages += change_p4d_range(vma, pgd, addr, next, newprot,
-+ pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
- cp_flags);
- } while (pgd++, addr = next, addr != end);
-
-- /* Only flush the TLB if we actually modified any entries: */
-- if (pages)
-- flush_tlb_range(vma, start, end);
-- dec_tlb_flush_pending(mm);
-+ tlb_end_vma(tlb, vma);
-
- return pages;
- }
-
--unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
-+unsigned long change_protection(struct mmu_gather *tlb,
-+ struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
- {
-@@ -372,7 +376,7 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
- if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot);
- else
-- pages = change_protection_range(vma, start, end, newprot,
-+ pages = change_protection_range(tlb, vma, start, end, newprot,
- cp_flags);
-
- return pages;
-@@ -406,8 +410,9 @@ static const struct mm_walk_ops prot_none_walk_ops = {
- };
-
- int
--mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
-- unsigned long start, unsigned long end, unsigned long newflags)
-+mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
-+ struct vm_area_struct **pprev, unsigned long start,
-+ unsigned long end, unsigned long newflags)
- {
- struct mm_struct *mm = vma->vm_mm;
- unsigned long oldflags = vma->vm_flags;
-@@ -494,7 +499,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
- dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
- vma_set_page_prot(vma);
-
-- change_protection(vma, start, end, vma->vm_page_prot,
-+ change_protection(tlb, vma, start, end, vma->vm_page_prot,
- dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
-
- /*
-@@ -528,6 +533,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
- const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
- const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
- (prot & PROT_READ);
-+ struct mmu_gather tlb;
-
- start = untagged_addr(start);
-
-@@ -584,6 +590,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
- if (start > vma->vm_start)
- prev = vma;
-
-+ tlb_gather_mmu(&tlb, current->mm);
- for (nstart = start ; ; ) {
- unsigned long mask_off_old_flags;
- unsigned long newflags;
-@@ -610,18 +617,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
- /* newflags >> 4 shift VM_MAY% in place of VM_% */
- if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
- error = -EACCES;
-- goto out;
-+ break;
- }
-
- /* Allow architectures to sanity-check the new flags */
- if (!arch_validate_flags(newflags)) {
- error = -EINVAL;
-- goto out;
-+ break;
- }
-
- error = security_file_mprotect(vma, reqprot, prot);
- if (error)
-- goto out;
-+ break;
-
- tmp = vma->vm_end;
- if (tmp > end)
-@@ -630,27 +637,28 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
- if (vma->vm_ops && vma->vm_ops->mprotect) {
- error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
- if (error)
-- goto out;
-+ break;
- }
-
-- error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
-+ error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags);
- if (error)
-- goto out;
-+ break;
-
- nstart = tmp;
-
- if (nstart < prev->vm_end)
- nstart = prev->vm_end;
- if (nstart >= end)
-- goto out;
-+ break;
-
- vma = prev->vm_next;
- if (!vma || vma->vm_start != nstart) {
- error = -ENOMEM;
-- goto out;
-+ break;
- }
- prot = reqprot;
- }
-+ tlb_finish_mmu(&tlb);
- out:
- mmap_write_unlock(current->mm);
- return error;
-diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
-index 98a9d0ef2d917..eafdc112ac7aa 100644
---- a/mm/userfaultfd.c
-+++ b/mm/userfaultfd.c
-@@ -16,6 +16,7 @@
- #include <linux/hugetlb.h>
- #include <linux/shmem_fs.h>
- #include <asm/tlbflush.h>
-+#include <asm/tlb.h>
- #include "internal.h"
-
- static __always_inline
-@@ -698,6 +699,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- atomic_t *mmap_changing)
- {
- struct vm_area_struct *dst_vma;
-+ struct mmu_gather tlb;
- pgprot_t newprot;
- int err;
-
-@@ -739,8 +741,10 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- else
- newprot = vm_get_page_prot(dst_vma->vm_flags);
-
-- change_protection(dst_vma, start, start + len, newprot,
-+ tlb_gather_mmu(&tlb, dst_mm);
-+ change_protection(&tlb, dst_vma, start, start + len, newprot,
- enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
-+ tlb_finish_mmu(&tlb);
-
- err = 0;
- out_unlock:
---
-2.43.0
-
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
-@@ -2491,26 +2491,34 @@ static int sdhci_get_cd(struct mmc_host
+@@ -2487,26 +2487,34 @@ static int sdhci_get_cd(struct mmc_host
static int sdhci_check_ro(struct sdhci_host *host)
{
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
-@@ -2492,11 +2492,8 @@ static int sdhci_get_cd(struct mmc_host
+@@ -2488,11 +2488,8 @@ static int sdhci_get_cd(struct mmc_host
static int sdhci_check_ro(struct sdhci_host *host)
{
bool allow_invert = false;
if (host->flags & SDHCI_DEVICE_DEAD) {
is_readonly = 0;
} else if (host->ops->get_ro) {
-@@ -2511,8 +2508,6 @@ static int sdhci_check_ro(struct sdhci_h
+@@ -2507,8 +2504,6 @@ static int sdhci_check_ro(struct sdhci_h
allow_invert = true;
}
ftrace-fix-possible-use-after-free-issue-in-ftrace_l.patch
mmc-davinci_mmc-convert-to-platform-remove-callback-.patch
mmc-davinci-don-t-strip-remove-function-when-driver-.patch
-mm-mprotect-use-mmu_gather.patch
-mm-mprotect-do-not-flush-when-not-required-architect.patch
-mm-avoid-unnecessary-flush-on-change_huge_pmd.patch
-mm-fix-race-between-__split_huge_pmd_locked-and-gup-.patch
i2c-add-fwnode-apis.patch
i2c-acpi-unbind-mux-adapters-before-delete.patch
cma-factor-out-minimum-alignment-requirement.patch