The implementation of the lazy MMU mode is currently entirely
arch-specific; core code directly calls arch helpers:
arch_{enter,leave}_lazy_mmu_mode().
We are about to introduce support for nested lazy MMU sections. As things
stand we'd have to duplicate that logic in every arch implementing
lazy_mmu - adding to a fair amount of logic already duplicated across
lazy_mmu implementations.
This patch therefore introduces a new generic layer that calls the
existing arch_* helpers. Two pair of calls are introduced:
* lazy_mmu_mode_enable() ... lazy_mmu_mode_disable()
This is the standard case where the mode is enabled for a given
block of code by surrounding it with enable() and disable()
calls.
* lazy_mmu_mode_pause() ... lazy_mmu_mode_resume()
This is for situations where the mode is temporarily disabled
by first calling pause() and then resume() (e.g. to prevent any
batching from occurring in a critical section).
The documentation in <linux/pgtable.h> will be updated in a subsequent
patch.
No functional change should be introduced at this stage. The
implementation of enable()/resume() and disable()/pause() is currently
identical, but nesting support will change that.
Most of the call sites have been updated using the following Coccinelle
script:
@@
@@
{
...
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
...
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
...
}
@@
@@
{
...
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_pause();
...
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_resume();
...
}
A couple of notes regarding x86:
* Xen is currently the only case where explicit handling is required
for lazy MMU when context-switching. This is purely an
implementation detail and using the generic lazy_mmu_mode_*
functions would cause trouble when nesting support is introduced,
because the generic functions must be called from the current task.
For that reason we still use arch_leave() and arch_enter() there.
* x86 calls arch_flush_lazy_mmu_mode() unconditionally in a few
places, but only defines it if PARAVIRT_XXL is selected, and we
are removing the fallback in <linux/pgtable.h>. Add a new fallback
definition to <asm/pgtable.h> to keep things building.
Link: https://lkml.kernel.org/r/20251215150323.2218608-8-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Juegren Gross <jgross@suse.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
return -EINVAL;
mutex_lock(&pgtable_split_lock);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
/*
* The split_kernel_leaf_mapping_locked() may sleep, it is not a
ret = split_kernel_leaf_mapping_locked(end);
}
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
mutex_unlock(&pgtable_split_lock);
return ret;
}
{
int ret;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
ret = walk_kernel_page_table_range_lockless(start, end,
&split_to_ptes_ops, NULL, &gfp);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
return ret;
}
if (WARN_ON_ONCE(ret))
return ret;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
/*
* The caller must ensure that the range we are operating on does not
*/
ret = walk_kernel_page_table_range_lockless(start, start + size,
&pageattr_ops, NULL, &data);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
return ret;
}
* way to do things but is fine for our needs here.
*/
local_irq_save(flags);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; start < end; start += PAGE_SIZE) {
pte_t *ptep = find_init_mm_pte(start, &hugepage_shift);
unsigned long pte;
continue;
hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift);
}
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
local_irq_restore(flags);
}
* way to do things but is fine for our needs here.
*/
local_irq_save(flags);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
start_pte = pte_offset_map(pmd, addr);
if (!start_pte)
goto out;
}
pte_unmap(start_pte);
out:
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
local_irq_restore(flags);
}
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; npages > 0; --npages) {
pte_update(mm, addr, pte, 0, 0, 0);
addr += PAGE_SIZE;
++pte;
}
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(pte - 1, ptl);
}
#define __pte(x) native_make_pte(x)
#define arch_end_context_switch(prev) do {} while(0)
+static inline void arch_flush_lazy_mmu_mode(void) {}
#endif /* CONFIG_PARAVIRT_XXL */
static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
return 0;
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
/* Fast path for performing exclusive WP */
if (flush_end)
flush_tlb_range(vma, start, addr);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
cond_resched();
*
* Nesting is not permitted and the mode cannot be used in interrupt context.
*/
-#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE
-static inline void arch_enter_lazy_mmu_mode(void) {}
-static inline void arch_leave_lazy_mmu_mode(void) {}
-static inline void arch_flush_lazy_mmu_mode(void) {}
+#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
+static inline void lazy_mmu_mode_enable(void)
+{
+ arch_enter_lazy_mmu_mode();
+}
+
+static inline void lazy_mmu_mode_disable(void)
+{
+ arch_leave_lazy_mmu_mode();
+}
+
+static inline void lazy_mmu_mode_pause(void)
+{
+ arch_leave_lazy_mmu_mode();
+}
+
+static inline void lazy_mmu_mode_resume(void)
+{
+ arch_enter_lazy_mmu_mode();
+}
+#else
+static inline void lazy_mmu_mode_enable(void) {}
+static inline void lazy_mmu_mode_disable(void) {}
+static inline void lazy_mmu_mode_pause(void) {}
+static inline void lazy_mmu_mode_resume(void) {}
#endif
#ifndef pte_batch_hint
pte_t pte;
int index;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_pause();
index = PFN_DOWN(addr - data->start);
page = data->pages[index];
}
spin_unlock(&init_mm.page_table_lock);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_resume();
return 0;
}
pte_t pte;
int none;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_pause();
spin_lock(&init_mm.page_table_lock);
pte = ptep_get(ptep);
if (likely(!none))
__free_page(pfn_to_page(pte_pfn(pte)));
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_resume();
return 0;
}
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
nr = 1;
ptent = ptep_get(pte);
if (++batch_count == SWAP_CLUSTER_MAX) {
batch_count = 0;
if (need_resched()) {
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
cond_resched();
goto restart;
if (!folio_trylock(folio))
continue;
folio_get(folio);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if (!err)
nr = 0;
continue;
}
if (start_pte) {
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
}
if (pageout)
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
nr = 1;
ptent = ptep_get(pte);
if (!folio_trylock(folio))
continue;
folio_get(folio);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if (!err)
nr = 0;
continue;
if (nr_swap)
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
if (start_pte) {
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
}
cond_resched();
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
nr = 1;
} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(orig_src_pte, src_ptl);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
return addr;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
bool any_skipped = false;
direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
add_mm_rss_vec(mm, rss);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
/* Do the actual TLB flush before dropping ptl */
if (force_flush) {
mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
return -EINVAL;
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if (fn) {
do {
}
*mask |= PGTBL_PTE_MODIFIED;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
if (mm != &init_mm)
pte_unmap_unlock(mapped_pte, ptl);
ptep = pte_offset_map_lock(mm, pmdp, start, &ptl);
if (!ptep)
goto again;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
ptep += (addr - start) / PAGE_SIZE;
for (; addr < end; addr += PAGE_SIZE, ptep++) {
if (folio_test_large(folio)) {
int ret;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(ptep, ptl);
ret = migrate_vma_split_folio(folio,
migrate->fault_page);
if (folio && folio_test_large(folio)) {
int ret;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(ptep, ptl);
ret = migrate_vma_split_folio(folio,
migrate->fault_page);
if (unmapped)
flush_tlb_range(walk->vma, start, end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(ptep - 1, ptl);
return 0;
is_private_single_threaded = vma_is_single_threaded_private(vma);
flush_tlb_batched_pending(vma->vm_mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
nr_ptes = 1;
oldpte = ptep_get(pte);
}
}
} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(pte - 1, ptl);
return pages;
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
}
}
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
if (force_flush)
flush_tlb_range(vma, old_end - len, old_end);
if (new_ptl != old_ptl)
/* It's safe to drop the reference now as the page-table is holding one. */
folio_put(*first_src_folio);
*first_src_folio = NULL;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
while (true) {
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
break;
}
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
if (src_addr > src_start)
flush_tlb_range(src_vma, src_start, src_addr);
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
#ifdef CONFIG_HUGETLB_PAGE
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
}
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
struct page *page = pages[*nr];
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return err;
return false;
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
unsigned long pfn;
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(pte, ptl);
return suitable_to_scan(total, young);
if (!spin_trylock(ptl))
goto done;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
unsigned long pfn;
walk_update_folio(walk, last, gen, dirty);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
spin_unlock(ptl);
done:
*first = -1;
}
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
pte -= (addr - start) / PAGE_SIZE;
walk_update_folio(walk, last, gen, dirty);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))