]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
mm/pagewalk: split walk_page_range_novma() into kernel/user parts
authorLorenzo Stoakes <lorenzo.stoakes@oracle.com>
Thu, 5 Jun 2025 13:51:04 +0000 (14:51 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 10 Jul 2025 05:42:05 +0000 (22:42 -0700)
walk_page_range_novma() is rather confusing - it supports two modes, one
used often, the other used only for debugging.

The first mode is the common case of traversal of kernel page tables,
which is what nearly all callers use this for.

Secondly it provides an unusual debugging interface that allows for the
traversal of page tables in a userland range of memory even for that
memory which is not described by a VMA.

It is far from certain that such page tables should even exist, but
perhaps this is precisely why it is useful as a debugging mechanism.

As a result, this is utilised by ptdump only.  Historically, things were
reversed - ptdump was the only user, and other parts of the kernel evolved
to use the kernel page table walking here.

Since we have some complicated and confusing locking rules for the novma
case, it makes sense to separate the two usages into their own functions.

Doing this also provide self-documentation as to the intent of the caller
- are they doing something rather unusual or are they simply doing a
standard kernel page table walk?

We therefore establish two separate functions - walk_page_range_debug()
for this single usage, and walk_kernel_page_table_range() for general
kernel page table walking.

The walk_page_range_debug() function is currently used to traverse both
userland and kernel mappings, so we maintain this and in the case of
kernel mappings being traversed, we have walk_page_range_debug() invoke
walk_kernel_page_table_range() internally.

We additionally make walk_page_range_debug() internal to mm.

Link: https://lkml.kernel.org/r/20250605135104.90720-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Barry Song <baohua@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
arch/loongarch/mm/pageattr.c
arch/openrisc/kernel/dma.c
arch/riscv/mm/pageattr.c
include/linux/pagewalk.h
mm/hugetlb_vmemmap.c
mm/internal.h
mm/pagewalk.c
mm/ptdump.c

index 99165903908a4ea1fbfeb5b1faca6ab9414cabbd..f5e910b68229d3abf40da2be6352421375739e90 100644 (file)
@@ -118,7 +118,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, pgp
                return 0;
 
        mmap_write_lock(&init_mm);
-       ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, &masks);
+       ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL, &masks);
        mmap_write_unlock(&init_mm);
 
        flush_tlb_kernel_range(start, end);
index 3a7b5baaa45066cc5da411554b4b993cb57e0fbe..af932a4ad306458f887701e3acd9c8702845a972 100644 (file)
@@ -72,7 +72,7 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size)
         * them and setting the cache-inhibit bit.
         */
        mmap_write_lock(&init_mm);
-       error = walk_page_range_novma(&init_mm, va, va + size,
+       error = walk_kernel_page_table_range(va, va + size,
                        &set_nocache_walk_ops, NULL, NULL);
        mmap_write_unlock(&init_mm);
 
@@ -87,7 +87,7 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size)
 
        mmap_write_lock(&init_mm);
        /* walk_page_range shouldn't be able to fail here */
-       WARN_ON(walk_page_range_novma(&init_mm, va, va + size,
+       WARN_ON(walk_kernel_page_table_range(va, va + size,
                        &clear_nocache_walk_ops, NULL, NULL));
        mmap_write_unlock(&init_mm);
 }
index d815448758a19cb0c314ab07736fb66230fa9a28..3f76db3d276992b832f3b5ab1aa4feac68198981 100644 (file)
@@ -299,7 +299,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
                        if (ret)
                                goto unlock;
 
-                       ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
+                       ret = walk_kernel_page_table_range(lm_start, lm_end,
                                                    &pageattr_ops, NULL, &masks);
                        if (ret)
                                goto unlock;
@@ -317,13 +317,13 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
                if (ret)
                        goto unlock;
 
-               ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
+               ret = walk_kernel_page_table_range(lm_start, lm_end,
                                            &pageattr_ops, NULL, &masks);
                if (ret)
                        goto unlock;
        }
 
-       ret =  walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+       ret =  walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
                                     &masks);
 
 unlock:
@@ -335,7 +335,7 @@ unlock:
         */
        flush_tlb_all();
 #else
-       ret =  walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+       ret =  walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
                                     &masks);
 
        mmap_write_unlock(&init_mm);
index 9700a29f8afbc974a98f200ef7730abca4ac9657..8ac2f6d6d2a344d53376e83f4ee64e179fcbf8cf 100644 (file)
@@ -129,10 +129,9 @@ struct mm_walk {
 int walk_page_range(struct mm_struct *mm, unsigned long start,
                unsigned long end, const struct mm_walk_ops *ops,
                void *private);
-int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
-                         unsigned long end, const struct mm_walk_ops *ops,
-                         pgd_t *pgd,
-                         void *private);
+int walk_kernel_page_table_range(unsigned long start,
+               unsigned long end, const struct mm_walk_ops *ops,
+               pgd_t *pgd, void *private);
 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, const struct mm_walk_ops *ops,
                        void *private);
index 27245e86df25008fb0eb50df5d6f9ef41c1f5b85..ba0fb1b6a5a8eb48ffb8bdc2708111ddc1901985 100644 (file)
@@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
        VM_BUG_ON(!PAGE_ALIGNED(start | end));
 
        mmap_read_lock(&init_mm);
-       ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
+       ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
                                    NULL, walk);
        mmap_read_unlock(&init_mm);
        if (ret)
index f91688e2894fbcd2c5e1eb64f43a0f36cbaa2642..2c0d9f197d8111cff1ce6d25683ca6f7a689e203 100644 (file)
@@ -1604,6 +1604,9 @@ static inline void accept_page(struct page *page)
 int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
                unsigned long end, const struct mm_walk_ops *ops,
                void *private);
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
+                         unsigned long end, const struct mm_walk_ops *ops,
+                         pgd_t *pgd, void *private);
 
 /* pt_reclaim.c */
 bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
index e478777c86e196d64a792d8d2180d1e85bdfce5f..ff5299eca687fca5a6aa6497aca7f97a744bd969 100644 (file)
@@ -585,8 +585,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
 }
 
 /**
- * walk_page_range_novma - walk a range of pagetables not backed by a vma
- * @mm:                mm_struct representing the target process of page table walk
+ * walk_kernel_page_table_range - walk a range of kernel pagetables.
  * @start:     start address of the virtual address range
  * @end:       end address of the virtual address range
  * @ops:       operation to call during the walk
@@ -596,17 +595,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
  * Similar to walk_page_range() but can walk any page tables even if they are
  * not backed by VMAs. Because 'unusual' entries may be walked this function
  * will also not lock the PTEs for the pte_entry() callback. This is useful for
- * walking the kernel pages tables or page tables for firmware.
+ * walking kernel pages tables or page tables for firmware.
  *
  * Note: Be careful to walk the kernel pages tables, the caller may be need to
  * take other effective approaches (mmap lock may be insufficient) to prevent
  * the intermediate kernel page tables belonging to the specified address range
  * from being freed (e.g. memory hot-remove).
  */
-int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+int walk_kernel_page_table_range(unsigned long start, unsigned long end,
+               const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+       struct mm_struct *mm = &init_mm;
+       struct mm_walk walk = {
+               .ops            = ops,
+               .mm             = mm,
+               .pgd            = pgd,
+               .private        = private,
+               .no_vma         = true
+       };
+
+       if (start >= end)
+               return -EINVAL;
+       if (!check_ops_valid(ops))
+               return -EINVAL;
+
+       /*
+        * Kernel intermediate page tables are usually not freed, so the mmap
+        * read lock is sufficient. But there are some exceptions.
+        * E.g. memory hot-remove. In which case, the mmap lock is insufficient
+        * to prevent the intermediate kernel pages tables belonging to the
+        * specified address range from being freed. The caller should take
+        * other actions to prevent this race.
+        */
+       mmap_assert_locked(mm);
+
+       return walk_pgd_range(start, end, &walk);
+}
+
+/**
+ * walk_page_range_debug - walk a range of pagetables not backed by a vma
+ * @mm:                mm_struct representing the target process of page table walk
+ * @start:     start address of the virtual address range
+ * @end:       end address of the virtual address range
+ * @ops:       operation to call during the walk
+ * @pgd:       pgd to walk if different from mm->pgd
+ * @private:   private data for callbacks' usage
+ *
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback.
+ *
+ * This is for debugging purposes ONLY.
+ */
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
                          unsigned long end, const struct mm_walk_ops *ops,
-                         pgd_t *pgd,
-                         void *private)
+                         pgd_t *pgd, void *private)
 {
        struct mm_walk walk = {
                .ops            = ops,
@@ -616,34 +659,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
                .no_vma         = true
        };
 
+       /* For convenience, we allow traversal of kernel mappings. */
+       if (mm == &init_mm)
+               return walk_kernel_page_table_range(start, end, ops,
+                                                   pgd, private);
        if (start >= end || !walk.mm)
                return -EINVAL;
        if (!check_ops_valid(ops))
                return -EINVAL;
 
        /*
-        * 1) For walking the user virtual address space:
-        *
         * The mmap lock protects the page walker from changes to the page
         * tables during the walk.  However a read lock is insufficient to
         * protect those areas which don't have a VMA as munmap() detaches
         * the VMAs before downgrading to a read lock and actually tearing
         * down PTEs/page tables. In which case, the mmap write lock should
-        * be hold.
-        *
-        * 2) For walking the kernel virtual address space:
-        *
-        * The kernel intermediate page tables usually do not be freed, so
-        * the mmap map read lock is sufficient. But there are some exceptions.
-        * E.g. memory hot-remove. In which case, the mmap lock is insufficient
-        * to prevent the intermediate kernel pages tables belonging to the
-        * specified address range from being freed. The caller should take
-        * other actions to prevent this race.
+        * be held.
         */
-       if (mm == &init_mm)
-               mmap_assert_locked(walk.mm);
-       else
-               mmap_assert_write_locked(walk.mm);
+       mmap_assert_write_locked(mm);
 
        return walk_pgd_range(start, end, &walk);
 }
index 9374f29cdc6f8af1eff1ff34fed95cd9a50bed5a..61a352aa12ed8ca0db4ed0c82cdb647b6fb50e41 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/debugfs.h>
 #include <linux/ptdump.h>
 #include <linux/kasan.h>
+#include "internal.h"
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 /*
@@ -177,7 +178,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
 
        mmap_write_lock(mm);
        while (range->start != range->end) {
-               walk_page_range_novma(mm, range->start, range->end,
+               walk_page_range_debug(mm, range->start, range->end,
                                      &ptdump_ops, pgd, st);
                range++;
        }