From: Greg Kroah-Hartman Date: Sat, 6 Sep 2025 19:07:11 +0000 (+0200) Subject: 6.16-stable patches X-Git-Tag: v5.4.299~46 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=82569794f3e6cd2f58ce52a12444efc205bf99fb;p=thirdparty%2Fkernel%2Fstable-queue.git 6.16-stable patches added patches: mm-fix-accounting-of-memmap-pages.patch mm-fix-possible-deadlock-in-kmemleak.patch mm-introduce-and-use-pgd-p4d-_populate_kernel.patch mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch rust-mm-mark-vmanew-as-transparent.patch x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch --- diff --git a/queue-6.16/mm-fix-accounting-of-memmap-pages.patch b/queue-6.16/mm-fix-accounting-of-memmap-pages.patch new file mode 100644 index 0000000000..543cafb1c8 --- /dev/null +++ b/queue-6.16/mm-fix-accounting-of-memmap-pages.patch @@ -0,0 +1,109 @@ +From c3576889d87b603cb66b417e08844a53c1077a37 Mon Sep 17 00:00:00 2001 +From: Sumanth Korikkar +Date: Thu, 7 Aug 2025 20:35:45 +0200 +Subject: mm: fix accounting of memmap pages + +From: Sumanth Korikkar + +commit c3576889d87b603cb66b417e08844a53c1077a37 upstream. + +For !CONFIG_SPARSEMEM_VMEMMAP, memmap page accounting is currently done +upfront in sparse_buffer_init(). However, sparse_buffer_alloc() may +return NULL in failure scenario. + +Also, memmap pages may be allocated either from the memblock allocator +during early boot or from the buddy allocator. When removed via +arch_remove_memory(), accounting of memmap pages must reflect the original +allocation source. + +To ensure correctness: +* Account memmap pages after successful allocation in sparse_init_nid() + and section_activate(). +* Account memmap pages in section_deactivate() based on allocation + source. + +Link: https://lkml.kernel.org/r/20250807183545.1424509-1-sumanthk@linux.ibm.com +Fixes: 15995a352474 ("mm: report per-page metadata information") +Signed-off-by: Sumanth Korikkar +Suggested-by: David Hildenbrand +Reviewed-by: Wei Yang +Cc: Alexander Gordeev +Cc: Gerald Schaefer +Cc: Heiko Carstens +Cc: Vasily Gorbik +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/sparse-vmemmap.c | 5 ----- + mm/sparse.c | 15 +++++++++------ + 2 files changed, 9 insertions(+), 11 deletions(-) + +--- a/mm/sparse-vmemmap.c ++++ b/mm/sparse-vmemmap.c +@@ -578,11 +578,6 @@ struct page * __meminit __populate_secti + if (r < 0) + return NULL; + +- if (system_state == SYSTEM_BOOTING) +- memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); +- else +- memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); +- + return pfn_to_page(pfn); + } + +--- a/mm/sparse.c ++++ b/mm/sparse.c +@@ -454,9 +454,6 @@ static void __init sparse_buffer_init(un + */ + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); + sparsemap_buf_end = sparsemap_buf + size; +-#ifndef CONFIG_SPARSEMEM_VMEMMAP +- memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); +-#endif + } + + static void __init sparse_buffer_fini(void) +@@ -567,6 +564,8 @@ static void __init sparse_init_nid(int n + sparse_buffer_fini(); + goto failed; + } ++ memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), ++ PAGE_SIZE)); + sparse_init_early_section(nid, map, pnum, 0); + } + } +@@ -680,7 +679,6 @@ static void depopulate_section_memmap(un + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); + +- memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); + vmemmap_free(start, end, altmap); + } + static void free_map_bootmem(struct page *memmap) +@@ -856,10 +854,14 @@ static void section_deactivate(unsigned + * The memmap of early sections is always fully populated. See + * section_activate() and pfn_valid() . + */ +- if (!section_is_early) ++ if (!section_is_early) { ++ memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); + depopulate_section_memmap(pfn, nr_pages, altmap); +- else if (memmap) ++ } else if (memmap) { ++ memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), ++ PAGE_SIZE))); + free_map_bootmem(memmap); ++ } + + if (empty) + ms->section_mem_map = (unsigned long)NULL; +@@ -904,6 +906,7 @@ static struct page * __meminit section_a + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } ++ memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); + + return memmap; + } diff --git a/queue-6.16/mm-fix-possible-deadlock-in-kmemleak.patch b/queue-6.16/mm-fix-possible-deadlock-in-kmemleak.patch new file mode 100644 index 0000000000..e600bb7601 --- /dev/null +++ b/queue-6.16/mm-fix-possible-deadlock-in-kmemleak.patch @@ -0,0 +1,115 @@ +From c873ccbb2f8db46ad9b4a989ea924b6d8f19abf1 Mon Sep 17 00:00:00 2001 +From: Gu Bowen +Date: Fri, 22 Aug 2025 15:35:41 +0800 +Subject: mm: fix possible deadlock in kmemleak + +From: Gu Bowen + +commit c873ccbb2f8db46ad9b4a989ea924b6d8f19abf1 upstream. + +There are some AA deadlock issues in kmemleak, similar to the situation +reported by Breno [1]. The deadlock path is as follows: + +mem_pool_alloc() + -> raw_spin_lock_irqsave(&kmemleak_lock, flags); + -> pr_warn() + -> netconsole subsystem + -> netpoll + -> __alloc_skb + -> __create_object + -> raw_spin_lock_irqsave(&kmemleak_lock, flags); + +To solve this problem, switch to printk_safe mode before printing warning +message, this will redirect all printk()-s to a special per-CPU buffer, +which will be flushed later from a safe context (irq work), and this +deadlock problem can be avoided. The proper API to use should be +printk_deferred_enter()/printk_deferred_exit() [2]. Another way is to +place the warn print after kmemleak is released. + +Link: https://lkml.kernel.org/r/20250822073541.1886469-1-gubowen5@huawei.com +Link: https://lore.kernel.org/all/20250731-kmemleak_lock-v1-1-728fd470198f@debian.org/#t [1] +Link: https://lore.kernel.org/all/5ca375cd-4a20-4807-b897-68b289626550@redhat.com/ [2] +Signed-off-by: Gu Bowen +Reviewed-by: Waiman Long +Reviewed-by: Catalin Marinas +Reviewed-by: Breno Leitao +Cc: Greg Kroah-Hartman +Cc: John Ogness +Cc: Lu Jialin +Cc: Petr Mladek +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/kmemleak.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +--- a/mm/kmemleak.c ++++ b/mm/kmemleak.c +@@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_ + else if (untagged_objp == untagged_ptr || alias) + return object; + else { ++ /* ++ * Printk deferring due to the kmemleak_lock held. ++ * This is done to avoid deadlock. ++ */ ++ printk_deferred_enter(); + kmemleak_warn("Found object by alias at 0x%08lx\n", + ptr); + dump_object_info(object); ++ printk_deferred_exit(); + break; + } + } +@@ -736,6 +742,11 @@ static int __link_object(struct kmemleak + else if (untagged_objp + parent->size <= untagged_ptr) + link = &parent->rb_node.rb_right; + else { ++ /* ++ * Printk deferring due to the kmemleak_lock held. ++ * This is done to avoid deadlock. ++ */ ++ printk_deferred_enter(); + kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", + ptr); + /* +@@ -743,6 +754,7 @@ static int __link_object(struct kmemleak + * be freed while the kmemleak_lock is held. + */ + dump_object_info(parent); ++ printk_deferred_exit(); + return -EEXIST; + } + } +@@ -856,13 +868,8 @@ static void delete_object_part(unsigned + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = __find_and_remove_object(ptr, 1, objflags); +- if (!object) { +-#ifdef DEBUG +- kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", +- ptr, size); +-#endif ++ if (!object) + goto unlock; +- } + + /* + * Create one or two objects that may result from the memory block +@@ -882,8 +889,14 @@ static void delete_object_part(unsigned + + unlock: + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); +- if (object) ++ if (object) { + __delete_object(object); ++ } else { ++#ifdef DEBUG ++ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", ++ ptr, size); ++#endif ++ } + + out: + if (object_l) diff --git a/queue-6.16/mm-introduce-and-use-pgd-p4d-_populate_kernel.patch b/queue-6.16/mm-introduce-and-use-pgd-p4d-_populate_kernel.patch new file mode 100644 index 0000000000..1b30e71b5a --- /dev/null +++ b/queue-6.16/mm-introduce-and-use-pgd-p4d-_populate_kernel.patch @@ -0,0 +1,283 @@ +From f2d2f9598ebb0158a3fe17cda0106d7752e654a2 Mon Sep 17 00:00:00 2001 +From: Harry Yoo +Date: Mon, 18 Aug 2025 11:02:05 +0900 +Subject: mm: introduce and use {pgd,p4d}_populate_kernel() + +From: Harry Yoo + +commit f2d2f9598ebb0158a3fe17cda0106d7752e654a2 upstream. + +Introduce and use {pgd,p4d}_populate_kernel() in core MM code when +populating PGD and P4D entries for the kernel address space. These +helpers ensure proper synchronization of page tables when updating the +kernel portion of top-level page tables. + +Until now, the kernel has relied on each architecture to handle +synchronization of top-level page tables in an ad-hoc manner. For +example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for direct +mapping and vmemmap mapping changes"). + +However, this approach has proven fragile for following reasons: + + 1) It is easy to forget to perform the necessary page table + synchronization when introducing new changes. + For instance, commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory + savings for compound devmaps") overlooked the need to synchronize + page tables for the vmemmap area. + + 2) It is also easy to overlook that the vmemmap and direct mapping areas + must not be accessed before explicit page table synchronization. + For example, commit 8d400913c231 ("x86/vmemmap: handle unpopulated + sub-pmd ranges")) caused crashes by accessing the vmemmap area + before calling sync_global_pgds(). + +To address this, as suggested by Dave Hansen, introduce _kernel() variants +of the page table population helpers, which invoke architecture-specific +hooks to properly synchronize page tables. These are introduced in a new +header file, include/linux/pgalloc.h, so they can be called from common +code. + +They reuse existing infrastructure for vmalloc and ioremap. +Synchronization requirements are determined by ARCH_PAGE_TABLE_SYNC_MASK, +and the actual synchronization is performed by +arch_sync_kernel_mappings(). + +This change currently targets only x86_64, so only PGD and P4D level +helpers are introduced. Currently, these helpers are no-ops since no +architecture sets PGTBL_{PGD,P4D}_MODIFIED in ARCH_PAGE_TABLE_SYNC_MASK. + +In theory, PUD and PMD level helpers can be added later if needed by other +architectures. For now, 32-bit architectures (x86-32 and arm) only handle +PGTBL_PMD_MODIFIED, so p*d_populate_kernel() will never affect them unless +we introduce a PMD level helper. + +[harry.yoo@oracle.com: fix KASAN build error due to p*d_populate_kernel()] + Link: https://lkml.kernel.org/r/20250822020727.202749-1-harry.yoo@oracle.com +Link: https://lkml.kernel.org/r/20250818020206.4517-3-harry.yoo@oracle.com +Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges") +Signed-off-by: Harry Yoo +Suggested-by: Dave Hansen +Acked-by: Kiryl Shutsemau +Reviewed-by: Mike Rapoport (Microsoft) +Reviewed-by: Lorenzo Stoakes +Acked-by: David Hildenbrand +Cc: Alexander Potapenko +Cc: Alistair Popple +Cc: Andrey Konovalov +Cc: Andrey Ryabinin +Cc: Andy Lutomirski +Cc: "Aneesh Kumar K.V" +Cc: Anshuman Khandual +Cc: Ard Biesheuvel +Cc: Arnd Bergmann +Cc: bibo mao +Cc: Borislav Betkov +Cc: Christoph Lameter (Ampere) +Cc: Dennis Zhou +Cc: Dev Jain +Cc: Dmitriy Vyukov +Cc: Gwan-gyeong Mun +Cc: Ingo Molnar +Cc: Jane Chu +Cc: Joao Martins +Cc: Joerg Roedel +Cc: John Hubbard +Cc: Kevin Brodsky +Cc: Liam Howlett +Cc: Michal Hocko +Cc: Oscar Salvador +Cc: Peter Xu +Cc: Peter Zijlstra +Cc: Qi Zheng +Cc: Ryan Roberts +Cc: Suren Baghdasaryan +Cc: Tejun Heo +Cc: Thomas Gleinxer +Cc: Thomas Huth +Cc: "Uladzislau Rezki (Sony)" +Cc: Vincenzo Frascino +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/pgalloc.h | 29 +++++++++++++++++++++++++++++ + include/linux/pgtable.h | 13 +++++++------ + mm/kasan/init.c | 12 ++++++------ + mm/percpu.c | 6 +++--- + mm/sparse-vmemmap.c | 6 +++--- + 5 files changed, 48 insertions(+), 18 deletions(-) + create mode 100644 include/linux/pgalloc.h + +--- /dev/null ++++ b/include/linux/pgalloc.h +@@ -0,0 +1,29 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_PGALLOC_H ++#define _LINUX_PGALLOC_H ++ ++#include ++#include ++ ++/* ++ * {pgd,p4d}_populate_kernel() are defined as macros to allow ++ * compile-time optimization based on the configured page table levels. ++ * Without this, linking may fail because callers (e.g., KASAN) may rely ++ * on calls to these functions being optimized away when passing symbols ++ * that exist only for certain page table levels. ++ */ ++#define pgd_populate_kernel(addr, pgd, p4d) \ ++ do { \ ++ pgd_populate(&init_mm, pgd, p4d); \ ++ if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED) \ ++ arch_sync_kernel_mappings(addr, addr); \ ++ } while (0) ++ ++#define p4d_populate_kernel(addr, p4d, pud) \ ++ do { \ ++ p4d_populate(&init_mm, p4d, pud); \ ++ if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_P4D_MODIFIED) \ ++ arch_sync_kernel_mappings(addr, addr); \ ++ } while (0) ++ ++#endif /* _LINUX_PGALLOC_H */ +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -1697,8 +1697,8 @@ static inline int pmd_protnone(pmd_t pmd + + /* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values +- * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() +- * needs to be called. ++ * and let generic vmalloc, ioremap and page table update code know when ++ * arch_sync_kernel_mappings() needs to be called. + */ + #ifndef ARCH_PAGE_TABLE_SYNC_MASK + #define ARCH_PAGE_TABLE_SYNC_MASK 0 +@@ -1831,10 +1831,11 @@ static inline bool arch_has_pfn_modify_c + /* + * Page Table Modification bits for pgtbl_mod_mask. + * +- * These are used by the p?d_alloc_track*() set of functions an in the generic +- * vmalloc/ioremap code to track at which page-table levels entries have been +- * modified. Based on that the code can better decide when vmalloc and ioremap +- * mapping changes need to be synchronized to other page-tables in the system. ++ * These are used by the p?d_alloc_track*() and p*d_populate_kernel() ++ * functions in the generic vmalloc, ioremap and page table update code ++ * to track at which page-table levels entries have been modified. ++ * Based on that the code can better decide when page table changes need ++ * to be synchronized to other page-tables in the system. + */ + #define __PGTBL_PGD_MODIFIED 0 + #define __PGTBL_P4D_MODIFIED 1 +--- a/mm/kasan/init.c ++++ b/mm/kasan/init.c +@@ -13,9 +13,9 @@ + #include + #include + #include ++#include + + #include +-#include + + #include "kasan.h" + +@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t + pud_t *pud; + pmd_t *pmd; + +- p4d_populate(&init_mm, p4d, ++ p4d_populate_kernel(addr, p4d, + lm_alias(kasan_early_shadow_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, +@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t + } else { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + pud_init(p); +- p4d_populate(&init_mm, p4d, p); ++ p4d_populate_kernel(addr, p4d, p); + } + } + zero_pud_populate(p4d, addr, next); +@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(co + * puds,pmds, so pgd_populate(), pud_populate() + * is noops. + */ +- pgd_populate(&init_mm, pgd, ++ pgd_populate_kernel(addr, pgd, + lm_alias(kasan_early_shadow_p4d)); + p4d = p4d_offset(pgd, addr); +- p4d_populate(&init_mm, p4d, ++ p4d_populate_kernel(addr, p4d, + lm_alias(kasan_early_shadow_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, +@@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(co + if (!p) + return -ENOMEM; + } else { +- pgd_populate(&init_mm, pgd, ++ pgd_populate_kernel(addr, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + } +--- a/mm/percpu.c ++++ b/mm/percpu.c +@@ -3108,7 +3108,7 @@ out_free: + #endif /* BUILD_EMBED_FIRST_CHUNK */ + + #ifdef BUILD_PAGE_FIRST_CHUNK +-#include ++#include + + #ifndef P4D_TABLE_SIZE + #define P4D_TABLE_SIZE PAGE_SIZE +@@ -3134,13 +3134,13 @@ void __init __weak pcpu_populate_pte(uns + + if (pgd_none(*pgd)) { + p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); +- pgd_populate(&init_mm, pgd, p4d); ++ pgd_populate_kernel(addr, pgd, p4d); + } + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); +- p4d_populate(&init_mm, p4d, pud); ++ p4d_populate_kernel(addr, p4d, pud); + } + + pud = pud_offset(p4d, addr); +--- a/mm/sparse-vmemmap.c ++++ b/mm/sparse-vmemmap.c +@@ -27,9 +27,9 @@ + #include + #include + #include ++#include + + #include +-#include + #include + + #include "hugetlb_vmemmap.h" +@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(p + if (!p) + return NULL; + pud_init(p); +- p4d_populate(&init_mm, p4d, p); ++ p4d_populate_kernel(addr, p4d, p); + } + return p4d; + } +@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(u + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; +- pgd_populate(&init_mm, pgd, p); ++ pgd_populate_kernel(addr, pgd, p); + } + return pgd; + } diff --git a/queue-6.16/mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch b/queue-6.16/mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch new file mode 100644 index 0000000000..4c19cada34 --- /dev/null +++ b/queue-6.16/mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch @@ -0,0 +1,216 @@ +From 7cc183f2e67d19b03ee5c13a6664b8c6cc37ff9d Mon Sep 17 00:00:00 2001 +From: Harry Yoo +Date: Mon, 18 Aug 2025 11:02:04 +0900 +Subject: mm: move page table sync declarations to linux/pgtable.h + +From: Harry Yoo + +commit 7cc183f2e67d19b03ee5c13a6664b8c6cc37ff9d upstream. + +During our internal testing, we started observing intermittent boot +failures when the machine uses 4-level paging and has a large amount of +persistent memory: + + BUG: unable to handle page fault for address: ffffe70000000034 + #PF: supervisor write access in kernel mode + #PF: error_code(0x0002) - not-present page + PGD 0 P4D 0 + Oops: 0002 [#1] SMP NOPTI + RIP: 0010:__init_single_page+0x9/0x6d + Call Trace: + + __init_zone_device_page+0x17/0x5d + memmap_init_zone_device+0x154/0x1bb + pagemap_range+0x2e0/0x40f + memremap_pages+0x10b/0x2f0 + devm_memremap_pages+0x1e/0x60 + dev_dax_probe+0xce/0x2ec [device_dax] + dax_bus_probe+0x6d/0xc9 + [... snip ...] + + +It turns out that the kernel panics while initializing vmemmap (struct +page array) when the vmemmap region spans two PGD entries, because the new +PGD entry is only installed in init_mm.pgd, but not in the page tables of +other tasks. + +And looking at __populate_section_memmap(): + if (vmemmap_can_optimize(altmap, pgmap)) + // does not sync top level page tables + r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap); + else + // sync top level page tables in x86 + r = vmemmap_populate(start, end, nid, altmap); + +In the normal path, vmemmap_populate() in arch/x86/mm/init_64.c +synchronizes the top level page table (See commit 9b861528a801 ("x86-64, +mem: Update all PGDs for direct mapping and vmemmap mapping changes")) so +that all tasks in the system can see the new vmemmap area. + +However, when vmemmap_can_optimize() returns true, the optimized path +skips synchronization of top-level page tables. This is because +vmemmap_populate_compound_pages() is implemented in core MM code, which +does not handle synchronization of the top-level page tables. Instead, +the core MM has historically relied on each architecture to perform this +synchronization manually. + +We're not the first party to encounter a crash caused by not-sync'd top +level page tables: earlier this year, Gwan-gyeong Mun attempted to address +the issue [1] [2] after hitting a kernel panic when x86 code accessed the +vmemmap area before the corresponding top-level entries were synced. At +that time, the issue was believed to be triggered only when struct page +was enlarged for debugging purposes, and the patch did not get further +updates. + +It turns out that current approach of relying on each arch to handle the +page table sync manually is fragile because 1) it's easy to forget to sync +the top level page table, and 2) it's also easy to overlook that the +kernel should not access the vmemmap and direct mapping areas before the +sync. + +# The solution: Make page table sync more code robust and harder to miss + +To address this, Dave Hansen suggested [3] [4] introducing +{pgd,p4d}_populate_kernel() for updating kernel portion of the page tables +and allow each architecture to explicitly perform synchronization when +installing top-level entries. With this approach, we no longer need to +worry about missing the sync step, reducing the risk of future +regressions. + +The new interface reuses existing ARCH_PAGE_TABLE_SYNC_MASK, +PGTBL_P*D_MODIFIED and arch_sync_kernel_mappings() facility used by +vmalloc and ioremap to synchronize page tables. + +pgd_populate_kernel() looks like this: +static inline void pgd_populate_kernel(unsigned long addr, pgd_t *pgd, + p4d_t *p4d) +{ + pgd_populate(&init_mm, pgd, p4d); + if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED) + arch_sync_kernel_mappings(addr, addr); +} + +It is worth noting that vmalloc() and apply_to_range() carefully +synchronizes page tables by calling p*d_alloc_track() and +arch_sync_kernel_mappings(), and thus they are not affected by this patch +series. + +This series was hugely inspired by Dave Hansen's suggestion and hence +added Suggested-by: Dave Hansen. + +Cc stable because lack of this series opens the door to intermittent +boot failures. + + +This patch (of 3): + +Move ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() to +linux/pgtable.h so that they can be used outside of vmalloc and ioremap. + +Link: https://lkml.kernel.org/r/20250818020206.4517-1-harry.yoo@oracle.com +Link: https://lkml.kernel.org/r/20250818020206.4517-2-harry.yoo@oracle.com +Link: https://lore.kernel.org/linux-mm/20250220064105.808339-1-gwan-gyeong.mun@intel.com [1] +Link: https://lore.kernel.org/linux-mm/20250311114420.240341-1-gwan-gyeong.mun@intel.com [2] +Link: https://lore.kernel.org/linux-mm/d1da214c-53d3-45ac-a8b6-51821c5416e4@intel.com [3] +Link: https://lore.kernel.org/linux-mm/4d800744-7b88-41aa-9979-b245e8bf794b@intel.com [4] +Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges") +Signed-off-by: Harry Yoo +Acked-by: Kiryl Shutsemau +Reviewed-by: Mike Rapoport (Microsoft) +Reviewed-by: "Uladzislau Rezki (Sony)" +Reviewed-by: Lorenzo Stoakes +Acked-by: David Hildenbrand +Cc: Alexander Potapenko +Cc: Alistair Popple +Cc: Andrey Konovalov +Cc: Andrey Ryabinin +Cc: Andy Lutomirski +Cc: "Aneesh Kumar K.V" +Cc: Anshuman Khandual +Cc: Ard Biesheuvel +Cc: Arnd Bergmann +Cc: bibo mao +Cc: Borislav Betkov +Cc: Christoph Lameter (Ampere) +Cc: Dennis Zhou +Cc: Dev Jain +Cc: Dmitriy Vyukov +Cc: Gwan-gyeong Mun +Cc: Ingo Molnar +Cc: Jane Chu +Cc: Joao Martins +Cc: Joerg Roedel +Cc: John Hubbard +Cc: Kevin Brodsky +Cc: Liam Howlett +Cc: Michal Hocko +Cc: Oscar Salvador +Cc: Peter Xu +Cc: Peter Zijlstra +Cc: Qi Zheng +Cc: Ryan Roberts +Cc: Suren Baghdasaryan +Cc: Tejun Heo +Cc: Thomas Gleinxer +Cc: Thomas Huth +Cc: Vincenzo Frascino +Cc: Vlastimil Babka +Cc: Dave Hansen +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/pgtable.h | 16 ++++++++++++++++ + include/linux/vmalloc.h | 16 ---------------- + 2 files changed, 16 insertions(+), 16 deletions(-) + +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -1695,6 +1695,22 @@ static inline int pmd_protnone(pmd_t pmd + } + #endif /* CONFIG_NUMA_BALANCING */ + ++/* ++ * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values ++ * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() ++ * needs to be called. ++ */ ++#ifndef ARCH_PAGE_TABLE_SYNC_MASK ++#define ARCH_PAGE_TABLE_SYNC_MASK 0 ++#endif ++ ++/* ++ * There is no default implementation for arch_sync_kernel_mappings(). It is ++ * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK ++ * is 0. ++ */ ++void arch_sync_kernel_mappings(unsigned long start, unsigned long end); ++ + #endif /* CONFIG_MMU */ + + #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -220,22 +220,6 @@ int vmap_pages_range(unsigned long addr, + struct page **pages, unsigned int page_shift); + + /* +- * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values +- * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() +- * needs to be called. +- */ +-#ifndef ARCH_PAGE_TABLE_SYNC_MASK +-#define ARCH_PAGE_TABLE_SYNC_MASK 0 +-#endif +- +-/* +- * There is no default implementation for arch_sync_kernel_mappings(). It is +- * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK +- * is 0. +- */ +-void arch_sync_kernel_mappings(unsigned long start, unsigned long end); +- +-/* + * Lowlevel-APIs (not for driver use!) + */ + diff --git a/queue-6.16/mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch b/queue-6.16/mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch new file mode 100644 index 0000000000..76748a1a45 --- /dev/null +++ b/queue-6.16/mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch @@ -0,0 +1,50 @@ +From b4efccec8d06ceb10a7d34d7b1c449c569d53770 Mon Sep 17 00:00:00 2001 +From: Li Qiong +Date: Mon, 4 Aug 2025 10:57:59 +0800 +Subject: mm/slub: avoid accessing metadata when pointer is invalid in object_err() + +From: Li Qiong + +commit b4efccec8d06ceb10a7d34d7b1c449c569d53770 upstream. + +object_err() reports details of an object for further debugging, such as +the freelist pointer, redzone, etc. However, if the pointer is invalid, +attempting to access object metadata can lead to a crash since it does +not point to a valid object. + +One known path to the crash is when alloc_consistency_checks() +determines the pointer to the allocated object is invalid because of a +freelist corruption, and calls object_err() to report it. The debug code +should report and handle the corruption gracefully and not crash in the +process. + +In case the pointer is NULL or check_valid_pointer() returns false for +the pointer, only print the pointer value and skip accessing metadata. + +Fixes: 81819f0fc828 ("SLUB core") +Cc: +Signed-off-by: Li Qiong +Reviewed-by: Harry Yoo +Reviewed-by: Matthew Wilcox (Oracle) +Signed-off-by: Vlastimil Babka +Signed-off-by: Greg Kroah-Hartman +--- + mm/slub.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1104,7 +1104,12 @@ static void object_err(struct kmem_cache + return; + + slab_bug(s, reason); +- print_trailer(s, slab, object); ++ if (!object || !check_valid_pointer(s, slab, object)) { ++ print_slab_info(slab); ++ pr_err("Invalid pointer 0x%p\n", object); ++ } else { ++ print_trailer(s, slab, object); ++ } + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); diff --git a/queue-6.16/mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch b/queue-6.16/mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch new file mode 100644 index 0000000000..d8a3d6282f --- /dev/null +++ b/queue-6.16/mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch @@ -0,0 +1,164 @@ +From 850470a8413a8a78e772c4f6bd9fe81ec6bd5b0f Mon Sep 17 00:00:00 2001 +From: yangshiguang +Date: Sat, 30 Aug 2025 10:09:46 +0800 +Subject: mm: slub: avoid wake up kswapd in set_track_prepare + +From: yangshiguang + +commit 850470a8413a8a78e772c4f6bd9fe81ec6bd5b0f upstream. + +set_track_prepare() can incur lock recursion. +The issue is that it is called from hrtimer_start_range_ns +holding the per_cpu(hrtimer_bases)[n].lock, but when enabled +CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, +and try to hold the per_cpu(hrtimer_bases)[n].lock. + +Avoid deadlock caused by implicitly waking up kswapd by passing in +allocation flags, which do not contain __GFP_KSWAPD_RECLAIM in the +debug_objects_fill_pool() case. Inside stack depot they are processed by +gfp_nested_mask(). +Since ___slab_alloc() has preemption disabled, we mask out +__GFP_DIRECT_RECLAIM from the flags there. + +The oops looks something like: + +BUG: spinlock recursion on CPU#3, swapper/3/0 + lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 +Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) +Call trace: +spin_bug+0x0 +_raw_spin_lock_irqsave+0x80 +hrtimer_try_to_cancel+0x94 +task_contending+0x10c +enqueue_dl_entity+0x2a4 +dl_server_start+0x74 +enqueue_task_fair+0x568 +enqueue_task+0xac +do_activate_task+0x14c +ttwu_do_activate+0xcc +try_to_wake_up+0x6c8 +default_wake_function+0x20 +autoremove_wake_function+0x1c +__wake_up+0xac +wakeup_kswapd+0x19c +wake_all_kswapds+0x78 +__alloc_pages_slowpath+0x1ac +__alloc_pages_noprof+0x298 +stack_depot_save_flags+0x6b0 +stack_depot_save+0x14 +set_track_prepare+0x5c +___slab_alloc+0xccc +__kmalloc_cache_noprof+0x470 +__set_page_owner+0x2bc +post_alloc_hook[jt]+0x1b8 +prep_new_page+0x28 +get_page_from_freelist+0x1edc +__alloc_pages_noprof+0x13c +alloc_slab_page+0x244 +allocate_slab+0x7c +___slab_alloc+0x8e8 +kmem_cache_alloc_noprof+0x450 +debug_objects_fill_pool+0x22c +debug_object_activate+0x40 +enqueue_hrtimer[jt]+0xdc +hrtimer_start_range_ns+0x5f8 +... + +Signed-off-by: yangshiguang +Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") +Cc: stable@vger.kernel.org +Signed-off-by: Vlastimil Babka +Signed-off-by: Greg Kroah-Hartman +--- + mm/slub.c | 30 ++++++++++++++++++++---------- + 1 file changed, 20 insertions(+), 10 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -926,19 +926,19 @@ static struct track *get_track(struct km + } + + #ifdef CONFIG_STACKDEPOT +-static noinline depot_stack_handle_t set_track_prepare(void) ++static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) + { + depot_stack_handle_t handle; + unsigned long entries[TRACK_ADDRS_COUNT]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); +- handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); ++ handle = stack_depot_save(entries, nr_entries, gfp_flags); + + return handle; + } + #else +-static inline depot_stack_handle_t set_track_prepare(void) ++static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) + { + return 0; + } +@@ -960,9 +960,9 @@ static void set_track_update(struct kmem + } + + static __always_inline void set_track(struct kmem_cache *s, void *object, +- enum track_item alloc, unsigned long addr) ++ enum track_item alloc, unsigned long addr, gfp_t gfp_flags) + { +- depot_stack_handle_t handle = set_track_prepare(); ++ depot_stack_handle_t handle = set_track_prepare(gfp_flags); + + set_track_update(s, object, alloc, addr, handle); + } +@@ -1890,9 +1890,9 @@ static inline bool free_debug_processing + static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} + static inline int check_object(struct kmem_cache *s, struct slab *slab, + void *object, u8 val) { return 1; } +-static inline depot_stack_handle_t set_track_prepare(void) { return 0; } ++static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } + static inline void set_track(struct kmem_cache *s, void *object, +- enum track_item alloc, unsigned long addr) {} ++ enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {} + static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct slab *slab) {} + static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, +@@ -3849,9 +3849,14 @@ new_objects: + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the + * tracking info and return the object. ++ * ++ * Due to disabled preemption we need to disallow ++ * blocking. The flags are further adjusted by ++ * gfp_nested_mask() in stack_depot itself. + */ + if (s->flags & SLAB_STORE_USER) +- set_track(s, freelist, TRACK_ALLOC, addr); ++ set_track(s, freelist, TRACK_ALLOC, addr, ++ gfpflags & ~(__GFP_DIRECT_RECLAIM)); + + return freelist; + } +@@ -3883,7 +3888,8 @@ new_objects: + goto new_objects; + + if (s->flags & SLAB_STORE_USER) +- set_track(s, freelist, TRACK_ALLOC, addr); ++ set_track(s, freelist, TRACK_ALLOC, addr, ++ gfpflags & ~(__GFP_DIRECT_RECLAIM)); + + return freelist; + } +@@ -4394,8 +4400,12 @@ static noinline void free_to_partial_lis + unsigned long flags; + depot_stack_handle_t handle = 0; + ++ /* ++ * We cannot use GFP_NOWAIT as there are callsites where waking up ++ * kswapd could deadlock ++ */ + if (s->flags & SLAB_STORE_USER) +- handle = set_track_prepare(); ++ handle = set_track_prepare(__GFP_NOWARN); + + spin_lock_irqsave(&n->list_lock, flags); + diff --git a/queue-6.16/mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch b/queue-6.16/mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch new file mode 100644 index 0000000000..c0da8884a7 --- /dev/null +++ b/queue-6.16/mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch @@ -0,0 +1,61 @@ +From 9614d8bee66387501f48718fa306e17f2aa3f2f3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 31 Jul 2025 10:44:31 -0400 +Subject: mm/userfaultfd: fix kmap_local LIFO ordering for CONFIG_HIGHPTE + +From: Sasha Levin + +commit 9614d8bee66387501f48718fa306e17f2aa3f2f3 upstream. + +With CONFIG_HIGHPTE on 32-bit ARM, move_pages_pte() maps PTE pages using +kmap_local_page(), which requires unmapping in Last-In-First-Out order. + +The current code maps dst_pte first, then src_pte, but unmaps them in the +same order (dst_pte, src_pte), violating the LIFO requirement. This +causes the warning in kunmap_local_indexed(): + + WARNING: CPU: 0 PID: 604 at mm/highmem.c:622 kunmap_local_indexed+0x178/0x17c + addr \!= __fix_to_virt(FIX_KMAP_BEGIN + idx) + +Fix this by reversing the unmap order to respect LIFO ordering. + +This issue follows the same pattern as similar fixes: +- commit eca6828403b8 ("crypto: skcipher - fix mismatch between mapping and unmapping order") +- commit 8cf57c6df818 ("nilfs2: eliminate staggered calls to kunmap in nilfs_rename") + +Both of which addressed the same fundamental requirement that kmap_local +operations must follow LIFO ordering. + +Link: https://lkml.kernel.org/r/20250731144431.773923-1-sashal@kernel.org +Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") +Signed-off-by: Sasha Levin +Acked-by: David Hildenbrand +Reviewed-by: Suren Baghdasaryan +Cc: Andrea Arcangeli +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/userfaultfd.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -1453,10 +1453,15 @@ out: + folio_unlock(src_folio); + folio_put(src_folio); + } +- if (dst_pte) +- pte_unmap(dst_pte); ++ /* ++ * Unmap in reverse order (LIFO) to maintain proper kmap_local ++ * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte ++ * first, then src_pte, so we must unmap src_pte first, then dst_pte. ++ */ + if (src_pte) + pte_unmap(src_pte); ++ if (dst_pte) ++ pte_unmap(dst_pte); + mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); diff --git a/queue-6.16/rust-mm-mark-vmanew-as-transparent.patch b/queue-6.16/rust-mm-mark-vmanew-as-transparent.patch new file mode 100644 index 0000000000..3277959431 --- /dev/null +++ b/queue-6.16/rust-mm-mark-vmanew-as-transparent.patch @@ -0,0 +1,54 @@ +From 5cc5e030bce2ec97ae5cdb2c1b94a98b1047b3fa Mon Sep 17 00:00:00 2001 +From: Baptiste Lepers +Date: Tue, 12 Aug 2025 15:26:56 +0200 +Subject: rust: mm: mark VmaNew as transparent +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Baptiste Lepers + +commit 5cc5e030bce2ec97ae5cdb2c1b94a98b1047b3fa upstream. + +Unsafe code in VmaNew's methods assumes that the type has the same layout +as the inner `bindings::vm_area_struct`. This is not guaranteed by the +default struct representation in Rust, but requires specifying the +`transparent` representation. + +Link: https://lkml.kernel.org/r/20250812132712.61007-1-baptiste.lepers@gmail.com +Fixes: dcb81aeab406 ("mm: rust: add VmaNew for f_ops->mmap()") +Signed-off-by: Baptiste Lepers +Reviewed-by: Alice Ryhl +Cc: Alex Gaynor +Cc: Andreas Hindborg +Cc: Björn Roy Baron +Cc: Boqun Feng +Cc: Danilo Krummrich +Cc: Gary Guo +Cc: Jann Horn +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Miguel Ojeda +Cc: Trevor Gross +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + rust/kernel/mm/virt.rs | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs +index 6086ca981b06..a1bfa4e19293 100644 +--- a/rust/kernel/mm/virt.rs ++++ b/rust/kernel/mm/virt.rs +@@ -209,6 +209,7 @@ pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result { + /// + /// For the duration of 'a, the referenced vma must be undergoing initialization in an + /// `f_ops->mmap()` hook. ++#[repr(transparent)] + pub struct VmaNew { + vma: VmaRef, + } +-- +2.51.0 + diff --git a/queue-6.16/series b/queue-6.16/series index 67d1edd298..9c788c00dd 100644 --- a/queue-6.16/series +++ b/queue-6.16/series @@ -98,3 +98,12 @@ accel-ivpu-prevent-recovery-work-from-being-queued-during-device-removal.patch acpi-iort-fix-memory-leak-in-iort_rmr_alloc_sids.patch arm64-ftrace-fix-unreachable-plt-for-ftrace_caller-in-init_module-with-config_dynamic_ftrace.patch pcmcia-fix-a-null-pointer-dereference-in-__iodyn_find_io_region.patch +rust-mm-mark-vmanew-as-transparent.patch +mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch +x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch +mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch +mm-fix-accounting-of-memmap-pages.patch +mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch +mm-introduce-and-use-pgd-p4d-_populate_kernel.patch +mm-fix-possible-deadlock-in-kmemleak.patch +mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch diff --git a/queue-6.16/x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch b/queue-6.16/x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch new file mode 100644 index 0000000000..ece7d97a82 --- /dev/null +++ b/queue-6.16/x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch @@ -0,0 +1,153 @@ +From 6659d027998083fbb6d42a165b0c90dc2e8ba989 Mon Sep 17 00:00:00 2001 +From: Harry Yoo +Date: Mon, 18 Aug 2025 11:02:06 +0900 +Subject: x86/mm/64: define ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() + +From: Harry Yoo + +commit 6659d027998083fbb6d42a165b0c90dc2e8ba989 upstream. + +Define ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() to ensure +page tables are properly synchronized when calling p*d_populate_kernel(). + +For 5-level paging, synchronization is performed via +pgd_populate_kernel(). In 4-level paging, pgd_populate() is a no-op, so +synchronization is instead performed at the P4D level via +p4d_populate_kernel(). + +This fixes intermittent boot failures on systems using 4-level paging and +a large amount of persistent memory: + + BUG: unable to handle page fault for address: ffffe70000000034 + #PF: supervisor write access in kernel mode + #PF: error_code(0x0002) - not-present page + PGD 0 P4D 0 + Oops: 0002 [#1] SMP NOPTI + RIP: 0010:__init_single_page+0x9/0x6d + Call Trace: + + __init_zone_device_page+0x17/0x5d + memmap_init_zone_device+0x154/0x1bb + pagemap_range+0x2e0/0x40f + memremap_pages+0x10b/0x2f0 + devm_memremap_pages+0x1e/0x60 + dev_dax_probe+0xce/0x2ec [device_dax] + dax_bus_probe+0x6d/0xc9 + [... snip ...] + + +It also fixes a crash in vmemmap_set_pmd() caused by accessing vmemmap +before sync_global_pgds() [1]: + + BUG: unable to handle page fault for address: ffffeb3ff1200000 + #PF: supervisor write access in kernel mode + #PF: error_code(0x0002) - not-present page + PGD 0 P4D 0 + Oops: Oops: 0002 [#1] PREEMPT SMP NOPTI + Tainted: [W]=WARN + RIP: 0010:vmemmap_set_pmd+0xff/0x230 + + vmemmap_populate_hugepages+0x176/0x180 + vmemmap_populate+0x34/0x80 + __populate_section_memmap+0x41/0x90 + sparse_add_section+0x121/0x3e0 + __add_pages+0xba/0x150 + add_pages+0x1d/0x70 + memremap_pages+0x3dc/0x810 + devm_memremap_pages+0x1c/0x60 + xe_devm_add+0x8b/0x100 [xe] + xe_tile_init_noalloc+0x6a/0x70 [xe] + xe_device_probe+0x48c/0x740 [xe] + [... snip ...] + +Link: https://lkml.kernel.org/r/20250818020206.4517-4-harry.yoo@oracle.com +Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges") +Signed-off-by: Harry Yoo +Closes: https://lore.kernel.org/linux-mm/20250311114420.240341-1-gwan-gyeong.mun@intel.com [1] +Suggested-by: Dave Hansen +Acked-by: Kiryl Shutsemau +Reviewed-by: Mike Rapoport (Microsoft) +Reviewed-by: Lorenzo Stoakes +Acked-by: David Hildenbrand +Cc: Alexander Potapenko +Cc: Alistair Popple +Cc: Andrey Konovalov +Cc: Andrey Ryabinin +Cc: Andy Lutomirski +Cc: "Aneesh Kumar K.V" +Cc: Anshuman Khandual +Cc: Ard Biesheuvel +Cc: Arnd Bergmann +Cc: bibo mao +Cc: Borislav Betkov +Cc: Christoph Lameter (Ampere) +Cc: Dennis Zhou +Cc: Dev Jain +Cc: Dmitriy Vyukov +Cc: Ingo Molnar +Cc: Jane Chu +Cc: Joao Martins +Cc: Joerg Roedel +Cc: John Hubbard +Cc: Kevin Brodsky +Cc: Liam Howlett +Cc: Michal Hocko +Cc: Oscar Salvador +Cc: Peter Xu +Cc: Peter Zijlstra +Cc: Qi Zheng +Cc: Ryan Roberts +Cc: Suren Baghdasaryan +Cc: Tejun Heo +Cc: Thomas Gleinxer +Cc: Thomas Huth +Cc: "Uladzislau Rezki (Sony)" +Cc: Vincenzo Frascino +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/pgtable_64_types.h | 3 +++ + arch/x86/mm/init_64.c | 18 ++++++++++++++++++ + 2 files changed, 21 insertions(+) + +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -36,6 +36,9 @@ static inline bool pgtable_l5_enabled(vo + #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) + #endif /* USE_EARLY_PGTABLE_L5 */ + ++#define ARCH_PAGE_TABLE_SYNC_MASK \ ++ (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) ++ + extern unsigned int pgdir_shift; + extern unsigned int ptrs_per_p4d; + +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -224,6 +224,24 @@ static void sync_global_pgds(unsigned lo + } + + /* ++ * Make kernel mappings visible in all page tables in the system. ++ * This is necessary except when the init task populates kernel mappings ++ * during the boot process. In that case, all processes originating from ++ * the init task copies the kernel mappings, so there is no issue. ++ * Otherwise, missing synchronization could lead to kernel crashes due ++ * to missing page table entries for certain kernel mappings. ++ * ++ * Synchronization is performed at the top level, which is the PGD in ++ * 5-level paging systems. But in 4-level paging systems, however, ++ * pgd_populate() is a no-op, so synchronization is done at the P4D level. ++ * sync_global_pgds() handles this difference between paging levels. ++ */ ++void arch_sync_kernel_mappings(unsigned long start, unsigned long end) ++{ ++ sync_global_pgds(start, end); ++} ++ ++/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */