]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.16-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 6 Sep 2025 19:07:11 +0000 (21:07 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 6 Sep 2025 19:07:11 +0000 (21:07 +0200)
added patches:
mm-fix-accounting-of-memmap-pages.patch
mm-fix-possible-deadlock-in-kmemleak.patch
mm-introduce-and-use-pgd-p4d-_populate_kernel.patch
mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch
mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch
mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch
mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch
rust-mm-mark-vmanew-as-transparent.patch
x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch

queue-6.16/mm-fix-accounting-of-memmap-pages.patch [new file with mode: 0644]
queue-6.16/mm-fix-possible-deadlock-in-kmemleak.patch [new file with mode: 0644]
queue-6.16/mm-introduce-and-use-pgd-p4d-_populate_kernel.patch [new file with mode: 0644]
queue-6.16/mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch [new file with mode: 0644]
queue-6.16/mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch [new file with mode: 0644]
queue-6.16/mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch [new file with mode: 0644]
queue-6.16/mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch [new file with mode: 0644]
queue-6.16/rust-mm-mark-vmanew-as-transparent.patch [new file with mode: 0644]
queue-6.16/series
queue-6.16/x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch [new file with mode: 0644]

diff --git a/queue-6.16/mm-fix-accounting-of-memmap-pages.patch b/queue-6.16/mm-fix-accounting-of-memmap-pages.patch
new file mode 100644 (file)
index 0000000..543cafb
--- /dev/null
@@ -0,0 +1,109 @@
+From c3576889d87b603cb66b417e08844a53c1077a37 Mon Sep 17 00:00:00 2001
+From: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Date: Thu, 7 Aug 2025 20:35:45 +0200
+Subject: mm: fix accounting of memmap pages
+
+From: Sumanth Korikkar <sumanthk@linux.ibm.com>
+
+commit c3576889d87b603cb66b417e08844a53c1077a37 upstream.
+
+For !CONFIG_SPARSEMEM_VMEMMAP, memmap page accounting is currently done
+upfront in sparse_buffer_init().  However, sparse_buffer_alloc() may
+return NULL in failure scenario.
+
+Also, memmap pages may be allocated either from the memblock allocator
+during early boot or from the buddy allocator.  When removed via
+arch_remove_memory(), accounting of memmap pages must reflect the original
+allocation source.
+
+To ensure correctness:
+* Account memmap pages after successful allocation in sparse_init_nid()
+  and section_activate().
+* Account memmap pages in section_deactivate() based on allocation
+  source.
+
+Link: https://lkml.kernel.org/r/20250807183545.1424509-1-sumanthk@linux.ibm.com
+Fixes: 15995a352474 ("mm: report per-page metadata information")
+Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Suggested-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
+Cc: Alexander Gordeev <agordeev@linux.ibm.com>
+Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Cc: Heiko Carstens <hca@linux.ibm.com>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/sparse-vmemmap.c |    5 -----
+ mm/sparse.c         |   15 +++++++++------
+ 2 files changed, 9 insertions(+), 11 deletions(-)
+
+--- a/mm/sparse-vmemmap.c
++++ b/mm/sparse-vmemmap.c
+@@ -578,11 +578,6 @@ struct page * __meminit __populate_secti
+       if (r < 0)
+               return NULL;
+-      if (system_state == SYSTEM_BOOTING)
+-              memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
+-      else
+-              memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
+-
+       return pfn_to_page(pfn);
+ }
+--- a/mm/sparse.c
++++ b/mm/sparse.c
+@@ -454,9 +454,6 @@ static void __init sparse_buffer_init(un
+        */
+       sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
+       sparsemap_buf_end = sparsemap_buf + size;
+-#ifndef CONFIG_SPARSEMEM_VMEMMAP
+-      memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
+-#endif
+ }
+ static void __init sparse_buffer_fini(void)
+@@ -567,6 +564,8 @@ static void __init sparse_init_nid(int n
+                               sparse_buffer_fini();
+                               goto failed;
+                       }
++                      memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
++                                                         PAGE_SIZE));
+                       sparse_init_early_section(nid, map, pnum, 0);
+               }
+       }
+@@ -680,7 +679,6 @@ static void depopulate_section_memmap(un
+       unsigned long start = (unsigned long) pfn_to_page(pfn);
+       unsigned long end = start + nr_pages * sizeof(struct page);
+-      memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
+       vmemmap_free(start, end, altmap);
+ }
+ static void free_map_bootmem(struct page *memmap)
+@@ -856,10 +854,14 @@ static void section_deactivate(unsigned
+        * The memmap of early sections is always fully populated. See
+        * section_activate() and pfn_valid() .
+        */
+-      if (!section_is_early)
++      if (!section_is_early) {
++              memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
+               depopulate_section_memmap(pfn, nr_pages, altmap);
+-      else if (memmap)
++      } else if (memmap) {
++              memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
++                                                        PAGE_SIZE)));
+               free_map_bootmem(memmap);
++      }
+       if (empty)
+               ms->section_mem_map = (unsigned long)NULL;
+@@ -904,6 +906,7 @@ static struct page * __meminit section_a
+               section_deactivate(pfn, nr_pages, altmap);
+               return ERR_PTR(-ENOMEM);
+       }
++      memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
+       return memmap;
+ }
diff --git a/queue-6.16/mm-fix-possible-deadlock-in-kmemleak.patch b/queue-6.16/mm-fix-possible-deadlock-in-kmemleak.patch
new file mode 100644 (file)
index 0000000..e600bb7
--- /dev/null
@@ -0,0 +1,115 @@
+From c873ccbb2f8db46ad9b4a989ea924b6d8f19abf1 Mon Sep 17 00:00:00 2001
+From: Gu Bowen <gubowen5@huawei.com>
+Date: Fri, 22 Aug 2025 15:35:41 +0800
+Subject: mm: fix possible deadlock in kmemleak
+
+From: Gu Bowen <gubowen5@huawei.com>
+
+commit c873ccbb2f8db46ad9b4a989ea924b6d8f19abf1 upstream.
+
+There are some AA deadlock issues in kmemleak, similar to the situation
+reported by Breno [1].  The deadlock path is as follows:
+
+mem_pool_alloc()
+  -> raw_spin_lock_irqsave(&kmemleak_lock, flags);
+      -> pr_warn()
+          -> netconsole subsystem
+            -> netpoll
+                -> __alloc_skb
+                  -> __create_object
+                    -> raw_spin_lock_irqsave(&kmemleak_lock, flags);
+
+To solve this problem, switch to printk_safe mode before printing warning
+message, this will redirect all printk()-s to a special per-CPU buffer,
+which will be flushed later from a safe context (irq work), and this
+deadlock problem can be avoided.  The proper API to use should be
+printk_deferred_enter()/printk_deferred_exit() [2].  Another way is to
+place the warn print after kmemleak is released.
+
+Link: https://lkml.kernel.org/r/20250822073541.1886469-1-gubowen5@huawei.com
+Link: https://lore.kernel.org/all/20250731-kmemleak_lock-v1-1-728fd470198f@debian.org/#t [1]
+Link: https://lore.kernel.org/all/5ca375cd-4a20-4807-b897-68b289626550@redhat.com/ [2]
+Signed-off-by: Gu Bowen <gubowen5@huawei.com>
+Reviewed-by: Waiman Long <longman@redhat.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Reviewed-by: Breno Leitao <leitao@debian.org>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: John Ogness <john.ogness@linutronix.de>
+Cc: Lu Jialin <lujialin4@huawei.com>
+Cc: Petr Mladek <pmladek@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/kmemleak.c |   27 ++++++++++++++++++++-------
+ 1 file changed, 20 insertions(+), 7 deletions(-)
+
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_
+               else if (untagged_objp == untagged_ptr || alias)
+                       return object;
+               else {
++                      /*
++                       * Printk deferring due to the kmemleak_lock held.
++                       * This is done to avoid deadlock.
++                       */
++                      printk_deferred_enter();
+                       kmemleak_warn("Found object by alias at 0x%08lx\n",
+                                     ptr);
+                       dump_object_info(object);
++                      printk_deferred_exit();
+                       break;
+               }
+       }
+@@ -736,6 +742,11 @@ static int __link_object(struct kmemleak
+               else if (untagged_objp + parent->size <= untagged_ptr)
+                       link = &parent->rb_node.rb_right;
+               else {
++                      /*
++                       * Printk deferring due to the kmemleak_lock held.
++                       * This is done to avoid deadlock.
++                       */
++                      printk_deferred_enter();
+                       kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
+                                     ptr);
+                       /*
+@@ -743,6 +754,7 @@ static int __link_object(struct kmemleak
+                        * be freed while the kmemleak_lock is held.
+                        */
+                       dump_object_info(parent);
++                      printk_deferred_exit();
+                       return -EEXIST;
+               }
+       }
+@@ -856,13 +868,8 @@ static void delete_object_part(unsigned
+       raw_spin_lock_irqsave(&kmemleak_lock, flags);
+       object = __find_and_remove_object(ptr, 1, objflags);
+-      if (!object) {
+-#ifdef DEBUG
+-              kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+-                            ptr, size);
+-#endif
++      if (!object)
+               goto unlock;
+-      }
+       /*
+        * Create one or two objects that may result from the memory block
+@@ -882,8 +889,14 @@ static void delete_object_part(unsigned
+ unlock:
+       raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+-      if (object)
++      if (object) {
+               __delete_object(object);
++      } else {
++#ifdef DEBUG
++              kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
++                            ptr, size);
++#endif
++      }
+ out:
+       if (object_l)
diff --git a/queue-6.16/mm-introduce-and-use-pgd-p4d-_populate_kernel.patch b/queue-6.16/mm-introduce-and-use-pgd-p4d-_populate_kernel.patch
new file mode 100644 (file)
index 0000000..1b30e71
--- /dev/null
@@ -0,0 +1,283 @@
+From f2d2f9598ebb0158a3fe17cda0106d7752e654a2 Mon Sep 17 00:00:00 2001
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Mon, 18 Aug 2025 11:02:05 +0900
+Subject: mm: introduce and use {pgd,p4d}_populate_kernel()
+
+From: Harry Yoo <harry.yoo@oracle.com>
+
+commit f2d2f9598ebb0158a3fe17cda0106d7752e654a2 upstream.
+
+Introduce and use {pgd,p4d}_populate_kernel() in core MM code when
+populating PGD and P4D entries for the kernel address space.  These
+helpers ensure proper synchronization of page tables when updating the
+kernel portion of top-level page tables.
+
+Until now, the kernel has relied on each architecture to handle
+synchronization of top-level page tables in an ad-hoc manner.  For
+example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for direct
+mapping and vmemmap mapping changes").
+
+However, this approach has proven fragile for following reasons:
+
+  1) It is easy to forget to perform the necessary page table
+     synchronization when introducing new changes.
+     For instance, commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory
+     savings for compound devmaps") overlooked the need to synchronize
+     page tables for the vmemmap area.
+
+  2) It is also easy to overlook that the vmemmap and direct mapping areas
+     must not be accessed before explicit page table synchronization.
+     For example, commit 8d400913c231 ("x86/vmemmap: handle unpopulated
+     sub-pmd ranges")) caused crashes by accessing the vmemmap area
+     before calling sync_global_pgds().
+
+To address this, as suggested by Dave Hansen, introduce _kernel() variants
+of the page table population helpers, which invoke architecture-specific
+hooks to properly synchronize page tables.  These are introduced in a new
+header file, include/linux/pgalloc.h, so they can be called from common
+code.
+
+They reuse existing infrastructure for vmalloc and ioremap.
+Synchronization requirements are determined by ARCH_PAGE_TABLE_SYNC_MASK,
+and the actual synchronization is performed by
+arch_sync_kernel_mappings().
+
+This change currently targets only x86_64, so only PGD and P4D level
+helpers are introduced.  Currently, these helpers are no-ops since no
+architecture sets PGTBL_{PGD,P4D}_MODIFIED in ARCH_PAGE_TABLE_SYNC_MASK.
+
+In theory, PUD and PMD level helpers can be added later if needed by other
+architectures.  For now, 32-bit architectures (x86-32 and arm) only handle
+PGTBL_PMD_MODIFIED, so p*d_populate_kernel() will never affect them unless
+we introduce a PMD level helper.
+
+[harry.yoo@oracle.com: fix KASAN build error due to p*d_populate_kernel()]
+  Link: https://lkml.kernel.org/r/20250822020727.202749-1-harry.yoo@oracle.com
+Link: https://lkml.kernel.org/r/20250818020206.4517-3-harry.yoo@oracle.com
+Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges")
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
+Acked-by: Kiryl Shutsemau <kas@kernel.org>
+Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Cc: bibo mao <maobibo@loongson.cn>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Christoph Lameter (Ampere) <cl@gentwo.org>
+Cc: Dennis Zhou <dennis@kernel.org>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Dmitriy Vyukov <dvyukov@google.com>
+Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jane Chu <jane.chu@oracle.com>
+Cc: Joao Martins <joao.m.martins@oracle.com>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kevin Brodsky <kevin.brodsky@arm.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: Thomas Huth <thuth@redhat.com>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/pgalloc.h |   29 +++++++++++++++++++++++++++++
+ include/linux/pgtable.h |   13 +++++++------
+ mm/kasan/init.c         |   12 ++++++------
+ mm/percpu.c             |    6 +++---
+ mm/sparse-vmemmap.c     |    6 +++---
+ 5 files changed, 48 insertions(+), 18 deletions(-)
+ create mode 100644 include/linux/pgalloc.h
+
+--- /dev/null
++++ b/include/linux/pgalloc.h
+@@ -0,0 +1,29 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _LINUX_PGALLOC_H
++#define _LINUX_PGALLOC_H
++
++#include <linux/pgtable.h>
++#include <asm/pgalloc.h>
++
++/*
++ * {pgd,p4d}_populate_kernel() are defined as macros to allow
++ * compile-time optimization based on the configured page table levels.
++ * Without this, linking may fail because callers (e.g., KASAN) may rely
++ * on calls to these functions being optimized away when passing symbols
++ * that exist only for certain page table levels.
++ */
++#define pgd_populate_kernel(addr, pgd, p4d)                           \
++      do {                                                            \
++              pgd_populate(&init_mm, pgd, p4d);                       \
++              if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED)     \
++                      arch_sync_kernel_mappings(addr, addr);          \
++      } while (0)
++
++#define p4d_populate_kernel(addr, p4d, pud)                           \
++      do {                                                            \
++              p4d_populate(&init_mm, p4d, pud);                       \
++              if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_P4D_MODIFIED)     \
++                      arch_sync_kernel_mappings(addr, addr);          \
++      } while (0)
++
++#endif /* _LINUX_PGALLOC_H */
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1697,8 +1697,8 @@ static inline int pmd_protnone(pmd_t pmd
+ /*
+  * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
+- * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
+- * needs to be called.
++ * and let generic vmalloc, ioremap and page table update code know when
++ * arch_sync_kernel_mappings() needs to be called.
+  */
+ #ifndef ARCH_PAGE_TABLE_SYNC_MASK
+ #define ARCH_PAGE_TABLE_SYNC_MASK 0
+@@ -1831,10 +1831,11 @@ static inline bool arch_has_pfn_modify_c
+ /*
+  * Page Table Modification bits for pgtbl_mod_mask.
+  *
+- * These are used by the p?d_alloc_track*() set of functions an in the generic
+- * vmalloc/ioremap code to track at which page-table levels entries have been
+- * modified. Based on that the code can better decide when vmalloc and ioremap
+- * mapping changes need to be synchronized to other page-tables in the system.
++ * These are used by the p?d_alloc_track*() and p*d_populate_kernel()
++ * functions in the generic vmalloc, ioremap and page table update code
++ * to track at which page-table levels entries have been modified.
++ * Based on that the code can better decide when page table changes need
++ * to be synchronized to other page-tables in the system.
+  */
+ #define               __PGTBL_PGD_MODIFIED    0
+ #define               __PGTBL_P4D_MODIFIED    1
+--- a/mm/kasan/init.c
++++ b/mm/kasan/init.c
+@@ -13,9 +13,9 @@
+ #include <linux/mm.h>
+ #include <linux/pfn.h>
+ #include <linux/slab.h>
++#include <linux/pgalloc.h>
+ #include <asm/page.h>
+-#include <asm/pgalloc.h>
+ #include "kasan.h"
+@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t
+                       pud_t *pud;
+                       pmd_t *pmd;
+-                      p4d_populate(&init_mm, p4d,
++                      p4d_populate_kernel(addr, p4d,
+                                       lm_alias(kasan_early_shadow_pud));
+                       pud = pud_offset(p4d, addr);
+                       pud_populate(&init_mm, pud,
+@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t
+                       } else {
+                               p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+                               pud_init(p);
+-                              p4d_populate(&init_mm, p4d, p);
++                              p4d_populate_kernel(addr, p4d, p);
+                       }
+               }
+               zero_pud_populate(p4d, addr, next);
+@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(co
+                        * puds,pmds, so pgd_populate(), pud_populate()
+                        * is noops.
+                        */
+-                      pgd_populate(&init_mm, pgd,
++                      pgd_populate_kernel(addr, pgd,
+                                       lm_alias(kasan_early_shadow_p4d));
+                       p4d = p4d_offset(pgd, addr);
+-                      p4d_populate(&init_mm, p4d,
++                      p4d_populate_kernel(addr, p4d,
+                                       lm_alias(kasan_early_shadow_pud));
+                       pud = pud_offset(p4d, addr);
+                       pud_populate(&init_mm, pud,
+@@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(co
+                               if (!p)
+                                       return -ENOMEM;
+                       } else {
+-                              pgd_populate(&init_mm, pgd,
++                              pgd_populate_kernel(addr, pgd,
+                                       early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+                       }
+               }
+--- a/mm/percpu.c
++++ b/mm/percpu.c
+@@ -3108,7 +3108,7 @@ out_free:
+ #endif /* BUILD_EMBED_FIRST_CHUNK */
+ #ifdef BUILD_PAGE_FIRST_CHUNK
+-#include <asm/pgalloc.h>
++#include <linux/pgalloc.h>
+ #ifndef P4D_TABLE_SIZE
+ #define P4D_TABLE_SIZE PAGE_SIZE
+@@ -3134,13 +3134,13 @@ void __init __weak pcpu_populate_pte(uns
+       if (pgd_none(*pgd)) {
+               p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
+-              pgd_populate(&init_mm, pgd, p4d);
++              pgd_populate_kernel(addr, pgd, p4d);
+       }
+       p4d = p4d_offset(pgd, addr);
+       if (p4d_none(*p4d)) {
+               pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+-              p4d_populate(&init_mm, p4d, pud);
++              p4d_populate_kernel(addr, p4d, pud);
+       }
+       pud = pud_offset(p4d, addr);
+--- a/mm/sparse-vmemmap.c
++++ b/mm/sparse-vmemmap.c
+@@ -27,9 +27,9 @@
+ #include <linux/spinlock.h>
+ #include <linux/vmalloc.h>
+ #include <linux/sched.h>
++#include <linux/pgalloc.h>
+ #include <asm/dma.h>
+-#include <asm/pgalloc.h>
+ #include <asm/tlbflush.h>
+ #include "hugetlb_vmemmap.h"
+@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(p
+               if (!p)
+                       return NULL;
+               pud_init(p);
+-              p4d_populate(&init_mm, p4d, p);
++              p4d_populate_kernel(addr, p4d, p);
+       }
+       return p4d;
+ }
+@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(u
+               void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+               if (!p)
+                       return NULL;
+-              pgd_populate(&init_mm, pgd, p);
++              pgd_populate_kernel(addr, pgd, p);
+       }
+       return pgd;
+ }
diff --git a/queue-6.16/mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch b/queue-6.16/mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch
new file mode 100644 (file)
index 0000000..4c19cad
--- /dev/null
@@ -0,0 +1,216 @@
+From 7cc183f2e67d19b03ee5c13a6664b8c6cc37ff9d Mon Sep 17 00:00:00 2001
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Mon, 18 Aug 2025 11:02:04 +0900
+Subject: mm: move page table sync declarations to linux/pgtable.h
+
+From: Harry Yoo <harry.yoo@oracle.com>
+
+commit 7cc183f2e67d19b03ee5c13a6664b8c6cc37ff9d upstream.
+
+During our internal testing, we started observing intermittent boot
+failures when the machine uses 4-level paging and has a large amount of
+persistent memory:
+
+  BUG: unable to handle page fault for address: ffffe70000000034
+  #PF: supervisor write access in kernel mode
+  #PF: error_code(0x0002) - not-present page
+  PGD 0 P4D 0
+  Oops: 0002 [#1] SMP NOPTI
+  RIP: 0010:__init_single_page+0x9/0x6d
+  Call Trace:
+   <TASK>
+   __init_zone_device_page+0x17/0x5d
+   memmap_init_zone_device+0x154/0x1bb
+   pagemap_range+0x2e0/0x40f
+   memremap_pages+0x10b/0x2f0
+   devm_memremap_pages+0x1e/0x60
+   dev_dax_probe+0xce/0x2ec [device_dax]
+   dax_bus_probe+0x6d/0xc9
+   [... snip ...]
+   </TASK>
+
+It turns out that the kernel panics while initializing vmemmap (struct
+page array) when the vmemmap region spans two PGD entries, because the new
+PGD entry is only installed in init_mm.pgd, but not in the page tables of
+other tasks.
+
+And looking at __populate_section_memmap():
+  if (vmemmap_can_optimize(altmap, pgmap))
+          // does not sync top level page tables
+          r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
+  else
+          // sync top level page tables in x86
+          r = vmemmap_populate(start, end, nid, altmap);
+
+In the normal path, vmemmap_populate() in arch/x86/mm/init_64.c
+synchronizes the top level page table (See commit 9b861528a801 ("x86-64,
+mem: Update all PGDs for direct mapping and vmemmap mapping changes")) so
+that all tasks in the system can see the new vmemmap area.
+
+However, when vmemmap_can_optimize() returns true, the optimized path
+skips synchronization of top-level page tables.  This is because
+vmemmap_populate_compound_pages() is implemented in core MM code, which
+does not handle synchronization of the top-level page tables.  Instead,
+the core MM has historically relied on each architecture to perform this
+synchronization manually.
+
+We're not the first party to encounter a crash caused by not-sync'd top
+level page tables: earlier this year, Gwan-gyeong Mun attempted to address
+the issue [1] [2] after hitting a kernel panic when x86 code accessed the
+vmemmap area before the corresponding top-level entries were synced.  At
+that time, the issue was believed to be triggered only when struct page
+was enlarged for debugging purposes, and the patch did not get further
+updates.
+
+It turns out that current approach of relying on each arch to handle the
+page table sync manually is fragile because 1) it's easy to forget to sync
+the top level page table, and 2) it's also easy to overlook that the
+kernel should not access the vmemmap and direct mapping areas before the
+sync.
+
+# The solution: Make page table sync more code robust and harder to miss
+
+To address this, Dave Hansen suggested [3] [4] introducing
+{pgd,p4d}_populate_kernel() for updating kernel portion of the page tables
+and allow each architecture to explicitly perform synchronization when
+installing top-level entries.  With this approach, we no longer need to
+worry about missing the sync step, reducing the risk of future
+regressions.
+
+The new interface reuses existing ARCH_PAGE_TABLE_SYNC_MASK,
+PGTBL_P*D_MODIFIED and arch_sync_kernel_mappings() facility used by
+vmalloc and ioremap to synchronize page tables.
+
+pgd_populate_kernel() looks like this:
+static inline void pgd_populate_kernel(unsigned long addr, pgd_t *pgd,
+                                       p4d_t *p4d)
+{
+        pgd_populate(&init_mm, pgd, p4d);
+        if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED)
+                arch_sync_kernel_mappings(addr, addr);
+}
+
+It is worth noting that vmalloc() and apply_to_range() carefully
+synchronizes page tables by calling p*d_alloc_track() and
+arch_sync_kernel_mappings(), and thus they are not affected by this patch
+series.
+
+This series was hugely inspired by Dave Hansen's suggestion and hence
+added Suggested-by: Dave Hansen.
+
+Cc stable because lack of this series opens the door to intermittent
+boot failures.
+
+
+This patch (of 3):
+
+Move ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() to
+linux/pgtable.h so that they can be used outside of vmalloc and ioremap.
+
+Link: https://lkml.kernel.org/r/20250818020206.4517-1-harry.yoo@oracle.com
+Link: https://lkml.kernel.org/r/20250818020206.4517-2-harry.yoo@oracle.com
+Link: https://lore.kernel.org/linux-mm/20250220064105.808339-1-gwan-gyeong.mun@intel.com [1]
+Link: https://lore.kernel.org/linux-mm/20250311114420.240341-1-gwan-gyeong.mun@intel.com [2]
+Link: https://lore.kernel.org/linux-mm/d1da214c-53d3-45ac-a8b6-51821c5416e4@intel.com [3]
+Link: https://lore.kernel.org/linux-mm/4d800744-7b88-41aa-9979-b245e8bf794b@intel.com  [4]
+Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges")
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Acked-by: Kiryl Shutsemau <kas@kernel.org>
+Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Reviewed-by: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Cc: bibo mao <maobibo@loongson.cn>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Christoph Lameter (Ampere) <cl@gentwo.org>
+Cc: Dennis Zhou <dennis@kernel.org>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Dmitriy Vyukov <dvyukov@google.com>
+Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jane Chu <jane.chu@oracle.com>
+Cc: Joao Martins <joao.m.martins@oracle.com>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kevin Brodsky <kevin.brodsky@arm.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: Thomas Huth <thuth@redhat.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/pgtable.h |   16 ++++++++++++++++
+ include/linux/vmalloc.h |   16 ----------------
+ 2 files changed, 16 insertions(+), 16 deletions(-)
+
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1695,6 +1695,22 @@ static inline int pmd_protnone(pmd_t pmd
+ }
+ #endif /* CONFIG_NUMA_BALANCING */
++/*
++ * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
++ * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
++ * needs to be called.
++ */
++#ifndef ARCH_PAGE_TABLE_SYNC_MASK
++#define ARCH_PAGE_TABLE_SYNC_MASK 0
++#endif
++
++/*
++ * There is no default implementation for arch_sync_kernel_mappings(). It is
++ * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
++ * is 0.
++ */
++void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
++
+ #endif /* CONFIG_MMU */
+ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+--- a/include/linux/vmalloc.h
++++ b/include/linux/vmalloc.h
+@@ -220,22 +220,6 @@ int vmap_pages_range(unsigned long addr,
+                    struct page **pages, unsigned int page_shift);
+ /*
+- * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
+- * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
+- * needs to be called.
+- */
+-#ifndef ARCH_PAGE_TABLE_SYNC_MASK
+-#define ARCH_PAGE_TABLE_SYNC_MASK 0
+-#endif
+-
+-/*
+- * There is no default implementation for arch_sync_kernel_mappings(). It is
+- * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
+- * is 0.
+- */
+-void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
+-
+-/*
+  *    Lowlevel-APIs (not for driver use!)
+  */
diff --git a/queue-6.16/mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch b/queue-6.16/mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch
new file mode 100644 (file)
index 0000000..76748a1
--- /dev/null
@@ -0,0 +1,50 @@
+From b4efccec8d06ceb10a7d34d7b1c449c569d53770 Mon Sep 17 00:00:00 2001
+From: Li Qiong <liqiong@nfschina.com>
+Date: Mon, 4 Aug 2025 10:57:59 +0800
+Subject: mm/slub: avoid accessing metadata when pointer is invalid in object_err()
+
+From: Li Qiong <liqiong@nfschina.com>
+
+commit b4efccec8d06ceb10a7d34d7b1c449c569d53770 upstream.
+
+object_err() reports details of an object for further debugging, such as
+the freelist pointer, redzone, etc. However, if the pointer is invalid,
+attempting to access object metadata can lead to a crash since it does
+not point to a valid object.
+
+One known path to the crash is when alloc_consistency_checks()
+determines the pointer to the allocated object is invalid because of a
+freelist corruption, and calls object_err() to report it. The debug code
+should report and handle the corruption gracefully and not crash in the
+process.
+
+In case the pointer is NULL or check_valid_pointer() returns false for
+the pointer, only print the pointer value and skip accessing metadata.
+
+Fixes: 81819f0fc828 ("SLUB core")
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Li Qiong <liqiong@nfschina.com>
+Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/slub.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -1104,7 +1104,12 @@ static void object_err(struct kmem_cache
+               return;
+       slab_bug(s, reason);
+-      print_trailer(s, slab, object);
++      if (!object || !check_valid_pointer(s, slab, object)) {
++              print_slab_info(slab);
++              pr_err("Invalid pointer 0x%p\n", object);
++      } else {
++              print_trailer(s, slab, object);
++      }
+       add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+       WARN_ON(1);
diff --git a/queue-6.16/mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch b/queue-6.16/mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch
new file mode 100644 (file)
index 0000000..d8a3d62
--- /dev/null
@@ -0,0 +1,164 @@
+From 850470a8413a8a78e772c4f6bd9fe81ec6bd5b0f Mon Sep 17 00:00:00 2001
+From: yangshiguang <yangshiguang@xiaomi.com>
+Date: Sat, 30 Aug 2025 10:09:46 +0800
+Subject: mm: slub: avoid wake up kswapd in set_track_prepare
+
+From: yangshiguang <yangshiguang@xiaomi.com>
+
+commit 850470a8413a8a78e772c4f6bd9fe81ec6bd5b0f upstream.
+
+set_track_prepare() can incur lock recursion.
+The issue is that it is called from hrtimer_start_range_ns
+holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
+CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
+and try to hold the per_cpu(hrtimer_bases)[n].lock.
+
+Avoid deadlock caused by implicitly waking up kswapd by passing in
+allocation flags, which do not contain __GFP_KSWAPD_RECLAIM in the
+debug_objects_fill_pool() case. Inside stack depot they are processed by
+gfp_nested_mask().
+Since ___slab_alloc() has preemption disabled, we mask out
+__GFP_DIRECT_RECLAIM from the flags there.
+
+The oops looks something like:
+
+BUG: spinlock recursion on CPU#3, swapper/3/0
+ lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
+Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
+Call trace:
+spin_bug+0x0
+_raw_spin_lock_irqsave+0x80
+hrtimer_try_to_cancel+0x94
+task_contending+0x10c
+enqueue_dl_entity+0x2a4
+dl_server_start+0x74
+enqueue_task_fair+0x568
+enqueue_task+0xac
+do_activate_task+0x14c
+ttwu_do_activate+0xcc
+try_to_wake_up+0x6c8
+default_wake_function+0x20
+autoremove_wake_function+0x1c
+__wake_up+0xac
+wakeup_kswapd+0x19c
+wake_all_kswapds+0x78
+__alloc_pages_slowpath+0x1ac
+__alloc_pages_noprof+0x298
+stack_depot_save_flags+0x6b0
+stack_depot_save+0x14
+set_track_prepare+0x5c
+___slab_alloc+0xccc
+__kmalloc_cache_noprof+0x470
+__set_page_owner+0x2bc
+post_alloc_hook[jt]+0x1b8
+prep_new_page+0x28
+get_page_from_freelist+0x1edc
+__alloc_pages_noprof+0x13c
+alloc_slab_page+0x244
+allocate_slab+0x7c
+___slab_alloc+0x8e8
+kmem_cache_alloc_noprof+0x450
+debug_objects_fill_pool+0x22c
+debug_object_activate+0x40
+enqueue_hrtimer[jt]+0xdc
+hrtimer_start_range_ns+0x5f8
+...
+
+Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
+Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
+Cc: stable@vger.kernel.org
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/slub.c |   30 ++++++++++++++++++++----------
+ 1 file changed, 20 insertions(+), 10 deletions(-)
+
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -926,19 +926,19 @@ static struct track *get_track(struct km
+ }
+ #ifdef CONFIG_STACKDEPOT
+-static noinline depot_stack_handle_t set_track_prepare(void)
++static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
+ {
+       depot_stack_handle_t handle;
+       unsigned long entries[TRACK_ADDRS_COUNT];
+       unsigned int nr_entries;
+       nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
+-      handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
++      handle = stack_depot_save(entries, nr_entries, gfp_flags);
+       return handle;
+ }
+ #else
+-static inline depot_stack_handle_t set_track_prepare(void)
++static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
+ {
+       return 0;
+ }
+@@ -960,9 +960,9 @@ static void set_track_update(struct kmem
+ }
+ static __always_inline void set_track(struct kmem_cache *s, void *object,
+-                                    enum track_item alloc, unsigned long addr)
++                                    enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
+ {
+-      depot_stack_handle_t handle = set_track_prepare();
++      depot_stack_handle_t handle = set_track_prepare(gfp_flags);
+       set_track_update(s, object, alloc, addr, handle);
+ }
+@@ -1890,9 +1890,9 @@ static inline bool free_debug_processing
+ static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
+ static inline int check_object(struct kmem_cache *s, struct slab *slab,
+                       void *object, u8 val) { return 1; }
+-static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
++static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
+ static inline void set_track(struct kmem_cache *s, void *object,
+-                           enum track_item alloc, unsigned long addr) {}
++                           enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
+ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
+                                       struct slab *slab) {}
+ static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+@@ -3849,9 +3849,14 @@ new_objects:
+                        * For debug caches here we had to go through
+                        * alloc_single_from_partial() so just store the
+                        * tracking info and return the object.
++                       *
++                       * Due to disabled preemption we need to disallow
++                       * blocking. The flags are further adjusted by
++                       * gfp_nested_mask() in stack_depot itself.
+                        */
+                       if (s->flags & SLAB_STORE_USER)
+-                              set_track(s, freelist, TRACK_ALLOC, addr);
++                              set_track(s, freelist, TRACK_ALLOC, addr,
++                                        gfpflags & ~(__GFP_DIRECT_RECLAIM));
+                       return freelist;
+               }
+@@ -3883,7 +3888,8 @@ new_objects:
+                       goto new_objects;
+               if (s->flags & SLAB_STORE_USER)
+-                      set_track(s, freelist, TRACK_ALLOC, addr);
++                      set_track(s, freelist, TRACK_ALLOC, addr,
++                                gfpflags & ~(__GFP_DIRECT_RECLAIM));
+               return freelist;
+       }
+@@ -4394,8 +4400,12 @@ static noinline void free_to_partial_lis
+       unsigned long flags;
+       depot_stack_handle_t handle = 0;
++      /*
++       * We cannot use GFP_NOWAIT as there are callsites where waking up
++       * kswapd could deadlock
++       */
+       if (s->flags & SLAB_STORE_USER)
+-              handle = set_track_prepare();
++              handle = set_track_prepare(__GFP_NOWARN);
+       spin_lock_irqsave(&n->list_lock, flags);
diff --git a/queue-6.16/mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch b/queue-6.16/mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch
new file mode 100644 (file)
index 0000000..c0da888
--- /dev/null
@@ -0,0 +1,61 @@
+From 9614d8bee66387501f48718fa306e17f2aa3f2f3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 31 Jul 2025 10:44:31 -0400
+Subject: mm/userfaultfd: fix kmap_local LIFO ordering for CONFIG_HIGHPTE
+
+From: Sasha Levin <sashal@kernel.org>
+
+commit 9614d8bee66387501f48718fa306e17f2aa3f2f3 upstream.
+
+With CONFIG_HIGHPTE on 32-bit ARM, move_pages_pte() maps PTE pages using
+kmap_local_page(), which requires unmapping in Last-In-First-Out order.
+
+The current code maps dst_pte first, then src_pte, but unmaps them in the
+same order (dst_pte, src_pte), violating the LIFO requirement.  This
+causes the warning in kunmap_local_indexed():
+
+  WARNING: CPU: 0 PID: 604 at mm/highmem.c:622 kunmap_local_indexed+0x178/0x17c
+  addr \!= __fix_to_virt(FIX_KMAP_BEGIN + idx)
+
+Fix this by reversing the unmap order to respect LIFO ordering.
+
+This issue follows the same pattern as similar fixes:
+- commit eca6828403b8 ("crypto: skcipher - fix mismatch between mapping and unmapping order")
+- commit 8cf57c6df818 ("nilfs2: eliminate staggered calls to kunmap in nilfs_rename")
+
+Both of which addressed the same fundamental requirement that kmap_local
+operations must follow LIFO ordering.
+
+Link: https://lkml.kernel.org/r/20250731144431.773923-1-sashal@kernel.org
+Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Acked-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Suren Baghdasaryan <surenb@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/userfaultfd.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -1453,10 +1453,15 @@ out:
+               folio_unlock(src_folio);
+               folio_put(src_folio);
+       }
+-      if (dst_pte)
+-              pte_unmap(dst_pte);
++      /*
++       * Unmap in reverse order (LIFO) to maintain proper kmap_local
++       * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
++       * first, then src_pte, so we must unmap src_pte first, then dst_pte.
++       */
+       if (src_pte)
+               pte_unmap(src_pte);
++      if (dst_pte)
++              pte_unmap(dst_pte);
+       mmu_notifier_invalidate_range_end(&range);
+       if (si)
+               put_swap_device(si);
diff --git a/queue-6.16/rust-mm-mark-vmanew-as-transparent.patch b/queue-6.16/rust-mm-mark-vmanew-as-transparent.patch
new file mode 100644 (file)
index 0000000..3277959
--- /dev/null
@@ -0,0 +1,54 @@
+From 5cc5e030bce2ec97ae5cdb2c1b94a98b1047b3fa Mon Sep 17 00:00:00 2001
+From: Baptiste Lepers <baptiste.lepers@gmail.com>
+Date: Tue, 12 Aug 2025 15:26:56 +0200
+Subject: rust: mm: mark VmaNew as transparent
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Baptiste Lepers <baptiste.lepers@gmail.com>
+
+commit 5cc5e030bce2ec97ae5cdb2c1b94a98b1047b3fa upstream.
+
+Unsafe code in VmaNew's methods assumes that the type has the same layout
+as the inner `bindings::vm_area_struct`.  This is not guaranteed by the
+default struct representation in Rust, but requires specifying the
+`transparent` representation.
+
+Link: https://lkml.kernel.org/r/20250812132712.61007-1-baptiste.lepers@gmail.com
+Fixes: dcb81aeab406 ("mm: rust: add VmaNew for f_ops->mmap()")
+Signed-off-by: Baptiste Lepers <baptiste.lepers@gmail.com>
+Reviewed-by: Alice Ryhl <aliceryhl@google.com>
+Cc: Alex Gaynor <alex.gaynor@gmail.com>
+Cc: Andreas Hindborg <a.hindborg@kernel.org>
+Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
+Cc: Boqun Feng <boqun.feng@gmail.com>
+Cc: Danilo Krummrich <dakr@kernel.org>
+Cc: Gary Guo <gary@garyguo.net>
+Cc: Jann Horn <jannh@google.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Miguel Ojeda <ojeda@kernel.org>
+Cc: Trevor Gross <tmgross@umich.edu>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ rust/kernel/mm/virt.rs | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs
+index 6086ca981b06..a1bfa4e19293 100644
+--- a/rust/kernel/mm/virt.rs
++++ b/rust/kernel/mm/virt.rs
+@@ -209,6 +209,7 @@ pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result {
+ ///
+ /// For the duration of 'a, the referenced vma must be undergoing initialization in an
+ /// `f_ops->mmap()` hook.
++#[repr(transparent)]
+ pub struct VmaNew {
+     vma: VmaRef,
+ }
+-- 
+2.51.0
+
index 67d1edd298a691c3855494ea4a856d45610d2f4b..9c788c00ddde63ceb23eaa1a9772e9b167800bd6 100644 (file)
@@ -98,3 +98,12 @@ accel-ivpu-prevent-recovery-work-from-being-queued-during-device-removal.patch
 acpi-iort-fix-memory-leak-in-iort_rmr_alloc_sids.patch
 arm64-ftrace-fix-unreachable-plt-for-ftrace_caller-in-init_module-with-config_dynamic_ftrace.patch
 pcmcia-fix-a-null-pointer-dereference-in-__iodyn_find_io_region.patch
+rust-mm-mark-vmanew-as-transparent.patch
+mm-slub-avoid-accessing-metadata-when-pointer-is-invalid-in-object_err.patch
+x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch
+mm-userfaultfd-fix-kmap_local-lifo-ordering-for-config_highpte.patch
+mm-fix-accounting-of-memmap-pages.patch
+mm-move-page-table-sync-declarations-to-linux-pgtable.h.patch
+mm-introduce-and-use-pgd-p4d-_populate_kernel.patch
+mm-fix-possible-deadlock-in-kmemleak.patch
+mm-slub-avoid-wake-up-kswapd-in-set_track_prepare.patch
diff --git a/queue-6.16/x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch b/queue-6.16/x86-mm-64-define-arch_page_table_sync_mask-and-arch_sync_kernel_mappings.patch
new file mode 100644 (file)
index 0000000..ece7d97
--- /dev/null
@@ -0,0 +1,153 @@
+From 6659d027998083fbb6d42a165b0c90dc2e8ba989 Mon Sep 17 00:00:00 2001
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Mon, 18 Aug 2025 11:02:06 +0900
+Subject: x86/mm/64: define ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings()
+
+From: Harry Yoo <harry.yoo@oracle.com>
+
+commit 6659d027998083fbb6d42a165b0c90dc2e8ba989 upstream.
+
+Define ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() to ensure
+page tables are properly synchronized when calling p*d_populate_kernel().
+
+For 5-level paging, synchronization is performed via
+pgd_populate_kernel().  In 4-level paging, pgd_populate() is a no-op, so
+synchronization is instead performed at the P4D level via
+p4d_populate_kernel().
+
+This fixes intermittent boot failures on systems using 4-level paging and
+a large amount of persistent memory:
+
+  BUG: unable to handle page fault for address: ffffe70000000034
+  #PF: supervisor write access in kernel mode
+  #PF: error_code(0x0002) - not-present page
+  PGD 0 P4D 0
+  Oops: 0002 [#1] SMP NOPTI
+  RIP: 0010:__init_single_page+0x9/0x6d
+  Call Trace:
+   <TASK>
+   __init_zone_device_page+0x17/0x5d
+   memmap_init_zone_device+0x154/0x1bb
+   pagemap_range+0x2e0/0x40f
+   memremap_pages+0x10b/0x2f0
+   devm_memremap_pages+0x1e/0x60
+   dev_dax_probe+0xce/0x2ec [device_dax]
+   dax_bus_probe+0x6d/0xc9
+   [... snip ...]
+   </TASK>
+
+It also fixes a crash in vmemmap_set_pmd() caused by accessing vmemmap
+before sync_global_pgds() [1]:
+
+  BUG: unable to handle page fault for address: ffffeb3ff1200000
+  #PF: supervisor write access in kernel mode
+  #PF: error_code(0x0002) - not-present page
+  PGD 0 P4D 0
+  Oops: Oops: 0002 [#1] PREEMPT SMP NOPTI
+  Tainted: [W]=WARN
+  RIP: 0010:vmemmap_set_pmd+0xff/0x230
+   <TASK>
+   vmemmap_populate_hugepages+0x176/0x180
+   vmemmap_populate+0x34/0x80
+   __populate_section_memmap+0x41/0x90
+   sparse_add_section+0x121/0x3e0
+   __add_pages+0xba/0x150
+   add_pages+0x1d/0x70
+   memremap_pages+0x3dc/0x810
+   devm_memremap_pages+0x1c/0x60
+   xe_devm_add+0x8b/0x100 [xe]
+   xe_tile_init_noalloc+0x6a/0x70 [xe]
+   xe_device_probe+0x48c/0x740 [xe]
+   [... snip ...]
+
+Link: https://lkml.kernel.org/r/20250818020206.4517-4-harry.yoo@oracle.com
+Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges")
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Closes: https://lore.kernel.org/linux-mm/20250311114420.240341-1-gwan-gyeong.mun@intel.com [1]
+Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
+Acked-by: Kiryl Shutsemau <kas@kernel.org>
+Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Cc: bibo mao <maobibo@loongson.cn>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Christoph Lameter (Ampere) <cl@gentwo.org>
+Cc: Dennis Zhou <dennis@kernel.org>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Dmitriy Vyukov <dvyukov@google.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jane Chu <jane.chu@oracle.com>
+Cc: Joao Martins <joao.m.martins@oracle.com>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kevin Brodsky <kevin.brodsky@arm.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: Thomas Huth <thuth@redhat.com>
+Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable_64_types.h |    3 +++
+ arch/x86/mm/init_64.c                   |   18 ++++++++++++++++++
+ 2 files changed, 21 insertions(+)
+
+--- a/arch/x86/include/asm/pgtable_64_types.h
++++ b/arch/x86/include/asm/pgtable_64_types.h
+@@ -36,6 +36,9 @@ static inline bool pgtable_l5_enabled(vo
+ #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+ #endif /* USE_EARLY_PGTABLE_L5 */
++#define ARCH_PAGE_TABLE_SYNC_MASK \
++      (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED)
++
+ extern unsigned int pgdir_shift;
+ extern unsigned int ptrs_per_p4d;
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -224,6 +224,24 @@ static void sync_global_pgds(unsigned lo
+ }
+ /*
++ * Make kernel mappings visible in all page tables in the system.
++ * This is necessary except when the init task populates kernel mappings
++ * during the boot process. In that case, all processes originating from
++ * the init task copies the kernel mappings, so there is no issue.
++ * Otherwise, missing synchronization could lead to kernel crashes due
++ * to missing page table entries for certain kernel mappings.
++ *
++ * Synchronization is performed at the top level, which is the PGD in
++ * 5-level paging systems. But in 4-level paging systems, however,
++ * pgd_populate() is a no-op, so synchronization is done at the P4D level.
++ * sync_global_pgds() handles this difference between paging levels.
++ */
++void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
++{
++      sync_global_pgds(start, end);
++}
++
++/*
+  * NOTE: This function is marked __ref because it calls __init function
+  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+  */