From: Greg Kroah-Hartman Date: Thu, 20 Nov 2025 15:59:03 +0000 (+0100) Subject: 6.17-stable patches X-Git-Tag: v6.6.117~35 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9e1621929d6df288e08c30086ed4d487f1c4f517;p=thirdparty%2Fkernel%2Fstable-queue.git 6.17-stable patches added patches: dma-mapping-benchmark-restore-padding-to-ensure-uabi-remained-consistent.patch gcov-add-support-for-gcc-15.patch kho-allocate-metadata-directly-from-the-buddy-allocator.patch kho-increase-metadata-bitmap-size-to-page_size.patch kho-warn-and-exit-when-unpreserved-page-wasn-t-preserved.patch kho-warn-and-fail-on-metadata-or-preserved-memory-in-scratch-area.patch ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item.patch ksmbd-close-accepted-socket-when-per-ip-limit-rejects-connection.patch kvm-arm64-make-all-32bit-id-registers-fully-writable.patch kvm-guest_memfd-remove-bindings-on-memslot-deletion-when-gmem-is-dying.patch kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch kvm-vmx-fix-check-for-valid-gva-on-an-ept-violation.patch loongarch-consolidate-early_ioremap-ioremap_prot.patch loongarch-kvm-add-delay-until-timer-interrupt-injected.patch loongarch-kvm-fix-max-supported-vcpus-set-with-eiointc.patch loongarch-kvm-restore-guest-pmu-if-it-is-enabled.patch loongarch-let-pte-pmd-_modify-record-the-status-of-_page_dirty.patch loongarch-use-correct-accessor-to-read-fwpc-mwpc.patch maple_tree-fix-tracepoint-string-pointers.patch nfsd-add-missing-fattr4_word2_clone_blksize-from-supported-attributes.patch nfsd-fix-refcount-leak-in-nfsd_set_fh_dentry.patch nfsd-free-copynotify-stateid-in-nfs4_free_ol_stateid.patch strparser-fix-signed-unsigned-mismatch-bug.patch --- diff --git a/queue-6.17/dma-mapping-benchmark-restore-padding-to-ensure-uabi-remained-consistent.patch b/queue-6.17/dma-mapping-benchmark-restore-padding-to-ensure-uabi-remained-consistent.patch new file mode 100644 index 0000000000..e480e83d25 --- /dev/null +++ b/queue-6.17/dma-mapping-benchmark-restore-padding-to-ensure-uabi-remained-consistent.patch @@ -0,0 +1,41 @@ +From 23ee8a2563a0f24cf4964685ced23c32be444ab8 Mon Sep 17 00:00:00 2001 +From: Qinxin Xia +Date: Tue, 28 Oct 2025 20:08:59 +0800 +Subject: dma-mapping: benchmark: Restore padding to ensure uABI remained consistent + +From: Qinxin Xia + +commit 23ee8a2563a0f24cf4964685ced23c32be444ab8 upstream. + +The padding field in the structure was previously reserved to +maintain a stable interface for potential new fields, ensuring +compatibility with user-space shared data structures. +However,it was accidentally removed by tiantao in a prior commit, +which may lead to incompatibility between user space and the kernel. + +This patch reinstates the padding to restore the original structure +layout and preserve compatibility. + +Fixes: 8ddde07a3d28 ("dma-mapping: benchmark: extract a common header file for map_benchmark definition") +Cc: stable@vger.kernel.org +Acked-by: Barry Song +Signed-off-by: Qinxin Xia +Reported-by: Barry Song +Closes: https://lore.kernel.org/lkml/CAGsJ_4waiZ2+NBJG+SCnbNk+nQ_ZF13_Q5FHJqZyxyJTcEop2A@mail.gmail.com/ +Reviewed-by: Jonathan Cameron +Signed-off-by: Marek Szyprowski +Link: https://lore.kernel.org/r/20251028120900.2265511-2-xiaqinxin@huawei.com +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/map_benchmark.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/include/linux/map_benchmark.h ++++ b/include/linux/map_benchmark.h +@@ -27,5 +27,6 @@ struct map_benchmark { + __u32 dma_dir; /* DMA data direction */ + __u32 dma_trans_ns; /* time for DMA transmission in ns */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ ++ __u8 expansion[76]; /* For future use */ + }; + #endif /* _KERNEL_DMA_BENCHMARK_H */ diff --git a/queue-6.17/gcov-add-support-for-gcc-15.patch b/queue-6.17/gcov-add-support-for-gcc-15.patch new file mode 100644 index 0000000000..a5b974af2b --- /dev/null +++ b/queue-6.17/gcov-add-support-for-gcc-15.patch @@ -0,0 +1,40 @@ +From ec4d11fc4b2dd4a2fa8c9d801ee9753b74623554 Mon Sep 17 00:00:00 2001 +From: Peter Oberparleiter +Date: Tue, 28 Oct 2025 12:51:25 +0100 +Subject: gcov: add support for GCC 15 + +From: Peter Oberparleiter + +commit ec4d11fc4b2dd4a2fa8c9d801ee9753b74623554 upstream. + +Using gcov on kernels compiled with GCC 15 results in truncated 16-byte +long .gcda files with no usable data. To fix this, update GCOV_COUNTERS +to match the value defined by GCC 15. + +Tested with GCC 14.3.0 and GCC 15.2.0. + +Link: https://lkml.kernel.org/r/20251028115125.1319410-1-oberpar@linux.ibm.com +Signed-off-by: Peter Oberparleiter +Reported-by: Matthieu Baerts +Closes: https://github.com/linux-test-project/lcov/issues/445 +Tested-by: Matthieu Baerts +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + kernel/gcov/gcc_4_7.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/kernel/gcov/gcc_4_7.c ++++ b/kernel/gcov/gcc_4_7.c +@@ -18,7 +18,9 @@ + #include + #include "gcov.h" + +-#if (__GNUC__ >= 14) ++#if (__GNUC__ >= 15) ++#define GCOV_COUNTERS 10 ++#elif (__GNUC__ >= 14) + #define GCOV_COUNTERS 9 + #elif (__GNUC__ >= 10) + #define GCOV_COUNTERS 8 diff --git a/queue-6.17/kho-allocate-metadata-directly-from-the-buddy-allocator.patch b/queue-6.17/kho-allocate-metadata-directly-from-the-buddy-allocator.patch new file mode 100644 index 0000000000..fc5c917e31 --- /dev/null +++ b/queue-6.17/kho-allocate-metadata-directly-from-the-buddy-allocator.patch @@ -0,0 +1,90 @@ +From fa759cd75bce5489eed34596daa53f721849a86f Mon Sep 17 00:00:00 2001 +From: Pasha Tatashin +Date: Mon, 20 Oct 2025 20:08:52 -0400 +Subject: kho: allocate metadata directly from the buddy allocator + +From: Pasha Tatashin + +commit fa759cd75bce5489eed34596daa53f721849a86f upstream. + +KHO allocates metadata for its preserved memory map using the slab +allocator via kzalloc(). This metadata is temporary and is used by the +next kernel during early boot to find preserved memory. + +A problem arises when KFENCE is enabled. kzalloc() calls can be randomly +intercepted by kfence_alloc(), which services the allocation from a +dedicated KFENCE memory pool. This pool is allocated early in boot via +memblock. + +When booting via KHO, the memblock allocator is restricted to a "scratch +area", forcing the KFENCE pool to be allocated within it. This creates a +conflict, as the scratch area is expected to be ephemeral and +overwriteable by a subsequent kexec. If KHO metadata is placed in this +KFENCE pool, it leads to memory corruption when the next kernel is loaded. + +To fix this, modify KHO to allocate its metadata directly from the buddy +allocator instead of slab. + +Link: https://lkml.kernel.org/r/20251021000852.2924827-4-pasha.tatashin@soleen.com +Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") +Signed-off-by: Pasha Tatashin +Reviewed-by: Pratyush Yadav +Reviewed-by: Mike Rapoport (Microsoft) +Reviewed-by: David Matlack +Cc: Alexander Graf +Cc: Christian Brauner +Cc: Jason Gunthorpe +Cc: Jonathan Corbet +Cc: Masahiro Yamada +Cc: Miguel Ojeda +Cc: Randy Dunlap +Cc: Samiullah Khawaja +Cc: Tejun Heo +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/gfp.h | 3 +++ + kernel/kexec_handover.c | 6 +++--- + 2 files changed, 6 insertions(+), 3 deletions(-) + +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + #include + + struct vm_area_struct; +@@ -463,4 +464,6 @@ static inline struct folio *folio_alloc_ + /* This should be paired with folio_put() rather than free_contig_range(). */ + #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) + ++DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) ++ + #endif /* __LINUX_GFP_H */ +--- a/kernel/kexec_handover.c ++++ b/kernel/kexec_handover.c +@@ -102,7 +102,7 @@ static void *xa_load_or_alloc(struct xar + if (res) + return res; + +- void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); ++ void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); + + if (!elm) + return ERR_PTR(-ENOMEM); +@@ -266,9 +266,9 @@ static_assert(sizeof(struct khoser_mem_c + static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, + unsigned long order) + { +- struct khoser_mem_chunk *chunk __free(kfree) = NULL; ++ struct khoser_mem_chunk *chunk __free(free_page) = NULL; + +- chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); ++ chunk = (void *)get_zeroed_page(GFP_KERNEL); + if (!chunk) + return ERR_PTR(-ENOMEM); + diff --git a/queue-6.17/kho-increase-metadata-bitmap-size-to-page_size.patch b/queue-6.17/kho-increase-metadata-bitmap-size-to-page_size.patch new file mode 100644 index 0000000000..26489be886 --- /dev/null +++ b/queue-6.17/kho-increase-metadata-bitmap-size-to-page_size.patch @@ -0,0 +1,113 @@ +From a2fff99f92dae9c0eaf0d75de3def70ec68dad92 Mon Sep 17 00:00:00 2001 +From: Pasha Tatashin +Date: Mon, 20 Oct 2025 20:08:51 -0400 +Subject: kho: increase metadata bitmap size to PAGE_SIZE + +From: Pasha Tatashin + +commit a2fff99f92dae9c0eaf0d75de3def70ec68dad92 upstream. + +KHO memory preservation metadata is preserved in 512 byte chunks which +requires their allocation from slab allocator. Slabs are not safe to be +used with KHO because of kfence, and because partial slabs may lead leaks +to the next kernel. Change the size to be PAGE_SIZE. + +The kfence specifically may cause memory corruption, where it randomly +provides slab objects that can be within the scratch area. The reason for +that is that kfence allocates its objects prior to KHO scratch is marked +as CMA region. + +While this change could potentially increase metadata overhead on systems +with sparsely preserved memory, this is being mitigated by ongoing work to +reduce sparseness during preservation via 1G guest pages. Furthermore, +this change aligns with future work on a stateless KHO, which will also +use page-sized bitmaps for its radix tree metadata. + +Link: https://lkml.kernel.org/r/20251021000852.2924827-3-pasha.tatashin@soleen.com +Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") +Signed-off-by: Pasha Tatashin +Reviewed-by: Mike Rapoport (Microsoft) +Reviewed-by: Pratyush Yadav +Cc: Alexander Graf +Cc: Christian Brauner +Cc: David Matlack +Cc: Jason Gunthorpe +Cc: Jonathan Corbet +Cc: Masahiro Yamada +Cc: Miguel Ojeda +Cc: Randy Dunlap +Cc: Samiullah Khawaja +Cc: Tejun Heo +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + kernel/kexec_handover.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +--- a/kernel/kexec_handover.c ++++ b/kernel/kexec_handover.c +@@ -52,10 +52,10 @@ early_param("kho", kho_parse_enable); + * Keep track of memory that is to be preserved across KHO. + * + * The serializing side uses two levels of xarrays to manage chunks of per-order +- * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a +- * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations +- * each bitmap will cover 16M of address space. Thus, for 16G of memory at most +- * 512K of bitmap memory will be needed for order 0. ++ * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order ++ * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 ++ * allocations each bitmap will cover 128M of address space. Thus, for 16G of ++ * memory at most 512K of bitmap memory will be needed for order 0. + * + * This approach is fully incremental, as the serialization progresses folios + * can continue be aggregated to the tracker. The final step, immediately prior +@@ -63,12 +63,14 @@ early_param("kho", kho_parse_enable); + * successor kernel to parse. + */ + +-#define PRESERVE_BITS (512 * 8) ++#define PRESERVE_BITS (PAGE_SIZE * 8) + + struct kho_mem_phys_bits { + DECLARE_BITMAP(preserve, PRESERVE_BITS); + }; + ++static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); ++ + struct kho_mem_phys { + /* + * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized +@@ -93,19 +95,19 @@ struct kho_serialization { + struct khoser_mem_chunk *preserved_mem_map; + }; + +-static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) ++static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) + { + void *res = xa_load(xa, index); + + if (res) + return res; + +- void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL); ++ void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); + + if (!elm) + return ERR_PTR(-ENOMEM); + +- if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) ++ if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); +@@ -175,8 +177,7 @@ static int __kho_preserve_order(struct k + } + } + +- bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, +- sizeof(*bits)); ++ bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (IS_ERR(bits)) + return PTR_ERR(bits); + diff --git a/queue-6.17/kho-warn-and-exit-when-unpreserved-page-wasn-t-preserved.patch b/queue-6.17/kho-warn-and-exit-when-unpreserved-page-wasn-t-preserved.patch new file mode 100644 index 0000000000..cacb512e9a --- /dev/null +++ b/queue-6.17/kho-warn-and-exit-when-unpreserved-page-wasn-t-preserved.patch @@ -0,0 +1,50 @@ +From b05addf6f0596edb1f82ab4059438c7ef2d2686d Mon Sep 17 00:00:00 2001 +From: Pratyush Yadav +Date: Mon, 3 Nov 2025 19:02:32 +0100 +Subject: kho: warn and exit when unpreserved page wasn't preserved + +From: Pratyush Yadav + +commit b05addf6f0596edb1f82ab4059438c7ef2d2686d upstream. + +Calling __kho_unpreserve() on a pair of (pfn, end_pfn) that wasn't +preserved is a bug. Currently, if that is done, the physxa or bits can be +NULL. This results in a soft lockup since a NULL physxa or bits results +in redoing the loop without ever making any progress. + +Return when physxa or bits are not found, but WARN first to loudly +indicate invalid behaviour. + +Link: https://lkml.kernel.org/r/20251103180235.71409-3-pratyush@kernel.org +Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") +Signed-off-by: Pratyush Yadav +Reviewed-by: Mike Rapoport (Microsoft) +Cc: Alexander Graf +Cc: Baoquan He +Cc: Pasha Tatashin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + kernel/kexec_handover.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/kernel/kexec_handover.c ++++ b/kernel/kexec_handover.c +@@ -131,12 +131,12 @@ static void __kho_unpreserve(struct kho_ + const unsigned long pfn_high = pfn >> order; + + physxa = xa_load(&track->orders, order); +- if (!physxa) +- continue; ++ if (WARN_ON_ONCE(!physxa)) ++ return; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); +- if (!bits) +- continue; ++ if (WARN_ON_ONCE(!bits)) ++ return; + + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + diff --git a/queue-6.17/kho-warn-and-fail-on-metadata-or-preserved-memory-in-scratch-area.patch b/queue-6.17/kho-warn-and-fail-on-metadata-or-preserved-memory-in-scratch-area.patch new file mode 100644 index 0000000000..dd70c6ed30 --- /dev/null +++ b/queue-6.17/kho-warn-and-fail-on-metadata-or-preserved-memory-in-scratch-area.patch @@ -0,0 +1,318 @@ +From e38f65d317df1fd2dcafe614d9c537475ecf9992 Mon Sep 17 00:00:00 2001 +From: Pasha Tatashin +Date: Mon, 20 Oct 2025 20:08:50 -0400 +Subject: kho: warn and fail on metadata or preserved memory in scratch area + +From: Pasha Tatashin + +commit e38f65d317df1fd2dcafe614d9c537475ecf9992 upstream. + +Patch series "KHO: kfence + KHO memory corruption fix", v3. + +This series fixes a memory corruption bug in KHO that occurs when KFENCE +is enabled. + +The root cause is that KHO metadata, allocated via kzalloc(), can be +randomly serviced by kfence_alloc(). When a kernel boots via KHO, the +early memblock allocator is restricted to a "scratch area". This forces +the KFENCE pool to be allocated within this scratch area, creating a +conflict. If KHO metadata is subsequently placed in this pool, it gets +corrupted during the next kexec operation. + +Google is using KHO and have had obscure crashes due to this memory +corruption, with stacks all over the place. I would prefer this fix to be +properly backported to stable so we can also automatically consume it once +we switch to the upstream KHO. + +Patch 1/3 introduces a debug-only feature (CONFIG_KEXEC_HANDOVER_DEBUG) +that adds checks to detect and fail any operation that attempts to place +KHO metadata or preserved memory within the scratch area. This serves as +a validation and diagnostic tool to confirm the problem without affecting +production builds. + +Patch 2/3 Increases bitmap to PAGE_SIZE, so buddy allocator can be used. + +Patch 3/3 Provides the fix by modifying KHO to allocate its metadata +directly from the buddy allocator instead of slab. This bypasses the +KFENCE interception entirely. + + +This patch (of 3): + +It is invalid for KHO metadata or preserved memory regions to be located +within the KHO scratch area, as this area is overwritten when the next +kernel is loaded, and used early in boot by the next kernel. This can +lead to memory corruption. + +Add checks to kho_preserve_* and KHO's internal metadata allocators +(xa_load_or_alloc, new_chunk) to verify that the physical address of the +memory does not overlap with any defined scratch region. If an overlap is +detected, the operation will fail and a WARN_ON is triggered. To avoid +performance overhead in production kernels, these checks are enabled only +when CONFIG_KEXEC_HANDOVER_DEBUG is selected. + +[rppt@kernel.org: fix KEXEC_HANDOVER_DEBUG Kconfig dependency] + Link: https://lkml.kernel.org/r/aQHUyyFtiNZhx8jo@kernel.org +[pasha.tatashin@soleen.com: build fix] + Link: https://lkml.kernel.org/r/CA+CK2bBnorfsTymKtv4rKvqGBHs=y=MjEMMRg_tE-RME6n-zUw@mail.gmail.com +Link: https://lkml.kernel.org/r/20251021000852.2924827-1-pasha.tatashin@soleen.com +Link: https://lkml.kernel.org/r/20251021000852.2924827-2-pasha.tatashin@soleen.com +Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") +Signed-off-by: Pasha Tatashin +Signed-off-by: Mike Rapoport +Reviewed-by: Mike Rapoport (Microsoft) +Reviewed-by: Pratyush Yadav +Cc: Alexander Graf +Cc: Christian Brauner +Cc: David Matlack +Cc: Jason Gunthorpe +Cc: Jonathan Corbet +Cc: Masahiro Yamada +Cc: Miguel Ojeda +Cc: Randy Dunlap +Cc: Samiullah Khawaja +Cc: Tejun Heo +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + kernel/Kconfig.kexec | 9 ++++++ + kernel/Makefile | 1 + kernel/kexec_handover.c | 57 ++++++++++++++++++++++++++------------- + kernel/kexec_handover_debug.c | 25 +++++++++++++++++ + kernel/kexec_handover_internal.h | 20 +++++++++++++ + 5 files changed, 93 insertions(+), 19 deletions(-) + create mode 100644 kernel/kexec_handover_debug.c + create mode 100644 kernel/kexec_handover_internal.h + +--- a/kernel/Kconfig.kexec ++++ b/kernel/Kconfig.kexec +@@ -109,6 +109,15 @@ config KEXEC_HANDOVER + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + ++config KEXEC_HANDOVER_DEBUG ++ bool "Enable Kexec Handover debug checks" ++ depends on KEXEC_HANDOVER ++ help ++ This option enables extra sanity checks for the Kexec Handover ++ subsystem. Since, KHO performance is crucial in live update ++ scenarios and the extra code might be adding overhead it is ++ only optionally enabled. ++ + config CRASH_DUMP + bool "kernel crash dumps" + default ARCH_DEFAULT_CRASH_DUMP +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -82,6 +82,7 @@ obj-$(CONFIG_KEXEC) += kexec.o + obj-$(CONFIG_KEXEC_FILE) += kexec_file.o + obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o + obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o ++obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o + obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o + obj-$(CONFIG_COMPAT) += compat.o + obj-$(CONFIG_CGROUPS) += cgroup/ +--- a/kernel/kexec_handover.c ++++ b/kernel/kexec_handover.c +@@ -8,6 +8,7 @@ + + #define pr_fmt(fmt) "KHO: " fmt + ++#include + #include + #include + #include +@@ -21,6 +22,7 @@ + + #include + ++#include "kexec_handover_internal.h" + /* + * KHO is tightly coupled with mm init and needs access to some of mm + * internal APIs. +@@ -93,26 +95,26 @@ struct kho_serialization { + + static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) + { +- void *elm, *res; ++ void *res = xa_load(xa, index); + +- elm = xa_load(xa, index); +- if (elm) +- return elm; ++ if (res) ++ return res; ++ ++ void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL); + +- elm = kzalloc(sz, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); + ++ if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) ++ return ERR_PTR(-EINVAL); ++ + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); + if (xa_is_err(res)) +- res = ERR_PTR(xa_err(res)); +- +- if (res) { +- kfree(elm); ++ return ERR_PTR(xa_err(res)); ++ else if (res) + return res; +- } + +- return elm; ++ return no_free_ptr(elm); + } + + static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, +@@ -263,15 +265,19 @@ static_assert(sizeof(struct khoser_mem_c + static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, + unsigned long order) + { +- struct khoser_mem_chunk *chunk; ++ struct khoser_mem_chunk *chunk __free(kfree) = NULL; + + chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!chunk) +- return NULL; ++ return ERR_PTR(-ENOMEM); ++ ++ if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) ++ return ERR_PTR(-EINVAL); ++ + chunk->hdr.order = order; + if (cur_chunk) + KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); +- return chunk; ++ return no_free_ptr(chunk); + } + + static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) +@@ -292,14 +298,17 @@ static int kho_mem_serialize(struct kho_ + struct khoser_mem_chunk *chunk = NULL; + struct kho_mem_phys *physxa; + unsigned long order; ++ int err = -ENOMEM; + + xa_for_each(&ser->track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + chunk = new_chunk(chunk, order); +- if (!chunk) ++ if (IS_ERR(chunk)) { ++ err = PTR_ERR(chunk); + goto err_free; ++ } + + if (!first_chunk) + first_chunk = chunk; +@@ -309,8 +318,10 @@ static int kho_mem_serialize(struct kho_ + + if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { + chunk = new_chunk(chunk, order); +- if (!chunk) ++ if (IS_ERR(chunk)) { ++ err = PTR_ERR(chunk); + goto err_free; ++ } + } + + elm = &chunk->bitmaps[chunk->hdr.num_elms]; +@@ -327,7 +338,7 @@ static int kho_mem_serialize(struct kho_ + + err_free: + kho_mem_ser_free(first_chunk); +- return -ENOMEM; ++ return err; + } + + static void __init deserialize_bitmap(unsigned int order, +@@ -380,8 +391,8 @@ static void __init kho_mem_deserialize(c + * area for early allocations that happen before page allocator is + * initialized. + */ +-static struct kho_scratch *kho_scratch; +-static unsigned int kho_scratch_cnt; ++struct kho_scratch *kho_scratch; ++unsigned int kho_scratch_cnt; + + /* + * The scratch areas are scaled by default as percent of memory allocated from +@@ -684,6 +695,9 @@ int kho_preserve_folio(struct folio *fol + if (kho_out.finalized) + return -EBUSY; + ++ if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) ++ return -EINVAL; ++ + return __kho_preserve_order(track, pfn, order); + } + EXPORT_SYMBOL_GPL(kho_preserve_folio); +@@ -713,6 +727,11 @@ int kho_preserve_phys(phys_addr_t phys, + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + ++ if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, ++ nr_pages << PAGE_SHIFT))) { ++ return -EINVAL; ++ } ++ + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); +--- /dev/null ++++ b/kernel/kexec_handover_debug.c +@@ -0,0 +1,25 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * kexec_handover_debug.c - kexec handover optional debug functionality ++ * Copyright (C) 2025 Google LLC, Pasha Tatashin ++ */ ++ ++#define pr_fmt(fmt) "KHO: " fmt ++ ++#include "kexec_handover_internal.h" ++ ++bool kho_scratch_overlap(phys_addr_t phys, size_t size) ++{ ++ phys_addr_t scratch_start, scratch_end; ++ unsigned int i; ++ ++ for (i = 0; i < kho_scratch_cnt; i++) { ++ scratch_start = kho_scratch[i].addr; ++ scratch_end = kho_scratch[i].addr + kho_scratch[i].size; ++ ++ if (phys < scratch_end && (phys + size) > scratch_start) ++ return true; ++ } ++ ++ return false; ++} +--- /dev/null ++++ b/kernel/kexec_handover_internal.h +@@ -0,0 +1,20 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H ++#define LINUX_KEXEC_HANDOVER_INTERNAL_H ++ ++#include ++#include ++ ++extern struct kho_scratch *kho_scratch; ++extern unsigned int kho_scratch_cnt; ++ ++#ifdef CONFIG_KEXEC_HANDOVER_DEBUG ++bool kho_scratch_overlap(phys_addr_t phys, size_t size); ++#else ++static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) ++{ ++ return false; ++} ++#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ ++ ++#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/queue-6.17/ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item.patch b/queue-6.17/ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item.patch new file mode 100644 index 0000000000..31d69e0adb --- /dev/null +++ b/queue-6.17/ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item.patch @@ -0,0 +1,212 @@ +From f5548c318d6520d4fa3c5ed6003eeb710763cbc5 Mon Sep 17 00:00:00 2001 +From: Pedro Demarchi Gomes +Date: Wed, 22 Oct 2025 12:30:59 -0300 +Subject: ksm: use range-walk function to jump over holes in scan_get_next_rmap_item + +From: Pedro Demarchi Gomes + +commit f5548c318d6520d4fa3c5ed6003eeb710763cbc5 upstream. + +Currently, scan_get_next_rmap_item() walks every page address in a VMA to +locate mergeable pages. This becomes highly inefficient when scanning +large virtual memory areas that contain mostly unmapped regions, causing +ksmd to use large amount of cpu without deduplicating much pages. + +This patch replaces the per-address lookup with a range walk using +walk_page_range(). The range walker allows KSM to skip over entire +unmapped holes in a VMA, avoiding unnecessary lookups. This problem was +previously discussed in [1]. + +Consider the following test program which creates a 32 TiB mapping in the +virtual address space but only populates a single page: + +#include +#include +#include + +/* 32 TiB */ +const size_t size = 32ul * 1024 * 1024 * 1024 * 1024; + +int main() { + char *area = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0); + + if (area == MAP_FAILED) { + perror("mmap() failed\n"); + return -1; + } + + /* Populate a single page such that we get an anon_vma. */ + *area = 0; + + /* Enable KSM. */ + madvise(area, size, MADV_MERGEABLE); + pause(); + return 0; +} + +$ ./ksm-sparse & +$ echo 1 > /sys/kernel/mm/ksm/run + +Without this patch ksmd uses 100% of the cpu for a long time (more then 1 +hour in my test machine) scanning all the 32 TiB virtual address space +that contain only one mapped page. This makes ksmd essentially deadlocked +not able to deduplicate anything of value. With this patch ksmd walks +only the one mapped page and skips the rest of the 32 TiB virtual address +space, making the scan fast using little cpu. + +Link: https://lkml.kernel.org/r/20251023035841.41406-1-pedrodemargomes@gmail.com +Link: https://lkml.kernel.org/r/20251022153059.22763-1-pedrodemargomes@gmail.com +Link: https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [1] +Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging") +Signed-off-by: Pedro Demarchi Gomes +Co-developed-by: David Hildenbrand +Signed-off-by: David Hildenbrand +Reported-by: craftfever +Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io +Suggested-by: David Hildenbrand +Acked-by: David Hildenbrand +Cc: Chengming Zhou +Cc: xu xin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/ksm.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 104 insertions(+), 9 deletions(-) + +--- a/mm/ksm.c ++++ b/mm/ksm.c +@@ -2458,6 +2458,95 @@ static bool should_skip_rmap_item(struct + return true; + } + ++struct ksm_next_page_arg { ++ struct folio *folio; ++ struct page *page; ++ unsigned long addr; ++}; ++ ++static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, ++ struct mm_walk *walk) ++{ ++ struct ksm_next_page_arg *private = walk->private; ++ struct vm_area_struct *vma = walk->vma; ++ pte_t *start_ptep = NULL, *ptep, pte; ++ struct mm_struct *mm = walk->mm; ++ struct folio *folio; ++ struct page *page; ++ spinlock_t *ptl; ++ pmd_t pmd; ++ ++ if (ksm_test_exit(mm)) ++ return 0; ++ ++ cond_resched(); ++ ++ pmd = pmdp_get_lockless(pmdp); ++ if (!pmd_present(pmd)) ++ return 0; ++ ++ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) { ++ ptl = pmd_lock(mm, pmdp); ++ pmd = pmdp_get(pmdp); ++ ++ if (!pmd_present(pmd)) { ++ goto not_found_unlock; ++ } else if (pmd_leaf(pmd)) { ++ page = vm_normal_page_pmd(vma, addr, pmd); ++ if (!page) ++ goto not_found_unlock; ++ folio = page_folio(page); ++ ++ if (folio_is_zone_device(folio) || !folio_test_anon(folio)) ++ goto not_found_unlock; ++ ++ page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT); ++ goto found_unlock; ++ } ++ spin_unlock(ptl); ++ } ++ ++ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); ++ if (!start_ptep) ++ return 0; ++ ++ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { ++ pte = ptep_get(ptep); ++ ++ if (!pte_present(pte)) ++ continue; ++ ++ page = vm_normal_page(vma, addr, pte); ++ if (!page) ++ continue; ++ folio = page_folio(page); ++ ++ if (folio_is_zone_device(folio) || !folio_test_anon(folio)) ++ continue; ++ goto found_unlock; ++ } ++ ++not_found_unlock: ++ spin_unlock(ptl); ++ if (start_ptep) ++ pte_unmap(start_ptep); ++ return 0; ++found_unlock: ++ folio_get(folio); ++ spin_unlock(ptl); ++ if (start_ptep) ++ pte_unmap(start_ptep); ++ private->page = page; ++ private->folio = folio; ++ private->addr = addr; ++ return 1; ++} ++ ++static struct mm_walk_ops ksm_next_page_ops = { ++ .pmd_entry = ksm_next_page_pmd_entry, ++ .walk_lock = PGWALK_RDLOCK, ++}; ++ + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) + { + struct mm_struct *mm; +@@ -2545,21 +2634,27 @@ next_mm: + ksm_scan.address = vma->vm_end; + + while (ksm_scan.address < vma->vm_end) { ++ struct ksm_next_page_arg ksm_next_page_arg; + struct page *tmp_page = NULL; +- struct folio_walk fw; + struct folio *folio; + + if (ksm_test_exit(mm)) + break; + +- folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); +- if (folio) { +- if (!folio_is_zone_device(folio) && +- folio_test_anon(folio)) { +- folio_get(folio); +- tmp_page = fw.page; +- } +- folio_walk_end(&fw, vma); ++ int found; ++ ++ found = walk_page_range_vma(vma, ksm_scan.address, ++ vma->vm_end, ++ &ksm_next_page_ops, ++ &ksm_next_page_arg); ++ ++ if (found > 0) { ++ folio = ksm_next_page_arg.folio; ++ tmp_page = ksm_next_page_arg.page; ++ ksm_scan.address = ksm_next_page_arg.addr; ++ } else { ++ VM_WARN_ON_ONCE(found < 0); ++ ksm_scan.address = vma->vm_end - PAGE_SIZE; + } + + if (tmp_page) { diff --git a/queue-6.17/ksmbd-close-accepted-socket-when-per-ip-limit-rejects-connection.patch b/queue-6.17/ksmbd-close-accepted-socket-when-per-ip-limit-rejects-connection.patch new file mode 100644 index 0000000000..b38f7eb026 --- /dev/null +++ b/queue-6.17/ksmbd-close-accepted-socket-when-per-ip-limit-rejects-connection.patch @@ -0,0 +1,42 @@ +From 98a5fd31cbf72d46bf18e50b3ab0ce86d5f319a9 Mon Sep 17 00:00:00 2001 +From: Joshua Rogers +Date: Sat, 8 Nov 2025 22:59:23 +0800 +Subject: ksmbd: close accepted socket when per-IP limit rejects connection + +From: Joshua Rogers + +commit 98a5fd31cbf72d46bf18e50b3ab0ce86d5f319a9 upstream. + +When the per-IP connection limit is exceeded in ksmbd_kthread_fn(), +the code sets ret = -EAGAIN and continues the accept loop without +closing the just-accepted socket. That leaks one socket per rejected +attempt from a single IP and enables a trivial remote DoS. + +Release client_sk before continuing. + +This bug was found with ZeroPath. + +Cc: stable@vger.kernel.org +Signed-off-by: Joshua Rogers +Acked-by: Namjae Jeon +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman +--- + fs/smb/server/transport_tcp.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/fs/smb/server/transport_tcp.c ++++ b/fs/smb/server/transport_tcp.c +@@ -284,8 +284,11 @@ static int ksmbd_kthread_fn(void *p) + } + } + up_read(&conn_list_lock); +- if (ret == -EAGAIN) ++ if (ret == -EAGAIN) { ++ /* Per-IP limit hit: release the just-accepted socket. */ ++ sock_release(client_sk); + continue; ++ } + + skip_max_ip_conns_limit: + if (server_conf.max_connections && diff --git a/queue-6.17/kvm-arm64-make-all-32bit-id-registers-fully-writable.patch b/queue-6.17/kvm-arm64-make-all-32bit-id-registers-fully-writable.patch new file mode 100644 index 0000000000..4765abbdac --- /dev/null +++ b/queue-6.17/kvm-arm64-make-all-32bit-id-registers-fully-writable.patch @@ -0,0 +1,123 @@ +From 3f9eacf4f0705876a5d6526d7d320ca91d7d7a16 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Thu, 30 Oct 2025 12:27:05 +0000 +Subject: KVM: arm64: Make all 32bit ID registers fully writable + +From: Marc Zyngier + +commit 3f9eacf4f0705876a5d6526d7d320ca91d7d7a16 upstream. + +32bit ID registers aren't getting much love these days, and are +often missed in updates. One of these updates broke restoring +a GICv2 guest on a GICv3 machine. + +Instead of performing a piecemeal fix, just bite the bullet +and make all 32bit ID regs fully writable. KVM itself never +relies on them for anything, and if the VMM wants to mess up +the guest, so be it. + +Fixes: 5cb57a1aff755 ("KVM: arm64: Zero ID_AA64PFR0_EL1.GIC when no GICv3 is presented to the guest") +Reported-by: Peter Maydell +Cc: stable@vger.kernel.org +Reviewed-by: Oliver Upton +Link: https://patch.msgid.link/20251030122707.2033690-2-maz@kernel.org +Signed-off-by: Marc Zyngier +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/sys_regs.c | 59 ++++++++++++++++++++++++---------------------- + 1 file changed, 31 insertions(+), 28 deletions(-) + +--- a/arch/arm64/kvm/sys_regs.c ++++ b/arch/arm64/kvm/sys_regs.c +@@ -2515,19 +2515,23 @@ static bool bad_redir_trap(struct kvm_vc + .val = 0, \ + } + +-/* sys_reg_desc initialiser for known cpufeature ID registers */ +-#define AA32_ID_SANITISED(name) { \ +- ID_DESC(name), \ +- .visibility = aa32_id_visibility, \ +- .val = 0, \ +-} +- + /* sys_reg_desc initialiser for writable ID registers */ + #define ID_WRITABLE(name, mask) { \ + ID_DESC(name), \ + .val = mask, \ + } + ++/* ++ * 32bit ID regs are fully writable when the guest is 32bit ++ * capable. Nothing in the KVM code should rely on 32bit features ++ * anyway, only 64bit, so let the VMM do its worse. ++ */ ++#define AA32_ID_WRITABLE(name) { \ ++ ID_DESC(name), \ ++ .visibility = aa32_id_visibility, \ ++ .val = GENMASK(31, 0), \ ++} ++ + /* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ + #define ID_FILTERED(sysreg, name, mask) { \ + ID_DESC(sysreg), \ +@@ -3039,40 +3043,39 @@ static const struct sys_reg_desc sys_reg + + /* AArch64 mappings of the AArch32 ID registers */ + /* CRm=1 */ +- AA32_ID_SANITISED(ID_PFR0_EL1), +- AA32_ID_SANITISED(ID_PFR1_EL1), ++ AA32_ID_WRITABLE(ID_PFR0_EL1), ++ AA32_ID_WRITABLE(ID_PFR1_EL1), + { SYS_DESC(SYS_ID_DFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_dfr0_el1, + .visibility = aa32_id_visibility, + .reset = read_sanitised_id_dfr0_el1, +- .val = ID_DFR0_EL1_PerfMon_MASK | +- ID_DFR0_EL1_CopDbg_MASK, }, ++ .val = GENMASK(31, 0) }, + ID_HIDDEN(ID_AFR0_EL1), +- AA32_ID_SANITISED(ID_MMFR0_EL1), +- AA32_ID_SANITISED(ID_MMFR1_EL1), +- AA32_ID_SANITISED(ID_MMFR2_EL1), +- AA32_ID_SANITISED(ID_MMFR3_EL1), ++ AA32_ID_WRITABLE(ID_MMFR0_EL1), ++ AA32_ID_WRITABLE(ID_MMFR1_EL1), ++ AA32_ID_WRITABLE(ID_MMFR2_EL1), ++ AA32_ID_WRITABLE(ID_MMFR3_EL1), + + /* CRm=2 */ +- AA32_ID_SANITISED(ID_ISAR0_EL1), +- AA32_ID_SANITISED(ID_ISAR1_EL1), +- AA32_ID_SANITISED(ID_ISAR2_EL1), +- AA32_ID_SANITISED(ID_ISAR3_EL1), +- AA32_ID_SANITISED(ID_ISAR4_EL1), +- AA32_ID_SANITISED(ID_ISAR5_EL1), +- AA32_ID_SANITISED(ID_MMFR4_EL1), +- AA32_ID_SANITISED(ID_ISAR6_EL1), ++ AA32_ID_WRITABLE(ID_ISAR0_EL1), ++ AA32_ID_WRITABLE(ID_ISAR1_EL1), ++ AA32_ID_WRITABLE(ID_ISAR2_EL1), ++ AA32_ID_WRITABLE(ID_ISAR3_EL1), ++ AA32_ID_WRITABLE(ID_ISAR4_EL1), ++ AA32_ID_WRITABLE(ID_ISAR5_EL1), ++ AA32_ID_WRITABLE(ID_MMFR4_EL1), ++ AA32_ID_WRITABLE(ID_ISAR6_EL1), + + /* CRm=3 */ +- AA32_ID_SANITISED(MVFR0_EL1), +- AA32_ID_SANITISED(MVFR1_EL1), +- AA32_ID_SANITISED(MVFR2_EL1), ++ AA32_ID_WRITABLE(MVFR0_EL1), ++ AA32_ID_WRITABLE(MVFR1_EL1), ++ AA32_ID_WRITABLE(MVFR2_EL1), + ID_UNALLOCATED(3,3), +- AA32_ID_SANITISED(ID_PFR2_EL1), ++ AA32_ID_WRITABLE(ID_PFR2_EL1), + ID_HIDDEN(ID_DFR1_EL1), +- AA32_ID_SANITISED(ID_MMFR5_EL1), ++ AA32_ID_WRITABLE(ID_MMFR5_EL1), + ID_UNALLOCATED(3,7), + + /* AArch64 ID registers */ diff --git a/queue-6.17/kvm-guest_memfd-remove-bindings-on-memslot-deletion-when-gmem-is-dying.patch b/queue-6.17/kvm-guest_memfd-remove-bindings-on-memslot-deletion-when-gmem-is-dying.patch new file mode 100644 index 0000000000..6033c8ec67 --- /dev/null +++ b/queue-6.17/kvm-guest_memfd-remove-bindings-on-memslot-deletion-when-gmem-is-dying.patch @@ -0,0 +1,172 @@ +From ae431059e75d36170a5ae6b44cc4d06d43613215 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Mon, 3 Nov 2025 17:12:05 -0800 +Subject: KVM: guest_memfd: Remove bindings on memslot deletion when gmem is dying + +From: Sean Christopherson + +commit ae431059e75d36170a5ae6b44cc4d06d43613215 upstream. + +When unbinding a memslot from a guest_memfd instance, remove the bindings +even if the guest_memfd file is dying, i.e. even if its file refcount has +gone to zero. If the memslot is freed before the file is fully released, +nullifying the memslot side of the binding in kvm_gmem_release() will +write to freed memory, as detected by syzbot+KASAN: + + ================================================================== + BUG: KASAN: slab-use-after-free in kvm_gmem_release+0x176/0x440 virt/kvm/guest_memfd.c:353 + Write of size 8 at addr ffff88807befa508 by task syz.0.17/6022 + + CPU: 0 UID: 0 PID: 6022 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025 + Call Trace: + + dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 + print_address_description mm/kasan/report.c:378 [inline] + print_report+0xca/0x240 mm/kasan/report.c:482 + kasan_report+0x118/0x150 mm/kasan/report.c:595 + kvm_gmem_release+0x176/0x440 virt/kvm/guest_memfd.c:353 + __fput+0x44c/0xa70 fs/file_table.c:468 + task_work_run+0x1d4/0x260 kernel/task_work.c:227 + resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] + exit_to_user_mode_loop+0xe9/0x130 kernel/entry/common.c:43 + exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] + syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] + syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] + do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + RIP: 0033:0x7fbeeff8efc9 + + + Allocated by task 6023: + kasan_save_stack mm/kasan/common.c:56 [inline] + kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 + poison_kmalloc_redzone mm/kasan/common.c:397 [inline] + __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:414 + kasan_kmalloc include/linux/kasan.h:262 [inline] + __kmalloc_cache_noprof+0x3e2/0x700 mm/slub.c:5758 + kmalloc_noprof include/linux/slab.h:957 [inline] + kzalloc_noprof include/linux/slab.h:1094 [inline] + kvm_set_memory_region+0x747/0xb90 virt/kvm/kvm_main.c:2104 + kvm_vm_ioctl_set_memory_region+0x6f/0xd0 virt/kvm/kvm_main.c:2154 + kvm_vm_ioctl+0x957/0xc60 virt/kvm/kvm_main.c:5201 + vfs_ioctl fs/ioctl.c:51 [inline] + __do_sys_ioctl fs/ioctl.c:597 [inline] + __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583 + do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] + do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + + Freed by task 6023: + kasan_save_stack mm/kasan/common.c:56 [inline] + kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 + kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584 + poison_slab_object mm/kasan/common.c:252 [inline] + __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:284 + kasan_slab_free include/linux/kasan.h:234 [inline] + slab_free_hook mm/slub.c:2533 [inline] + slab_free mm/slub.c:6622 [inline] + kfree+0x19a/0x6d0 mm/slub.c:6829 + kvm_set_memory_region+0x9c4/0xb90 virt/kvm/kvm_main.c:2130 + kvm_vm_ioctl_set_memory_region+0x6f/0xd0 virt/kvm/kvm_main.c:2154 + kvm_vm_ioctl+0x957/0xc60 virt/kvm/kvm_main.c:5201 + vfs_ioctl fs/ioctl.c:51 [inline] + __do_sys_ioctl fs/ioctl.c:597 [inline] + __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583 + do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] + do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +Deliberately don't acquire filemap invalid lock when the file is dying as +the lifecycle of f_mapping is outside the purview of KVM. Dereferencing +the mapping is *probably* fine, but there's no need to invalidate anything +as memslot deletion is responsible for zapping SPTEs, and the only code +that can access the dying file is kvm_gmem_release(), whose core code is +mutually exclusive with unbinding. + +Note, the mutual exclusivity is also what makes it safe to access the +bindings on a dying gmem instance. Unbinding either runs with slots_lock +held, or after the last reference to the owning "struct kvm" is put, and +kvm_gmem_release() nullifies the slot pointer under slots_lock, and puts +its reference to the VM after that is done. + +Reported-by: syzbot+2479e53d0db9b32ae2aa@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/all/68fa7a22.a70a0220.3bf6c6.008b.GAE@google.com +Tested-by: syzbot+2479e53d0db9b32ae2aa@syzkaller.appspotmail.com +Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") +Cc: stable@vger.kernel.org +Cc: Hillf Danton +Reviewed-By: Vishal Annapurve +Link: https://patch.msgid.link/20251104011205.3853541-1-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/guest_memfd.c | 45 ++++++++++++++++++++++++++++++++------------- + 1 file changed, 32 insertions(+), 13 deletions(-) + +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -523,31 +523,50 @@ err: + return r; + } + +-void kvm_gmem_unbind(struct kvm_memory_slot *slot) ++static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gmem) + { + unsigned long start = slot->gmem.pgoff; + unsigned long end = start + slot->npages; +- struct kvm_gmem *gmem; ++ ++ xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); ++ ++ /* ++ * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() ++ * cannot see this memslot. ++ */ ++ WRITE_ONCE(slot->gmem.file, NULL); ++} ++ ++void kvm_gmem_unbind(struct kvm_memory_slot *slot) ++{ + struct file *file; + + /* +- * Nothing to do if the underlying file was already closed (or is being +- * closed right now), kvm_gmem_release() invalidates all bindings. ++ * Nothing to do if the underlying file was _already_ closed, as ++ * kvm_gmem_release() invalidates and nullifies all bindings. + */ +- file = kvm_gmem_get_file(slot); +- if (!file) ++ if (!slot->gmem.file) + return; + +- gmem = file->private_data; +- +- filemap_invalidate_lock(file->f_mapping); +- xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); ++ file = kvm_gmem_get_file(slot); + + /* +- * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() +- * cannot see this memslot. ++ * However, if the file is _being_ closed, then the bindings need to be ++ * removed as kvm_gmem_release() might not run until after the memslot ++ * is freed. Note, modifying the bindings is safe even though the file ++ * is dying as kvm_gmem_release() nullifies slot->gmem.file under ++ * slots_lock, and only puts its reference to KVM after destroying all ++ * bindings. I.e. reaching this point means kvm_gmem_release() hasn't ++ * yet destroyed the bindings or freed the gmem_file, and can't do so ++ * until the caller drops slots_lock. + */ +- WRITE_ONCE(slot->gmem.file, NULL); ++ if (!file) { ++ __kvm_gmem_unbind(slot, slot->gmem.file->private_data); ++ return; ++ } ++ ++ filemap_invalidate_lock(file->f_mapping); ++ __kvm_gmem_unbind(slot, file->private_data); + filemap_invalidate_unlock(file->f_mapping); + + fput(file); diff --git a/queue-6.17/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch b/queue-6.17/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch new file mode 100644 index 0000000000..50dbb3b3e7 --- /dev/null +++ b/queue-6.17/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch @@ -0,0 +1,98 @@ +From fbe5e5f030c22ae717ee422aaab0e00ea84fab5e Mon Sep 17 00:00:00 2001 +From: Yosry Ahmed +Date: Sat, 8 Nov 2025 00:45:20 +0000 +Subject: KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv() + +From: Yosry Ahmed + +commit fbe5e5f030c22ae717ee422aaab0e00ea84fab5e upstream. + +svm_update_lbrv() is called when MSR_IA32_DEBUGCTLMSR is updated, and on +nested transitions where LBRV is used. It checks whether LBRV enablement +needs to be changed in the current VMCB, and if it does, it also +recalculate intercepts to LBR MSRs. + +However, there are cases where intercepts need to be updated even when +LBRV enablement doesn't. Example scenario: +- L1 has MSR_IA32_DEBUGCTLMSR cleared. +- L1 runs L2 without LBR_CTL_ENABLE (no LBRV). +- L2 sets DEBUGCTLMSR_LBR in MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() + sets LBR_CTL_ENABLE in VMCB02 and disables intercepts to LBR MSRs. +- L2 exits to L1, svm_update_lbrv() is not called on this transition. +- L1 clears MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() finds that + LBR_CTL_ENABLE is already cleared in VMCB01 and does nothing. +- Intercepts remain disabled, L1 reads to LBR MSRs read the host MSRs. + +Fix it by always recalculating intercepts in svm_update_lbrv(). + +Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") +Cc: stable@vger.kernel.org +Signed-off-by: Yosry Ahmed +Link: https://patch.msgid.link/20251108004524.1600006-3-yosry.ahmed@linux.dev +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++++---------- + 1 file changed, 19 insertions(+), 10 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -852,25 +852,29 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, + vmcb_mark_dirty(to_vmcb, VMCB_LBR); + } + +-void svm_enable_lbrv(struct kvm_vcpu *vcpu) ++static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; +- svm_recalc_lbr_msr_intercepts(vcpu); + + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); + } + +-static void svm_disable_lbrv(struct kvm_vcpu *vcpu) ++void svm_enable_lbrv(struct kvm_vcpu *vcpu) ++{ ++ __svm_enable_lbrv(vcpu); ++ svm_recalc_lbr_msr_intercepts(vcpu); ++} ++ ++static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; +- svm_recalc_lbr_msr_intercepts(vcpu); + + /* + * Move the LBR msrs back to the vmcb01 to avoid copying them +@@ -899,13 +903,18 @@ void svm_update_lbrv(struct kvm_vcpu *vc + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); + +- if (enable_lbrv == current_enable_lbrv) +- return; ++ if (enable_lbrv && !current_enable_lbrv) ++ __svm_enable_lbrv(vcpu); ++ else if (!enable_lbrv && current_enable_lbrv) ++ __svm_disable_lbrv(vcpu); + +- if (enable_lbrv) +- svm_enable_lbrv(vcpu); +- else +- svm_disable_lbrv(vcpu); ++ /* ++ * During nested transitions, it is possible that the current VMCB has ++ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa). ++ * In this case, even though LBR_CTL does not need an update, intercepts ++ * do, so always recalculate the intercepts here. ++ */ ++ svm_recalc_lbr_msr_intercepts(vcpu); + } + + void disable_nmi_singlestep(struct vcpu_svm *svm) diff --git a/queue-6.17/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch b/queue-6.17/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch new file mode 100644 index 0000000000..8ab5ee7e6a --- /dev/null +++ b/queue-6.17/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch @@ -0,0 +1,191 @@ +From 8a4821412cf2c1429fffa07c012dd150f2edf78c Mon Sep 17 00:00:00 2001 +From: Yosry Ahmed +Date: Sat, 8 Nov 2025 00:45:21 +0000 +Subject: KVM: nSVM: Fix and simplify LBR virtualization handling with nested + +From: Yosry Ahmed + +commit 8a4821412cf2c1429fffa07c012dd150f2edf78c upstream. + +The current scheme for handling LBRV when nested is used is very +complicated, especially when L1 does not enable LBRV (i.e. does not set +LBR_CTL_ENABLE_MASK). + +To avoid copying LBRs between VMCB01 and VMCB02 on every nested +transition, the current implementation switches between using VMCB01 or +VMCB02 as the source of truth for the LBRs while L2 is running. If L2 +enables LBR, VMCB02 is used as the source of truth. When L2 disables +LBR, the LBRs are copied to VMCB01 and VMCB01 is used as the source of +truth. This introduces significant complexity, and incorrect behavior in +some cases. + +For example, on a nested #VMEXIT, the LBRs are only copied from VMCB02 +to VMCB01 if LBRV is enabled in VMCB01. This is because L2's writes to +MSR_IA32_DEBUGCTLMSR to enable LBR are intercepted and propagated to +VMCB01 instead of VMCB02. However, LBRV is only enabled in VMCB02 when +L2 is running. + +This means that if L2 enables LBR and exits to L1, the LBRs will not be +propagated from VMCB02 to VMCB01, because LBRV is disabled in VMCB01. + +There is no meaningful difference in CPUID rate in L2 when copying LBRs +on every nested transition vs. the current approach, so do the simple +and correct thing and always copy LBRs between VMCB01 and VMCB02 on +nested transitions (when LBRV is disabled by L1). Drop the conditional +LBRs copying in __svm_{enable/disable}_lbrv() as it is now unnecessary. + +VMCB02 becomes the only source of truth for LBRs when L2 is running, +regardless of LBRV being enabled by L1, drop svm_get_lbr_vmcb() and use +svm->vmcb directly in its place. + +Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") +Cc: stable@vger.kernel.org +Signed-off-by: Yosry Ahmed +Link: https://patch.msgid.link/20251108004524.1600006-4-yosry.ahmed@linux.dev +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/nested.c | 20 +++++++------------- + arch/x86/kvm/svm/svm.c | 46 ++++++++++------------------------------------ + 2 files changed, 17 insertions(+), 49 deletions(-) + +--- a/arch/x86/kvm/svm/nested.c ++++ b/arch/x86/kvm/svm/nested.c +@@ -669,11 +669,10 @@ static void nested_vmcb02_prepare_save(s + */ + svm_copy_lbrs(vmcb02, vmcb12); + vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; +- svm_update_lbrv(&svm->vcpu); +- +- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { ++ } else { + svm_copy_lbrs(vmcb02, vmcb01); + } ++ svm_update_lbrv(&svm->vcpu); + } + + static inline bool is_evtinj_soft(u32 evtinj) +@@ -825,11 +824,7 @@ static void nested_vmcb02_prepare_contro + svm->soft_int_next_rip = vmcb12_rip; + } + +- vmcb02->control.virt_ext = vmcb01->control.virt_ext & +- LBR_CTL_ENABLE_MASK; +- if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) +- vmcb02->control.virt_ext |= +- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); ++ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ + + if (!nested_vmcb_needs_vls_intercept(svm)) + vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; +@@ -1169,13 +1164,12 @@ int nested_svm_vmexit(struct vcpu_svm *s + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && +- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { ++ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) + svm_copy_lbrs(vmcb12, vmcb02); +- svm_update_lbrv(vcpu); +- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { ++ else + svm_copy_lbrs(vmcb01, vmcb02); +- svm_update_lbrv(vcpu); +- } ++ ++ svm_update_lbrv(vcpu); + + if (vnmi) { + if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -854,13 +854,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, + + static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) + { +- struct vcpu_svm *svm = to_svm(vcpu); +- +- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; +- +- /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ +- if (is_guest_mode(vcpu)) +- svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); ++ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; + } + + void svm_enable_lbrv(struct kvm_vcpu *vcpu) +@@ -871,35 +865,15 @@ void svm_enable_lbrv(struct kvm_vcpu *vc + + static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) + { +- struct vcpu_svm *svm = to_svm(vcpu); +- + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); +- svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; +- +- /* +- * Move the LBR msrs back to the vmcb01 to avoid copying them +- * on nested guest entries. +- */ +- if (is_guest_mode(vcpu)) +- svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); +-} +- +-static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) +-{ +- /* +- * If LBR virtualization is disabled, the LBR MSRs are always kept in +- * vmcb01. If LBR virtualization is enabled and L1 is running VMs of +- * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. +- */ +- return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : +- svm->vmcb01.ptr; ++ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; + } + + void svm_update_lbrv(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; +- bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || ++ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); + +@@ -2785,19 +2759,19 @@ static int svm_get_msr(struct kvm_vcpu * + msr_info->data = svm->tsc_aux; + break; + case MSR_IA32_DEBUGCTLMSR: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; ++ msr_info->data = svm->vmcb->save.dbgctl; + break; + case MSR_IA32_LASTBRANCHFROMIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; ++ msr_info->data = svm->vmcb->save.br_from; + break; + case MSR_IA32_LASTBRANCHTOIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; ++ msr_info->data = svm->vmcb->save.br_to; + break; + case MSR_IA32_LASTINTFROMIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; ++ msr_info->data = svm->vmcb->save.last_excp_from; + break; + case MSR_IA32_LASTINTTOIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; ++ msr_info->data = svm->vmcb->save.last_excp_to; + break; + case MSR_VM_HSAVE_PA: + msr_info->data = svm->nested.hsave_msr; +@@ -3053,10 +3027,10 @@ static int svm_set_msr(struct kvm_vcpu * + if (data & DEBUGCTL_RESERVED_BITS) + return 1; + +- if (svm_get_lbr_vmcb(svm)->save.dbgctl == data) ++ if (svm->vmcb->save.dbgctl == data) + break; + +- svm_get_lbr_vmcb(svm)->save.dbgctl = data; ++ svm->vmcb->save.dbgctl = data; + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); + svm_update_lbrv(vcpu); + break; diff --git a/queue-6.17/kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch b/queue-6.17/kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch new file mode 100644 index 0000000000..3fa97327a9 --- /dev/null +++ b/queue-6.17/kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch @@ -0,0 +1,47 @@ +From dc55b3c3f61246e483e50c85d8d5366f9567e188 Mon Sep 17 00:00:00 2001 +From: Yosry Ahmed +Date: Sat, 8 Nov 2025 00:45:19 +0000 +Subject: KVM: SVM: Mark VMCB_LBR dirty when MSR_IA32_DEBUGCTLMSR is updated + +From: Yosry Ahmed + +commit dc55b3c3f61246e483e50c85d8d5366f9567e188 upstream. + +The APM lists the DbgCtlMsr field as being tracked by the VMCB_LBR clean +bit. Always clear the bit when MSR_IA32_DEBUGCTLMSR is updated. + +The history is complicated, it was correctly cleared for L1 before +commit 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when +L2 is running"). At that point svm_set_msr() started to rely on +svm_update_lbrv() to clear the bit, but when nested virtualization +is enabled the latter does not always clear it even if MSR_IA32_DEBUGCTLMSR +changed. Go back to clearing it directly in svm_set_msr(). + +Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") +Reported-by: Matteo Rizzo +Reported-by: evn@google.com +Co-developed-by: Jim Mattson +Signed-off-by: Jim Mattson +Signed-off-by: Yosry Ahmed +Link: https://patch.msgid.link/20251108004524.1600006-2-yosry.ahmed@linux.dev +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -3044,7 +3044,11 @@ static int svm_set_msr(struct kvm_vcpu * + if (data & DEBUGCTL_RESERVED_BITS) + return 1; + ++ if (svm_get_lbr_vmcb(svm)->save.dbgctl == data) ++ break; ++ + svm_get_lbr_vmcb(svm)->save.dbgctl = data; ++ vmcb_mark_dirty(svm->vmcb, VMCB_LBR); + svm_update_lbrv(vcpu); + break; + case MSR_VM_HSAVE_PA: diff --git a/queue-6.17/kvm-vmx-fix-check-for-valid-gva-on-an-ept-violation.patch b/queue-6.17/kvm-vmx-fix-check-for-valid-gva-on-an-ept-violation.patch new file mode 100644 index 0000000000..b128fe9714 --- /dev/null +++ b/queue-6.17/kvm-vmx-fix-check-for-valid-gva-on-an-ept-violation.patch @@ -0,0 +1,35 @@ +From d0164c161923ac303bd843e04ebe95cfd03c6e19 Mon Sep 17 00:00:00 2001 +From: Sukrit Bhatnagar +Date: Thu, 6 Nov 2025 14:28:51 +0900 +Subject: KVM: VMX: Fix check for valid GVA on an EPT violation + +From: Sukrit Bhatnagar + +commit d0164c161923ac303bd843e04ebe95cfd03c6e19 upstream. + +On an EPT violation, bit 7 of the exit qualification is set if the +guest linear-address is valid. The derived page fault error code +should not be checked for this bit. + +Fixes: f3009482512e ("KVM: VMX: Set PFERR_GUEST_{FINAL,PAGE}_MASK if and only if the GVA is valid") +Cc: stable@vger.kernel.org +Signed-off-by: Sukrit Bhatnagar +Reviewed-by: Xiaoyao Li +Link: https://patch.msgid.link/20251106052853.3071088-1-Sukrit.Bhatnagar@sony.com +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/common.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/vmx/common.h ++++ b/arch/x86/kvm/vmx/common.h +@@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_viola + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) + ? PFERR_PRESENT_MASK : 0; + +- if (error_code & EPT_VIOLATION_GVA_IS_VALID) ++ if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + diff --git a/queue-6.17/loongarch-consolidate-early_ioremap-ioremap_prot.patch b/queue-6.17/loongarch-consolidate-early_ioremap-ioremap_prot.patch new file mode 100644 index 0000000000..d37c12ff11 --- /dev/null +++ b/queue-6.17/loongarch-consolidate-early_ioremap-ioremap_prot.patch @@ -0,0 +1,57 @@ +From 43a9e6a10bdde32445ad2725f568e08a94e51dc9 Mon Sep 17 00:00:00 2001 +From: Huacai Chen +Date: Sun, 9 Nov 2025 16:02:00 +0800 +Subject: LoongArch: Consolidate early_ioremap()/ioremap_prot() + +From: Huacai Chen + +commit 43a9e6a10bdde32445ad2725f568e08a94e51dc9 upstream. + +1. Use phys_addr_t instead of u64, which can work for both 32/64 bits. +2. Check whether the input physical address is above TO_PHYS_MASK (and + return NULL if yes) for the DMW version. + +Note: In theory early_ioremap() also need the TO_PHYS_MASK checking, but +the UEFI BIOS pass some DMW virtual addresses. + +Cc: stable@vger.kernel.org +Signed-off-by: Jiaxun Yang +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/include/asm/io.h | 5 ++++- + arch/loongarch/mm/ioremap.c | 2 +- + 2 files changed, 5 insertions(+), 2 deletions(-) + +--- a/arch/loongarch/include/asm/io.h ++++ b/arch/loongarch/include/asm/io.h +@@ -14,7 +14,7 @@ + #include + #include + +-extern void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size); ++extern void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size); + extern void __init early_iounmap(void __iomem *addr, unsigned long size); + + #define early_memremap early_ioremap +@@ -25,6 +25,9 @@ extern void __init early_iounmap(void __ + static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, + pgprot_t prot) + { ++ if (offset > TO_PHYS_MASK) ++ return NULL; ++ + switch (pgprot_val(prot) & _CACHE_MASK) { + case _CACHE_CC: + return (void __iomem *)(unsigned long)(CACHE_BASE + offset); +--- a/arch/loongarch/mm/ioremap.c ++++ b/arch/loongarch/mm/ioremap.c +@@ -6,7 +6,7 @@ + #include + #include + +-void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size) ++void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size) + { + return ((void __iomem *)TO_CACHE(phys_addr)); + } diff --git a/queue-6.17/loongarch-kvm-add-delay-until-timer-interrupt-injected.patch b/queue-6.17/loongarch-kvm-add-delay-until-timer-interrupt-injected.patch new file mode 100644 index 0000000000..269b5a96ae --- /dev/null +++ b/queue-6.17/loongarch-kvm-add-delay-until-timer-interrupt-injected.patch @@ -0,0 +1,46 @@ +From d3c9515e4f9d10ccb113adb4809db5cc31e7ef65 Mon Sep 17 00:00:00 2001 +From: Bibo Mao +Date: Sun, 9 Nov 2025 16:02:09 +0800 +Subject: LoongArch: KVM: Add delay until timer interrupt injected + +From: Bibo Mao + +commit d3c9515e4f9d10ccb113adb4809db5cc31e7ef65 upstream. + +When timer is fired in oneshot mode, CSR.TVAL will stop with value -1 +rather than 0. However when the register CSR.TVAL is restored, it will +continue to count down rather than stop there. + +Now the method is to write 0 to CSR.TVAL, wait to count down for 1 cycle +at least, which is 10ns with a timer freq 100MHz, and then retore timer +interrupt status. Here add 2 cycles delay to assure that timer interrupt +is injected. + +With this patch, timer selftest case passes to run always. + +Cc: stable@vger.kernel.org +Signed-off-by: Bibo Mao +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/kvm/timer.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/loongarch/kvm/timer.c ++++ b/arch/loongarch/kvm/timer.c +@@ -4,6 +4,7 @@ + */ + + #include ++#include + #include + #include + +@@ -95,6 +96,7 @@ void kvm_restore_timer(struct kvm_vcpu * + * and set CSR TVAL with -1 + */ + write_gcsr_timertick(0); ++ __delay(2); /* Wait cycles until timer interrupt injected */ + + /* + * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear diff --git a/queue-6.17/loongarch-kvm-fix-max-supported-vcpus-set-with-eiointc.patch b/queue-6.17/loongarch-kvm-fix-max-supported-vcpus-set-with-eiointc.patch new file mode 100644 index 0000000000..45fc68a61c --- /dev/null +++ b/queue-6.17/loongarch-kvm-fix-max-supported-vcpus-set-with-eiointc.patch @@ -0,0 +1,40 @@ +From 237e74bfa261fb0cf75bd08c9be0c5094018ee20 Mon Sep 17 00:00:00 2001 +From: Bibo Mao +Date: Sun, 9 Nov 2025 16:02:09 +0800 +Subject: LoongArch: KVM: Fix max supported vCPUs set with EIOINTC + +From: Bibo Mao + +commit 237e74bfa261fb0cf75bd08c9be0c5094018ee20 upstream. + +VM fails to boot with 256 vCPUs, the detailed command is + + qemu-system-loongarch64 -smp 256 + +and there is an error reported as follows: + + KVM_LOONGARCH_EXTIOI_INIT_NUM_CPU failed: Invalid argument + +There is typo issue in function kvm_eiointc_ctrl_access() when set +max supported vCPUs. + +Cc: stable@vger.kernel.org +Fixes: 47256c4c8b1b ("LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_ctrl_access()") +Signed-off-by: Bibo Mao +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/kvm/intc/eiointc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/loongarch/kvm/intc/eiointc.c ++++ b/arch/loongarch/kvm/intc/eiointc.c +@@ -439,7 +439,7 @@ static int kvm_eiointc_ctrl_access(struc + spin_lock_irqsave(&s->lock, flags); + switch (type) { + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: +- if (val >= EIOINTC_ROUTE_MAX_VCPUS) ++ if (val > EIOINTC_ROUTE_MAX_VCPUS) + ret = -EINVAL; + else + s->num_cpu = val; diff --git a/queue-6.17/loongarch-kvm-restore-guest-pmu-if-it-is-enabled.patch b/queue-6.17/loongarch-kvm-restore-guest-pmu-if-it-is-enabled.patch new file mode 100644 index 0000000000..c0c203726f --- /dev/null +++ b/queue-6.17/loongarch-kvm-restore-guest-pmu-if-it-is-enabled.patch @@ -0,0 +1,49 @@ +From 5001bcf86edf2de02f025a0f789bcac37fa040e6 Mon Sep 17 00:00:00 2001 +From: Bibo Mao +Date: Sun, 9 Nov 2025 16:02:09 +0800 +Subject: LoongArch: KVM: Restore guest PMU if it is enabled + +From: Bibo Mao + +commit 5001bcf86edf2de02f025a0f789bcac37fa040e6 upstream. + +On LoongArch system, guest PMU hardware is shared by guest and host but +PMU interrupt is separated. PMU is pass-through to VM, and there is PMU +context switch when exit to host and return to guest. + +There is optimiation to check whether PMU is enabled by guest. If not, +it is not necessary to return to guest. However, if it is enabled, PMU +context for guest need switch on. Now KVM_REQ_PMU notification is set +on vCPU context switch, but it is missing if there is no vCPU context +switch while PMU is used by guest VM, so fix it. + +Cc: +Fixes: f4e40ea9f78f ("LoongArch: KVM: Add PMU support for guest") +Signed-off-by: Bibo Mao +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/kvm/vcpu.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/arch/loongarch/kvm/vcpu.c ++++ b/arch/loongarch/kvm/vcpu.c +@@ -133,6 +133,9 @@ static void kvm_lose_pmu(struct kvm_vcpu + * Clear KVM_LARCH_PMU if the guest is not using PMU CSRs when + * exiting the guest, so that the next time trap into the guest. + * We don't need to deal with PMU CSRs contexts. ++ * ++ * Otherwise set the request bit KVM_REQ_PMU to restore guest PMU ++ * before entering guest VM + */ + val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); +@@ -140,6 +143,8 @@ static void kvm_lose_pmu(struct kvm_vcpu + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); + if (!(val & KVM_PMU_EVENT_ENABLED)) + vcpu->arch.aux_inuse &= ~KVM_LARCH_PMU; ++ else ++ kvm_make_request(KVM_REQ_PMU, vcpu); + + kvm_restore_host_pmu(vcpu); + } diff --git a/queue-6.17/loongarch-let-pte-pmd-_modify-record-the-status-of-_page_dirty.patch b/queue-6.17/loongarch-let-pte-pmd-_modify-record-the-status-of-_page_dirty.patch new file mode 100644 index 0000000000..e74f20350e --- /dev/null +++ b/queue-6.17/loongarch-let-pte-pmd-_modify-record-the-status-of-_page_dirty.patch @@ -0,0 +1,56 @@ +From a073d637c8cfbfbab39b7272226a3fbf3b887580 Mon Sep 17 00:00:00 2001 +From: Tianyang Zhang +Date: Sun, 9 Nov 2025 16:02:01 +0800 +Subject: LoongArch: Let {pte,pmd}_modify() record the status of _PAGE_DIRTY + +From: Tianyang Zhang + +commit a073d637c8cfbfbab39b7272226a3fbf3b887580 upstream. + +Now if the PTE/PMD is dirty with _PAGE_DIRTY but without _PAGE_MODIFIED, +after {pte,pmd}_modify() we lose _PAGE_DIRTY, then {pte,pmd}_dirty() +return false and lead to data loss. This can happen in certain scenarios +such as HW PTW doesn't set _PAGE_MODIFIED automatically, so here we need +_PAGE_MODIFIED to record the dirty status (_PAGE_DIRTY). + +The new modification involves checking whether the original PTE/PMD has +the _PAGE_DIRTY flag. If it exists, the _PAGE_MODIFIED bit is also set, +ensuring that the {pte,pmd}_dirty() interface can always return accurate +information. + +Cc: stable@vger.kernel.org +Co-developed-by: Liupu Wang +Signed-off-by: Liupu Wang +Signed-off-by: Tianyang Zhang +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/include/asm/pgtable.h | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/arch/loongarch/include/asm/pgtable.h ++++ b/arch/loongarch/include/asm/pgtable.h +@@ -424,6 +424,9 @@ static inline unsigned long pte_accessib + + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) + { ++ if (pte_val(pte) & _PAGE_DIRTY) ++ pte_val(pte) |= _PAGE_MODIFIED; ++ + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | + (pgprot_val(newprot) & ~_PAGE_CHG_MASK)); + } +@@ -547,9 +550,11 @@ static inline struct page *pmd_page(pmd_ + + static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) + { +- pmd_val(pmd) = (pmd_val(pmd) & _HPAGE_CHG_MASK) | +- (pgprot_val(newprot) & ~_HPAGE_CHG_MASK); +- return pmd; ++ if (pmd_val(pmd) & _PAGE_DIRTY) ++ pmd_val(pmd) |= _PAGE_MODIFIED; ++ ++ return __pmd((pmd_val(pmd) & _HPAGE_CHG_MASK) | ++ (pgprot_val(newprot) & ~_HPAGE_CHG_MASK)); + } + + static inline pmd_t pmd_mkinvalid(pmd_t pmd) diff --git a/queue-6.17/loongarch-use-correct-accessor-to-read-fwpc-mwpc.patch b/queue-6.17/loongarch-use-correct-accessor-to-read-fwpc-mwpc.patch new file mode 100644 index 0000000000..c5e3504f51 --- /dev/null +++ b/queue-6.17/loongarch-use-correct-accessor-to-read-fwpc-mwpc.patch @@ -0,0 +1,38 @@ +From eeeeaafa62ea0cd4b86390f657dc0aea73bff4f5 Mon Sep 17 00:00:00 2001 +From: Huacai Chen +Date: Sun, 9 Nov 2025 16:02:01 +0800 +Subject: LoongArch: Use correct accessor to read FWPC/MWPC + +From: Huacai Chen + +commit eeeeaafa62ea0cd4b86390f657dc0aea73bff4f5 upstream. + +CSR.FWPC and CSR.MWPC are 32bit registers, so use csr_read32() rather +than csr_read64() to read the values of FWPC/MWPC. + +Cc: stable@vger.kernel.org +Fixes: edffa33c7bb5a73 ("LoongArch: Add hardware breakpoints/watchpoints support") +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/include/asm/hw_breakpoint.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/loongarch/include/asm/hw_breakpoint.h ++++ b/arch/loongarch/include/asm/hw_breakpoint.h +@@ -134,13 +134,13 @@ static inline void hw_breakpoint_thread_ + /* Determine number of BRP registers available. */ + static inline int get_num_brps(void) + { +- return csr_read64(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM; ++ return csr_read32(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM; + } + + /* Determine number of WRP registers available. */ + static inline int get_num_wrps(void) + { +- return csr_read64(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM; ++ return csr_read32(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM; + } + + #endif /* __KERNEL__ */ diff --git a/queue-6.17/maple_tree-fix-tracepoint-string-pointers.patch b/queue-6.17/maple_tree-fix-tracepoint-string-pointers.patch new file mode 100644 index 0000000000..9c1da762d5 --- /dev/null +++ b/queue-6.17/maple_tree-fix-tracepoint-string-pointers.patch @@ -0,0 +1,174 @@ +From 91a54090026f84ceffaa12ac53c99b9f162946f6 Mon Sep 17 00:00:00 2001 +From: Martin Kaiser +Date: Thu, 30 Oct 2025 16:55:05 +0100 +Subject: maple_tree: fix tracepoint string pointers + +From: Martin Kaiser + +commit 91a54090026f84ceffaa12ac53c99b9f162946f6 upstream. + +maple_tree tracepoints contain pointers to function names. Such a pointer +is saved when a tracepoint logs an event. There's no guarantee that it's +still valid when the event is parsed later and the pointer is dereferenced. + +The kernel warns about these unsafe pointers. + + event 'ma_read' has unsafe pointer field 'fn' + WARNING: kernel/trace/trace.c:3779 at ignore_event+0x1da/0x1e4 + +Mark the function names as tracepoint_string() to fix the events. + +One case that doesn't work without my patch would be trace-cmd record +to save the binary ringbuffer and trace-cmd report to parse it in +userspace. The address of __func__ can't be dereferenced from +userspace but tracepoint_string will add an entry to +/sys/kernel/tracing/printk_formats + +Link: https://lkml.kernel.org/r/20251030155537.87972-1-martin@kaiser.cx +Fixes: 54a611b60590 ("Maple Tree: add new data structure") +Signed-off-by: Martin Kaiser +Acked-by: Liam R. Howlett +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + lib/maple_tree.c | 30 ++++++++++++++++-------------- + 1 file changed, 16 insertions(+), 14 deletions(-) + +--- a/lib/maple_tree.c ++++ b/lib/maple_tree.c +@@ -64,6 +64,8 @@ + #define CREATE_TRACE_POINTS + #include + ++#define TP_FCT tracepoint_string(__func__) ++ + /* + * Kernel pointer hashing renders much of the maple tree dump useless as tagged + * pointers get hashed to arbitrary values. +@@ -2976,7 +2978,7 @@ static inline void mas_rebalance(struct + MA_STATE(l_mas, mas->tree, mas->index, mas->last); + MA_STATE(r_mas, mas->tree, mas->index, mas->last); + +- trace_ma_op(__func__, mas); ++ trace_ma_op(TP_FCT, mas); + + /* + * Rebalancing occurs if a node is insufficient. Data is rebalanced +@@ -3337,7 +3339,7 @@ static void mas_split(struct ma_state *m + MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); + MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); + +- trace_ma_op(__func__, mas); ++ trace_ma_op(TP_FCT, mas); + + mast.l = &l_mas; + mast.r = &r_mas; +@@ -3512,7 +3514,7 @@ static bool mas_is_span_wr(struct ma_wr_ + return false; + } + +- trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); ++ trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry); + return true; + } + +@@ -3756,7 +3758,7 @@ static noinline void mas_wr_spanning_sto + * of data may happen. + */ + mas = wr_mas->mas; +- trace_ma_op(__func__, mas); ++ trace_ma_op(TP_FCT, mas); + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + return mas_new_root(mas, wr_mas->entry); +@@ -3894,7 +3896,7 @@ done: + } else { + memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); + } +- trace_ma_write(__func__, mas, 0, wr_mas->entry); ++ trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); + mas_update_gap(mas); + mas->end = new_end; + return; +@@ -3938,7 +3940,7 @@ static inline void mas_wr_slot_store(str + mas->offset++; /* Keep mas accurate. */ + } + +- trace_ma_write(__func__, mas, 0, wr_mas->entry); ++ trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); + /* + * Only update gap when the new entry is empty or there is an empty + * entry in the original two ranges. +@@ -4059,7 +4061,7 @@ static inline void mas_wr_append(struct + mas_update_gap(mas); + + mas->end = new_end; +- trace_ma_write(__func__, mas, new_end, wr_mas->entry); ++ trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry); + return; + } + +@@ -4073,7 +4075,7 @@ static void mas_wr_bnode(struct ma_wr_st + { + struct maple_big_node b_node; + +- trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); ++ trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry); + memset(&b_node, 0, sizeof(struct maple_big_node)); + mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); + mas_commit_b_node(wr_mas, &b_node); +@@ -5405,7 +5407,7 @@ void *mas_store(struct ma_state *mas, vo + int request; + MA_WR_STATE(wr_mas, mas, entry); + +- trace_ma_write(__func__, mas, 0, entry); ++ trace_ma_write(TP_FCT, mas, 0, entry); + #ifdef CONFIG_DEBUG_MAPLE_TREE + if (MAS_WARN_ON(mas, mas->index > mas->last)) + pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, +@@ -5506,7 +5508,7 @@ void mas_store_prealloc(struct ma_state + } + + store: +- trace_ma_write(__func__, mas, 0, entry); ++ trace_ma_write(TP_FCT, mas, 0, entry); + mas_wr_store_entry(&wr_mas); + MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); + mas_destroy(mas); +@@ -6319,7 +6321,7 @@ void *mtree_load(struct maple_tree *mt, + MA_STATE(mas, mt, index, index); + void *entry; + +- trace_ma_read(__func__, &mas); ++ trace_ma_read(TP_FCT, &mas); + rcu_read_lock(); + retry: + entry = mas_start(&mas); +@@ -6362,7 +6364,7 @@ int mtree_store_range(struct maple_tree + MA_STATE(mas, mt, index, last); + int ret = 0; + +- trace_ma_write(__func__, &mas, 0, entry); ++ trace_ma_write(TP_FCT, &mas, 0, entry); + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return -EINVAL; + +@@ -6585,7 +6587,7 @@ void *mtree_erase(struct maple_tree *mt, + void *entry = NULL; + + MA_STATE(mas, mt, index, index); +- trace_ma_op(__func__, &mas); ++ trace_ma_op(TP_FCT, &mas); + + mtree_lock(mt); + entry = mas_erase(&mas); +@@ -6923,7 +6925,7 @@ void *mt_find(struct maple_tree *mt, uns + unsigned long copy = *index; + #endif + +- trace_ma_read(__func__, &mas); ++ trace_ma_read(TP_FCT, &mas); + + if ((*index) > max) + return NULL; diff --git a/queue-6.17/nfsd-add-missing-fattr4_word2_clone_blksize-from-supported-attributes.patch b/queue-6.17/nfsd-add-missing-fattr4_word2_clone_blksize-from-supported-attributes.patch new file mode 100644 index 0000000000..08f12defda --- /dev/null +++ b/queue-6.17/nfsd-add-missing-fattr4_word2_clone_blksize-from-supported-attributes.patch @@ -0,0 +1,32 @@ +From 4d3dbc2386fe051e44efad663e0ec828b98ab53f Mon Sep 17 00:00:00 2001 +From: Olga Kornievskaia +Date: Thu, 9 Oct 2025 16:37:59 -0400 +Subject: nfsd: add missing FATTR4_WORD2_CLONE_BLKSIZE from supported attributes + +From: Olga Kornievskaia + +commit 4d3dbc2386fe051e44efad663e0ec828b98ab53f upstream. + +RFC 7862 Section 4.1.2 says that if the server supports CLONE it MUST +support clone_blksize attribute. + +Fixes: d6ca7d2643ee ("NFSD: Implement FATTR4_CLONE_BLKSIZE attribute") +Cc: stable@vger.kernel.org +Signed-off-by: Olga Kornievskaia +Reviewed-by: Jeff Layton +Signed-off-by: Chuck Lever +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfsd/nfsd.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/nfsd/nfsd.h ++++ b/fs/nfsd/nfsd.h +@@ -455,6 +455,7 @@ enum { + #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_MODE_UMASK | \ ++ FATTR4_WORD2_CLONE_BLKSIZE | \ + NFSD4_2_SECURITY_ATTRS | \ + FATTR4_WORD2_XATTR_SUPPORT | \ + FATTR4_WORD2_TIME_DELEG_ACCESS | \ diff --git a/queue-6.17/nfsd-fix-refcount-leak-in-nfsd_set_fh_dentry.patch b/queue-6.17/nfsd-fix-refcount-leak-in-nfsd_set_fh_dentry.patch new file mode 100644 index 0000000000..8d29c09d81 --- /dev/null +++ b/queue-6.17/nfsd-fix-refcount-leak-in-nfsd_set_fh_dentry.patch @@ -0,0 +1,60 @@ +From 8a7348a9ed70bda1c1f51d3f1815bcbdf9f3b38c Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Wed, 8 Oct 2025 09:52:25 -0400 +Subject: nfsd: fix refcount leak in nfsd_set_fh_dentry() + +From: NeilBrown + +commit 8a7348a9ed70bda1c1f51d3f1815bcbdf9f3b38c upstream. + +nfsd exports a "pseudo root filesystem" which is used by NFSv4 to find +the various exported filesystems using LOOKUP requests from a known root +filehandle. NFSv3 uses the MOUNT protocol to find those exported +filesystems and so is not given access to the pseudo root filesystem. + +If a v3 (or v2) client uses a filehandle from that filesystem, +nfsd_set_fh_dentry() will report an error, but still stores the export +in "struct svc_fh" even though it also drops the reference (exp_put()). +This means that when fh_put() is called an extra reference will be dropped +which can lead to use-after-free and possible denial of service. + +Normal NFS usage will not provide a pseudo-root filehandle to a v3 +client. This bug can only be triggered by the client synthesising an +incorrect filehandle. + +To fix this we move the assignments to the svc_fh later, after all +possible error cases have been detected. + +Reported-and-tested-by: tianshuo han +Fixes: ef7f6c4904d0 ("nfsd: move V4ROOT version check to nfsd_set_fh_dentry()") +Signed-off-by: NeilBrown +Reviewed-by: Jeff Layton +Cc: stable@vger.kernel.org +Signed-off-by: Chuck Lever +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfsd/nfsfh.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/nfsd/nfsfh.c ++++ b/fs/nfsd/nfsfh.c +@@ -269,9 +269,6 @@ static __be32 nfsd_set_fh_dentry(struct + dentry); + } + +- fhp->fh_dentry = dentry; +- fhp->fh_export = exp; +- + switch (fhp->fh_maxsize) { + case NFS4_FHSIZE: + if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR) +@@ -293,6 +290,9 @@ static __be32 nfsd_set_fh_dentry(struct + goto out; + } + ++ fhp->fh_dentry = dentry; ++ fhp->fh_export = exp; ++ + return 0; + out: + exp_put(exp); diff --git a/queue-6.17/nfsd-free-copynotify-stateid-in-nfs4_free_ol_stateid.patch b/queue-6.17/nfsd-free-copynotify-stateid-in-nfs4_free_ol_stateid.patch new file mode 100644 index 0000000000..5b513402e1 --- /dev/null +++ b/queue-6.17/nfsd-free-copynotify-stateid-in-nfs4_free_ol_stateid.patch @@ -0,0 +1,85 @@ +From 4aa17144d5abc3c756883e3a010246f0dba8b468 Mon Sep 17 00:00:00 2001 +From: Olga Kornievskaia +Date: Tue, 14 Oct 2025 13:59:59 -0400 +Subject: NFSD: free copynotify stateid in nfs4_free_ol_stateid() + +From: Olga Kornievskaia + +commit 4aa17144d5abc3c756883e3a010246f0dba8b468 upstream. + +Typically copynotify stateid is freed either when parent's stateid +is being close/freed or in nfsd4_laundromat if the stateid hasn't +been used in a lease period. + +However, in case when the server got an OPEN (which created +a parent stateid), followed by a COPY_NOTIFY using that stateid, +followed by a client reboot. New client instance while doing +CREATE_SESSION would force expire previous state of this client. +It leads to the open state being freed thru release_openowner-> +nfs4_free_ol_stateid() and it finds that it still has copynotify +stateid associated with it. We currently print a warning and is +triggerred + +WARNING: CPU: 1 PID: 8858 at fs/nfsd/nfs4state.c:1550 nfs4_free_ol_stateid+0xb0/0x100 [nfsd] + +This patch, instead, frees the associated copynotify stateid here. + +If the parent stateid is freed (without freeing the copynotify +stateids associated with it), it leads to the list corruption +when laundromat ends up freeing the copynotify state later. + +[ 1626.839430] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP +[ 1626.842828] Modules linked in: nfnetlink_queue nfnetlink_log bluetooth cfg80211 rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd nfs_acl lockd grace nfs_localio ext4 crc16 mbcache jbd2 overlay uinput snd_seq_dummy snd_hrtimer qrtr rfkill vfat fat uvcvideo snd_hda_codec_generic videobuf2_vmalloc videobuf2_memops snd_hda_intel uvc snd_intel_dspcfg videobuf2_v4l2 videobuf2_common snd_hda_codec snd_hda_core videodev snd_hwdep snd_seq mc snd_seq_device snd_pcm snd_timer snd soundcore sg loop auth_rpcgss vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vmw_vmci vsock xfs 8021q garp stp llc mrp nvme ghash_ce e1000e nvme_core sr_mod nvme_keyring nvme_auth cdrom vmwgfx drm_ttm_helper ttm sunrpc dm_mirror dm_region_hash dm_log iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi fuse dm_multipath dm_mod nfnetlink +[ 1626.855594] CPU: 2 UID: 0 PID: 199 Comm: kworker/u24:33 Kdump: loaded Tainted: G B W 6.17.0-rc7+ #22 PREEMPT(voluntary) +[ 1626.857075] Tainted: [B]=BAD_PAGE, [W]=WARN +[ 1626.857573] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS VMW201.00V.24006586.BA64.2406042154 06/04/2024 +[ 1626.858724] Workqueue: nfsd4 laundromat_main [nfsd] +[ 1626.859304] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) +[ 1626.860010] pc : __list_del_entry_valid_or_report+0x148/0x200 +[ 1626.860601] lr : __list_del_entry_valid_or_report+0x148/0x200 +[ 1626.861182] sp : ffff8000881d7a40 +[ 1626.861521] x29: ffff8000881d7a40 x28: 0000000000000018 x27: ffff0000c2a98200 +[ 1626.862260] x26: 0000000000000600 x25: 0000000000000000 x24: ffff8000881d7b20 +[ 1626.862986] x23: ffff0000c2a981e8 x22: 1fffe00012410e7d x21: ffff0000920873e8 +[ 1626.863701] x20: ffff0000920873e8 x19: ffff000086f22998 x18: 0000000000000000 +[ 1626.864421] x17: 20747562202c3839 x16: 3932326636383030 x15: 3030666666662065 +[ 1626.865092] x14: 6220646c756f6873 x13: 0000000000000001 x12: ffff60004fd9e4a3 +[ 1626.865713] x11: 1fffe0004fd9e4a2 x10: ffff60004fd9e4a2 x9 : dfff800000000000 +[ 1626.866320] x8 : 00009fffb0261b5e x7 : ffff00027ecf2513 x6 : 0000000000000001 +[ 1626.866938] x5 : ffff00027ecf2510 x4 : ffff60004fd9e4a3 x3 : 0000000000000000 +[ 1626.867553] x2 : 0000000000000000 x1 : ffff000096069640 x0 : 000000000000006d +[ 1626.868167] Call trace: +[ 1626.868382] __list_del_entry_valid_or_report+0x148/0x200 (P) +[ 1626.868876] _free_cpntf_state_locked+0xd0/0x268 [nfsd] +[ 1626.869368] nfs4_laundromat+0x6f8/0x1058 [nfsd] +[ 1626.869813] laundromat_main+0x24/0x60 [nfsd] +[ 1626.870231] process_one_work+0x584/0x1050 +[ 1626.870595] worker_thread+0x4c4/0xc60 +[ 1626.870893] kthread+0x2f8/0x398 +[ 1626.871146] ret_from_fork+0x10/0x20 +[ 1626.871422] Code: aa1303e1 aa1403e3 910e8000 97bc55d7 (d4210000) +[ 1626.871892] SMP: stopping secondary CPUs + +Reported-by: rtm@csail.mit.edu +Closes: https://lore.kernel.org/linux-nfs/d8f064c1-a26f-4eed-b4f0-1f7f608f415f@oracle.com/T/#t +Fixes: 624322f1adc5 ("NFSD add COPY_NOTIFY operation") +Cc: stable@vger.kernel.org +Signed-off-by: Olga Kornievskaia +Signed-off-by: Chuck Lever +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfsd/nfs4state.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/nfsd/nfs4state.c ++++ b/fs/nfsd/nfs4state.c +@@ -1505,7 +1505,8 @@ static void nfs4_free_ol_stateid(struct + release_all_access(stp); + if (stp->st_stateowner) + nfs4_put_stateowner(stp->st_stateowner); +- WARN_ON(!list_empty(&stid->sc_cp_list)); ++ if (!list_empty(&stid->sc_cp_list)) ++ nfs4_free_cpntf_statelist(stid->sc_client->net, stid); + kmem_cache_free(stateid_slab, stid); + } + diff --git a/queue-6.17/series b/queue-6.17/series index 9a49d0e730..49ea7ee5ed 100644 --- a/queue-6.17/series +++ b/queue-6.17/series @@ -146,3 +146,28 @@ arm64-dts-rockchip-drop-reset-from-rk3576-i2c9-node.patch pwm-adp5585-correct-mismatched-pwm-chip-info.patch hid-playstation-fix-memory-leak-in-dualshock4_get_ca.patch hid-uclogic-fix-potential-memory-leak-in-error-path.patch +loongarch-kvm-restore-guest-pmu-if-it-is-enabled.patch +loongarch-kvm-add-delay-until-timer-interrupt-injected.patch +loongarch-kvm-fix-max-supported-vcpus-set-with-eiointc.patch +kvm-guest_memfd-remove-bindings-on-memslot-deletion-when-gmem-is-dying.patch +kvm-arm64-make-all-32bit-id-registers-fully-writable.patch +kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch +kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch +kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch +kvm-vmx-fix-check-for-valid-gva-on-an-ept-violation.patch +nfsd-fix-refcount-leak-in-nfsd_set_fh_dentry.patch +nfsd-add-missing-fattr4_word2_clone_blksize-from-supported-attributes.patch +nfsd-free-copynotify-stateid-in-nfs4_free_ol_stateid.patch +gcov-add-support-for-gcc-15.patch +ksmbd-close-accepted-socket-when-per-ip-limit-rejects-connection.patch +ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item.patch +kho-warn-and-fail-on-metadata-or-preserved-memory-in-scratch-area.patch +kho-increase-metadata-bitmap-size-to-page_size.patch +kho-allocate-metadata-directly-from-the-buddy-allocator.patch +kho-warn-and-exit-when-unpreserved-page-wasn-t-preserved.patch +strparser-fix-signed-unsigned-mismatch-bug.patch +dma-mapping-benchmark-restore-padding-to-ensure-uabi-remained-consistent.patch +maple_tree-fix-tracepoint-string-pointers.patch +loongarch-consolidate-early_ioremap-ioremap_prot.patch +loongarch-use-correct-accessor-to-read-fwpc-mwpc.patch +loongarch-let-pte-pmd-_modify-record-the-status-of-_page_dirty.patch diff --git a/queue-6.17/strparser-fix-signed-unsigned-mismatch-bug.patch b/queue-6.17/strparser-fix-signed-unsigned-mismatch-bug.patch new file mode 100644 index 0000000000..036a265db9 --- /dev/null +++ b/queue-6.17/strparser-fix-signed-unsigned-mismatch-bug.patch @@ -0,0 +1,47 @@ +From 4da4e4bde1c453ac5cc2dce5def81d504ae257ee Mon Sep 17 00:00:00 2001 +From: Nate Karstens +Date: Thu, 6 Nov 2025 16:28:33 -0600 +Subject: strparser: Fix signed/unsigned mismatch bug + +From: Nate Karstens + +commit 4da4e4bde1c453ac5cc2dce5def81d504ae257ee upstream. + +The `len` member of the sk_buff is an unsigned int. This is cast to +`ssize_t` (a signed type) for the first sk_buff in the comparison, +but not the second sk_buff. On 32-bit systems, this can result in +an integer underflow for certain values because unsigned arithmetic +is being used. + +This appears to be an oversight: if the intention was to use unsigned +arithmetic, then the first cast would have been omitted. The change +ensures both len values are cast to `ssize_t`. + +The underflow causes an issue with ktls when multiple TLS PDUs are +included in a single TCP segment. The mainline kernel does not use +strparser for ktls anymore, but this is still useful for other +features that still use strparser, and for backporting. + +Signed-off-by: Nate Karstens +Cc: stable@vger.kernel.org +Fixes: 43a0c6751a32 ("strparser: Stream parser for messages") +Reviewed-by: Jacob Keller +Reviewed-by: Sabrina Dubroca +Link: https://patch.msgid.link/20251106222835.1871628-1-nate.karstens@garmin.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/strparser/strparser.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/strparser/strparser.c ++++ b/net/strparser/strparser.c +@@ -238,7 +238,7 @@ static int __strp_recv(read_descriptor_t + strp_parser_err(strp, -EMSGSIZE, desc); + break; + } else if (len <= (ssize_t)head->len - +- skb->len - stm->strp.offset) { ++ (ssize_t)skb->len - stm->strp.offset) { + /* Length must be into new skb (and also + * greater than zero) + */