From: Sasha Levin Date: Fri, 26 Jun 2026 17:54:11 +0000 (-0400) Subject: Fixes for all trees X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=aa8832b481848a331779261dd63ac665aeafa38f;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for all trees Signed-off-by: Sasha Levin --- diff --git a/staging-5.15/revert-ptp-add-testptp-mask-test.patch b/staging-5.15/revert-ptp-add-testptp-mask-test.patch new file mode 100644 index 0000000000..d18d802717 --- /dev/null +++ b/staging-5.15/revert-ptp-add-testptp-mask-test.patch @@ -0,0 +1,93 @@ +From 492be12d4a21f5882cc0a4f4f87162a9ff3a6cd5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2026 14:31:52 +0200 +Subject: Revert "ptp: add testptp mask test" + +From: Petr Machata + +This reverts commit 8510559c0fa1e228b18fcf77cfbcf5b970793a8a, which is +commit 26285e689c6cd2cf3849568c83b2ebe53f467143 upstream. + +The reverted commit extends the selftest to test timestamp event queue mask +manipulation in testptp. It exercises masks PTP_MASK_CLEAR_ALL and +PTP_MASK_EN_SINGLE, introduced in commit c5a445b1e934 ("ptp: support event +queue reader channel masks"), which is not on this stable branch. The test +case thus cannot be built against this tree's own UAPI headers. + +The reverted commit was introduced to resolve a missing dependency of +commit bef3a83a9a67 ("testptp: Add option to open PHC in readonly mode"), +which is 76868642e427 upstream. The only conflict between the two is the +getopt string, and there is otherwise no direct dependency between the two. + +This patch therefore reverts the cited commit, with hand-resolving the +getopt string to include 'r' (as introduced by c6dc458227a3), but not +'F' (introduced by c1c50689799d). + +Reported-by: Yong Wang +Signed-off-by: Petr Machata +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/ptp/testptp.c | 19 +------------------ + 1 file changed, 1 insertion(+), 18 deletions(-) + +diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c +index d78d52f028ab52..84e86898f4b409 100644 +--- a/tools/testing/selftests/ptp/testptp.c ++++ b/tools/testing/selftests/ptp/testptp.c +@@ -121,7 +121,6 @@ static void usage(char *progname) + " -d name device to open\n" + " -e val read 'val' external time stamp events\n" + " -f val adjust the ptp clock frequency by 'val' ppb\n" +- " -F chan Enable single channel mask and keep device open for debugfs verification.\n" + " -g get the ptp clock time\n" + " -h prints this message\n" + " -i val index for event/trigger\n" +@@ -190,7 +189,6 @@ int main(int argc, char *argv[]) + int seconds = 0; + int readonly = 0; + int settime = 0; +- int channel = -1; + + int64_t t1, t2, tp; + int64_t interval, offset; +@@ -200,7 +198,7 @@ int main(int argc, char *argv[]) + + progname = strrchr(argv[0], '/'); + progname = progname ? 1+progname : argv[0]; +- while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xz"))) { ++ while (EOF != (c = getopt(argc, argv, "cd:e:f:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xz"))) { + switch (c) { + case 'c': + capabilities = 1; +@@ -214,9 +212,6 @@ int main(int argc, char *argv[]) + case 'f': + adjfreq = atoi(optarg); + break; +- case 'F': +- channel = atoi(optarg); +- break; + case 'g': + gettime = 1; + break; +@@ -616,18 +611,6 @@ int main(int argc, char *argv[]) + free(xts); + } + +- if (channel >= 0) { +- if (ioctl(fd, PTP_MASK_CLEAR_ALL)) { +- perror("PTP_MASK_CLEAR_ALL"); +- } else if (ioctl(fd, PTP_MASK_EN_SINGLE, (unsigned int *)&channel)) { +- perror("PTP_MASK_EN_SINGLE"); +- } else { +- printf("Channel %d exclusively enabled. Check on debugfs.\n", channel); +- printf("Press any key to continue\n."); +- getchar(); +- } +- } +- + close(fd); + return 0; + } +-- +2.53.0 + diff --git a/staging-5.15/revert-selftest-ptp-update-ptp-selftest-to-exercise-.patch b/staging-5.15/revert-selftest-ptp-update-ptp-selftest-to-exercise-.patch new file mode 100644 index 0000000000..e0b64b72ff --- /dev/null +++ b/staging-5.15/revert-selftest-ptp-update-ptp-selftest-to-exercise-.patch @@ -0,0 +1,147 @@ +From fc21c6a729f8798693461df430e637220889e2a6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2026 14:31:51 +0200 +Subject: Revert "selftest/ptp: update ptp selftest to exercise the gettimex + options" + +From: Petr Machata + +This reverts commit 6b32d042aa8255e964ebed860e24adccb204fcbc, which is +commit 3d07b691ee707c00afaf365440975e81bb96cd9b upstream. + +The cited commit allows testptp to set a configurable clock_id. That is +done via a PTP_SYS_OFFSET_EXTENDED ioctl call, whose argument is struct +ptp_sys_offset_extended, where the clock_id is set. However, this Linux +version does not support the ptp_sys_offset_extended.clockid field, and +the test case cannot be built against this tree's own UAPI headers. + +The reverted commit was introduced to resolve a missing dependency of +commit bef3a83a9a67 ("testptp: Add option to open PHC in readonly mode"), +which is 76868642e427 upstream. My suspicion is that the only conflict +between the two is the getopt string, and there is otherwise no direct +dependency between the two. + +This patch therefore reverts the cited commit, with hand-resolving the +getopt string to include 'r' (as introduced by c6dc458227a3), but not +'y' (introduced by 06954f715deb). + +Reported-by: Yong Wang +Signed-off-by: Petr Machata +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/ptp/testptp.c | 62 +++------------------------ + 1 file changed, 5 insertions(+), 57 deletions(-) + +diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c +index 89b4f43a7ba459..d78d52f028ab52 100644 +--- a/tools/testing/selftests/ptp/testptp.c ++++ b/tools/testing/selftests/ptp/testptp.c +@@ -147,7 +147,6 @@ static void usage(char *progname) + " -T val set the ptp clock time to 'val' seconds\n" + " -x val get an extended ptp clock time with the desired number of samples (up to %d)\n" + " -X get a ptp clock cross timestamp\n" +- " -y val pre/post tstamp timebase to use {realtime|monotonic|monotonic-raw}\n" + " -z test combinations of rising/falling external time stamp flags\n", + progname, PTP_MAX_SAMPLES); + } +@@ -192,7 +191,6 @@ int main(int argc, char *argv[]) + int readonly = 0; + int settime = 0; + int channel = -1; +- clockid_t ext_clockid = CLOCK_REALTIME; + + int64_t t1, t2, tp; + int64_t interval, offset; +@@ -202,7 +200,7 @@ int main(int argc, char *argv[]) + + progname = strrchr(argv[0], '/'); + progname = progname ? 1+progname : argv[0]; +- while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xy:z"))) { ++ while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xz"))) { + switch (c) { + case 'c': + capabilities = 1; +@@ -285,21 +283,6 @@ int main(int argc, char *argv[]) + case 'X': + getcross = 1; + break; +- case 'y': +- if (!strcasecmp(optarg, "realtime")) +- ext_clockid = CLOCK_REALTIME; +- else if (!strcasecmp(optarg, "monotonic")) +- ext_clockid = CLOCK_MONOTONIC; +- else if (!strcasecmp(optarg, "monotonic-raw")) +- ext_clockid = CLOCK_MONOTONIC_RAW; +- else { +- fprintf(stderr, +- "type needs to be realtime, monotonic or monotonic-raw; was given %s\n", +- optarg); +- return -1; +- } +- break; +- + case 'z': + flagtest = 1; + break; +@@ -590,7 +573,6 @@ int main(int argc, char *argv[]) + } + + soe->n_samples = getextended; +- soe->clockid = ext_clockid; + + if (ioctl(fd, PTP_SYS_OFFSET_EXTENDED, soe)) { + perror("PTP_SYS_OFFSET_EXTENDED"); +@@ -599,46 +581,12 @@ int main(int argc, char *argv[]) + getextended); + + for (i = 0; i < getextended; i++) { +- switch (ext_clockid) { +- case CLOCK_REALTIME: +- printf("sample #%2d: real time before: %lld.%09u\n", +- i, soe->ts[i][0].sec, +- soe->ts[i][0].nsec); +- break; +- case CLOCK_MONOTONIC: +- printf("sample #%2d: monotonic time before: %lld.%09u\n", +- i, soe->ts[i][0].sec, +- soe->ts[i][0].nsec); +- break; +- case CLOCK_MONOTONIC_RAW: +- printf("sample #%2d: monotonic-raw time before: %lld.%09u\n", +- i, soe->ts[i][0].sec, +- soe->ts[i][0].nsec); +- break; +- default: +- break; +- } ++ printf("sample #%2d: system time before: %lld.%09u\n", ++ i, soe->ts[i][0].sec, soe->ts[i][0].nsec); + printf(" phc time: %lld.%09u\n", + soe->ts[i][1].sec, soe->ts[i][1].nsec); +- switch (ext_clockid) { +- case CLOCK_REALTIME: +- printf(" real time after: %lld.%09u\n", +- soe->ts[i][2].sec, +- soe->ts[i][2].nsec); +- break; +- case CLOCK_MONOTONIC: +- printf(" monotonic time after: %lld.%09u\n", +- soe->ts[i][2].sec, +- soe->ts[i][2].nsec); +- break; +- case CLOCK_MONOTONIC_RAW: +- printf(" monotonic-raw time after: %lld.%09u\n", +- soe->ts[i][2].sec, +- soe->ts[i][2].nsec); +- break; +- default: +- break; +- } ++ printf(" system time after: %lld.%09u\n", ++ soe->ts[i][2].sec, soe->ts[i][2].nsec); + } + } + +-- +2.53.0 + diff --git a/staging-5.15/series b/staging-5.15/series new file mode 100644 index 0000000000..d7c0dfd243 --- /dev/null +++ b/staging-5.15/series @@ -0,0 +1,2 @@ +revert-selftest-ptp-update-ptp-selftest-to-exercise-.patch +revert-ptp-add-testptp-mask-test.patch diff --git a/staging-6.1/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch b/staging-6.1/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch new file mode 100644 index 0000000000..ef50850447 --- /dev/null +++ b/staging-6.1/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch @@ -0,0 +1,71 @@ +From 820505951b2c8f7274028e90218cc6f94419f324 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:24:36 +0200 +Subject: KVM: x86: Fix shadow paging use-after-free due to unexpected role + +From: Paolo Bonzini + +commit 81ccda30b4e83d8f5cc4fd50503c44e3a33abfeb upstream. + +Commit 0cb2af2ea66ad ("KVM: x86: Fix shadow paging use-after-free due +to unexpected GFN") fixed a shadow paging mismatch between stored and +computed GFNs; the bug could be triggered by changing a PDE mapping from +outside the guest, and then deleting a memslot. The rmap_remove() +call would miss entries created after the PDE change because the GFN +of the leaf SPTE does not match the GFN of the struct kvm_mmu_page. + +A similar hole however remains if the modified PDE points to a non-leaf +page. In this case the gfn can be made to match, but the role does not +match: the original large 2MB page creates a kvm_mmu_page with direct=1, +while the new 4KB needs a kvm_mmu_page with direct=0. However, +kvm_mmu_get_child_sp() does not compare the role, and therefore reuses +the page. + +The next step is installing a leaf (4KB) SPTE on the new path which +records an rmap entry under the gfn resolved by the walk. But when +that child is zapped its parent kvm_mmu_page has direct=1 and +kvm_mmu_page_get_gfn() computes the gfn for the 4KB page as +sp->gfn + index instead of using sp->shadowed_translation[] (or sp->gfns[] +in older kernels). It therefore fails to remove the recorded entry. + +When the memslot is dropped the shadow page is freed but the rmap +entry survives, as in the scenario that was already fixed. Code that +later walks that gfn (dirty logging, MMU notifier invalidation, and +so on) dereferences an sptep that lies in the freed page, causing the +use-after-free. + +Fixes: 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") +Reported-by: Hyunwoo Kim +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 58d67e5ab2c583..9edfc812423766 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2239,13 +2239,15 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) + { +- union kvm_mmu_page_role role; ++ union kvm_mmu_page_role role = kvm_mmu_child_role(sptep, direct, access); + +- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && +- spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) ++ if (is_shadow_present_pte(*sptep) && ++ !is_large_pte(*sptep) && ++ spte_to_child_sp(*sptep) && ++ spte_to_child_sp(*sptep)->gfn == gfn && ++ spte_to_child_sp(*sptep)->role.word == role.word) + return ERR_PTR(-EEXIST); + +- role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); + } + +-- +2.53.0 + diff --git a/staging-6.1/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch b/staging-6.1/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch new file mode 100644 index 0000000000..1bfffceb2b --- /dev/null +++ b/staging-6.1/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch @@ -0,0 +1,136 @@ +From 521ac6fe6afdb644708691075886d39b2211c6a7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:24:37 +0200 +Subject: KVM: x86/mmu: Ensure hugepage is in by slot before checking max + mapping level + +From: Sean Christopherson + +commit ef057cbf825e03b63f6edf5980f96abf3c53089d upstream. + +When recovering hugepages in the shadow MMU, verify that the base gfn of +the shadow page is actually contained within the target memslot, *before* +querying the max mapping level given the shadow page's gfn. Failure to +pre-check the validity of the gfn can lead to an out-of-bounds access to +the slot's lpage_info (which typically manifests as a host #PF because the +lpage_info is vmalloc'd) if the guest creates a hugepage mapping (in its +PTEs) that extends "below" the bounds of a memslot. + +When faulting in memory for a guest, and the size of the guest mapping is +greater than KVM's (current) max mapping, then KVM will create a "direct" +shadow page (direct in that there are no gPTEs to shadow, and so the target +gfn is a direct calculation given the base gfn of the shadow page). The +hugepage recovery flow looks for such direct shadow pages, as forcing 4KiB +mappings when dirty logging generates the guest > host mapping size case. +When the 4KiB restriction is lifted, then KVM can replace the shadow page +with a hugepage. + +But if KVM originally used a smaller mapping than the guest because the +range of memory covered by the guest hugepage exceeds the bounds of a +memslot, then KVM will link a direct shadow page with a gfn that is outside +the bounds of the memslot being used to fault in memory. The rmap entry +added for the leaf mapping is correct and within bounds, but the gfn of the +leaf SPTE's parent shadow page will be out of bounds. + + BUG: unable to handle page fault for address: ffffc90000806ffc + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 100000067 P4D 100000067 PUD 1002a7067 PMD 10612f067 PTE 0 + Oops: Oops: 0000 [#1] SMP + CPU: 13 UID: 1000 PID: 757 Comm: mmu_stress_test Not tainted 7.1.0-rc1-48ce1e26eace-x86_pir_to_irr_comments-vm #341 PREEMPT + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + RIP: 0010:kvm_mmu_max_mapping_level+0x79/0x2b0 [kvm] + Call Trace: + + kvm_mmu_recover_huge_pages+0x21b/0x320 [kvm] + kvm_set_memslot+0x1ee/0x590 [kvm] + kvm_set_memory_region.part.0+0x3a1/0x4d0 [kvm] + kvm_vm_ioctl+0x9bf/0x15d0 [kvm] + __x64_sys_ioctl+0x8a/0xd0 + do_syscall_64+0xb7/0xbb0 + entry_SYSCALL_64_after_hwframe+0x4b/0x53 + RIP: 0033:0x7f21c0f1a9bf + + +Don't bother pre-checking the bounds of the potential hugepage, i.e. don't +check that e.g. sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1) is also +within the memslot, as the checks performed by kvm_mmu_max_mapping_level() +are a superset of the basic bounds checks. I.e. pre-checking the full +range would be a dubious micro-optimization. + +Fixes: 9eba50f8d7fc ("KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs") +Cc: stable@vger.kernel.org +Cc: David Matlack +Cc: James Houghton +Cc: Alexander Bulekov +Cc: Fred Griffoul +Cc: Alexander Graf +Cc: David Woodhouse +Cc: Filippo Sironi +Cc: Ivan Orlov +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 18 ++++++++++++------ + include/linux/kvm_host.h | 7 ++++++- + 2 files changed, 18 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 9edfc812423766..7785da8f0ad339 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6417,13 +6417,19 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + sp = sptep_to_sp(sptep); + + /* +- * We cannot do huge page mapping for indirect shadow pages, +- * which are found on the last rmap (level = 1) when not using +- * tdp; such shadow pages are synced with the page table in +- * the guest, and the guest page table is using 4K page size +- * mapping if the indirect sp has level = 1. ++ * Direct shadow page can be replaced by a hugepage if the host ++ * mapping level allows it and the memslot maps all of the host ++ * hugepage. Note! If the memslot maps only part of the ++ * hugepage, sp->gfn may be below slot->base_gfn, and querying ++ * the max mapping level would cause an out-of-bounds lpage_info ++ * access. So the gfn bounds check *must* be done first. ++ * ++ * Indirect shadow pages are created when the guest page tables ++ * are using 4K pages. Since the host mapping is always ++ * constrained by the page size in the guest, indirect shadow ++ * pages are never collapsible. + */ +- if (sp->role.direct && ++ if (sp->role.direct && is_gfn_in_memslot(slot, sp->gfn) && + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, + PG_LEVEL_NUM)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 04fac4cdcfbcef..624380c4fef809 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -1621,6 +1621,11 @@ int kvm_request_irq_source_id(struct kvm *kvm); + void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); + bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); + ++static inline bool is_gfn_in_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) ++{ ++ return gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages; ++} ++ + /* + * Returns a pointer to the memslot if it contains gfn. + * Otherwise returns NULL. +@@ -1631,7 +1636,7 @@ try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) + if (!slot) + return NULL; + +- if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) ++ if (is_gfn_in_memslot(slot, gfn)) + return slot; + else + return NULL; +-- +2.53.0 + diff --git a/staging-6.1/mm-mglru-skip-special-vmas-in-lru_gen_look_around.patch b/staging-6.1/mm-mglru-skip-special-vmas-in-lru_gen_look_around.patch new file mode 100644 index 0000000000..888089a4b9 --- /dev/null +++ b/staging-6.1/mm-mglru-skip-special-vmas-in-lru_gen_look_around.patch @@ -0,0 +1,80 @@ +From dfb507cfad4d73f87e54272ee099e1b9c8fd6f03 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2026 23:27:51 +0200 +Subject: mm/mglru: skip special VMAs in lru_gen_look_around() + +From: Yu Zhao + +[ Upstream commit c28ac3c7eb945fee6e20f47d576af68fdff1392a ] + +Special VMAs like VM_PFNMAP can contain anon pages from COW. There isn't +much profit in doing lookaround on them. Besides, they can trigger the +pte_special() warning in get_pte_pfn(). + +Skip them in lru_gen_look_around(). + +Link: https://lkml.kernel.org/r/20231223045647.1566043-1-yuzhao@google.com +Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap") +Signed-off-by: Yu Zhao +Reported-by: syzbot+03fd9b3f71641f0ebf2d@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/000000000000f9ff00060d14c256@google.com/ +Cc: +Signed-off-by: Andrew Morton +[fix conflicts with variable declarations and vma pointer usage] +Signed-off-by: Jakov Novak +Signed-off-by: Sasha Levin +--- + mm/vmscan.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 1f7a90ecc7007d..f6f8c18dc45f57 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4622,6 +4622,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + struct lru_gen_mm_walk *walk; + int young = 0; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; ++ struct vm_area_struct *vma = pvmw->vma; + struct folio *folio = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); +@@ -4635,11 +4636,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (spin_is_contended(pvmw->ptl)) + return; + ++ /* exclude special VMAs containing anon pages from COW */ ++ if (vma->vm_flags & VM_SPECIAL) ++ return; ++ + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + +- start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); +- end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ start = max(pvmw->address & PMD_MASK, vma->vm_start); ++ end = min(pvmw->address | ~PMD_MASK, vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) +@@ -4660,7 +4665,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + +- pfn = get_pte_pfn(pte[i], pvmw->vma, addr); ++ pfn = get_pte_pfn(pte[i], vma, addr); + if (pfn == -1) + continue; + +@@ -4671,7 +4676,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (!folio) + continue; + +- if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ if (!ptep_test_and_clear_young(vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; +-- +2.53.0 + diff --git a/staging-6.1/perf-bench-avoid-ndebug-warning.patch b/staging-6.1/perf-bench-avoid-ndebug-warning.patch new file mode 100644 index 0000000000..0b252e8313 --- /dev/null +++ b/staging-6.1/perf-bench-avoid-ndebug-warning.patch @@ -0,0 +1,69 @@ +From 217f6fbbb3c73e8855307cf712ab66b736fac020 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2026 13:32:21 +0000 +Subject: perf bench: Avoid NDEBUG warning + +From: Ian Rogers + +[ Upstream commit d1babea9c38282b58a6f822ab95027cba3165a42 ] + +With NDEBUG set the asserts are compiled out. This yields +"unused-but-set-variable" variables. Move these variables behind +NDEBUG to avoid the warning. + +Signed-off-by: Ian Rogers +Cc: Adrian Hunter +Cc: Alexander Shishkin +Cc: Ingo Molnar +Cc: Jiri Olsa +Cc: Mark Rutland +Cc: Namhyung Kim +Cc: Paolo Bonzini +Cc: Peter Zijlstra +Cc: Sean Christopherson +Link: https://lore.kernel.org/r/20230330183827.1412303-1-irogers@google.com +Signed-off-by: Arnaldo Carvalho de Melo +Stable-dep-of: 616b14b47a86 ("perf build: Conditionally define NDEBUG") +Signed-off-by: Simon Liebold +Signed-off-by: Sasha Levin +--- + tools/perf/bench/find-bit-bench.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c +index 22b5cfe9702370..80f051f9c20fd9 100644 +--- a/tools/perf/bench/find-bit-bench.c ++++ b/tools/perf/bench/find-bit-bench.c +@@ -61,7 +61,6 @@ static int do_for_each_set_bit(unsigned int num_bits) + double time_average, time_stddev; + unsigned int bit, i, j; + unsigned int set_bits, skip; +- unsigned int old; + + init_stats(&fb_time_stats); + init_stats(&tb_time_stats); +@@ -73,7 +72,10 @@ static int do_for_each_set_bit(unsigned int num_bits) + set_bit(i, to_test); + + for (i = 0; i < outer_iterations; i++) { +- old = accumulator; ++#ifndef NDEBUG ++ unsigned int old = accumulator; ++#endif ++ + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for_each_set_bit(bit, to_test, num_bits) +@@ -85,7 +87,9 @@ static int do_for_each_set_bit(unsigned int num_bits) + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&fb_time_stats, runtime_us); + ++#ifndef NDEBUG + old = accumulator; ++#endif + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for (bit = 0; bit < num_bits; bit++) { +-- +2.53.0 + diff --git a/staging-6.1/perf-block-range-move-debug-code-behind-ifndef-ndebu.patch b/staging-6.1/perf-block-range-move-debug-code-behind-ifndef-ndebu.patch new file mode 100644 index 0000000000..e899b34aac --- /dev/null +++ b/staging-6.1/perf-block-range-move-debug-code-behind-ifndef-ndebu.patch @@ -0,0 +1,50 @@ +From cab8b53bc49101e02d2ad8320e9a1828d9ae2624 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2026 13:32:22 +0000 +Subject: perf block-range: Move debug code behind ifndef NDEBUG + +From: Ian Rogers + +[ Upstream commit 984a785f25e5b5db5fa673130b60dca6ca794406 ] + +Make good on a comment and avoid a unused-but-set-variable warning. + +Signed-off-by: Ian Rogers +Cc: Adrian Hunter +Cc: Alexander Shishkin +Cc: Ingo Molnar +Cc: Jiri Olsa +Cc: Mark Rutland +Cc: Namhyung Kim +Cc: Paolo Bonzini +Cc: Peter Zijlstra +Cc: Sean Christopherson +Link: https://lore.kernel.org/r/20230330183827.1412303-1-irogers@google.com +Signed-off-by: Arnaldo Carvalho de Melo +Stable-dep-of: 616b14b47a86 ("perf build: Conditionally define NDEBUG") +Signed-off-by: Simon Liebold +Signed-off-by: Sasha Levin +--- + tools/perf/util/block-range.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/tools/perf/util/block-range.c b/tools/perf/util/block-range.c +index 1be43265750137..680e92774d0cde 100644 +--- a/tools/perf/util/block-range.c ++++ b/tools/perf/util/block-range.c +@@ -11,11 +11,7 @@ struct { + + static void block_range__debug(void) + { +- /* +- * XXX still paranoid for now; see if we can make this depend on +- * DEBUG=1 builds. +- */ +-#if 1 ++#ifndef NDEBUG + struct rb_node *rb; + u64 old = 0; /* NULL isn't executable */ + +-- +2.53.0 + diff --git a/staging-6.1/revert-ptp-add-testptp-mask-test.patch b/staging-6.1/revert-ptp-add-testptp-mask-test.patch new file mode 100644 index 0000000000..5c76729525 --- /dev/null +++ b/staging-6.1/revert-ptp-add-testptp-mask-test.patch @@ -0,0 +1,93 @@ +From 6ac4e808e2168de2361ad605d0e16df87c2fb2d8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2026 14:31:21 +0200 +Subject: Revert "ptp: add testptp mask test" + +From: Petr Machata + +This reverts commit c1c50689799d0343598ab6ccb7209819bcef248d, which is +commit 26285e689c6cd2cf3849568c83b2ebe53f467143 upstream. + +The reverted commit extends the selftest to test timestamp event queue mask +manipulation in testptp. It exercises masks PTP_MASK_CLEAR_ALL and +PTP_MASK_EN_SINGLE, introduced in commit c5a445b1e934 ("ptp: support event +queue reader channel masks"), which is not on this stable branch. The test +case thus cannot be built against this tree's own UAPI headers. + +The reverted commit was introduced to resolve a missing dependency of +commit c6dc458227a3 ("testptp: Add option to open PHC in readonly mode"), +which is 76868642e427 upstream. The only conflict between the two is the +getopt string, and there is otherwise no direct dependency between the two. + +This patch therefore reverts the cited commit, with hand-resolving the +getopt string to include 'r' (as introduced by c6dc458227a3), but not +'F' (introduced by c1c50689799d). + +Reported-by: Yong Wang +Signed-off-by: Petr Machata +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/ptp/testptp.c | 19 +------------------ + 1 file changed, 1 insertion(+), 18 deletions(-) + +diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c +index 7030bae8e5e07e..14b975594c88e7 100644 +--- a/tools/testing/selftests/ptp/testptp.c ++++ b/tools/testing/selftests/ptp/testptp.c +@@ -121,7 +121,6 @@ static void usage(char *progname) + " -d name device to open\n" + " -e val read 'val' external time stamp events\n" + " -f val adjust the ptp clock frequency by 'val' ppb\n" +- " -F chan Enable single channel mask and keep device open for debugfs verification.\n" + " -g get the ptp clock time\n" + " -h prints this message\n" + " -i val index for event/trigger\n" +@@ -190,7 +189,6 @@ int main(int argc, char *argv[]) + int seconds = 0; + int readonly = 0; + int settime = 0; +- int channel = -1; + + int64_t t1, t2, tp; + int64_t interval, offset; +@@ -200,7 +198,7 @@ int main(int argc, char *argv[]) + + progname = strrchr(argv[0], '/'); + progname = progname ? 1+progname : argv[0]; +- while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xz"))) { ++ while (EOF != (c = getopt(argc, argv, "cd:e:f:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xz"))) { + switch (c) { + case 'c': + capabilities = 1; +@@ -214,9 +212,6 @@ int main(int argc, char *argv[]) + case 'f': + adjfreq = atoi(optarg); + break; +- case 'F': +- channel = atoi(optarg); +- break; + case 'g': + gettime = 1; + break; +@@ -616,18 +611,6 @@ int main(int argc, char *argv[]) + free(xts); + } + +- if (channel >= 0) { +- if (ioctl(fd, PTP_MASK_CLEAR_ALL)) { +- perror("PTP_MASK_CLEAR_ALL"); +- } else if (ioctl(fd, PTP_MASK_EN_SINGLE, (unsigned int *)&channel)) { +- perror("PTP_MASK_EN_SINGLE"); +- } else { +- printf("Channel %d exclusively enabled. Check on debugfs.\n", channel); +- printf("Press any key to continue\n."); +- getchar(); +- } +- } +- + close(fd); + return 0; + } +-- +2.53.0 + diff --git a/staging-6.1/series b/staging-6.1/series new file mode 100644 index 0000000000..1cebfc9e2e --- /dev/null +++ b/staging-6.1/series @@ -0,0 +1,6 @@ +perf-bench-avoid-ndebug-warning.patch +perf-block-range-move-debug-code-behind-ifndef-ndebu.patch +kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch +kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch +revert-ptp-add-testptp-mask-test.patch +mm-mglru-skip-special-vmas-in-lru_gen_look_around.patch diff --git a/staging-6.12/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch b/staging-6.12/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch new file mode 100644 index 0000000000..ccf3a53145 --- /dev/null +++ b/staging-6.12/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch @@ -0,0 +1,71 @@ +From e5fafa4b1a7e592a8bcd2ff828db676f68f4dc81 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:24:04 +0200 +Subject: KVM: x86: Fix shadow paging use-after-free due to unexpected role + +From: Paolo Bonzini + +commit 81ccda30b4e83d8f5cc4fd50503c44e3a33abfeb upstream. + +Commit 0cb2af2ea66ad ("KVM: x86: Fix shadow paging use-after-free due +to unexpected GFN") fixed a shadow paging mismatch between stored and +computed GFNs; the bug could be triggered by changing a PDE mapping from +outside the guest, and then deleting a memslot. The rmap_remove() +call would miss entries created after the PDE change because the GFN +of the leaf SPTE does not match the GFN of the struct kvm_mmu_page. + +A similar hole however remains if the modified PDE points to a non-leaf +page. In this case the gfn can be made to match, but the role does not +match: the original large 2MB page creates a kvm_mmu_page with direct=1, +while the new 4KB needs a kvm_mmu_page with direct=0. However, +kvm_mmu_get_child_sp() does not compare the role, and therefore reuses +the page. + +The next step is installing a leaf (4KB) SPTE on the new path which +records an rmap entry under the gfn resolved by the walk. But when +that child is zapped its parent kvm_mmu_page has direct=1 and +kvm_mmu_page_get_gfn() computes the gfn for the 4KB page as +sp->gfn + index instead of using sp->shadowed_translation[] (or sp->gfns[] +in older kernels). It therefore fails to remove the recorded entry. + +When the memslot is dropped the shadow page is freed but the rmap +entry survives, as in the scenario that was already fixed. Code that +later walks that gfn (dirty logging, MMU notifier invalidation, and +so on) dereferences an sptep that lies in the freed page, causing the +use-after-free. + +Fixes: 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") +Reported-by: Hyunwoo Kim +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index d288c60ae200ba..a67d013fff4d91 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2329,13 +2329,15 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) + { +- union kvm_mmu_page_role role; ++ union kvm_mmu_page_role role = kvm_mmu_child_role(sptep, direct, access); + +- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && +- spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) ++ if (is_shadow_present_pte(*sptep) && ++ !is_large_pte(*sptep) && ++ spte_to_child_sp(*sptep) && ++ spte_to_child_sp(*sptep)->gfn == gfn && ++ spte_to_child_sp(*sptep)->role.word == role.word) + return ERR_PTR(-EEXIST); + +- role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); + } + +-- +2.53.0 + diff --git a/staging-6.12/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch b/staging-6.12/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch new file mode 100644 index 0000000000..d5b0c78b0f --- /dev/null +++ b/staging-6.12/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch @@ -0,0 +1,136 @@ +From 49df398f6a3114653555bd0876213293dc761a65 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:24:05 +0200 +Subject: KVM: x86/mmu: Ensure hugepage is in by slot before checking max + mapping level + +From: Sean Christopherson + +commit ef057cbf825e03b63f6edf5980f96abf3c53089d upstream. + +When recovering hugepages in the shadow MMU, verify that the base gfn of +the shadow page is actually contained within the target memslot, *before* +querying the max mapping level given the shadow page's gfn. Failure to +pre-check the validity of the gfn can lead to an out-of-bounds access to +the slot's lpage_info (which typically manifests as a host #PF because the +lpage_info is vmalloc'd) if the guest creates a hugepage mapping (in its +PTEs) that extends "below" the bounds of a memslot. + +When faulting in memory for a guest, and the size of the guest mapping is +greater than KVM's (current) max mapping, then KVM will create a "direct" +shadow page (direct in that there are no gPTEs to shadow, and so the target +gfn is a direct calculation given the base gfn of the shadow page). The +hugepage recovery flow looks for such direct shadow pages, as forcing 4KiB +mappings when dirty logging generates the guest > host mapping size case. +When the 4KiB restriction is lifted, then KVM can replace the shadow page +with a hugepage. + +But if KVM originally used a smaller mapping than the guest because the +range of memory covered by the guest hugepage exceeds the bounds of a +memslot, then KVM will link a direct shadow page with a gfn that is outside +the bounds of the memslot being used to fault in memory. The rmap entry +added for the leaf mapping is correct and within bounds, but the gfn of the +leaf SPTE's parent shadow page will be out of bounds. + + BUG: unable to handle page fault for address: ffffc90000806ffc + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 100000067 P4D 100000067 PUD 1002a7067 PMD 10612f067 PTE 0 + Oops: Oops: 0000 [#1] SMP + CPU: 13 UID: 1000 PID: 757 Comm: mmu_stress_test Not tainted 7.1.0-rc1-48ce1e26eace-x86_pir_to_irr_comments-vm #341 PREEMPT + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + RIP: 0010:kvm_mmu_max_mapping_level+0x79/0x2b0 [kvm] + Call Trace: + + kvm_mmu_recover_huge_pages+0x21b/0x320 [kvm] + kvm_set_memslot+0x1ee/0x590 [kvm] + kvm_set_memory_region.part.0+0x3a1/0x4d0 [kvm] + kvm_vm_ioctl+0x9bf/0x15d0 [kvm] + __x64_sys_ioctl+0x8a/0xd0 + do_syscall_64+0xb7/0xbb0 + entry_SYSCALL_64_after_hwframe+0x4b/0x53 + RIP: 0033:0x7f21c0f1a9bf + + +Don't bother pre-checking the bounds of the potential hugepage, i.e. don't +check that e.g. sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1) is also +within the memslot, as the checks performed by kvm_mmu_max_mapping_level() +are a superset of the basic bounds checks. I.e. pre-checking the full +range would be a dubious micro-optimization. + +Fixes: 9eba50f8d7fc ("KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs") +Cc: stable@vger.kernel.org +Cc: David Matlack +Cc: James Houghton +Cc: Alexander Bulekov +Cc: Fred Griffoul +Cc: Alexander Graf +Cc: David Woodhouse +Cc: Filippo Sironi +Cc: Ivan Orlov +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 18 ++++++++++++------ + include/linux/kvm_host.h | 7 ++++++- + 2 files changed, 18 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index a67d013fff4d91..aab26f90c28551 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6952,13 +6952,19 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + sp = sptep_to_sp(sptep); + + /* +- * We cannot do huge page mapping for indirect shadow pages, +- * which are found on the last rmap (level = 1) when not using +- * tdp; such shadow pages are synced with the page table in +- * the guest, and the guest page table is using 4K page size +- * mapping if the indirect sp has level = 1. ++ * Direct shadow page can be replaced by a hugepage if the host ++ * mapping level allows it and the memslot maps all of the host ++ * hugepage. Note! If the memslot maps only part of the ++ * hugepage, sp->gfn may be below slot->base_gfn, and querying ++ * the max mapping level would cause an out-of-bounds lpage_info ++ * access. So the gfn bounds check *must* be done first. ++ * ++ * Indirect shadow pages are created when the guest page tables ++ * are using 4K pages. Since the host mapping is always ++ * constrained by the page size in the guest, indirect shadow ++ * pages are never collapsible. + */ +- if (sp->role.direct && ++ if (sp->role.direct && is_gfn_in_memslot(slot, sp->gfn) && + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, + PG_LEVEL_NUM)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 04b81e2166d5dc..b4235e99f0a9d0 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -1745,6 +1745,11 @@ int kvm_request_irq_source_id(struct kvm *kvm); + void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); + bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); + ++static inline bool is_gfn_in_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) ++{ ++ return gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages; ++} ++ + /* + * Returns a pointer to the memslot if it contains gfn. + * Otherwise returns NULL. +@@ -1755,7 +1760,7 @@ try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) + if (!slot) + return NULL; + +- if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) ++ if (is_gfn_in_memslot(slot, gfn)) + return slot; + else + return NULL; +-- +2.53.0 + diff --git a/staging-6.12/series b/staging-6.12/series new file mode 100644 index 0000000000..c98117dad8 --- /dev/null +++ b/staging-6.12/series @@ -0,0 +1,2 @@ +kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch +kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch diff --git a/staging-6.18/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch b/staging-6.18/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch new file mode 100644 index 0000000000..d9d40a5753 --- /dev/null +++ b/staging-6.18/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch @@ -0,0 +1,71 @@ +From 5172dc086d930aaf62c5b03b49199990dd1a0dcc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:23:15 +0200 +Subject: KVM: x86: Fix shadow paging use-after-free due to unexpected role + +From: Paolo Bonzini + +commit 81ccda30b4e83d8f5cc4fd50503c44e3a33abfeb upstream. + +Commit 0cb2af2ea66ad ("KVM: x86: Fix shadow paging use-after-free due +to unexpected GFN") fixed a shadow paging mismatch between stored and +computed GFNs; the bug could be triggered by changing a PDE mapping from +outside the guest, and then deleting a memslot. The rmap_remove() +call would miss entries created after the PDE change because the GFN +of the leaf SPTE does not match the GFN of the struct kvm_mmu_page. + +A similar hole however remains if the modified PDE points to a non-leaf +page. In this case the gfn can be made to match, but the role does not +match: the original large 2MB page creates a kvm_mmu_page with direct=1, +while the new 4KB needs a kvm_mmu_page with direct=0. However, +kvm_mmu_get_child_sp() does not compare the role, and therefore reuses +the page. + +The next step is installing a leaf (4KB) SPTE on the new path which +records an rmap entry under the gfn resolved by the walk. But when +that child is zapped its parent kvm_mmu_page has direct=1 and +kvm_mmu_page_get_gfn() computes the gfn for the 4KB page as +sp->gfn + index instead of using sp->shadowed_translation[] (or sp->gfns[] +in older kernels). It therefore fails to remove the recorded entry. + +When the memslot is dropped the shadow page is freed but the rmap +entry survives, as in the scenario that was already fixed. Code that +later walks that gfn (dirty logging, MMU notifier invalidation, and +so on) dereferences an sptep that lies in the freed page, causing the +use-after-free. + +Fixes: 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") +Reported-by: Hyunwoo Kim +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 0bd0cb8992c9fd..541e199feb9981 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2453,13 +2453,15 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) + { +- union kvm_mmu_page_role role; ++ union kvm_mmu_page_role role = kvm_mmu_child_role(sptep, direct, access); + +- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && +- spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) ++ if (is_shadow_present_pte(*sptep) && ++ !is_large_pte(*sptep) && ++ spte_to_child_sp(*sptep) && ++ spte_to_child_sp(*sptep)->gfn == gfn && ++ spte_to_child_sp(*sptep)->role.word == role.word) + return ERR_PTR(-EEXIST); + +- role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); + } + +-- +2.53.0 + diff --git a/staging-6.18/lsm-add-backing_file-lsm-hooks.patch b/staging-6.18/lsm-add-backing_file-lsm-hooks.patch new file mode 100644 index 0000000000..ff9eb00b1b --- /dev/null +++ b/staging-6.18/lsm-add-backing_file-lsm-hooks.patch @@ -0,0 +1,560 @@ +From d0a894862fd2a3c4c2557c6e2b341d2193d8ff9f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 15:50:34 +0800 +Subject: lsm: add backing_file LSM hooks + +From: Paul Moore + +[ Upstream commit 6af36aeb147a06dea47c49859cd6ca5659aeb987 ] + +Stacked filesystems such as overlayfs do not currently provide the +necessary mechanisms for LSMs to properly enforce access controls on the +mmap() and mprotect() operations. In order to resolve this gap, a LSM +security blob is being added to the backing_file struct and the following +new LSM hooks are being created: + + security_backing_file_alloc() + security_backing_file_free() + security_mmap_backing_file() + +The first two hooks are to manage the lifecycle of the LSM security blob +in the backing_file struct, while the third provides a new mmap() access +control point for the underlying backing file. It is also expected that +LSMs will likely want to update their security_file_mprotect() callback +to address issues with their mprotect() controls, but that does not +require a change to the security_file_mprotect() LSM hook. + +There are a three other small changes to support these new LSM hooks: +* Pass the user file associated with a backing file down to +alloc_empty_backing_file() so it can be included in the +security_backing_file_alloc() hook. +* Add getter and setter functions for the backing_file struct LSM blob +as the backing_file struct remains private to fs/file_table.c. +* Constify the file struct field in the LSM common_audit_data struct to +better support LSMs that need to pass a const file struct pointer into +the common LSM audit code. + +Thanks to Arnd Bergmann for identifying the missing EXPORT_SYMBOL_GPL() +and supplying a fixup. + +Cc: stable@vger.kernel.org +Cc: linux-fsdevel@vger.kernel.org +Cc: linux-unionfs@vger.kernel.org +Cc: linux-erofs@lists.ozlabs.org +Reviewed-by: Amir Goldstein +Reviewed-by: Serge Hallyn +Reviewed-by: Christian Brauner +Signed-off-by: Paul Moore +[Mainline declares lsm_backing_file_cache in security/lsm.h. Linux 6.18.y +does not have security/lsm_init.c or security/lsm.h; the cache variable +is defined locally as static struct kmem_cache *lsm_backing_file_cache in +security/security.c.] +Signed-off-by: Cai Xinchen +Signed-off-by: Sasha Levin +--- + fs/backing-file.c | 17 ++++-- + fs/file_table.c | 27 +++++++-- + fs/fuse/passthrough.c | 2 +- + fs/internal.h | 3 +- + fs/overlayfs/dir.c | 2 +- + fs/overlayfs/file.c | 2 +- + include/linux/backing-file.h | 4 +- + include/linux/fs.h | 13 ++++ + include/linux/lsm_audit.h | 2 +- + include/linux/lsm_hook_defs.h | 5 ++ + include/linux/lsm_hooks.h | 1 + + include/linux/security.h | 22 +++++++ + security/security.c | 109 ++++++++++++++++++++++++++++++++++ + 13 files changed, 194 insertions(+), 15 deletions(-) + +diff --git a/fs/backing-file.c b/fs/backing-file.c +index 15a7f80310848d..e049a627d78fb9 100644 +--- a/fs/backing-file.c ++++ b/fs/backing-file.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + + #include "internal.h" + +@@ -29,14 +30,15 @@ + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). + */ +-struct file *backing_file_open(const struct path *user_path, int flags, ++struct file *backing_file_open(const struct file *user_file, int flags, + const struct path *real_path, + const struct cred *cred) + { ++ const struct path *user_path = &user_file->f_path; + struct file *f; + int error; + +- f = alloc_empty_backing_file(flags, cred); ++ f = alloc_empty_backing_file(flags, cred, user_file); + if (IS_ERR(f)) + return f; + +@@ -52,15 +54,16 @@ struct file *backing_file_open(const struct path *user_path, int flags, + } + EXPORT_SYMBOL_GPL(backing_file_open); + +-struct file *backing_tmpfile_open(const struct path *user_path, int flags, ++struct file *backing_tmpfile_open(const struct file *user_file, int flags, + const struct path *real_parentpath, + umode_t mode, const struct cred *cred) + { + struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt); ++ const struct path *user_path = &user_file->f_path; + struct file *f; + int error; + +- f = alloc_empty_backing_file(flags, cred); ++ f = alloc_empty_backing_file(flags, cred, user_file); + if (IS_ERR(f)) + return f; + +@@ -339,6 +342,12 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); ++ ret = security_mmap_backing_file(vma, file, user_file); ++ if (ret) { ++ revert_creds(old_cred); ++ return ret; ++ } ++ + ret = vfs_mmap(vma->vm_file, vma); + revert_creds(old_cred); + +diff --git a/fs/file_table.c b/fs/file_table.c +index 762f03dcbcd778..987e01da993894 100644 +--- a/fs/file_table.c ++++ b/fs/file_table.c +@@ -50,6 +50,9 @@ struct backing_file { + struct path user_path; + freeptr_t bf_freeptr; + }; ++#ifdef CONFIG_SECURITY ++ void *security; ++#endif + }; + + #define backing_file(f) container_of(f, struct backing_file, file) +@@ -66,8 +69,21 @@ void backing_file_set_user_path(struct file *f, const struct path *path) + } + EXPORT_SYMBOL_GPL(backing_file_set_user_path); + ++#ifdef CONFIG_SECURITY ++void *backing_file_security(const struct file *f) ++{ ++ return backing_file(f)->security; ++} ++ ++void backing_file_set_security(struct file *f, void *security) ++{ ++ backing_file(f)->security = security; ++} ++#endif /* CONFIG_SECURITY */ ++ + static inline void backing_file_free(struct backing_file *ff) + { ++ security_backing_file_free(&ff->file); + path_put(&ff->user_path); + kmem_cache_free(bfilp_cachep, ff); + } +@@ -288,10 +304,12 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) + return f; + } + +-static int init_backing_file(struct backing_file *ff) ++static int init_backing_file(struct backing_file *ff, ++ const struct file *user_file) + { + memset(&ff->user_path, 0, sizeof(ff->user_path)); +- return 0; ++ backing_file_set_security(&ff->file, NULL); ++ return security_backing_file_alloc(&ff->file, user_file); + } + + /* +@@ -301,7 +319,8 @@ static int init_backing_file(struct backing_file *ff) + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. + */ +-struct file *alloc_empty_backing_file(int flags, const struct cred *cred) ++struct file *alloc_empty_backing_file(int flags, const struct cred *cred, ++ const struct file *user_file) + { + struct backing_file *ff; + int error; +@@ -318,7 +337,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) + + /* The f_mode flags must be set before fput(). */ + ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; +- error = init_backing_file(ff); ++ error = init_backing_file(ff, user_file); + if (unlikely(error)) { + fput(&ff->file); + return ERR_PTR(error); +diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c +index 72de97c03d0eeb..f2d08ac2459b7e 100644 +--- a/fs/fuse/passthrough.c ++++ b/fs/fuse/passthrough.c +@@ -167,7 +167,7 @@ struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id) + goto out; + + /* Allocate backing file per fuse file to store fuse path */ +- backing_file = backing_file_open(&file->f_path, file->f_flags, ++ backing_file = backing_file_open(file, file->f_flags, + &fb->file->f_path, fb->cred); + err = PTR_ERR(backing_file); + if (IS_ERR(backing_file)) { +diff --git a/fs/internal.h b/fs/internal.h +index 9b2b4d11688023..51107fd515145b 100644 +--- a/fs/internal.h ++++ b/fs/internal.h +@@ -100,7 +100,8 @@ extern void chroot_fs_refs(const struct path *, const struct path *); + */ + struct file *alloc_empty_file(int flags, const struct cred *cred); + struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); +-struct file *alloc_empty_backing_file(int flags, const struct cred *cred); ++struct file *alloc_empty_backing_file(int flags, const struct cred *cred, ++ const struct file *user_file); + void backing_file_set_user_path(struct file *f, const struct path *path); + + static inline void file_put_write_access(struct file *file) +diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c +index a5e9ddf3023b39..e924321b64025e 100644 +--- a/fs/overlayfs/dir.c ++++ b/fs/overlayfs/dir.c +@@ -1355,7 +1355,7 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, + } + + ovl_path_upper(dentry->d_parent, &realparentpath); +- realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath, ++ realfile = backing_tmpfile_open(file, flags, &realparentpath, + mode, current_cred()); + err = PTR_ERR_OR_ZERO(realfile); + pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err); +diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c +index 7ab2c9daffd017..3fedfdddfa7584 100644 +--- a/fs/overlayfs/file.c ++++ b/fs/overlayfs/file.c +@@ -48,7 +48,7 @@ static struct file *ovl_open_realfile(const struct file *file, + if (!inode_owner_or_capable(real_idmap, realinode)) + flags &= ~O_NOATIME; + +- realfile = backing_file_open(file_user_path(file), ++ realfile = backing_file_open(file, + flags, realpath, current_cred()); + } + ovl_revert_creds(old_cred); +diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h +index 1476a6ed1bfd77..c939cd222730c4 100644 +--- a/include/linux/backing-file.h ++++ b/include/linux/backing-file.h +@@ -18,10 +18,10 @@ struct backing_file_ctx { + void (*end_write)(struct kiocb *iocb, ssize_t); + }; + +-struct file *backing_file_open(const struct path *user_path, int flags, ++struct file *backing_file_open(const struct file *user_file, int flags, + const struct path *real_path, + const struct cred *cred); +-struct file *backing_tmpfile_open(const struct path *user_path, int flags, ++struct file *backing_tmpfile_open(const struct file *user_file, int flags, + const struct path *real_parentpath, + umode_t mode, const struct cred *cred); + ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 014cb04eefbe6c..f3e798184a58e8 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -2890,6 +2890,19 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred); + const struct path *backing_file_user_path(const struct file *f); + ++#ifdef CONFIG_SECURITY ++void *backing_file_security(const struct file *f); ++void backing_file_set_security(struct file *f, void *security); ++#else ++static inline void *backing_file_security(const struct file *f) ++{ ++ return NULL; ++} ++static inline void backing_file_set_security(struct file *f, void *security) ++{ ++} ++#endif /* CONFIG_SECURITY */ ++ + /* + * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file + * stored in ->vm_file is a backing file whose f_inode is on the underlying +diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h +index 382c56a97bba1d..584db296e43b20 100644 +--- a/include/linux/lsm_audit.h ++++ b/include/linux/lsm_audit.h +@@ -94,7 +94,7 @@ struct common_audit_data { + #endif + char *kmod_name; + struct lsm_ioctlop_audit *op; +- struct file *file; ++ const struct file *file; + struct lsm_ibpkey_audit *ibpkey; + struct lsm_ibendport_audit *ibendport; + int reason; +diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h +index 8c42b4bde09c07..b4958167e38196 100644 +--- a/include/linux/lsm_hook_defs.h ++++ b/include/linux/lsm_hook_defs.h +@@ -191,6 +191,9 @@ LSM_HOOK(int, 0, file_permission, struct file *file, int mask) + LSM_HOOK(int, 0, file_alloc_security, struct file *file) + LSM_HOOK(void, LSM_RET_VOID, file_release, struct file *file) + LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) ++LSM_HOOK(int, 0, backing_file_alloc, struct file *backing_file, ++ const struct file *user_file) ++LSM_HOOK(void, LSM_RET_VOID, backing_file_free, struct file *backing_file) + LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, + unsigned long arg) + LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, +@@ -198,6 +201,8 @@ LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, + LSM_HOOK(int, 0, mmap_addr, unsigned long addr) + LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags) ++LSM_HOOK(int, 0, mmap_backing_file, struct vm_area_struct *vma, ++ struct file *backing_file, struct file *user_file) + LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma, + unsigned long reqprot, unsigned long prot) + LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd) +diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h +index 79ec5a2bdcca7a..ea4b0f5ca7f0ff 100644 +--- a/include/linux/lsm_hooks.h ++++ b/include/linux/lsm_hooks.h +@@ -104,6 +104,7 @@ struct security_hook_list { + struct lsm_blob_sizes { + int lbs_cred; + int lbs_file; ++ int lbs_backing_file; + int lbs_ib; + int lbs_inode; + int lbs_sock; +diff --git a/include/linux/security.h b/include/linux/security.h +index b64598e5d65d75..e540253624268d 100644 +--- a/include/linux/security.h ++++ b/include/linux/security.h +@@ -473,11 +473,17 @@ int security_file_permission(struct file *file, int mask); + int security_file_alloc(struct file *file); + void security_file_release(struct file *file); + void security_file_free(struct file *file); ++int security_backing_file_alloc(struct file *backing_file, ++ const struct file *user_file); ++void security_backing_file_free(struct file *backing_file); + int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + int security_file_ioctl_compat(struct file *file, unsigned int cmd, + unsigned long arg); + int security_mmap_file(struct file *file, unsigned long prot, + unsigned long flags); ++int security_mmap_backing_file(struct vm_area_struct *vma, ++ struct file *backing_file, ++ struct file *user_file); + int security_mmap_addr(unsigned long addr); + int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, + unsigned long prot); +@@ -1142,6 +1148,15 @@ static inline void security_file_release(struct file *file) + static inline void security_file_free(struct file *file) + { } + ++static inline int security_backing_file_alloc(struct file *backing_file, ++ const struct file *user_file) ++{ ++ return 0; ++} ++ ++static inline void security_backing_file_free(struct file *backing_file) ++{ } ++ + static inline int security_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) + { +@@ -1161,6 +1176,13 @@ static inline int security_mmap_file(struct file *file, unsigned long prot, + return 0; + } + ++static inline int security_mmap_backing_file(struct vm_area_struct *vma, ++ struct file *backing_file, ++ struct file *user_file) ++{ ++ return 0; ++} ++ + static inline int security_mmap_addr(unsigned long addr) + { + return cap_mmap_addr(addr); +diff --git a/security/security.c b/security/security.c +index 603c3c6d5635d8..9285909908ab8a 100644 +--- a/security/security.c ++++ b/security/security.c +@@ -94,6 +94,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { + static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); + + static struct kmem_cache *lsm_file_cache; ++static struct kmem_cache *lsm_backing_file_cache; + static struct kmem_cache *lsm_inode_cache; + + char *lsm_names; +@@ -265,6 +266,7 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) + + lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); + lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); ++ lsm_set_blob_size(&needed->lbs_backing_file, &blob_sizes.lbs_backing_file); + lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib); + /* + * The inode blob gets an rcu_head in addition to +@@ -470,6 +472,7 @@ static void __init ordered_lsm_init(void) + + init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); + init_debug("file blob size = %d\n", blob_sizes.lbs_file); ++ init_debug("lsm_backing_file_cache = %d\n", blob_sizes.lbs_backing_file); + init_debug("ib blob size = %d\n", blob_sizes.lbs_ib); + init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); + init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); +@@ -495,6 +498,11 @@ static void __init ordered_lsm_init(void) + lsm_file_cache = kmem_cache_create("lsm_file_cache", + blob_sizes.lbs_file, 0, + SLAB_PANIC, NULL); ++ if (blob_sizes.lbs_backing_file) ++ lsm_backing_file_cache = kmem_cache_create( ++ "lsm_backing_file_cache", ++ blob_sizes.lbs_backing_file, ++ 0, SLAB_PANIC, NULL); + if (blob_sizes.lbs_inode) + lsm_inode_cache = kmem_cache_create("lsm_inode_cache", + blob_sizes.lbs_inode, 0, +@@ -671,6 +679,30 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb) + } + EXPORT_SYMBOL(unregister_blocking_lsm_notifier); + ++/** ++ * lsm_backing_file_alloc - allocate a composite backing file blob ++ * @backing_file: the backing file ++ * ++ * Allocate the backing file blob for all the modules. ++ * ++ * Returns 0, or -ENOMEM if memory can't be allocated. ++ */ ++static int lsm_backing_file_alloc(struct file *backing_file) ++{ ++ void *blob; ++ ++ if (!lsm_backing_file_cache) { ++ backing_file_set_security(backing_file, NULL); ++ return 0; ++ } ++ ++ blob = kmem_cache_zalloc(lsm_backing_file_cache, GFP_KERNEL); ++ backing_file_set_security(backing_file, blob); ++ if (!blob) ++ return -ENOMEM; ++ return 0; ++} ++ + /** + * lsm_blob_alloc - allocate a composite blob + * @dest: the destination for the blob +@@ -2965,6 +2997,57 @@ void security_file_free(struct file *file) + } + } + ++/** ++ * security_backing_file_alloc() - Allocate and setup a backing file blob ++ * @backing_file: the backing file ++ * @user_file: the associated user visible file ++ * ++ * Allocate a backing file LSM blob and perform any necessary initialization of ++ * the LSM blob. There will be some operations where the LSM will not have ++ * access to @user_file after this point, so any important state associated ++ * with @user_file that is important to the LSM should be captured in the ++ * backing file's LSM blob. ++ * ++ * LSM's should avoid taking a reference to @user_file in this hook as it will ++ * result in problems later when the system attempts to drop/put the file ++ * references due to a circular dependency. ++ * ++ * Return: Return 0 if the hook is successful, negative values otherwise. ++ */ ++int security_backing_file_alloc(struct file *backing_file, ++ const struct file *user_file) ++{ ++ int rc; ++ ++ rc = lsm_backing_file_alloc(backing_file); ++ if (rc) ++ return rc; ++ rc = call_int_hook(backing_file_alloc, backing_file, user_file); ++ if (unlikely(rc)) ++ security_backing_file_free(backing_file); ++ ++ return rc; ++} ++ ++/** ++ * security_backing_file_free() - Free a backing file blob ++ * @backing_file: the backing file ++ * ++ * Free any LSM state associate with a backing file's LSM blob, including the ++ * blob itself. ++ */ ++void security_backing_file_free(struct file *backing_file) ++{ ++ void *blob = backing_file_security(backing_file); ++ ++ call_void_hook(backing_file_free, backing_file); ++ ++ if (blob) { ++ backing_file_set_security(backing_file, NULL); ++ kmem_cache_free(lsm_backing_file_cache, blob); ++ } ++} ++ + /** + * security_file_ioctl() - Check if an ioctl is allowed + * @file: associated file +@@ -3053,6 +3136,32 @@ int security_mmap_file(struct file *file, unsigned long prot, + flags); + } + ++/** ++ * security_mmap_backing_file - Check if mmap'ing a backing file is allowed ++ * @vma: the vm_area_struct for the mmap'd region ++ * @backing_file: the backing file being mmap'd ++ * @user_file: the user file being mmap'd ++ * ++ * Check permissions for a mmap operation on a stacked filesystem. This hook ++ * is called after the security_mmap_file() and is responsible for authorizing ++ * the mmap on @backing_file. It is important to note that the mmap operation ++ * on @user_file has already been authorized and the @vma->vm_file has been ++ * set to @backing_file. ++ * ++ * Return: Returns 0 if permission is granted. ++ */ ++int security_mmap_backing_file(struct vm_area_struct *vma, ++ struct file *backing_file, ++ struct file *user_file) ++{ ++ /* recommended by the stackable filesystem devs */ ++ if (WARN_ON_ONCE(!(backing_file->f_mode & FMODE_BACKING))) ++ return -EIO; ++ ++ return call_int_hook(mmap_backing_file, vma, backing_file, user_file); ++} ++EXPORT_SYMBOL_GPL(security_mmap_backing_file); ++ + /** + * security_mmap_addr() - Check if mmap'ing an address is allowed + * @addr: address +-- +2.53.0 + diff --git a/staging-6.18/selinux-fix-overlayfs-mmap-and-mprotect-access-check.patch b/staging-6.18/selinux-fix-overlayfs-mmap-and-mprotect-access-check.patch new file mode 100644 index 0000000000..162343b762 --- /dev/null +++ b/staging-6.18/selinux-fix-overlayfs-mmap-and-mprotect-access-check.patch @@ -0,0 +1,426 @@ +From 57ddf33f22f73696788287ee86f64be3332ed730 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 15:50:35 +0800 +Subject: selinux: fix overlayfs mmap() and mprotect() access checks + +From: Paul Moore + +[ Upstream commit 82544d36b1729153c8aeb179e84750f0c085d3b1 ] + +The existing SELinux security model for overlayfs is to allow access if +the current task is able to access the top level file (the "user" file) +and the mounter's credentials are sufficient to access the lower +level file (the "backing" file). Unfortunately, the current code does +not properly enforce these access controls for both mmap() and mprotect() +operations on overlayfs filesystems. + +This patch makes use of the newly created security_mmap_backing_file() +LSM hook to provide the missing backing file enforcement for mmap() +operations, and leverages the backing file API and new LSM blob to +provide the necessary information to properly enforce the mprotect() +access controls. + +Cc: stable@vger.kernel.org +Acked-by: Amir Goldstein +Signed-off-by: Paul Moore +Signed-off-by: Cai Xinchen +Signed-off-by: Sasha Levin +--- + security/selinux/hooks.c | 242 ++++++++++++++++++++++-------- + security/selinux/include/objsec.h | 11 ++ + 2 files changed, 189 insertions(+), 64 deletions(-) + +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 3da3017ad2ca06..f96ee8f372e3b2 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -1739,49 +1739,72 @@ static inline int file_path_has_perm(const struct cred *cred, + static int bpf_fd_pass(const struct file *file, u32 sid); + #endif + +-/* Check whether a task can use an open file descriptor to +- access an inode in a given way. Check access to the +- descriptor itself, and then use dentry_has_perm to +- check a particular permission to the file. +- Access to the descriptor is implicitly granted if it +- has the same SID as the process. If av is zero, then +- access to the file is not checked, e.g. for cases +- where only the descriptor is affected like seek. */ +-static int file_has_perm(const struct cred *cred, +- struct file *file, +- u32 av) ++static int __file_has_perm(const struct cred *cred, const struct file *file, ++ u32 av, bool bf_user_file) ++ + { +- struct file_security_struct *fsec = selinux_file(file); +- struct inode *inode = file_inode(file); + struct common_audit_data ad; +- u32 sid = cred_sid(cred); ++ struct inode *inode; ++ u32 ssid = cred_sid(cred); ++ u32 tsid_fd; + int rc; + +- ad.type = LSM_AUDIT_DATA_FILE; +- ad.u.file = file; ++ if (bf_user_file) { ++ struct backing_file_security_struct *bfsec; ++ const struct path *path; + +- if (sid != fsec->sid) { +- rc = avc_has_perm(sid, fsec->sid, +- SECCLASS_FD, +- FD__USE, +- &ad); ++ if (WARN_ON(!(file->f_mode & FMODE_BACKING))) ++ return -EIO; ++ ++ bfsec = selinux_backing_file(file); ++ path = backing_file_user_path(file); ++ tsid_fd = bfsec->uf_sid; ++ inode = d_inode(path->dentry); ++ ++ ad.type = LSM_AUDIT_DATA_PATH; ++ ad.u.path = *path; ++ } else { ++ struct file_security_struct *fsec = selinux_file(file); ++ ++ tsid_fd = fsec->sid; ++ inode = file_inode(file); ++ ++ ad.type = LSM_AUDIT_DATA_FILE; ++ ad.u.file = file; ++ } ++ ++ if (ssid != tsid_fd) { ++ rc = avc_has_perm(ssid, tsid_fd, SECCLASS_FD, FD__USE, &ad); + if (rc) +- goto out; ++ return rc; + } + + #ifdef CONFIG_BPF_SYSCALL +- rc = bpf_fd_pass(file, cred_sid(cred)); ++ /* regardless of backing vs user file, use the underlying file here */ ++ rc = bpf_fd_pass(file, ssid); + if (rc) + return rc; + #endif + + /* av is zero if only checking access to the descriptor. */ +- rc = 0; + if (av) +- rc = inode_has_perm(cred, inode, av, &ad); ++ return inode_has_perm(cred, inode, av, &ad); + +-out: +- return rc; ++ return 0; ++} ++ ++/* Check whether a task can use an open file descriptor to ++ access an inode in a given way. Check access to the ++ descriptor itself, and then use dentry_has_perm to ++ check a particular permission to the file. ++ Access to the descriptor is implicitly granted if it ++ has the same SID as the process. If av is zero, then ++ access to the file is not checked, e.g. for cases ++ where only the descriptor is affected like seek. */ ++static inline int file_has_perm(const struct cred *cred, ++ const struct file *file, u32 av) ++{ ++ return __file_has_perm(cred, file, av, false); + } + + /* +@@ -3799,6 +3822,17 @@ static int selinux_file_alloc_security(struct file *file) + return 0; + } + ++static int selinux_backing_file_alloc(struct file *backing_file, ++ const struct file *user_file) ++{ ++ struct backing_file_security_struct *bfsec; ++ ++ bfsec = selinux_backing_file(backing_file); ++ bfsec->uf_sid = selinux_file(user_file)->sid; ++ ++ return 0; ++} ++ + /* + * Check whether a task has the ioctl permission and cmd + * operation to an inode. +@@ -3916,42 +3950,55 @@ static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd, + + static int default_noexec __ro_after_init; + +-static int file_map_prot_check(struct file *file, unsigned long prot, int shared) ++static int __file_map_prot_check(const struct cred *cred, ++ const struct file *file, unsigned long prot, ++ bool shared, bool bf_user_file) + { +- const struct cred *cred = current_cred(); +- u32 sid = cred_sid(cred); +- int rc = 0; ++ struct inode *inode = NULL; ++ bool prot_exec = prot & PROT_EXEC; ++ bool prot_write = prot & PROT_WRITE; ++ ++ if (file) { ++ if (bf_user_file) ++ inode = d_inode(backing_file_user_path(file)->dentry); ++ else ++ inode = file_inode(file); ++ } ++ ++ if (default_noexec && prot_exec && ++ (!file || IS_PRIVATE(inode) || (!shared && prot_write))) { ++ int rc; ++ u32 sid = cred_sid(cred); + +- if (default_noexec && +- (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) || +- (!shared && (prot & PROT_WRITE)))) { + /* +- * We are making executable an anonymous mapping or a +- * private file mapping that will also be writable. +- * This has an additional check. ++ * We are making executable an anonymous mapping or a private ++ * file mapping that will also be writable. + */ +- rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, +- PROCESS__EXECMEM, NULL); ++ rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM, ++ NULL); + if (rc) +- goto error; ++ return rc; + } + + if (file) { +- /* read access is always possible with a mapping */ ++ /* "read" always possible, "write" only if shared */ + u32 av = FILE__READ; +- +- /* write access only matters if the mapping is shared */ +- if (shared && (prot & PROT_WRITE)) ++ if (shared && prot_write) + av |= FILE__WRITE; +- +- if (prot & PROT_EXEC) ++ if (prot_exec) + av |= FILE__EXECUTE; + +- return file_has_perm(cred, file, av); ++ return __file_has_perm(cred, file, av, bf_user_file); + } + +-error: +- return rc; ++ return 0; ++} ++ ++static inline int file_map_prot_check(const struct cred *cred, ++ const struct file *file, ++ unsigned long prot, bool shared) ++{ ++ return __file_map_prot_check(cred, file, prot, shared, false); + } + + static int selinux_mmap_addr(unsigned long addr) +@@ -3967,36 +4014,80 @@ static int selinux_mmap_addr(unsigned long addr) + return rc; + } + +-static int selinux_mmap_file(struct file *file, +- unsigned long reqprot __always_unused, +- unsigned long prot, unsigned long flags) ++static int selinux_mmap_file_common(const struct cred *cred, struct file *file, ++ unsigned long prot, bool shared) + { +- struct common_audit_data ad; +- int rc; +- + if (file) { ++ int rc; ++ struct common_audit_data ad; ++ + ad.type = LSM_AUDIT_DATA_FILE; + ad.u.file = file; +- rc = inode_has_perm(current_cred(), file_inode(file), +- FILE__MAP, &ad); ++ rc = inode_has_perm(cred, file_inode(file), FILE__MAP, &ad); + if (rc) + return rc; + } + +- return file_map_prot_check(file, prot, +- (flags & MAP_TYPE) == MAP_SHARED); ++ return file_map_prot_check(cred, file, prot, shared); ++} ++ ++static int selinux_mmap_file(struct file *file, ++ unsigned long reqprot __always_unused, ++ unsigned long prot, unsigned long flags) ++{ ++ return selinux_mmap_file_common(current_cred(), file, prot, ++ (flags & MAP_TYPE) == MAP_SHARED); ++} ++ ++/** ++ * selinux_mmap_backing_file - Check mmap permissions on a backing file ++ * @vma: memory region ++ * @backing_file: stacked filesystem backing file ++ * @user_file: user visible file ++ * ++ * This is called after selinux_mmap_file() on stacked filesystems, and it ++ * is this function's responsibility to verify access to @backing_file and ++ * setup the SELinux state for possible later use in the mprotect() code path. ++ * ++ * By the time this function is called, mmap() access to @user_file has already ++ * been authorized and @vma->vm_file has been set to point to @backing_file. ++ * ++ * Return zero on success, negative values otherwise. ++ */ ++static int selinux_mmap_backing_file(struct vm_area_struct *vma, ++ struct file *backing_file, ++ struct file *user_file __always_unused) ++{ ++ unsigned long prot = 0; ++ ++ /* translate vma->vm_flags perms into PROT perms */ ++ if (vma->vm_flags & VM_READ) ++ prot |= PROT_READ; ++ if (vma->vm_flags & VM_WRITE) ++ prot |= PROT_WRITE; ++ if (vma->vm_flags & VM_EXEC) ++ prot |= PROT_EXEC; ++ ++ return selinux_mmap_file_common(backing_file->f_cred, backing_file, ++ prot, vma->vm_flags & VM_SHARED); + } + + static int selinux_file_mprotect(struct vm_area_struct *vma, + unsigned long reqprot __always_unused, + unsigned long prot) + { ++ int rc; + const struct cred *cred = current_cred(); + u32 sid = cred_sid(cred); ++ const struct file *file = vma->vm_file; ++ bool backing_file; ++ bool shared = vma->vm_flags & VM_SHARED; ++ ++ /* check if we need to trigger the "backing files are awful" mode */ ++ backing_file = file && (file->f_mode & FMODE_BACKING); + + if (default_noexec && + (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { +- int rc = 0; + /* + * We don't use the vma_is_initial_heap() helper as it has + * a history of problems and is currently broken on systems +@@ -4010,11 +4101,15 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, + vma->vm_end <= vma->vm_mm->brk) { + rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, + PROCESS__EXECHEAP, NULL); +- } else if (!vma->vm_file && (vma_is_initial_stack(vma) || ++ if (rc) ++ return rc; ++ } else if (!file && (vma_is_initial_stack(vma) || + vma_is_stack_for_current(vma))) { + rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, + PROCESS__EXECSTACK, NULL); +- } else if (vma->vm_file && vma->anon_vma) { ++ if (rc) ++ return rc; ++ } else if (file && vma->anon_vma) { + /* + * We are making executable a file mapping that has + * had some COW done. Since pages might have been +@@ -4022,13 +4117,29 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, + * modified content. This typically should only + * occur for text relocations. + */ +- rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD); ++ rc = __file_has_perm(cred, file, FILE__EXECMOD, ++ backing_file); ++ if (rc) ++ return rc; ++ if (backing_file) { ++ rc = file_has_perm(file->f_cred, file, ++ FILE__EXECMOD); ++ if (rc) ++ return rc; ++ } + } ++ } ++ ++ rc = __file_map_prot_check(cred, file, prot, shared, backing_file); ++ if (rc) ++ return rc; ++ if (backing_file) { ++ rc = file_map_prot_check(file->f_cred, file, prot, shared); + if (rc) + return rc; + } + +- return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED); ++ return 0; + } + + static int selinux_file_lock(struct file *file, unsigned int cmd) +@@ -7140,6 +7251,7 @@ struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { + .lbs_cred = sizeof(struct cred_security_struct), + .lbs_task = sizeof(struct task_security_struct), + .lbs_file = sizeof(struct file_security_struct), ++ .lbs_backing_file = sizeof(struct backing_file_security_struct), + .lbs_inode = sizeof(struct inode_security_struct), + .lbs_ipc = sizeof(struct ipc_security_struct), + .lbs_key = sizeof(struct key_security_struct), +@@ -7363,9 +7475,11 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { + + LSM_HOOK_INIT(file_permission, selinux_file_permission), + LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security), ++ LSM_HOOK_INIT(backing_file_alloc, selinux_backing_file_alloc), + LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl), + LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat), + LSM_HOOK_INIT(mmap_file, selinux_mmap_file), ++ LSM_HOOK_INIT(mmap_backing_file, selinux_mmap_backing_file), + LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr), + LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect), + LSM_HOOK_INIT(file_lock, selinux_file_lock), +diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h +index 816fde5a5896c1..fcb46793898f5e 100644 +--- a/security/selinux/include/objsec.h ++++ b/security/selinux/include/objsec.h +@@ -86,6 +86,10 @@ struct file_security_struct { + u32 pseqno; /* Policy seqno at the time of file open */ + }; + ++struct backing_file_security_struct { ++ u32 uf_sid; /* associated user file fsec->sid */ ++}; ++ + struct superblock_security_struct { + u32 sid; /* SID of file system superblock */ + u32 def_sid; /* default SID for labeling */ +@@ -190,6 +194,13 @@ static inline struct file_security_struct *selinux_file(const struct file *file) + return file->f_security + selinux_blob_sizes.lbs_file; + } + ++static inline struct backing_file_security_struct * ++selinux_backing_file(const struct file *backing_file) ++{ ++ void *blob = backing_file_security(backing_file); ++ return blob + selinux_blob_sizes.lbs_backing_file; ++} ++ + static inline struct inode_security_struct * + selinux_inode(const struct inode *inode) + { +-- +2.53.0 + diff --git a/staging-6.18/series b/staging-6.18/series new file mode 100644 index 0000000000..d00f57e334 --- /dev/null +++ b/staging-6.18/series @@ -0,0 +1,3 @@ +kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch +lsm-add-backing_file-lsm-hooks.patch +selinux-fix-overlayfs-mmap-and-mprotect-access-check.patch diff --git a/staging-6.6/eventpoll-drop-vestigial-__-prefix-from-ep_remove_-f.patch b/staging-6.6/eventpoll-drop-vestigial-__-prefix-from-ep_remove_-f.patch new file mode 100644 index 0000000000..7aaa092a20 --- /dev/null +++ b/staging-6.6/eventpoll-drop-vestigial-__-prefix-from-ep_remove_-f.patch @@ -0,0 +1,70 @@ +From 01314afe47db7d0abc07cfbc9a851f92ed98f451 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:14:00 +0800 +Subject: eventpoll: drop vestigial __ prefix from ep_remove_{file,epi}() + +From: Christian Brauner + +[ Upstream commit 0feaf644f7180c4a91b6b405a881afbfd958f1cf ] + +With __ep_remove() gone, the double-underscore on __ep_remove_file() +and __ep_remove_epi() no longer contrasts with a __-less parent and +just reads as noise. Rename both to ep_remove_file() and +ep_remove_epi(). No functional change. + +Signed-off-by: Christian Brauner (Amutable) +Stable-dep-of: a6dc643c6931 ("eventpoll: fix ep_remove struct eventpoll / struct file UAF") +Signed-off-by: Quentin Schulz +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 766716c2fd92a0..0a54a42263575f 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -719,7 +719,7 @@ static void ep_free(struct eventpoll *ep) + * Called with &file->f_lock held, + * returns with it released + */ +-static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, ++static void ep_remove_file(struct eventpoll *ep, struct epitem *epi, + struct file *file) + { + struct epitems_head *to_free = NULL; +@@ -743,7 +743,7 @@ static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, + free_ephead(to_free); + } + +-static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi) ++static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) + { + lockdep_assert_held(&ep->mtx); + +@@ -789,9 +789,9 @@ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) + spin_unlock(&file->f_lock); + return; + } +- __ep_remove_file(ep, epi, file); ++ ep_remove_file(ep, epi, file); + +- if (__ep_remove_epi(ep, epi)) ++ if (ep_remove_epi(ep, epi)) + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); + } + +@@ -1013,8 +1013,8 @@ void eventpoll_release_file(struct file *file) + ep_unregister_pollwait(ep, epi); + + spin_lock(&file->f_lock); +- __ep_remove_file(ep, epi, file); +- dispose = __ep_remove_epi(ep, epi); ++ ep_remove_file(ep, epi, file); ++ dispose = ep_remove_epi(ep, epi); + + mutex_unlock(&ep->mtx); + +-- +2.53.0 + diff --git a/staging-6.6/eventpoll-fix-ep_remove-struct-eventpoll-struct-file.patch b/staging-6.6/eventpoll-fix-ep_remove-struct-eventpoll-struct-file.patch new file mode 100644 index 0000000000..62c162856f --- /dev/null +++ b/staging-6.6/eventpoll-fix-ep_remove-struct-eventpoll-struct-file.patch @@ -0,0 +1,101 @@ +From 23c26dd42ccf99e2d0574dfc93569898693ac979 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:14:03 +0800 +Subject: eventpoll: fix ep_remove struct eventpoll / struct file UAF + +From: Christian Brauner + +[ Upstream commit a6dc643c69311677c574a0f17a3f4d66a5f3744b ] + +ep_remove() (via ep_remove_file()) cleared file->f_ep under +file->f_lock but then kept using @file inside the critical section +(is_file_epoll(), hlist_del_rcu() through the head, spin_unlock). +A concurrent __fput() taking the eventpoll_release() fastpath in +that window observed the transient NULL, skipped +eventpoll_release_file() and ran to f_op->release / file_free(). + +For the epoll-watches-epoll case, f_op->release is +ep_eventpoll_release() -> ep_clear_and_put() -> ep_free(), which +kfree()s the watched struct eventpoll. Its embedded ->refs +hlist_head is exactly where epi->fllink.pprev points, so the +subsequent hlist_del_rcu()'s "*pprev = next" scribbles into freed +kmalloc-192 memory. + +In addition, struct file is SLAB_TYPESAFE_BY_RCU, so the slot +backing @file could be recycled by alloc_empty_file() -- +reinitializing f_lock and f_ep -- while ep_remove() is still +nominally inside that lock. The upshot is an attacker-controllable +kmem_cache_free() against the wrong slab cache. + +Pin @file via epi_fget() at the top of ep_remove() and gate the +critical section on the pin succeeding. With the pin held @file +cannot reach refcount zero, which holds __fput() off and +transitively keeps the watched struct eventpoll alive across the +hlist_del_rcu() and the f_lock use, closing both UAFs. + +If the pin fails @file has already reached refcount zero and its +__fput() is in flight. Because we bailed before clearing f_ep, +that path takes the eventpoll_release() slow path into +eventpoll_release_file() and blocks on ep->mtx until the waiter +side's ep_clear_and_put() drops it. The bailed epi's share of +ep->refcount stays intact, so the trailing ep_refcount_dec_and_test() +in ep_clear_and_put() cannot free the eventpoll out from under +eventpoll_release_file(); the orphaned epi is then cleaned up +there. + +A successful pin also proves we are not racing +eventpoll_release_file() on this epi, so drop the now-redundant +re-check of epi->dying under f_lock. The cheap lockless +READ_ONCE(epi->dying) fast-path bailout stays. + +Fixes: 58c9b016e128 ("epoll: use refcount to reduce ep_mutex contention") +Reported-by: Jaeyoung Chung +Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-6-2470f9eec0f5@kernel.org +Signed-off-by: Christian Brauner (Amutable) +(cherry picked from commit a6dc643c69311677c574a0f17a3f4d66a5f3744b) +Signed-off-by: Wentao Guan +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index fc4668a403c9d3..0e09bddea16a5f 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -801,22 +801,26 @@ static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) + */ + static void ep_remove(struct eventpoll *ep, struct epitem *epi) + { +- struct file *file = epi->ffd.file; ++ struct file *file __free(fput) = NULL; + + lockdep_assert_irqs_enabled(); + lockdep_assert_held(&ep->mtx); + + ep_unregister_pollwait(ep, epi); + +- /* sync with eventpoll_release_file() */ ++ /* cheap sync with eventpoll_release_file() */ + if (unlikely(READ_ONCE(epi->dying))) + return; + +- spin_lock(&file->f_lock); +- if (epi->dying) { +- spin_unlock(&file->f_lock); ++ /* ++ * If we manage to grab a reference it means we're not in ++ * eventpoll_release_file() and aren't going to be. ++ */ ++ file = epi_fget(epi); ++ if (!file) + return; +- } ++ ++ spin_lock(&file->f_lock); + ep_remove_file(ep, epi, file); + + if (ep_remove_epi(ep, epi)) +-- +2.53.0 + diff --git a/staging-6.6/eventpoll-kill-__ep_remove.patch b/staging-6.6/eventpoll-kill-__ep_remove.patch new file mode 100644 index 0000000000..481c41c995 --- /dev/null +++ b/staging-6.6/eventpoll-kill-__ep_remove.patch @@ -0,0 +1,133 @@ +From 703f69e2424badc5012bd77ea32f68a917a59063 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:13:59 +0800 +Subject: eventpoll: kill __ep_remove() + +From: Christian Brauner + +[ Upstream commit e9e5cd40d7c403e19f21d0f7b8b8ba3a76b58330 ] + +Remove the boolean conditional in __ep_remove() and restructure the code +so the check for racing with eventpoll_release_file() are only done in +the ep_remove_safe() path where they belong. + +Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-3-2470f9eec0f5@kernel.org +Signed-off-by: Christian Brauner (Amutable) +Stable-dep-of: a6dc643c6931 ("eventpoll: fix ep_remove struct eventpoll / struct file UAF") +Signed-off-by: Quentin Schulz +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 67 ++++++++++++++++++++++---------------------------- + 1 file changed, 30 insertions(+), 37 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index ae9cb82764482c..766716c2fd92a0 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -715,49 +715,18 @@ static void ep_free(struct eventpoll *ep) + kfree_rcu(ep, rcu); + } + +-static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file); +-static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi); +- +-/* +- * Removes a "struct epitem" from the eventpoll RB tree and deallocates +- * all the associated resources. Must be called with "mtx" held. +- * If the dying flag is set, do the removal only if force is true. +- * This prevents ep_clear_and_put() from dropping all the ep references +- * while running concurrently with eventpoll_release_file(). +- * Returns true if the eventpoll can be disposed. +- */ +-static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) +-{ +- struct file *file = epi->ffd.file; +- +- lockdep_assert_irqs_enabled(); +- +- /* +- * Removes poll wait queue hooks. +- */ +- ep_unregister_pollwait(ep, epi); +- +- /* Remove the current item from the list of epoll hooks */ +- spin_lock(&file->f_lock); +- if (epi->dying && !force) { +- spin_unlock(&file->f_lock); +- return false; +- } +- +- __ep_remove_file(ep, epi, file); +- return __ep_remove_epi(ep, epi); +-} +- + /* + * Called with &file->f_lock held, + * returns with it released + */ +-static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file) ++static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, ++ struct file *file) + { + struct epitems_head *to_free = NULL; + struct hlist_head *head = file->f_ep; + + lockdep_assert_held(&ep->mtx); ++ lockdep_assert_held(&file->f_lock); + + if (hlist_is_singular_node(&epi->fllink, head)) { + /* See eventpoll_release() for details. */ +@@ -804,7 +773,25 @@ static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi) + */ + static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) + { +- if (__ep_remove(ep, epi, false)) ++ struct file *file = epi->ffd.file; ++ ++ lockdep_assert_irqs_enabled(); ++ lockdep_assert_held(&ep->mtx); ++ ++ ep_unregister_pollwait(ep, epi); ++ ++ /* sync with eventpoll_release_file() */ ++ if (unlikely(READ_ONCE(epi->dying))) ++ return; ++ ++ spin_lock(&file->f_lock); ++ if (epi->dying) { ++ spin_unlock(&file->f_lock); ++ return; ++ } ++ __ep_remove_file(ep, epi, file); ++ ++ if (__ep_remove_epi(ep, epi)) + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); + } + +@@ -1013,7 +1000,7 @@ void eventpoll_release_file(struct file *file) + spin_lock(&file->f_lock); + if (file->f_ep && file->f_ep->first) { + epi = hlist_entry(file->f_ep->first, struct epitem, fllink); +- epi->dying = true; ++ WRITE_ONCE(epi->dying, true); + spin_unlock(&file->f_lock); + + /* +@@ -1022,7 +1009,13 @@ void eventpoll_release_file(struct file *file) + */ + ep = epi->ep; + mutex_lock(&ep->mtx); +- dispose = __ep_remove(ep, epi, true); ++ ++ ep_unregister_pollwait(ep, epi); ++ ++ spin_lock(&file->f_lock); ++ __ep_remove_file(ep, epi, file); ++ dispose = __ep_remove_epi(ep, epi); ++ + mutex_unlock(&ep->mtx); + + if (dispose && ep_refcount_dec_and_test(ep)) +-- +2.53.0 + diff --git a/staging-6.6/eventpoll-move-epi_fget-up.patch b/staging-6.6/eventpoll-move-epi_fget-up.patch new file mode 100644 index 0000000000..ad08de5df7 --- /dev/null +++ b/staging-6.6/eventpoll-move-epi_fget-up.patch @@ -0,0 +1,101 @@ +From ca3b789c9d69a0caf13143935a98832cf0e8f59f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:14:02 +0800 +Subject: eventpoll: move epi_fget() up + +From: Christian Brauner + +[ Upstream commit 86e87059e6d1fd5115a31949726450ed03c1073b ] + +We'll need it when removing files so move it up. No functional change. + +Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-5-2470f9eec0f5@kernel.org +Signed-off-by: Christian Brauner (Amutable) +Stable-dep-of: a6dc643c6931 ("eventpoll: fix ep_remove struct eventpoll / struct file UAF") +[file_ref_get(&file->f_ref) from original commit left as + atomic_long_inc_not_zero(&file->f_count) due to v6.12.y missing commit + 90ee6ed776c0 ("fs: port files to file_ref") and its dependent commit + 08ef26ea9ab3 ("fs: add file_ref")] +Signed-off-by: Quentin Schulz +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 56 +++++++++++++++++++++++++------------------------- + 1 file changed, 28 insertions(+), 28 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index db5d7c1d726c83..fc4668a403c9d3 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -715,6 +715,34 @@ static void ep_free(struct eventpoll *ep) + kfree_rcu(ep, rcu); + } + ++/* ++ * The ffd.file pointer may be in the process of being torn down due to ++ * being closed, but we may not have finished eventpoll_release() yet. ++ * ++ * Normally, even with the atomic_long_inc_not_zero, the file may have ++ * been free'd and then gotten re-allocated to something else (since ++ * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). ++ * ++ * But for epoll, users hold the ep->mtx mutex, and as such any file in ++ * the process of being free'd will block in eventpoll_release_file() ++ * and thus the underlying file allocation will not be free'd, and the ++ * file re-use cannot happen. ++ * ++ * For the same reason we can avoid a rcu_read_lock() around the ++ * operation - 'ffd.file' cannot go away even if the refcount has ++ * reached zero (but we must still not call out to ->poll() functions ++ * etc). ++ */ ++static struct file *epi_fget(const struct epitem *epi) ++{ ++ struct file *file; ++ ++ file = epi->ffd.file; ++ if (!atomic_long_inc_not_zero(&file->f_count)) ++ file = NULL; ++ return file; ++} ++ + /* + * Called with &file->f_lock held, + * returns with it released +@@ -886,34 +914,6 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep + return res; + } + +-/* +- * The ffd.file pointer may be in the process of being torn down due to +- * being closed, but we may not have finished eventpoll_release() yet. +- * +- * Normally, even with the atomic_long_inc_not_zero, the file may have +- * been free'd and then gotten re-allocated to something else (since +- * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). +- * +- * But for epoll, users hold the ep->mtx mutex, and as such any file in +- * the process of being free'd will block in eventpoll_release_file() +- * and thus the underlying file allocation will not be free'd, and the +- * file re-use cannot happen. +- * +- * For the same reason we can avoid a rcu_read_lock() around the +- * operation - 'ffd.file' cannot go away even if the refcount has +- * reached zero (but we must still not call out to ->poll() functions +- * etc). +- */ +-static struct file *epi_fget(const struct epitem *epi) +-{ +- struct file *file; +- +- file = epi->ffd.file; +- if (!atomic_long_inc_not_zero(&file->f_count)) +- file = NULL; +- return file; +-} +- + /* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() +-- +2.53.0 + diff --git a/staging-6.6/eventpoll-rename-ep_remove_safe-back-to-ep_remove.patch b/staging-6.6/eventpoll-rename-ep_remove_safe-back-to-ep_remove.patch new file mode 100644 index 0000000000..bcfe9b1625 --- /dev/null +++ b/staging-6.6/eventpoll-rename-ep_remove_safe-back-to-ep_remove.patch @@ -0,0 +1,97 @@ +From 177b4d0273b7f3dc4ecb1d9005d98b7e840e1229 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:14:01 +0800 +Subject: eventpoll: rename ep_remove_safe() back to ep_remove() + +From: Christian Brauner + +[ Upstream commit 0bade234723e40e4937be912e105785d6a51464e ] + +The current name is just confusing and doesn't clarify anything. + +Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-4-2470f9eec0f5@kernel.org +Signed-off-by: Christian Brauner (Amutable) +Stable-dep-of: a6dc643c6931 ("eventpoll: fix ep_remove struct eventpoll / struct file UAF") +Signed-off-by: Quentin Schulz +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 0a54a42263575f..db5d7c1d726c83 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -771,7 +771,7 @@ static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) + /* + * ep_remove variant for callers owing an additional reference to the ep + */ +-static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) ++static void ep_remove(struct eventpoll *ep, struct epitem *epi) + { + struct file *file = epi->ffd.file; + +@@ -818,7 +818,7 @@ static void ep_clear_and_put(struct eventpoll *ep) + + /* + * Walks through the whole tree and try to free each "struct epitem". +- * Note that ep_remove_safe() will not remove the epitem in case of a ++ * Note that ep_remove() will not remove the epitem in case of a + * racing eventpoll_release_file(); the latter will do the removal. + * At this point we are sure no poll callbacks will be lingering around. + * Since we still own a reference to the eventpoll struct, the loop can't +@@ -827,7 +827,7 @@ static void ep_clear_and_put(struct eventpoll *ep) + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { + next = rb_next(rbp); + epi = rb_entry(rbp, struct epitem, rbn); +- ep_remove_safe(ep, epi); ++ ep_remove(ep, epi); + cond_resched(); + } + +@@ -1497,21 +1497,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, + mutex_unlock(&tep->mtx); + + /* +- * ep_remove_safe() calls in the later error paths can't lead to ++ * ep_remove() calls in the later error paths can't lead to + * ep_free() as the ep file itself still holds an ep reference. + */ + ep_get(ep); + + /* now check if we've created too many backpaths */ + if (unlikely(full_check && reverse_path_check())) { +- ep_remove_safe(ep, epi); ++ ep_remove(ep, epi); + return -EINVAL; + } + + if (epi->event.events & EPOLLWAKEUP) { + error = ep_create_wakeup_source(epi); + if (error) { +- ep_remove_safe(ep, epi); ++ ep_remove(ep, epi); + return error; + } + } +@@ -1535,7 +1535,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, + * high memory pressure. + */ + if (unlikely(!epq.epi)) { +- ep_remove_safe(ep, epi); ++ ep_remove(ep, epi); + return -ENOMEM; + } + +@@ -2227,7 +2227,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + * The eventpoll itself is still alive: the refcount + * can't go to zero here. + */ +- ep_remove_safe(ep, epi); ++ ep_remove(ep, epi); + error = 0; + } else { + error = -ENOENT; +-- +2.53.0 + diff --git a/staging-6.6/eventpoll-split-__ep_remove.patch b/staging-6.6/eventpoll-split-__ep_remove.patch new file mode 100644 index 0000000000..ea1ab21fc6 --- /dev/null +++ b/staging-6.6/eventpoll-split-__ep_remove.patch @@ -0,0 +1,83 @@ +From efc3e478b693c4904b75b800ca605fb623b6510e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:13:58 +0800 +Subject: eventpoll: split __ep_remove() + +From: Christian Brauner + +[ Upstream commit 0f7bdfd413000985de09fc39eb9efa1e091a3ce0 ] + +Split __ep_remove() to delineate file removal from epoll item removal. + +Suggested-by: Linus Torvalds +Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-2-2470f9eec0f5@kernel.org +Signed-off-by: Christian Brauner (Amutable) +Stable-dep-of: a6dc643c6931 ("eventpoll: fix ep_remove struct eventpoll / struct file UAF") +Signed-off-by: Quentin Schulz +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 27 +++++++++++++++++++++++---- + 1 file changed, 23 insertions(+), 4 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 4f05d12a05031a..ae9cb82764482c 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -715,6 +715,9 @@ static void ep_free(struct eventpoll *ep) + kfree_rcu(ep, rcu); + } + ++static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file); ++static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi); ++ + /* + * Removes a "struct epitem" from the eventpoll RB tree and deallocates + * all the associated resources. Must be called with "mtx" held. +@@ -726,8 +729,6 @@ static void ep_free(struct eventpoll *ep) + static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) + { + struct file *file = epi->ffd.file; +- struct epitems_head *to_free; +- struct hlist_head *head; + + lockdep_assert_irqs_enabled(); + +@@ -743,8 +744,21 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) + return false; + } + +- to_free = NULL; +- head = file->f_ep; ++ __ep_remove_file(ep, epi, file); ++ return __ep_remove_epi(ep, epi); ++} ++ ++/* ++ * Called with &file->f_lock held, ++ * returns with it released ++ */ ++static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file) ++{ ++ struct epitems_head *to_free = NULL; ++ struct hlist_head *head = file->f_ep; ++ ++ lockdep_assert_held(&ep->mtx); ++ + if (hlist_is_singular_node(&epi->fllink, head)) { + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); +@@ -758,6 +772,11 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) + hlist_del_rcu(&epi->fllink); + spin_unlock(&file->f_lock); + free_ephead(to_free); ++} ++ ++static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi) ++{ ++ lockdep_assert_held(&ep->mtx); + + rb_erase_cached(&epi->rbn, &ep->rbr); + +-- +2.53.0 + diff --git a/staging-6.6/eventpoll-use-hlist_is_singular_node-in-__ep_remove.patch b/staging-6.6/eventpoll-use-hlist_is_singular_node-in-__ep_remove.patch new file mode 100644 index 0000000000..b804965e4e --- /dev/null +++ b/staging-6.6/eventpoll-use-hlist_is_singular_node-in-__ep_remove.patch @@ -0,0 +1,38 @@ +From 7de44d9622e8a472637993fbec6b4869a01d0a1f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:13:57 +0800 +Subject: eventpoll: use hlist_is_singular_node() in __ep_remove() + +From: Christian Brauner + +[ Upstream commit 3d9fd0abc94d8cd430cc7cd7d37ce5e5aae2cd2b ] + +Replace the open-coded "epi is the only entry in file->f_ep" check +with hlist_is_singular_node(). Same semantics, and the helper avoids +the head-cacheline access in the common false case. + +Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-1-2470f9eec0f5@kernel.org +Signed-off-by: Christian Brauner (Amutable) +Stable-dep-of: a6dc643c6931 ("eventpoll: fix ep_remove struct eventpoll / struct file UAF") +Signed-off-by: Quentin Schulz +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 8a556560a5b2f2..4f05d12a05031a 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -745,7 +745,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) + + to_free = NULL; + head = file->f_ep; +- if (head->first == &epi->fllink && !epi->fllink.next) { ++ if (hlist_is_singular_node(&epi->fllink, head)) { + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); + if (!is_file_epoll(file)) { +-- +2.53.0 + diff --git a/staging-6.6/file-add-fput-cleanup-helper.patch b/staging-6.6/file-add-fput-cleanup-helper.patch new file mode 100644 index 0000000000..6615104326 --- /dev/null +++ b/staging-6.6/file-add-fput-cleanup-helper.patch @@ -0,0 +1,45 @@ +From 931684e9bb54f5fbfb5a4e7270852d54b8efbf32 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 12:13:56 +0800 +Subject: file: add fput() cleanup helper + +From: Christian Brauner + +[ Upstream commit 257b1c2c78c25643526609dee0c15f1544eb3252 ] + +Add a simple helper to put a file reference. + +Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-4-834113cab0d2@kernel.org +Reviewed-by: Josef Bacik +Reviewed-by: Jeff Layton +Signed-off-by: Christian Brauner +(cherry picked from commit 257b1c2c78c25643526609dee0c15f1544eb3252) +Signed-off-by: Wentao Guan +Signed-off-by: Sasha Levin +--- + include/linux/file.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/include/linux/file.h b/include/linux/file.h +index 6e9099d2934368..221ba0888107a0 100644 +--- a/include/linux/file.h ++++ b/include/linux/file.h +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + struct file; + +@@ -93,6 +94,7 @@ extern void put_unused_fd(unsigned int fd); + + DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), + get_unused_fd_flags(flags), unsigned flags) ++DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T)) + + extern void fd_install(unsigned int fd, struct file *file); + +-- +2.53.0 + diff --git a/staging-6.6/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch b/staging-6.6/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch new file mode 100644 index 0000000000..073241bab4 --- /dev/null +++ b/staging-6.6/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch @@ -0,0 +1,71 @@ +From b0d2e80edac5f4cf1d3626e24167f5606e36d335 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:24:24 +0200 +Subject: KVM: x86: Fix shadow paging use-after-free due to unexpected role + +From: Paolo Bonzini + +commit 81ccda30b4e83d8f5cc4fd50503c44e3a33abfeb upstream. + +Commit 0cb2af2ea66ad ("KVM: x86: Fix shadow paging use-after-free due +to unexpected GFN") fixed a shadow paging mismatch between stored and +computed GFNs; the bug could be triggered by changing a PDE mapping from +outside the guest, and then deleting a memslot. The rmap_remove() +call would miss entries created after the PDE change because the GFN +of the leaf SPTE does not match the GFN of the struct kvm_mmu_page. + +A similar hole however remains if the modified PDE points to a non-leaf +page. In this case the gfn can be made to match, but the role does not +match: the original large 2MB page creates a kvm_mmu_page with direct=1, +while the new 4KB needs a kvm_mmu_page with direct=0. However, +kvm_mmu_get_child_sp() does not compare the role, and therefore reuses +the page. + +The next step is installing a leaf (4KB) SPTE on the new path which +records an rmap entry under the gfn resolved by the walk. But when +that child is zapped its parent kvm_mmu_page has direct=1 and +kvm_mmu_page_get_gfn() computes the gfn for the 4KB page as +sp->gfn + index instead of using sp->shadowed_translation[] (or sp->gfns[] +in older kernels). It therefore fails to remove the recorded entry. + +When the memslot is dropped the shadow page is freed but the rmap +entry survives, as in the scenario that was already fixed. Code that +later walks that gfn (dirty logging, MMU notifier invalidation, and +so on) dereferences an sptep that lies in the freed page, causing the +use-after-free. + +Fixes: 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") +Reported-by: Hyunwoo Kim +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 774bc26b8235e3..8e9ba7eaeaf3a0 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2337,13 +2337,15 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) + { +- union kvm_mmu_page_role role; ++ union kvm_mmu_page_role role = kvm_mmu_child_role(sptep, direct, access); + +- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && +- spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) ++ if (is_shadow_present_pte(*sptep) && ++ !is_large_pte(*sptep) && ++ spte_to_child_sp(*sptep) && ++ spte_to_child_sp(*sptep)->gfn == gfn && ++ spte_to_child_sp(*sptep)->role.word == role.word) + return ERR_PTR(-EEXIST); + +- role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); + } + +-- +2.53.0 + diff --git a/staging-6.6/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch b/staging-6.6/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch new file mode 100644 index 0000000000..428795b252 --- /dev/null +++ b/staging-6.6/kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch @@ -0,0 +1,136 @@ +From 84d6c1556b8cbc87fbc8b37bbeb34e49a55c77ed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:24:25 +0200 +Subject: KVM: x86/mmu: Ensure hugepage is in by slot before checking max + mapping level + +From: Sean Christopherson + +commit ef057cbf825e03b63f6edf5980f96abf3c53089d upstream. + +When recovering hugepages in the shadow MMU, verify that the base gfn of +the shadow page is actually contained within the target memslot, *before* +querying the max mapping level given the shadow page's gfn. Failure to +pre-check the validity of the gfn can lead to an out-of-bounds access to +the slot's lpage_info (which typically manifests as a host #PF because the +lpage_info is vmalloc'd) if the guest creates a hugepage mapping (in its +PTEs) that extends "below" the bounds of a memslot. + +When faulting in memory for a guest, and the size of the guest mapping is +greater than KVM's (current) max mapping, then KVM will create a "direct" +shadow page (direct in that there are no gPTEs to shadow, and so the target +gfn is a direct calculation given the base gfn of the shadow page). The +hugepage recovery flow looks for such direct shadow pages, as forcing 4KiB +mappings when dirty logging generates the guest > host mapping size case. +When the 4KiB restriction is lifted, then KVM can replace the shadow page +with a hugepage. + +But if KVM originally used a smaller mapping than the guest because the +range of memory covered by the guest hugepage exceeds the bounds of a +memslot, then KVM will link a direct shadow page with a gfn that is outside +the bounds of the memslot being used to fault in memory. The rmap entry +added for the leaf mapping is correct and within bounds, but the gfn of the +leaf SPTE's parent shadow page will be out of bounds. + + BUG: unable to handle page fault for address: ffffc90000806ffc + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 100000067 P4D 100000067 PUD 1002a7067 PMD 10612f067 PTE 0 + Oops: Oops: 0000 [#1] SMP + CPU: 13 UID: 1000 PID: 757 Comm: mmu_stress_test Not tainted 7.1.0-rc1-48ce1e26eace-x86_pir_to_irr_comments-vm #341 PREEMPT + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + RIP: 0010:kvm_mmu_max_mapping_level+0x79/0x2b0 [kvm] + Call Trace: + + kvm_mmu_recover_huge_pages+0x21b/0x320 [kvm] + kvm_set_memslot+0x1ee/0x590 [kvm] + kvm_set_memory_region.part.0+0x3a1/0x4d0 [kvm] + kvm_vm_ioctl+0x9bf/0x15d0 [kvm] + __x64_sys_ioctl+0x8a/0xd0 + do_syscall_64+0xb7/0xbb0 + entry_SYSCALL_64_after_hwframe+0x4b/0x53 + RIP: 0033:0x7f21c0f1a9bf + + +Don't bother pre-checking the bounds of the potential hugepage, i.e. don't +check that e.g. sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1) is also +within the memslot, as the checks performed by kvm_mmu_max_mapping_level() +are a superset of the basic bounds checks. I.e. pre-checking the full +range would be a dubious micro-optimization. + +Fixes: 9eba50f8d7fc ("KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs") +Cc: stable@vger.kernel.org +Cc: David Matlack +Cc: James Houghton +Cc: Alexander Bulekov +Cc: Fred Griffoul +Cc: Alexander Graf +Cc: David Woodhouse +Cc: Filippo Sironi +Cc: Ivan Orlov +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 18 ++++++++++++------ + include/linux/kvm_host.h | 7 ++++++- + 2 files changed, 18 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 8e9ba7eaeaf3a0..2453524ea4a1fc 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6582,13 +6582,19 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + sp = sptep_to_sp(sptep); + + /* +- * We cannot do huge page mapping for indirect shadow pages, +- * which are found on the last rmap (level = 1) when not using +- * tdp; such shadow pages are synced with the page table in +- * the guest, and the guest page table is using 4K page size +- * mapping if the indirect sp has level = 1. ++ * Direct shadow page can be replaced by a hugepage if the host ++ * mapping level allows it and the memslot maps all of the host ++ * hugepage. Note! If the memslot maps only part of the ++ * hugepage, sp->gfn may be below slot->base_gfn, and querying ++ * the max mapping level would cause an out-of-bounds lpage_info ++ * access. So the gfn bounds check *must* be done first. ++ * ++ * Indirect shadow pages are created when the guest page tables ++ * are using 4K pages. Since the host mapping is always ++ * constrained by the page size in the guest, indirect shadow ++ * pages are never collapsible. + */ +- if (sp->role.direct && ++ if (sp->role.direct && is_gfn_in_memslot(slot, sp->gfn) && + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, + PG_LEVEL_NUM)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index ab09b08967bba4..57c7b4009f5758 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -1650,6 +1650,11 @@ int kvm_request_irq_source_id(struct kvm *kvm); + void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); + bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); + ++static inline bool is_gfn_in_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) ++{ ++ return gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages; ++} ++ + /* + * Returns a pointer to the memslot if it contains gfn. + * Otherwise returns NULL. +@@ -1660,7 +1665,7 @@ try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) + if (!slot) + return NULL; + +- if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) ++ if (is_gfn_in_memslot(slot, gfn)) + return slot; + else + return NULL; +-- +2.53.0 + diff --git a/staging-6.6/revert-ptp-add-testptp-mask-test.patch b/staging-6.6/revert-ptp-add-testptp-mask-test.patch new file mode 100644 index 0000000000..86db7285f6 --- /dev/null +++ b/staging-6.6/revert-ptp-add-testptp-mask-test.patch @@ -0,0 +1,93 @@ +From 47bc6f0db8f685814a14df5e3d28ed384b5107df Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2026 14:30:39 +0200 +Subject: Revert "ptp: add testptp mask test" + +From: Petr Machata + +This reverts commit 59ac47a0275fcd5a7637c3d5da20b0905563c7f5, which is +commit 26285e689c6cd2cf3849568c83b2ebe53f467143 upstream. + +The reverted commit extends the selftest to test timestamp event queue mask +manipulation in testptp. It exercises masks PTP_MASK_CLEAR_ALL and +PTP_MASK_EN_SINGLE, introduced in commit c5a445b1e934 ("ptp: support event +queue reader channel masks"), which is not on this stable branch. The test +case thus cannot be built against this tree's own UAPI headers. + +The reverted commit was introduced to resolve a missing dependency of +commit 8d9f22c570ba ("testptp: Add option to open PHC in readonly mode"), +which is 76868642e427 upstream. The only conflict between the two is the +getopt string, and there is otherwise no direct dependency between the two. + +This patch therefore reverts the cited commit, with hand-resolving the +getopt string to include 'r' (as introduced by c6dc458227a3), but not +'F' (introduced by c1c50689799d). + +Reported-by: Yong Wang +Signed-off-by: Petr Machata +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/ptp/testptp.c | 19 +------------------ + 1 file changed, 1 insertion(+), 18 deletions(-) + +diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c +index e0aed424fe42d5..8f05212f82329a 100644 +--- a/tools/testing/selftests/ptp/testptp.c ++++ b/tools/testing/selftests/ptp/testptp.c +@@ -121,7 +121,6 @@ static void usage(char *progname) + " -d name device to open\n" + " -e val read 'val' external time stamp events\n" + " -f val adjust the ptp clock frequency by 'val' ppb\n" +- " -F chan Enable single channel mask and keep device open for debugfs verification.\n" + " -g get the ptp clock time\n" + " -h prints this message\n" + " -i val index for event/trigger\n" +@@ -190,7 +189,6 @@ int main(int argc, char *argv[]) + int seconds = 0; + int readonly = 0; + int settime = 0; +- int channel = -1; + + int64_t t1, t2, tp; + int64_t interval, offset; +@@ -200,7 +198,7 @@ int main(int argc, char *argv[]) + + progname = strrchr(argv[0], '/'); + progname = progname ? 1+progname : argv[0]; +- while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xz"))) { ++ while (EOF != (c = getopt(argc, argv, "cd:e:f:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xz"))) { + switch (c) { + case 'c': + capabilities = 1; +@@ -214,9 +212,6 @@ int main(int argc, char *argv[]) + case 'f': + adjfreq = atoi(optarg); + break; +- case 'F': +- channel = atoi(optarg); +- break; + case 'g': + gettime = 1; + break; +@@ -618,18 +613,6 @@ int main(int argc, char *argv[]) + free(xts); + } + +- if (channel >= 0) { +- if (ioctl(fd, PTP_MASK_CLEAR_ALL)) { +- perror("PTP_MASK_CLEAR_ALL"); +- } else if (ioctl(fd, PTP_MASK_EN_SINGLE, (unsigned int *)&channel)) { +- perror("PTP_MASK_EN_SINGLE"); +- } else { +- printf("Channel %d exclusively enabled. Check on debugfs.\n", channel); +- printf("Press any key to continue\n."); +- getchar(); +- } +- } +- + close(fd); + return 0; + } +-- +2.53.0 + diff --git a/staging-6.6/series b/staging-6.6/series new file mode 100644 index 0000000000..f434d9665f --- /dev/null +++ b/staging-6.6/series @@ -0,0 +1,11 @@ +file-add-fput-cleanup-helper.patch +eventpoll-use-hlist_is_singular_node-in-__ep_remove.patch +eventpoll-split-__ep_remove.patch +eventpoll-kill-__ep_remove.patch +eventpoll-drop-vestigial-__-prefix-from-ep_remove_-f.patch +eventpoll-rename-ep_remove_safe-back-to-ep_remove.patch +eventpoll-move-epi_fget-up.patch +eventpoll-fix-ep_remove-struct-eventpoll-struct-file.patch +kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch +kvm-x86-mmu-ensure-hugepage-is-in-by-slot-before-che.patch +revert-ptp-add-testptp-mask-test.patch diff --git a/staging-7.0/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch b/staging-7.0/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch new file mode 100644 index 0000000000..49ef7c0fa9 --- /dev/null +++ b/staging-7.0/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch @@ -0,0 +1,71 @@ +From b0d49a30728d6df9f36ca076f38160317e784a1c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:22:50 +0200 +Subject: KVM: x86: Fix shadow paging use-after-free due to unexpected role + +From: Paolo Bonzini + +commit 81ccda30b4e83d8f5cc4fd50503c44e3a33abfeb upstream. + +Commit 0cb2af2ea66ad ("KVM: x86: Fix shadow paging use-after-free due +to unexpected GFN") fixed a shadow paging mismatch between stored and +computed GFNs; the bug could be triggered by changing a PDE mapping from +outside the guest, and then deleting a memslot. The rmap_remove() +call would miss entries created after the PDE change because the GFN +of the leaf SPTE does not match the GFN of the struct kvm_mmu_page. + +A similar hole however remains if the modified PDE points to a non-leaf +page. In this case the gfn can be made to match, but the role does not +match: the original large 2MB page creates a kvm_mmu_page with direct=1, +while the new 4KB needs a kvm_mmu_page with direct=0. However, +kvm_mmu_get_child_sp() does not compare the role, and therefore reuses +the page. + +The next step is installing a leaf (4KB) SPTE on the new path which +records an rmap entry under the gfn resolved by the walk. But when +that child is zapped its parent kvm_mmu_page has direct=1 and +kvm_mmu_page_get_gfn() computes the gfn for the 4KB page as +sp->gfn + index instead of using sp->shadowed_translation[] (or sp->gfns[] +in older kernels). It therefore fails to remove the recorded entry. + +When the memslot is dropped the shadow page is freed but the rmap +entry survives, as in the scenario that was already fixed. Code that +later walks that gfn (dirty logging, MMU notifier invalidation, and +so on) dereferences an sptep that lies in the freed page, causing the +use-after-free. + +Fixes: 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") +Reported-by: Hyunwoo Kim +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 729240bc00a269..3e1218abbbb757 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2453,13 +2453,15 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) + { +- union kvm_mmu_page_role role; ++ union kvm_mmu_page_role role = kvm_mmu_child_role(sptep, direct, access); + +- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && +- spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) ++ if (is_shadow_present_pte(*sptep) && ++ !is_large_pte(*sptep) && ++ spte_to_child_sp(*sptep) && ++ spte_to_child_sp(*sptep)->gfn == gfn && ++ spte_to_child_sp(*sptep)->role.word == role.word) + return ERR_PTR(-EEXIST); + +- role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); + } + +-- +2.53.0 + diff --git a/staging-7.0/series b/staging-7.0/series new file mode 100644 index 0000000000..a385adea48 --- /dev/null +++ b/staging-7.0/series @@ -0,0 +1 @@ +kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch diff --git a/staging-7.1/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch b/staging-7.1/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch new file mode 100644 index 0000000000..5cd276a7bb --- /dev/null +++ b/staging-7.1/kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch @@ -0,0 +1,71 @@ +From 0559cf5a1599f89cb2255b2eef4d1ad721059e30 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Jun 2026 13:22:32 +0200 +Subject: KVM: x86: Fix shadow paging use-after-free due to unexpected role + +From: Paolo Bonzini + +commit 81ccda30b4e83d8f5cc4fd50503c44e3a33abfeb upstream. + +Commit 0cb2af2ea66ad ("KVM: x86: Fix shadow paging use-after-free due +to unexpected GFN") fixed a shadow paging mismatch between stored and +computed GFNs; the bug could be triggered by changing a PDE mapping from +outside the guest, and then deleting a memslot. The rmap_remove() +call would miss entries created after the PDE change because the GFN +of the leaf SPTE does not match the GFN of the struct kvm_mmu_page. + +A similar hole however remains if the modified PDE points to a non-leaf +page. In this case the gfn can be made to match, but the role does not +match: the original large 2MB page creates a kvm_mmu_page with direct=1, +while the new 4KB needs a kvm_mmu_page with direct=0. However, +kvm_mmu_get_child_sp() does not compare the role, and therefore reuses +the page. + +The next step is installing a leaf (4KB) SPTE on the new path which +records an rmap entry under the gfn resolved by the walk. But when +that child is zapped its parent kvm_mmu_page has direct=1 and +kvm_mmu_page_get_gfn() computes the gfn for the 4KB page as +sp->gfn + index instead of using sp->shadowed_translation[] (or sp->gfns[] +in older kernels). It therefore fails to remove the recorded entry. + +When the memslot is dropped the shadow page is freed but the rmap +entry survives, as in the scenario that was already fixed. Code that +later walks that gfn (dirty logging, MMU notifier invalidation, and +so on) dereferences an sptep that lies in the freed page, causing the +use-after-free. + +Fixes: 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") +Reported-by: Hyunwoo Kim +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index f0144ae8d891d3..bb204d3c66b7e9 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2453,13 +2453,15 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) + { +- union kvm_mmu_page_role role; ++ union kvm_mmu_page_role role = kvm_mmu_child_role(sptep, direct, access); + +- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && +- spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) ++ if (is_shadow_present_pte(*sptep) && ++ !is_large_pte(*sptep) && ++ spte_to_child_sp(*sptep) && ++ spte_to_child_sp(*sptep)->gfn == gfn && ++ spte_to_child_sp(*sptep)->role.word == role.word) + return ERR_PTR(-EEXIST); + +- role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); + } + +-- +2.53.0 + diff --git a/staging-7.1/series b/staging-7.1/series new file mode 100644 index 0000000000..a385adea48 --- /dev/null +++ b/staging-7.1/series @@ -0,0 +1 @@ +kvm-x86-fix-shadow-paging-use-after-free-due-to-unex.patch