]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for all trees
authorSasha Levin <sashal@kernel.org>
Sat, 16 Aug 2025 20:59:40 +0000 (16:59 -0400)
committerSasha Levin <sashal@kernel.org>
Sat, 16 Aug 2025 20:59:40 +0000 (16:59 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
143 files changed:
queue-5.10/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch [new file with mode: 0644]
queue-5.10/intel_idle-allow-loading-acpi-tables-for-any-family.patch [new file with mode: 0644]
queue-5.10/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-5.10/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch [new file with mode: 0644]
queue-5.10/series
queue-5.10/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]
queue-5.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch [new file with mode: 0644]
queue-5.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch [new file with mode: 0644]
queue-5.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-5.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch [new file with mode: 0644]
queue-5.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch [new file with mode: 0644]
queue-5.15/series
queue-5.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]
queue-5.4/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-5.4/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch [new file with mode: 0644]
queue-5.4/series
queue-5.4/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]
queue-6.1/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch [new file with mode: 0644]
queue-6.1/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch [new file with mode: 0644]
queue-6.1/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch [new file with mode: 0644]
queue-6.1/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch [new file with mode: 0644]
queue-6.1/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch [new file with mode: 0644]
queue-6.1/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch [new file with mode: 0644]
queue-6.1/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch [new file with mode: 0644]
queue-6.1/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch [new file with mode: 0644]
queue-6.1/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch [new file with mode: 0644]
queue-6.1/kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch [new file with mode: 0644]
queue-6.1/kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch [new file with mode: 0644]
queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch [new file with mode: 0644]
queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch [new file with mode: 0644]
queue-6.1/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]
queue-6.12/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch [new file with mode: 0644]
queue-6.12/habanalabs-fix-uaf-in-export_dmabuf.patch [new file with mode: 0644]
queue-6.12/intel_idle-allow-loading-acpi-tables-for-any-family.patch [new file with mode: 0644]
queue-6.12/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch [new file with mode: 0644]
queue-6.12/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch [new file with mode: 0644]
queue-6.12/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch [new file with mode: 0644]
queue-6.12/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch [new file with mode: 0644]
queue-6.12/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch [new file with mode: 0644]
queue-6.12/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch [new file with mode: 0644]
queue-6.12/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch [new file with mode: 0644]
queue-6.12/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch [new file with mode: 0644]
queue-6.12/net-kcm-fix-race-condition-in-kcm_unattach.patch [new file with mode: 0644]
queue-6.12/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch [new file with mode: 0644]
queue-6.12/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch [new file with mode: 0644]
queue-6.12/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-6.12/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch [new file with mode: 0644]
queue-6.12/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch [new file with mode: 0644]
queue-6.12/series
queue-6.12/tls-handle-data-disappearing-from-under-the-tls-ulp.patch [new file with mode: 0644]
queue-6.12/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]
queue-6.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch [new file with mode: 0644]
queue-6.15/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch [new file with mode: 0644]
queue-6.15/habanalabs-fix-uaf-in-export_dmabuf.patch [new file with mode: 0644]
queue-6.15/hamradio-ignore-ops-locked-netdevs.patch [new file with mode: 0644]
queue-6.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch [new file with mode: 0644]
queue-6.15/ipvs-fix-estimator-kthreads-preferred-affinity.patch [new file with mode: 0644]
queue-6.15/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch [new file with mode: 0644]
queue-6.15/net-hibmcge-fix-rtnl-deadlock-issue.patch [new file with mode: 0644]
queue-6.15/net-hibmcge-fix-the-division-by-zero-issue.patch [new file with mode: 0644]
queue-6.15/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch [new file with mode: 0644]
queue-6.15/net-kcm-fix-race-condition-in-kcm_unattach.patch [new file with mode: 0644]
queue-6.15/net-lapbether-ignore-ops-locked-netdevs.patch [new file with mode: 0644]
queue-6.15/net-page_pool-allow-enabling-recycling-late-fix-fals.patch [new file with mode: 0644]
queue-6.15/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch [new file with mode: 0644]
queue-6.15/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch [new file with mode: 0644]
queue-6.15/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch [new file with mode: 0644]
queue-6.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-6.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch [new file with mode: 0644]
queue-6.15/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch [new file with mode: 0644]
queue-6.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch [new file with mode: 0644]
queue-6.15/series
queue-6.15/tls-handle-data-disappearing-from-under-the-tls-ulp.patch [new file with mode: 0644]
queue-6.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]
queue-6.15/xfrm-restore-gso-for-sw-crypto.patch [new file with mode: 0644]
queue-6.16/bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch [new file with mode: 0644]
queue-6.16/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch [new file with mode: 0644]
queue-6.16/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch [new file with mode: 0644]
queue-6.16/habanalabs-fix-uaf-in-export_dmabuf.patch [new file with mode: 0644]
queue-6.16/hamradio-ignore-ops-locked-netdevs.patch [new file with mode: 0644]
queue-6.16/intel_idle-allow-loading-acpi-tables-for-any-family.patch [new file with mode: 0644]
queue-6.16/ipvs-fix-estimator-kthreads-preferred-affinity.patch [new file with mode: 0644]
queue-6.16/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch [new file with mode: 0644]
queue-6.16/net-hibmcge-fix-rtnl-deadlock-issue.patch [new file with mode: 0644]
queue-6.16/net-hibmcge-fix-the-division-by-zero-issue.patch [new file with mode: 0644]
queue-6.16/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch [new file with mode: 0644]
queue-6.16/net-kcm-fix-race-condition-in-kcm_unattach.patch [new file with mode: 0644]
queue-6.16/net-lapbether-ignore-ops-locked-netdevs.patch [new file with mode: 0644]
queue-6.16/net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch [new file with mode: 0644]
queue-6.16/net-page_pool-allow-enabling-recycling-late-fix-fals.patch [new file with mode: 0644]
queue-6.16/net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch [new file with mode: 0644]
queue-6.16/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch [new file with mode: 0644]
queue-6.16/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch [new file with mode: 0644]
queue-6.16/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch [new file with mode: 0644]
queue-6.16/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-6.16/netfilter-ctnetlink-remove-refcounting-in-expectatio.patch [new file with mode: 0644]
queue-6.16/netfilter-nf_tables-reject-duplicate-device-on-updat.patch [new file with mode: 0644]
queue-6.16/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch [new file with mode: 0644]
queue-6.16/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch [new file with mode: 0644]
queue-6.16/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch [new file with mode: 0644]
queue-6.16/series
queue-6.16/tls-handle-data-disappearing-from-under-the-tls-ulp.patch [new file with mode: 0644]
queue-6.16/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]
queue-6.16/xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch [new file with mode: 0644]
queue-6.16/xfrm-flush-all-states-in-xfrm_state_fini.patch [new file with mode: 0644]
queue-6.16/xfrm-restore-gso-for-sw-crypto.patch [new file with mode: 0644]
queue-6.6/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch [new file with mode: 0644]
queue-6.6/intel_idle-allow-loading-acpi-tables-for-any-family.patch [new file with mode: 0644]
queue-6.6/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch [new file with mode: 0644]
queue-6.6/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch [new file with mode: 0644]
queue-6.6/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch [new file with mode: 0644]
queue-6.6/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch [new file with mode: 0644]
queue-6.6/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch [new file with mode: 0644]
queue-6.6/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch [new file with mode: 0644]
queue-6.6/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch [new file with mode: 0644]
queue-6.6/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch [new file with mode: 0644]
queue-6.6/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch [new file with mode: 0644]
queue-6.6/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch [new file with mode: 0644]
queue-6.6/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch [new file with mode: 0644]
queue-6.6/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch [new file with mode: 0644]
queue-6.6/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch [new file with mode: 0644]
queue-6.6/kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch [new file with mode: 0644]
queue-6.6/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch [new file with mode: 0644]
queue-6.6/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch [new file with mode: 0644]
queue-6.6/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch [new file with mode: 0644]
queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch [new file with mode: 0644]
queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch [new file with mode: 0644]
queue-6.6/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch [new file with mode: 0644]
queue-6.6/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch [new file with mode: 0644]
queue-6.6/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch [new file with mode: 0644]
queue-6.6/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch [new file with mode: 0644]
queue-6.6/series
queue-6.6/tls-handle-data-disappearing-from-under-the-tls-ulp.patch [new file with mode: 0644]
queue-6.6/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch [new file with mode: 0644]

diff --git a/queue-5.10/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-5.10/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
new file mode 100644 (file)
index 0000000..74b04a4
--- /dev/null
@@ -0,0 +1,91 @@
+From cfccde6ceaa234284620cfaca692e6585dba91b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2].  This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period.  Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index a95cc8f024fd..d34463f96848 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -158,6 +158,14 @@ static inline int performance_multiplier(unsigned long nr_iowaiters)
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++      /* Update the repeating-pattern data. */
++      data->intervals[data->interval_ptr++] = interval_us;
++      if (data->interval_ptr >= INTERVALS)
++              data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+ /*
+@@ -288,6 +296,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+       if (data->needs_update) {
+               menu_update(drv, dev);
+               data->needs_update = 0;
++      } else if (!dev->last_residency_ns) {
++              /*
++               * This happens when the driver rejects the previously selected
++               * idle state and returns an error, so update the recent
++               * intervals table to prevent invalid information from being
++               * used going forward.
++               */
++              menu_update_intervals(data, UINT_MAX);
+       }
+       /* determine the expected residency time, round up */
+@@ -537,10 +553,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+       data->correction_factor[data->bucket] = new_factor;
+-      /* update the repeating-pattern data */
+-      data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+-      if (data->interval_ptr >= INTERVALS)
+-              data->interval_ptr = 0;
++      menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+ /**
+-- 
+2.50.1
+
diff --git a/queue-5.10/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-5.10/intel_idle-allow-loading-acpi-tables-for-any-family.patch
new file mode 100644 (file)
index 0000000..2883299
--- /dev/null
@@ -0,0 +1,41 @@
+From 55146a8e555eb6bcaf596bf8b7455a06175b4760 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6.  Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 1cead368f961..f6a2211ca4ef 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1154,7 +1154,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+-      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+       {}
+ };
+-- 
+2.50.1
+
diff --git a/queue-5.10/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-5.10/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..bc69d6c
--- /dev/null
@@ -0,0 +1,129 @@
+From f713f56980c58c0297138ace9f7b483378f6bd73 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index b2b06033ef2c..f622fcad3f50 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -839,8 +839,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -1112,19 +1110,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -1160,7 +1165,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -1173,8 +1178,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                           ct, true, flags);
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -1187,12 +1191,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
diff --git a/queue-5.10/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-5.10/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
new file mode 100644 (file)
index 0000000..5c69f7a
--- /dev/null
@@ -0,0 +1,73 @@
+From 3e634f70881c39e2c08fb3e91544b90694df00bc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+  BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+   sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+   sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+   __release_sock+0x1da/0x330 net/core/sock.c:3106
+   release_sock+0x6b/0x250 net/core/sock.c:3660
+   sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+   sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+   sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+   inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+  BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+   sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+   __release_sock+0x1d3/0x330 net/core/sock.c:3213
+   release_sock+0x6b/0x270 net/core/sock.c:3767
+   sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+   sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+   sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+   inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 8fe1a74f0618..079b1bfc7d31 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb)
+        * it's better to just linearize it otherwise crc computing
+        * takes longer.
+        */
+-      if ((!is_gso && skb_linearize(skb)) ||
++      if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+           !pskb_may_pull(skb, sizeof(struct sctphdr)))
+               goto discard_it;
+-- 
+2.50.1
+
index f16f2f202e65b46c654b8b6eff66e3c6704449b9..c167be5b1b28c2d82fa481e19cddb3d119c041e2 100644 (file)
@@ -203,3 +203,8 @@ fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch
 documentation-acpi-fix-parent-device-references.patch
 acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
 acpi-processor-perflib-move-problematic-pr-performance-check.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
diff --git a/queue-5.10/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-5.10/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..35b776b
--- /dev/null
@@ -0,0 +1,51 @@
+From 703e70d1d8e2e3fa7a948735d5f6cd1cc8ce9e8d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 73beaa7e2d70..5d4413fe4195 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+
diff --git a/queue-5.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-5.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
new file mode 100644 (file)
index 0000000..5a7ef11
--- /dev/null
@@ -0,0 +1,91 @@
+From 05efdd270d75536fbf901a5eae7145a45a532748 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2].  This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period.  Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index e1e2721beb75..246b4a1b664a 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -158,6 +158,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters)
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++      /* Update the repeating-pattern data. */
++      data->intervals[data->interval_ptr++] = interval_us;
++      if (data->interval_ptr >= INTERVALS)
++              data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+ /*
+@@ -288,6 +296,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+       if (data->needs_update) {
+               menu_update(drv, dev);
+               data->needs_update = 0;
++      } else if (!dev->last_residency_ns) {
++              /*
++               * This happens when the driver rejects the previously selected
++               * idle state and returns an error, so update the recent
++               * intervals table to prevent invalid information from being
++               * used going forward.
++               */
++              menu_update_intervals(data, UINT_MAX);
+       }
+       /* determine the expected residency time, round up */
+@@ -542,10 +558,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+       data->correction_factor[data->bucket] = new_factor;
+-      /* update the repeating-pattern data */
+-      data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+-      if (data->interval_ptr >= INTERVALS)
+-              data->interval_ptr = 0;
++      menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+ /**
+-- 
+2.50.1
+
diff --git a/queue-5.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-5.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch
new file mode 100644 (file)
index 0000000..05a4204
--- /dev/null
@@ -0,0 +1,41 @@
+From 1ee55ceeeb4fb9720509f1f18eb551a41c5568c1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6.  Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 359272ce8e29..96002f35405e 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1194,7 +1194,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+-      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+       {}
+ };
+-- 
+2.50.1
+
diff --git a/queue-5.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-5.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..892daa8
--- /dev/null
@@ -0,0 +1,129 @@
+From 93145a29f5f86a93148422799f8ec6667e0b6f50 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 585103c16a8a..50f7531221c3 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -848,8 +848,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -1164,19 +1162,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -1211,7 +1216,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -1224,8 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                           ct, true, flags);
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -1238,12 +1242,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
diff --git a/queue-5.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-5.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
new file mode 100644 (file)
index 0000000..8483738
--- /dev/null
@@ -0,0 +1,103 @@
+From 167cea59f060ec2d6f527a724186f5e6a9a3f4d6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+       CPU0                           CPU1
+       ----                           ----
+  n_vclocks_store()
+    lock(&ptp->n_vclocks_mux) [1]
+        (physical clock)
+                                     pc_clock_adjtime()
+                                       lock(&clk->rwsem) [2]
+                                        (physical clock)
+                                       ...
+                                       ptp_clock_freerun()
+                                         ptp_vclock_in_use()
+                                           lock(&ptp->n_vclocks_mux) [3]
+                                              (physical clock)
+    ptp_clock_unregister()
+      posix_clock_unregister()
+        lock(&clk->rwsem) [4]
+          (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c  | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index b8d3df8a393a..bf823b8c3c8f 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -20,6 +20,11 @@
+ #define PTP_BUF_TIMESTAMPS 30
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
++enum {
++      PTP_LOCK_PHYSICAL = 0,
++      PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+       struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+       int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index ab1d233173e1..6a14c39c4508 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -81,6 +81,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+       return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++      lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+       .owner          = THIS_MODULE,
+       .name           = "ptp virtual clock",
+@@ -137,6 +142,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+               return NULL;
+       }
++      ptp_vclock_set_subclass(vclock->clock);
++
+       timecounter_init(&vclock->tc, &vclock->cc, 0);
+       ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+-- 
+2.50.1
+
diff --git a/queue-5.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-5.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
new file mode 100644 (file)
index 0000000..d7d882e
--- /dev/null
@@ -0,0 +1,73 @@
+From ecc53eb08e436c50345fff6ff4f2d84eddc7ffc3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+  BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+   sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+   sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+   __release_sock+0x1da/0x330 net/core/sock.c:3106
+   release_sock+0x6b/0x250 net/core/sock.c:3660
+   sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+   sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+   sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+   inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+  BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+   sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+   __release_sock+0x1d3/0x330 net/core/sock.c:3213
+   release_sock+0x6b/0x270 net/core/sock.c:3767
+   sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+   sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+   sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+   inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 4ee9374dcfb9..182898cb754a 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb)
+        * it's better to just linearize it otherwise crc computing
+        * takes longer.
+        */
+-      if ((!is_gso && skb_linearize(skb)) ||
++      if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+           !pskb_may_pull(skb, sizeof(struct sctphdr)))
+               goto discard_it;
+-- 
+2.50.1
+
index 059ff2464709d77eef8be9ab665cb7014fffdb11..83a81a15be958140f1e4c4c0220bdabe32f889c8 100644 (file)
@@ -271,3 +271,9 @@ eventpoll-fix-semi-unbounded-recursion.patch
 documentation-acpi-fix-parent-device-references.patch
 acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
 acpi-processor-perflib-move-problematic-pr-performance-check.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
diff --git a/queue-5.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-5.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..4033349
--- /dev/null
@@ -0,0 +1,51 @@
+From 92e0e5246675bee6f45ac39d6c8c1ff8e588dd53 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 612da8ec1081..8f47d07c49fb 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -59,7 +59,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+
diff --git a/queue-5.4/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-5.4/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..7529f30
--- /dev/null
@@ -0,0 +1,128 @@
+From 4ad31ff02bc58329fdb26bd716b8c3ab15ba0533 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index eeb000e41ad7..5d6f9b375c0f 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -808,8 +808,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -890,18 +888,25 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -936,7 +941,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -951,8 +956,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           ct);
+                       rcu_read_unlock();
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -965,12 +969,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
diff --git a/queue-5.4/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-5.4/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
new file mode 100644 (file)
index 0000000..758f201
--- /dev/null
@@ -0,0 +1,73 @@
+From d22304d70bb1c9842e7607c98275fa7c2316a3a9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+  BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+   sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+   sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+   __release_sock+0x1da/0x330 net/core/sock.c:3106
+   release_sock+0x6b/0x250 net/core/sock.c:3660
+   sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+   sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+   sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+   inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+  BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+   sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+   __release_sock+0x1d3/0x330 net/core/sock.c:3213
+   release_sock+0x6b/0x270 net/core/sock.c:3767
+   sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+   sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+   sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+   inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index b1d3e342ac83..9013257cf3df 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb)
+        * it's better to just linearize it otherwise crc computing
+        * takes longer.
+        */
+-      if ((!is_gso && skb_linearize(skb)) ||
++      if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+           !pskb_may_pull(skb, sizeof(struct sctphdr)))
+               goto discard_it;
+-- 
+2.50.1
+
index 9124675c6417b91bc81c69288a71aadf0052fd54..7973eb6c30a8bec87ef66d8e506721c9a2c408c8 100644 (file)
@@ -165,3 +165,6 @@ fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch
 documentation-acpi-fix-parent-device-references.patch
 acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
 acpi-processor-perflib-move-problematic-pr-performance-check.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
diff --git a/queue-5.4/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-5.4/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..b59c330
--- /dev/null
@@ -0,0 +1,51 @@
+From 35a73e471818ebb5d92b5a51ddfbc30c777fb59c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 6505a6fd245a..7e025f3517b8 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch b/queue-6.1/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
new file mode 100644 (file)
index 0000000..3e26d9f
--- /dev/null
@@ -0,0 +1,117 @@
+From 9e1075bdd03cf356ae89ba3b703080b5c4fa2278 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:03 -0700
+Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
+
+Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports
+a subset of hardware functionality, i.e. KVM can't rely on hardware to
+detect illegal/unsupported values.  Failure to check the vmcs12 value
+would allow the guest to load any harware-supported value while running L2.
+
+Take care to exempt BTF and LBR from the validity check in order to match
+KVM's behavior for writes via WRMSR, but without clobbering vmcs12.  Even
+if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect
+that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR
+are being intercepted.
+
+Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set
+*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but
+that would incur non-trivial complexity and wouldn't change the fact that
+KVM's handling of DEBUGCTL is blatantly broken.  I.e. the extra complexity
+is not worth carrying.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 12 ++++++++++--
+ arch/x86/kvm/vmx/vmx.c    |  5 ++---
+ arch/x86/kvm/vmx/vmx.h    |  3 +++
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index d55f7edc0860..da129e12cff9 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2532,7 +2532,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
++              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
++                                                vmx_get_supported_debugctl(vcpu, false));
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+@@ -3022,7 +3023,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+               return -EINVAL;
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+-          CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
++          (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
++           CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
+               return -EINVAL;
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+@@ -4374,6 +4376,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+               (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+               (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
++      /*
++       * Note!  Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
++       * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
++       * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
++       * vmcs02 doesn't strictly track vmcs12.
++       */
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+               kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 6517b9d929bf..0b37e21d55b1 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2052,7 +2052,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+       return (unsigned long)data;
+ }
+-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+       u64 debugctl = 0;
+@@ -2071,8 +2071,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+       return debugctl;
+ }
+-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
+-                                bool host_initiated)
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ {
+       u64 invalid;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index ddbe73958d7f..99e3f46de2ec 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -442,6 +442,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++
+ /*
+  * Note, early Intel manuals have the write-low and read-high bitmap offsets
+  * the wrong way round.  The bitmaps control MSRs 0x00000000-0x00001fff and
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch b/queue-6.1/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch
new file mode 100644 (file)
index 0000000..f57b6ee
--- /dev/null
@@ -0,0 +1,156 @@
+From 2fbc005722e5d1985ef69a071a4a889ff1cb6120 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:48 -0700
+Subject: KVM: nVMX: Defer SVI update to vmcs01 on EOI when L2 is active w/o
+ VID
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chao Gao <chao.gao@intel.com>
+
+[ Upstream commit 04bc93cf49d16d01753b95ddb5d4f230b809a991 ]
+
+If KVM emulates an EOI for L1's virtual APIC while L2 is active, defer
+updating GUEST_INTERUPT_STATUS.SVI, i.e. the VMCS's cache of the highest
+in-service IRQ, until L1 is active, as vmcs01, not vmcs02, needs to track
+vISR.  The missed SVI update for vmcs01 can result in L1 interrupts being
+incorrectly blocked, e.g. if there is a pending interrupt with lower
+priority than the interrupt that was EOI'd.
+
+This bug only affects use cases where L1's vAPIC is effectively passed
+through to L2, e.g. in a pKVM scenario where L2 is L1's depriveleged host,
+as KVM will only emulate an EOI for L1's vAPIC if Virtual Interrupt
+Delivery (VID) is disabled in vmc12, and L1 isn't intercepting L2 accesses
+to its (virtual) APIC page (or if x2APIC is enabled, the EOI MSR).
+
+WARN() if KVM updates L1's ISR while L2 is active with VID enabled, as an
+EOI from L2 is supposed to affect L2's vAPIC, but still defer the update,
+to try to keep L1 alive.  Specifically, KVM forwards all APICv-related
+VM-Exits to L1 via nested_vmx_l1_wants_exit():
+
+       case EXIT_REASON_APIC_ACCESS:
+       case EXIT_REASON_APIC_WRITE:
+       case EXIT_REASON_EOI_INDUCED:
+               /*
+                * The controls for "virtualize APIC accesses," "APIC-
+                * register virtualization," and "virtual-interrupt
+                * delivery" only come from vmcs12.
+                */
+               return true;
+
+Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/kvm/20230312180048.1778187-1-jason.cj.chen@intel.com
+Reported-by: Markku Ahvenjärvi <mankku@gmail.com>
+Closes: https://lore.kernel.org/all/20240920080012.74405-1-mankku@gmail.com
+Cc: Janne Karhunen <janne.karhunen@gmail.com>
+Signed-off-by: Chao Gao <chao.gao@intel.com>
+[sean: drop request, handle in VMX, write changelog]
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntactic conflict in lapic.h, account for lack of
+       kvm_x86_call(), drop sanity check due to lack of wants_to_run]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/lapic.c      | 11 +++++++++++
+ arch/x86/kvm/lapic.h      |  1 +
+ arch/x86/kvm/vmx/nested.c |  5 +++++
+ arch/x86/kvm/vmx/vmx.c    | 16 ++++++++++++++++
+ arch/x86/kvm/vmx/vmx.h    |  1 +
+ 5 files changed, 34 insertions(+)
+
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 3d65d6a023c9..9aae76b74417 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -640,6 +640,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+       }
+ }
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
++{
++      struct kvm_lapic *apic = vcpu->arch.apic;
++
++      if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
++              return;
++
++      static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
++}
++EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
++
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+ {
+       /* This may race with setting of irr in __apic_accept_irq() and
+diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
+index a5ac4a5a5179..e5d2dc58fcf8 100644
+--- a/arch/x86/kvm/lapic.h
++++ b/arch/x86/kvm/lapic.h
+@@ -122,6 +122,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 8052f8b7d8e1..d55f7edc0860 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4839,6 +4839,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+               kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+       }
++      if (vmx->nested.update_vmcs01_hwapic_isr) {
++              vmx->nested.update_vmcs01_hwapic_isr = false;
++              kvm_apic_update_hwapic_isr(vcpu);
++      }
++
+       if ((vm_exit_reason != -1) &&
+           (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
+               vmx->nested.need_vmcs12_to_shadow_sync = true;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 721ba6ddb121..7b87fbc69b21 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6713,6 +6713,22 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+       u16 status;
+       u8 old;
++      /*
++       * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
++       * is only relevant for if and only if Virtual Interrupt Delivery is
++       * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
++       * vAPIC, not L1's vAPIC.  KVM must update vmcs01 on the next nested
++       * VM-Exit, otherwise L1 with run with a stale SVI.
++       */
++      if (is_guest_mode(vcpu)) {
++              /*
++               * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
++               * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
++               */
++              to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
++              return;
++      }
++
+       if (max_isr == -1)
+               max_isr = 0;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 9e0bb98b116d..8b4b149bd9c1 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -189,6 +189,7 @@ struct nested_vmx {
+       bool reload_vmcs01_apic_access_page;
+       bool update_vmcs01_cpu_dirty_logging;
+       bool update_vmcs01_apicv_status;
++      bool update_vmcs01_hwapic_isr;
+       /*
+        * Enlightened VMCS has been enabled. It does not mean that L1 has to
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch b/queue-6.1/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch
new file mode 100644 (file)
index 0000000..aff2530
--- /dev/null
@@ -0,0 +1,123 @@
+From 7a3ebf358c60cdf6f7ef1c175053ec17e59945c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:45 -0700
+Subject: KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI
+ shadow
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit be45bc4eff33d9a7dae84a2150f242a91a617402 ]
+
+Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common
+svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff()
+so that VMRUN is not executed in an STI shadow.  AMD CPUs have a quirk
+(some would say "bug"), where the STI shadow bleeds into the guest's
+intr_state field if a #VMEXIT occurs during injection of an event, i.e. if
+the VMRUN doesn't complete before the subsequent #VMEXIT.
+
+The spurious "interrupts masked" state is relatively benign, as it only
+occurs during event injection and is transient.  Because KVM is already
+injecting an event, the guest can't be in HLT, and if KVM is querying IRQ
+blocking for injection, then KVM would need to force an immediate exit
+anyways since injecting multiple events is impossible.
+
+However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the
+spurious STI shadow is visible to L1 when running a nested VM, which can
+trip sanity checks, e.g. in VMware's VMM.
+
+Hoist the STI+CLI all the way to C code, as the aforementioned calls to
+guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are
+enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already
+possible.  I.e. if there's kernel code that is confused by running with
+RFLAGS.IF=1, then it's already a problem.  In practice, since GIF=0 also
+blocks NMIs, the only change in exposure to non-KVM code (relative to
+surrounding VMRUN with STI+CLI) is exception handling code, and except for
+the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path
+are fatal.
+
+Use the "raw" variants to enable/disable IRQs to avoid tracing in the
+"no instrumentation" code; the guest state helpers also take care of
+tracing IRQ state.
+
+Oppurtunstically document why KVM needs to do STI in the first place.
+
+Reported-by: Doug Covelli <doug.covelli@broadcom.com>
+Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com
+Fixes: f14eec0a3203 ("KVM: SVM: move more vmentry code to assembly")
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in __svm_sev_es_vcpu_run()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c     | 14 ++++++++++++++
+ arch/x86/kvm/svm/vmenter.S |  9 +--------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index b6bbd0dc4e65..c95a84afc35f 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3982,6 +3982,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       guest_state_enter_irqoff();
++      /*
++       * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
++       * VMRUN controls whether or not physical IRQs are masked (KVM always
++       * runs with V_INTR_MASKING_MASK).  Toggle RFLAGS.IF here to avoid the
++       * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
++       * into guest state if delivery of an event during VMRUN triggers a
++       * #VMEXIT, and the guest_state transitions already tell lockdep that
++       * IRQs are being enabled/disabled.  Note!  GIF=0 for the entirety of
++       * this path, so IRQs aren't actually unmasked while running host code.
++       */
++      raw_local_irq_enable();
++
+       amd_clear_divider();
+       if (sev_es_guest(vcpu->kvm))
+@@ -3989,6 +4001,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       else
+               __svm_vcpu_run(svm, spec_ctrl_intercepted);
++      raw_local_irq_disable();
++
+       guest_state_exit_irqoff();
+ }
+diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
+index 42824f9b06a2..48b72625cc45 100644
+--- a/arch/x86/kvm/svm/vmenter.S
++++ b/arch/x86/kvm/svm/vmenter.S
+@@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
+       VM_CLEAR_CPU_BUFFERS
+       /* Enter guest mode */
+-      sti
+-
+ 3:    vmrun %_ASM_AX
+ 4:
+-      cli
+-
+       /* Pop @svm to RAX while it's the only available register. */
+       pop %_ASM_AX
+@@ -343,11 +339,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+       VM_CLEAR_CPU_BUFFERS
+       /* Enter guest mode */
+-      sti
+-
+ 1:    vmrun %_ASM_AX
+-
+-2:    cli
++2:
+       /* Pop @svm to RDI, guest registers have been saved already. */
+       pop %_ASM_DI
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch b/queue-6.1/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
new file mode 100644 (file)
index 0000000..eb43264
--- /dev/null
@@ -0,0 +1,63 @@
+From a0343421f8ed3cfa76b9719e3d3f1d575d5dd176 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:01 -0700
+Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ]
+
+Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the
+guest CPUID model, as debug support is supposed to be available if RTM is
+supported, and there are no known downsides to letting the guest debug RTM
+aborts.
+
+Note, there are no known bug reports related to RTM_DEBUG, the primary
+motivation is to reduce the probability of breaking existing guests when a
+future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL
+(KVM currently lets L2 run with whatever hardware supports; whoops).
+
+Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to
+DR7.RTM.
+
+Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/msr-index.h | 1 +
+ arch/x86/kvm/vmx/vmx.c           | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index 727947ed5e5e..afd65c815043 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -379,6 +379,7 @@
+ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI     (1UL << 12)
+ #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+ #define DEBUGCTLMSR_FREEZE_IN_SMM     (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
++#define DEBUGCTLMSR_RTM_DEBUG         BIT(15)
+ #define MSR_PEBS_FRONTEND             0x000003f7
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 60d1ff3fca45..9445def2b3d2 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2064,6 +2064,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+           (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+               debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
++      if (boot_cpu_has(X86_FEATURE_RTM) &&
++          (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM)))
++              debugctl |= DEBUGCTLMSR_RTM_DEBUG;
++
+       return debugctl;
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch b/queue-6.1/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
new file mode 100644 (file)
index 0000000..3177e70
--- /dev/null
@@ -0,0 +1,90 @@
+From ada33297c8f7efa38a5100d5dde191508fc0254b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:02 -0700
+Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
+
+Move VMX's logic to check DEBUGCTL values into a standalone helper so that
+the code can be used by nested VM-Enter to apply the same logic to the
+value being loaded from vmcs12.
+
+KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested
+VM-Enter, as hardware may support features that KVM does not, i.e. relying
+on hardware to detect invalid guest state will result in false negatives.
+Unfortunately, that means applying KVM's funky suppression of BTF and LBR
+to vmcs12 so as not to break existing guests.
+
+No functional change intended.
+
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------
+ 1 file changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9445def2b3d2..6517b9d929bf 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2071,6 +2071,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+       return debugctl;
+ }
++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
++                                bool host_initiated)
++{
++      u64 invalid;
++
++      invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
++      if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
++              kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
++              invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
++      }
++      return !invalid;
++}
++
+ /*
+  * Writes msr value into the appropriate "register".
+  * Returns 0 on success, non-0 otherwise.
+@@ -2139,19 +2152,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+               }
+               vmcs_writel(GUEST_SYSENTER_ESP, data);
+               break;
+-      case MSR_IA32_DEBUGCTLMSR: {
+-              u64 invalid;
+-
+-              invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+-              if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+-                      kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+-                      data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+-                      invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+-              }
+-
+-              if (invalid)
++      case MSR_IA32_DEBUGCTLMSR:
++              if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+                       return 1;
++              data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
++
+               if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+                                               VM_EXIT_SAVE_DEBUG_CONTROLS)
+                       get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+@@ -2161,7 +2167,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                   (data & DEBUGCTLMSR_LBR))
+                       intel_pmu_create_guest_lbr_event(vcpu);
+               return 0;
+-      }
+       case MSR_IA32_BNDCFGS:
+               if (!kvm_mpx_supported() ||
+                   (!msr_info->host_initiated &&
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch b/queue-6.1/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch
new file mode 100644 (file)
index 0000000..0771918
--- /dev/null
@@ -0,0 +1,56 @@
+From 318a7d25fb1c4671eb3c0e5ead8980801321fd3a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:55 -0700
+Subject: KVM: VMX: Handle forced exit due to preemption timer in fastpath
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 11776aa0cfa7d007ad1799b1553bdcbd830e5010 ]
+
+Handle VMX preemption timer VM-Exits due to KVM forcing an exit in the
+exit fastpath, i.e. avoid calling back into handle_preemption_timer() for
+the same exit.  There is no work to be done for forced exits, as the name
+suggests the goal is purely to get control back in KVM.
+
+In addition to shaving a few cycles, this will allow cleanly separating
+handle_fastpath_preemption_timer() from handle_preemption_timer(), e.g.
+it's not immediately obvious why _apparently_ calling
+handle_fastpath_preemption_timer() twice on a "slow" exit is necessary:
+the "slow" call is necessary to handle exits from L2, which are excluded
+from the fastpath by vmx_vcpu_run().
+
+Link: https://lore.kernel.org/r/20240110012705.506918-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 96bbccd9477c..c804ad001a79 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5941,12 +5941,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+       if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+               return EXIT_FASTPATH_REENTER_GUEST;
+-      if (!vmx->req_immediate_exit) {
+-              kvm_lapic_expired_hv_timer(vcpu);
+-              return EXIT_FASTPATH_REENTER_GUEST;
+-      }
++      /*
++       * If the timer expired because KVM used it to force an immediate exit,
++       * then mission accomplished.
++       */
++      if (vmx->req_immediate_exit)
++              return EXIT_FASTPATH_EXIT_HANDLED;
+-      return EXIT_FASTPATH_NONE;
++      kvm_lapic_expired_hv_timer(vcpu);
++      return EXIT_FASTPATH_REENTER_GUEST;
+ }
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch b/queue-6.1/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch
new file mode 100644 (file)
index 0000000..f71c1ee
--- /dev/null
@@ -0,0 +1,74 @@
+From d21ac42171b150d6870e91a395f72845982311ff Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:57 -0700
+Subject: KVM: VMX: Handle KVM-induced preemption timer exits in fastpath for
+ L2
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 7b3d1bbf8d68d76fb21210932a5e8ed8ea80dbcc ]
+
+Eat VMX treemption timer exits in the fastpath regardless of whether L1 or
+L2 is active.  The VM-Exit is 100% KVM-induced, i.e. there is nothing
+directly related to the exit that KVM needs to do on behalf of the guest,
+thus there is no reason to wait until the slow path to do nothing.
+
+Opportunistically add comments explaining why preemption timer exits for
+emulating the guest's APIC timer need to go down the slow path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++--
+ 1 file changed, 20 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 18ceed9046a9..4db9d41d988c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5948,13 +5948,26 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+       if (vmx->req_immediate_exit)
+               return EXIT_FASTPATH_EXIT_HANDLED;
++      /*
++       * If L2 is active, go down the slow path as emulating the guest timer
++       * expiration likely requires synthesizing a nested VM-Exit.
++       */
++      if (is_guest_mode(vcpu))
++              return EXIT_FASTPATH_NONE;
++
+       kvm_lapic_expired_hv_timer(vcpu);
+       return EXIT_FASTPATH_REENTER_GUEST;
+ }
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+-      handle_fastpath_preemption_timer(vcpu);
++      /*
++       * This non-fastpath handler is reached if and only if the preemption
++       * timer was being used to emulate a guest timer while L2 is active.
++       * All other scenarios are supposed to be handled in the fastpath.
++       */
++      WARN_ON_ONCE(!is_guest_mode(vcpu));
++      kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+ }
+@@ -7138,7 +7151,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
+-      if (is_guest_mode(vcpu))
++      /*
++       * If L2 is active, some VMX preemption timer exits can be handled in
++       * the fastpath even, all other exits must use the slow path.
++       */
++      if (is_guest_mode(vcpu) &&
++          to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+               return EXIT_FASTPATH_NONE;
+       switch (to_vmx(vcpu)->exit_reason.basic) {
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch b/queue-6.1/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
new file mode 100644 (file)
index 0000000..94781bd
--- /dev/null
@@ -0,0 +1,191 @@
+From 94d2d32566130542daf6fc1a32f0c8b615def9bd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:05 -0700
+Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the
+ guest
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ]
+
+Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the
+host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting
+while running the guest.  When running with the "default treatment of SMIs"
+in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that
+is visible to host (non-SMM) software, and instead transitions directly
+from VMX non-root to SMM.  And critically, DEBUGCTL isn't context switched
+by hardware on SMI or RSM, i.e. SMM will run with whatever value was
+resident in hardware at the time of the SMI.
+
+Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting
+events while the CPU is executing in SMM, which can pollute profiling and
+potentially leak information into the guest.
+
+Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner
+run loop, as the bit can be toggled in IRQ context via IPI callback (SMP
+function call), by way of /sys/devices/cpu/freeze_on_smi.
+
+Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be
+preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs,
+i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and
+at worst could lead to undesirable behavior in the future if AMD CPUs ever
+happened to pick up a collision with the bit.
+
+Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module
+owns and controls GUEST_IA32_DEBUGCTL.
+
+WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the
+lack of handling isn't a KVM bug (TDX already WARNs on any run_flag).
+
+Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed
+by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state().
+Doing so avoids the need to track host_debugctl on a per-VMCS basis, as
+GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and
+load_vmcs12_host_state().  For the VM-Fail case, even though KVM won't
+have actually entered the guest, vcpu_enter_guest() will have run with
+vmcs02 active and thus could result in vmcs01 being run with a stale value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: move vmx/main.c change to vmx/vmx.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  7 +++++++
+ arch/x86/kvm/vmx/nested.c       |  3 +++
+ arch/x86/kvm/vmx/vmx.c          |  5 +++++
+ arch/x86/kvm/vmx/vmx.h          | 15 ++++++++++++++-
+ arch/x86/kvm/x86.c              | 14 ++++++++++++--
+ 5 files changed, 41 insertions(+), 3 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index c8fc4f2acf69..d0229323ca63 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1459,6 +1459,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+       KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
+       KVM_RUN_LOAD_GUEST_DR6          = BIT(1),
++      KVM_RUN_LOAD_DEBUGCTL           = BIT(2),
+ };
+ struct kvm_x86_ops {
+@@ -1484,6 +1485,12 @@ struct kvm_x86_ops {
+       void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+       void (*vcpu_put)(struct kvm_vcpu *vcpu);
++      /*
++       * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
++       * match the host's value even while the guest is active.
++       */
++      const u64 HOST_OWNED_DEBUGCTL;
++
+       void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
+       int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+       int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index a220770644e1..2c3cf4351c4c 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4627,6 +4627,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+                       WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+       }
++      /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
++      vmx_reload_guest_debugctl(vcpu);
++
+       /*
+        * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+        * handle a variety of side effects to KVM's software model.
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index e470a294b22d..3fef4e14abc6 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7258,6 +7258,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+       if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+               set_debugreg(vcpu->arch.dr6, 6);
++      if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
++              vmx_reload_guest_debugctl(vcpu);
++
+       /*
+        * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+        * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8197,6 +8200,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+       .vcpu_load = vmx_vcpu_load,
+       .vcpu_put = vmx_vcpu_put,
++      .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
++
+       .update_exception_bitmap = vmx_update_exception_bitmap,
+       .get_msr_feature = vmx_get_msr_feature,
+       .get_msr = vmx_get_msr,
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index b7ae263cde7b..dc6f06326648 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -447,12 +447,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+ {
++      WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
++
++      val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
+       vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+ }
+ static inline u64 vmx_guest_debugctl_read(void)
+ {
+-      return vmcs_read64(GUEST_IA32_DEBUGCTL);
++      return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
++}
++
++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
++{
++      u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
++
++      if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
++              return;
++
++      vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
+ }
+ /*
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 9d66830d594c..dfecf5ba5aa7 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10591,7 +10591,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               dm_request_for_irq_injection(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu);
+       fastpath_t exit_fastpath;
+-      u64 run_flags;
++      u64 run_flags, debug_ctl;
+       bool req_immediate_exit = false;
+@@ -10838,7 +10838,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(0, 7);
+       }
+-      vcpu->arch.host_debugctl = get_debugctlmsr();
++      /*
++       * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
++       * can be modified in IRQ context, e.g. via SMP function calls.  Inform
++       * vendor code if any host-owned bits were changed, e.g. so that the
++       * value loaded into hardware while running the guest can be updated.
++       */
++      debug_ctl = get_debugctlmsr();
++      if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
++          !vcpu->arch.guest_state_protected)
++              run_flags |= KVM_RUN_LOAD_DEBUGCTL;
++      vcpu->arch.host_debugctl = debug_ctl;
+       guest_timing_enter_irqoff();
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch b/queue-6.1/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch
new file mode 100644 (file)
index 0000000..d873837
--- /dev/null
@@ -0,0 +1,49 @@
+From 46e5f37d619ea0a3b02610d32be90ddab43d9393 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:54 -0700
+Subject: KVM: VMX: Re-enter guest in fastpath for "spurious" preemption timer
+ exits
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit e6b5d16bbd2d4c8259ad76aa33de80d561aba5f9 ]
+
+Re-enter the guest in the fast path if VMX preeemption timer VM-Exit was
+"spurious", i.e. if KVM "soft disabled" the timer by writing -1u and by
+some miracle the timer expired before any other VM-Exit occurred.  This is
+just an intermediate step to cleaning up the preemption timer handling,
+optimizing these types of spurious VM-Exits is not interesting as they are
+extremely rare/infrequent.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0b495979a02b..96bbccd9477c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5933,8 +5933,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+-      if (!vmx->req_immediate_exit &&
+-          !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
++      /*
++       * In the *extremely* unlikely scenario that this is a spurious VM-Exit
++       * due to the timer expiring while it was "soft" disabled, just eat the
++       * exit and re-enter the guest.
++       */
++      if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
++              return EXIT_FASTPATH_REENTER_GUEST;
++
++      if (!vmx->req_immediate_exit) {
+               kvm_lapic_expired_hv_timer(vcpu);
+               return EXIT_FASTPATH_REENTER_GUEST;
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch b/queue-6.1/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
new file mode 100644 (file)
index 0000000..d108a9e
--- /dev/null
@@ -0,0 +1,162 @@
+From 495f4d2993192a89076ae3ae03216019fc88fa55 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:04 -0700
+Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
+
+Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to
+vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into
+GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state
+into the guest, and without needing to copy+paste the FREEZE_IN_SMM
+logic into every patch that accesses GUEST_IA32_DEBUGCTL.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+[sean: massage changelog, make inline, use in all prepare_vmcs02() cases]
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c    | 10 +++++-----
+ arch/x86/kvm/vmx/pmu_intel.c |  8 ++++----
+ arch/x86/kvm/vmx/vmx.c       |  8 +++++---
+ arch/x86/kvm/vmx/vmx.h       | 10 ++++++++++
+ 4 files changed, 24 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index da129e12cff9..a220770644e1 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2532,11 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
+-                                                vmx_get_supported_debugctl(vcpu, false));
++              vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
++                                             vmx_get_supported_debugctl(vcpu, false));
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
++              vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
+       }
+       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -3404,7 +3404,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+       if (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+-              vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
++              vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
+       if (kvm_mpx_supported() &&
+           (!vmx->nested.nested_run_pending ||
+            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -4572,7 +4572,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+       kvm_set_dr(vcpu, 7, 0x400);
+-      vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++      vmx_guest_debugctl_write(vcpu, 0);
+       if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+                               vmcs12->vm_exit_msr_load_count))
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 220cdbe1e286..76d3ed8abf6a 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -672,11 +672,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+  */
+ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+ {
+-      u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++      u64 data = vmx_guest_debugctl_read();
+       if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+               data &= ~DEBUGCTLMSR_LBR;
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++              vmx_guest_debugctl_write(vcpu, data);
+       }
+ }
+@@ -746,7 +746,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+       if (!lbr_desc->event) {
+               vmx_disable_lbr_msrs_passthrough(vcpu);
+-              if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
++              if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
+                       goto warn;
+               if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+                       goto warn;
+@@ -769,7 +769,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+ {
+-      if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
++      if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
+               intel_pmu_release_guest_lbr_event(vcpu);
+ }
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0b37e21d55b1..e470a294b22d 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2027,7 +2027,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                       msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+               break;
+       case MSR_IA32_DEBUGCTLMSR:
+-              msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++              msr_info->data = vmx_guest_debugctl_read();
+               break;
+       default:
+       find_uret_msr:
+@@ -2161,7 +2161,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                                               VM_EXIT_SAVE_DEBUG_CONTROLS)
+                       get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++              vmx_guest_debugctl_write(vcpu, data);
++
+               if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+                   (data & DEBUGCTLMSR_LBR))
+                       intel_pmu_create_guest_lbr_event(vcpu);
+@@ -4751,7 +4752,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+-      vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++
++      vmx_guest_debugctl_write(&vmx->vcpu, 0);
+       if (cpu_has_vmx_tpr_shadow()) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 99e3f46de2ec..b7ae263cde7b 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -445,6 +445,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
++{
++      vmcs_write64(GUEST_IA32_DEBUGCTL, val);
++}
++
++static inline u64 vmx_guest_debugctl_read(void)
++{
++      return vmcs_read64(GUEST_IA32_DEBUGCTL);
++}
++
+ /*
+  * Note, early Intel manuals have the write-low and read-high bitmap offsets
+  * the wrong way round.  The bitmaps control MSRs 0x00000000-0x00001fff and
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch b/queue-6.1/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
new file mode 100644 (file)
index 0000000..bdc82eb
--- /dev/null
@@ -0,0 +1,138 @@
+From 36f7addde5e161c3ad08eccfdaaf6d318b6e6461 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:59 -0700
+Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic
+ bitmap
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ]
+
+Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter
+into an a generic bitmap so that similar "take action" information can be
+passed to vendor code without creating a pile of boolean parameters.
+
+This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and
+will also allow for adding similar functionality for re-loading debugctl
+in the active VMCS.
+
+Opportunistically massage the TDX WARN and comment to prepare for adding
+more run_flags, all of which are expected to be mutually exclusive with
+TDX, i.e. should be WARNed on.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX crud, account for lack of kvm_x86_call()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  6 +++++-
+ arch/x86/kvm/svm/svm.c          |  4 ++--
+ arch/x86/kvm/vmx/vmx.c          |  3 ++-
+ arch/x86/kvm/x86.c              | 10 ++++++++--
+ 4 files changed, 17 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 86f3bd6601e7..1383f5e5238a 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1456,6 +1456,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+       return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+ }
++enum kvm_x86_run_flags {
++      KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
++};
++
+ struct kvm_x86_ops {
+       const char *name;
+@@ -1529,7 +1533,7 @@ struct kvm_x86_ops {
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+       enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+-                                                bool force_immediate_exit);
++                                                u64 run_flags);
+       int (*handle_exit)(struct kvm_vcpu *vcpu,
+               enum exit_fastpath_completion exit_fastpath);
+       int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 12de50db401f..dc8a1b72d8ec 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4008,9 +4008,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       guest_state_exit_irqoff();
+ }
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+-                                        bool force_immediate_exit)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++      bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 179747d04edc..382f42200688 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7204,8 +7204,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+       guest_state_exit_irqoff();
+ }
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++      bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 400a6e9fb0be..83e5e823cbae 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10591,6 +10591,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               dm_request_for_irq_injection(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu);
+       fastpath_t exit_fastpath;
++      u64 run_flags;
+       bool req_immediate_exit = false;
+@@ -10811,8 +10812,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               goto cancel_injection;
+       }
+-      if (req_immediate_exit)
++      run_flags = 0;
++      if (req_immediate_exit) {
++              run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
++      }
+       fpregs_assert_state_consistent();
+       if (test_thread_flag(TIF_NEED_FPU_LOAD))
+@@ -10848,7 +10852,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+                            (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+-              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
++              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags);
+               if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+                       break;
+@@ -10860,6 +10864,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+                       break;
+               }
++              run_flags = 0;
++
+               /* Note, VM-Exits that go down the "slow" path are accounted below. */
+               ++vcpu->stat.exits;
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch b/queue-6.1/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
new file mode 100644 (file)
index 0000000..6c0c878
--- /dev/null
@@ -0,0 +1,144 @@
+From 3a65689ab6b232b205f7e1d222883025eacb62d1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:00 -0700
+Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ]
+
+Instruct vendor code to load the guest's DR6 into hardware via a new
+KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to
+load vcpu->arch.dr6 into hardware when DR6 can be read/written directly
+by the guest.
+
+Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM
+thinks DR6 needs to be reloaded.  TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH
+and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of vmx/main.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h |  1 -
+ arch/x86/include/asm/kvm_host.h    |  2 +-
+ arch/x86/kvm/svm/svm.c             | 10 ++++++----
+ arch/x86/kvm/vmx/vmx.c             | 10 +++-------
+ arch/x86/kvm/x86.c                 |  2 +-
+ 5 files changed, 11 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 0e5ae3b0c867..c068565fe954 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -47,7 +47,6 @@ KVM_X86_OP(set_idt)
+ KVM_X86_OP(get_gdt)
+ KVM_X86_OP(set_gdt)
+ KVM_X86_OP(sync_dirty_debug_regs)
+-KVM_X86_OP(set_dr6)
+ KVM_X86_OP(set_dr7)
+ KVM_X86_OP(cache_reg)
+ KVM_X86_OP(get_rflags)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 1383f5e5238a..c8fc4f2acf69 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1458,6 +1458,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+       KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
++      KVM_RUN_LOAD_GUEST_DR6          = BIT(1),
+ };
+ struct kvm_x86_ops {
+@@ -1504,7 +1505,6 @@ struct kvm_x86_ops {
+       void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+       void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+       void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+-      void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+       unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index dc8a1b72d8ec..5a6bd9d5cceb 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4052,10 +4052,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+       svm_hv_update_vp_id(svm->vmcb, vcpu);
+       /*
+-       * Run with all-zero DR6 unless needed, so that we can get the exact cause
+-       * of a #DB.
++       * Run with all-zero DR6 unless the guest can write DR6 freely, so that
++       * KVM can get the exact cause of a #DB.  Note, loading guest DR6 from
++       * KVM's snapshot is only necessary when DR accesses won't exit.
+        */
+-      if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
++      if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
++              svm_set_dr6(vcpu, vcpu->arch.dr6);
++      else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+               svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
+       clgi();
+@@ -4822,7 +4825,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+       .set_idt = svm_set_idt,
+       .get_gdt = svm_get_gdt,
+       .set_gdt = svm_set_gdt,
+-      .set_dr6 = svm_set_dr6,
+       .set_dr7 = svm_set_dr7,
+       .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+       .cache_reg = svm_cache_reg,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 382f42200688..60d1ff3fca45 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5530,12 +5530,6 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+       set_debugreg(DR6_RESERVED, 6);
+ }
+-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+-{
+-      lockdep_assert_irqs_disabled();
+-      set_debugreg(vcpu->arch.dr6, 6);
+-}
+-
+ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+ {
+       vmcs_writel(GUEST_DR7, val);
+@@ -7251,6 +7245,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+               vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+       vcpu->arch.regs_dirty = 0;
++      if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
++              set_debugreg(vcpu->arch.dr6, 6);
++
+       /*
+        * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+        * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8208,7 +8205,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+       .set_idt = vmx_set_idt,
+       .get_gdt = vmx_get_gdt,
+       .set_gdt = vmx_set_gdt,
+-      .set_dr6 = vmx_set_dr6,
+       .set_dr7 = vmx_set_dr7,
+       .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+       .cache_reg = vmx_cache_reg,
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 83e5e823cbae..9d66830d594c 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10833,7 +10833,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(vcpu->arch.eff_db[3], 3);
+               /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+               if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+-                      static_call(kvm_x86_set_dr6)(vcpu, vcpu->arch.dr6);
++                      run_flags |= KVM_RUN_LOAD_GUEST_DR6;
+       } else if (unlikely(hw_breakpoint_active())) {
+               set_debugreg(0, 7);
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch b/queue-6.1/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch
new file mode 100644 (file)
index 0000000..5c67685
--- /dev/null
@@ -0,0 +1,265 @@
+From b596c99630a856d3912ec549084a96dd2546752f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:58 -0700
+Subject: KVM: x86: Fully defer to vendor code to decide how to force immediate
+ exit
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 0ec3d6d1f169baa7fc512ae4b78d17e7c94b7763 ]
+
+Now that vmx->req_immediate_exit is used only in the scope of
+vmx_vcpu_run(), use force_immediate_exit to detect that KVM should usurp
+the VMX preemption to force a VM-Exit and let vendor code fully handle
+forcing a VM-Exit.
+
+Opportunsitically drop __kvm_request_immediate_exit() and just have
+vendor code call smp_send_reschedule() directly.  SVM already does this
+when injecting an event while also trying to single-step an IRET, i.e.
+it's not exactly secret knowledge that KVM uses a reschedule IPI to force
+an exit.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-7-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve absurd conflict due to funky kvm_x86_ops.sched_in prototype]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h |  1 -
+ arch/x86/include/asm/kvm_host.h    |  3 ---
+ arch/x86/kvm/svm/svm.c             |  7 ++++---
+ arch/x86/kvm/vmx/vmx.c             | 32 +++++++++++++-----------------
+ arch/x86/kvm/vmx/vmx.h             |  2 --
+ arch/x86/kvm/x86.c                 | 10 +---------
+ 6 files changed, 19 insertions(+), 36 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 29bef25ac77c..0e5ae3b0c867 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -100,7 +100,6 @@ KVM_X86_OP(write_tsc_multiplier)
+ KVM_X86_OP(get_exit_info)
+ KVM_X86_OP(check_intercept)
+ KVM_X86_OP(handle_exit_irqoff)
+-KVM_X86_OP(request_immediate_exit)
+ KVM_X86_OP(sched_in)
+ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
+ KVM_X86_OP_OPTIONAL(vcpu_blocking)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 93f523762854..86f3bd6601e7 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1590,8 +1590,6 @@ struct kvm_x86_ops {
+                              struct x86_exception *exception);
+       void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+-      void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
+-
+       void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+       /*
+@@ -2059,7 +2057,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
+ void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+                                    u32 size);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 337a304d211b..12de50db401f 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4033,9 +4033,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+                * is enough to force an immediate vmexit.
+                */
+               disable_nmi_singlestep(svm);
+-              smp_send_reschedule(vcpu->cpu);
++              force_immediate_exit = true;
+       }
++      if (force_immediate_exit)
++              smp_send_reschedule(vcpu->cpu);
++
+       pre_svm_run(vcpu);
+       sync_lapic_to_cr8(vcpu);
+@@ -4874,8 +4877,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+       .check_intercept = svm_check_intercept,
+       .handle_exit_irqoff = svm_handle_exit_irqoff,
+-      .request_immediate_exit = __kvm_request_immediate_exit,
+-
+       .sched_in = svm_sched_in,
+       .nested_ops = &svm_nested_ops,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4db9d41d988c..179747d04edc 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -49,6 +49,8 @@
+ #include <asm/virtext.h>
+ #include <asm/vmx.h>
++#include <trace/events/ipi.h>
++
+ #include "capabilities.h"
+ #include "cpuid.h"
+ #include "evmcs.h"
+@@ -1223,8 +1225,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+       u16 fs_sel, gs_sel;
+       int i;
+-      vmx->req_immediate_exit = false;
+-
+       /*
+        * Note that guest MSRs to be saved/restored can also be changed
+        * when guest state is loaded. This happens when guest transitions
+@@ -5929,7 +5929,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
+       return 1;
+ }
+-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
++static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
++                                                 bool force_immediate_exit)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+@@ -5945,7 +5946,7 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+        * If the timer expired because KVM used it to force an immediate exit,
+        * then mission accomplished.
+        */
+-      if (vmx->req_immediate_exit)
++      if (force_immediate_exit)
+               return EXIT_FASTPATH_EXIT_HANDLED;
+       /*
+@@ -7090,13 +7091,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+                                       msrs[i].host, false);
+ }
+-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
++static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+-      if (vmx->req_immediate_exit) {
++      if (force_immediate_exit) {
+               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+               vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+       } else if (vmx->hv_deadline_tsc != -1) {
+@@ -7149,7 +7150,8 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+       barrier_nospec();
+ }
+-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
++                                           bool force_immediate_exit)
+ {
+       /*
+        * If L2 is active, some VMX preemption timer exits can be handled in
+@@ -7163,7 +7165,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+       case EXIT_REASON_MSR_WRITE:
+               return handle_fastpath_set_msr_irqoff(vcpu);
+       case EXIT_REASON_PREEMPTION_TIMER:
+-              return handle_fastpath_preemption_timer(vcpu);
++              return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+       default:
+               return EXIT_FASTPATH_NONE;
+       }
+@@ -7284,7 +7286,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+               vmx_passthrough_lbr_msrs(vcpu);
+       if (enable_preemption_timer)
+-              vmx_update_hv_timer(vcpu);
++              vmx_update_hv_timer(vcpu, force_immediate_exit);
++      else if (force_immediate_exit)
++              smp_send_reschedule(vcpu->cpu);
+       kvm_wait_lapic_expire(vcpu);
+@@ -7358,7 +7362,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+       vmx_recover_nmi_blocking(vmx);
+       vmx_complete_interrupts(vmx);
+-      return vmx_exit_handlers_fastpath(vcpu);
++      return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
+ }
+ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+@@ -7865,11 +7869,6 @@ static __init void vmx_set_cpu_caps(void)
+               kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+ }
+-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+-      to_vmx(vcpu)->req_immediate_exit = true;
+-}
+-
+ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+                                 struct x86_instruction_info *info)
+ {
+@@ -8275,8 +8274,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+       .check_intercept = vmx_check_intercept,
+       .handle_exit_irqoff = vmx_handle_exit_irqoff,
+-      .request_immediate_exit = vmx_request_immediate_exit,
+-
+       .sched_in = vmx_sched_in,
+       .cpu_dirty_log_size = PML_ENTITY_NUM,
+@@ -8533,7 +8530,6 @@ static __init int hardware_setup(void)
+       if (!enable_preemption_timer) {
+               vmx_x86_ops.set_hv_timer = NULL;
+               vmx_x86_ops.cancel_hv_timer = NULL;
+-              vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
+       }
+       kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 357819872d80..ddbe73958d7f 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -343,8 +343,6 @@ struct vcpu_vmx {
+       unsigned int ple_window;
+       bool ple_window_dirty;
+-      bool req_immediate_exit;
+-
+       /* Support for PML */
+ #define PML_ENTITY_NUM                512
+       struct page *pml_pg;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 08c3da88f402..400a6e9fb0be 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10578,12 +10578,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+       static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+ }
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+-      smp_send_reschedule(vcpu->cpu);
+-}
+-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+-
+ /*
+  * Called within kvm->srcu read side.
+  * Returns 1 to let vcpu_run() continue the guest execution loop without
+@@ -10817,10 +10811,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               goto cancel_injection;
+       }
+-      if (req_immediate_exit) {
++      if (req_immediate_exit)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+-              static_call(kvm_x86_request_immediate_exit)(vcpu);
+-      }
+       fpregs_assert_state_consistent();
+       if (test_thread_flag(TIF_NEED_FPU_LOAD))
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch b/queue-6.1/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch
new file mode 100644 (file)
index 0000000..2c2f2c1
--- /dev/null
@@ -0,0 +1,82 @@
+From 2ce55c36cca09ff95c3ba4cdb09407fc864500b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:56 -0700
+Subject: KVM: x86: Move handling of is_guest_mode() into fastpath exit
+ handlers
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit bf1a49436ea37b98dd2f37c57608951d0e28eecc ]
+
+Let the fastpath code decide which exits can/can't be handled in the
+fastpath when L2 is active, e.g. when KVM generates a VMX preemption
+timer exit to forcefully regain control, there is no "work" to be done and
+so such exits can be handled in the fastpath regardless of whether L1 or
+L2 is active.
+
+Moving the is_guest_mode() check into the fastpath code also makes it
+easier to see that L2 isn't allowed to use the fastpath in most cases,
+e.g. it's not immediately obvious why handle_fastpath_preemption_timer()
+is called from the fastpath and the normal path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-5-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve syntactic conflict in svm_exit_handlers_fastpath()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c | 6 +++---
+ arch/x86/kvm/vmx/vmx.c | 6 +++---
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index b4283c2358a6..337a304d211b 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3964,6 +3964,9 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
+       struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
++      if (is_guest_mode(vcpu))
++              return EXIT_FASTPATH_NONE;
++
+       /*
+        * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
+        * can't read guest memory (dereference memslots) to decode the WRMSR.
+@@ -4127,9 +4130,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+       svm_complete_interrupts(vcpu);
+-      if (is_guest_mode(vcpu))
+-              return EXIT_FASTPATH_NONE;
+-
+       return svm_exit_handlers_fastpath(vcpu);
+ }
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index c804ad001a79..18ceed9046a9 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7138,6 +7138,9 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
++      if (is_guest_mode(vcpu))
++              return EXIT_FASTPATH_NONE;
++
+       switch (to_vmx(vcpu)->exit_reason.basic) {
+       case EXIT_REASON_MSR_WRITE:
+               return handle_fastpath_set_msr_irqoff(vcpu);
+@@ -7337,9 +7340,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+       vmx_recover_nmi_blocking(vmx);
+       vmx_complete_interrupts(vmx);
+-      if (is_guest_mode(vcpu))
+-              return EXIT_FASTPATH_NONE;
+-
+       return vmx_exit_handlers_fastpath(vcpu);
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch b/queue-6.1/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch
new file mode 100644 (file)
index 0000000..cac06bc
--- /dev/null
@@ -0,0 +1,130 @@
+From b8df9da8aaf5d2d743800536dbd0bf0ec684f320 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:53 -0700
+Subject: KVM: x86: Plumb "force_immediate_exit" into kvm_entry() tracepoint
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 9c9025ea003a03f967affd690f39b4ef3452c0f5 ]
+
+Annotate the kvm_entry() tracepoint with "immediate exit" when KVM is
+forcing a VM-Exit immediately after VM-Enter, e.g. when KVM wants to
+inject an event but needs to first complete some other operation.
+Knowing that KVM is (or isn't) forcing an exit is useful information when
+debugging issues related to event injection.
+
+Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
+Link: https://lore.kernel.org/r/20240110012705.506918-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 3 ++-
+ arch/x86/kvm/svm/svm.c          | 5 +++--
+ arch/x86/kvm/trace.h            | 9 ++++++---
+ arch/x86/kvm/vmx/vmx.c          | 4 ++--
+ arch/x86/kvm/x86.c              | 2 +-
+ 5 files changed, 14 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 555c7bf35e28..93f523762854 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1528,7 +1528,8 @@ struct kvm_x86_ops {
+       void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+-      enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
++      enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
++                                                bool force_immediate_exit);
+       int (*handle_exit)(struct kvm_vcpu *vcpu,
+               enum exit_fastpath_completion exit_fastpath);
+       int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 2c0f9c7d1242..b4283c2358a6 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4005,12 +4005,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       guest_state_exit_irqoff();
+ }
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
++                                        bool force_immediate_exit)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+-      trace_kvm_entry(vcpu);
++      trace_kvm_entry(vcpu, force_immediate_exit);
+       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
+index 6c1dcf44c4fa..ab407bc00d84 100644
+--- a/arch/x86/kvm/trace.h
++++ b/arch/x86/kvm/trace.h
+@@ -15,20 +15,23 @@
+  * Tracepoint for guest mode entry.
+  */
+ TRACE_EVENT(kvm_entry,
+-      TP_PROTO(struct kvm_vcpu *vcpu),
+-      TP_ARGS(vcpu),
++      TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
++      TP_ARGS(vcpu, force_immediate_exit),
+       TP_STRUCT__entry(
+               __field(        unsigned int,   vcpu_id         )
+               __field(        unsigned long,  rip             )
++              __field(        bool,           immediate_exit  )
+       ),
+       TP_fast_assign(
+               __entry->vcpu_id        = vcpu->vcpu_id;
+               __entry->rip            = kvm_rip_read(vcpu);
++              __entry->immediate_exit = force_immediate_exit;
+       ),
+-      TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
++      TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
++                __entry->immediate_exit ? "[immediate exit]" : "")
+ );
+ /*
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 390af16d9a67..0b495979a02b 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7171,7 +7171,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+       guest_state_exit_irqoff();
+ }
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+@@ -7198,7 +7198,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+               return EXIT_FASTPATH_NONE;
+       }
+-      trace_kvm_entry(vcpu);
++      trace_kvm_entry(vcpu, force_immediate_exit);
+       if (vmx->ple_window_dirty) {
+               vmx->ple_window_dirty = false;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index d224180c56f5..08c3da88f402 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10856,7 +10856,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+                            (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+-              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
++              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
+               if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+                       break;
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch b/queue-6.1/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch
new file mode 100644 (file)
index 0000000..7ad4d28
--- /dev/null
@@ -0,0 +1,104 @@
+From 8aadc6631ffd7b08508de7b053eb6e237402d947 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:47 -0700
+Subject: KVM: x86: Plumb in the vCPU to kvm_x86_ops.hwapic_isr_update()
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 76bce9f10162cd4b36ac0b7889649b22baf70ebd ]
+
+Pass the target vCPU to the hwapic_isr_update() vendor hook so that VMX
+can defer the update until after nested VM-Exit if an EOI for L1's vAPIC
+occurs while L2 is active.
+
+Note, commit d39850f57d21 ("KVM: x86: Drop @vcpu parameter from
+kvm_x86_ops.hwapic_isr_update()") removed the parameter with the
+justification that doing so "allows for a decent amount of (future)
+cleanup in the APIC code", but it's not at all clear what cleanup was
+intended, or if it was ever realized.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Chao Gao <chao.gao@intel.com>
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of kvm_x86_call(), drop vmx/x86_ops.h change]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 2 +-
+ arch/x86/kvm/lapic.c            | 8 ++++----
+ arch/x86/kvm/vmx/vmx.c          | 2 +-
+ 3 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 17b4e61a52b9..6db42ee82032 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1552,7 +1552,7 @@ struct kvm_x86_ops {
+       bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+       void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+       void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+-      void (*hwapic_isr_update)(int isr);
++      void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+       bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
+       void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+       void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 42eec987ac3d..3d65d6a023c9 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -587,7 +587,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+        * just set SVI.
+        */
+       if (unlikely(apic->apicv_active))
+-              static_call_cond(kvm_x86_hwapic_isr_update)(vec);
++              static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec);
+       else {
+               ++apic->isr_count;
+               BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+@@ -632,7 +632,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+        * and must be left alone.
+        */
+       if (unlikely(apic->apicv_active))
+-              static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++              static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
+       else {
+               --apic->isr_count;
+               BUG_ON(apic->isr_count < 0);
+@@ -2554,7 +2554,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+       if (apic->apicv_active) {
+               static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+               static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+-              static_call_cond(kvm_x86_hwapic_isr_update)(-1);
++              static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
+       }
+       vcpu->arch.apic_arb_prio = 0;
+@@ -2847,7 +2847,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+       if (apic->apicv_active) {
+               static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+               static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+-              static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++              static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+       }
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       if (ioapic_in_kernel(vcpu->kvm))
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9a5cb896229f..721ba6ddb121 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6708,7 +6708,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+       put_page(page);
+ }
+-static void vmx_hwapic_isr_update(int max_isr)
++static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ {
+       u16 status;
+       u8 old;
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch b/queue-6.1/kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch
new file mode 100644 (file)
index 0000000..9d8f871
--- /dev/null
@@ -0,0 +1,222 @@
+From 7a29b546168ea7252a909633d7860beb1599191b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:52 -0700
+Subject: KVM: x86/pmu: Gate all "unimplemented MSR" prints on
+ report_ignored_msrs
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit e76ae52747a82a548742107b4100e90da41a624d ]
+
+Add helpers to print unimplemented MSR accesses and condition all such
+prints on report_ignored_msrs, i.e. honor userspace's request to not
+print unimplemented MSRs.  Even though vcpu_unimpl() is ratelimited,
+printing can still be problematic, e.g. if a print gets stalled when host
+userspace is writing MSRs during live migration, an effective stall can
+result in very noticeable disruption in the guest.
+
+E.g. the profile below was taken while calling KVM_SET_MSRS on the PMU
+counters while the PMU was disabled in KVM.
+
+  -   99.75%     0.00%  [.] __ioctl
+   - __ioctl
+      - 99.74% entry_SYSCALL_64_after_hwframe
+           do_syscall_64
+           sys_ioctl
+         - do_vfs_ioctl
+            - 92.48% kvm_vcpu_ioctl
+               - kvm_arch_vcpu_ioctl
+                  - 85.12% kvm_set_msr_ignored_check
+                       svm_set_msr
+                       kvm_set_msr_common
+                       printk
+                       vprintk_func
+                       vprintk_default
+                       vprintk_emit
+                       console_unlock
+                       call_console_drivers
+                       univ8250_console_write
+                       serial8250_console_write
+                       uart_console_write
+
+Reported-by: Aaron Lewis <aaronlewis@google.com>
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Link: https://lore.kernel.org/r/20230124234905.3774678-3-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/hyperv.c  | 10 ++++------
+ arch/x86/kvm/svm/svm.c |  5 ++---
+ arch/x86/kvm/vmx/vmx.c |  4 +---
+ arch/x86/kvm/x86.c     | 18 +++++-------------
+ arch/x86/kvm/x86.h     | 12 ++++++++++++
+ 5 files changed, 24 insertions(+), 25 deletions(-)
+
+diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
+index 28555bbd52e8..cb0a531e13c5 100644
+--- a/arch/x86/kvm/hyperv.c
++++ b/arch/x86/kvm/hyperv.c
+@@ -1406,8 +1406,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+       case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+               return syndbg_set_msr(vcpu, msr, data, host);
+       default:
+-              vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
+-                          msr, data);
++              kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+               return 1;
+       }
+       return 0;
+@@ -1528,8 +1527,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+                       return 1;
+               break;
+       default:
+-              vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
+-                          msr, data);
++              kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+               return 1;
+       }
+@@ -1581,7 +1579,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+       case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+               return syndbg_get_msr(vcpu, msr, pdata, host);
+       default:
+-              vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
++              kvm_pr_unimpl_rdmsr(vcpu, msr);
+               return 1;
+       }
+@@ -1646,7 +1644,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+               data = APIC_BUS_FREQUENCY;
+               break;
+       default:
+-              vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
++              kvm_pr_unimpl_rdmsr(vcpu, msr);
+               return 1;
+       }
+       *pdata = data;
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index b922f31d1415..2c0f9c7d1242 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3035,8 +3035,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+               break;
+       case MSR_IA32_DEBUGCTLMSR:
+               if (!lbrv) {
+-                      vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+-                                  __func__, data);
++                      kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+                       break;
+               }
+@@ -3077,7 +3076,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+       case MSR_VM_CR:
+               return svm_set_vm_cr(vcpu, data);
+       case MSR_VM_IGNNE:
+-              vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
++              kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+               break;
+       case MSR_AMD64_DE_CFG: {
+               struct kvm_msr_entry msr_entry;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index c24da2cff208..390af16d9a67 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2140,9 +2140,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+               invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+               if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+-                      if (report_ignored_msrs)
+-                              vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
+-                                          __func__, data);
++                      kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+                       data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+                       invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+               }
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index b0ae61ba9b99..d224180c56f5 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3573,7 +3573,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
+ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ {
+-      bool pr = false;
+       u32 msr = msr_info->index;
+       u64 data = msr_info->data;
+@@ -3625,15 +3624,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+               if (data == BIT_ULL(18)) {
+                       vcpu->arch.msr_hwcr = data;
+               } else if (data != 0) {
+-                      vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+-                                  data);
++                      kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+                       return 1;
+               }
+               break;
+       case MSR_FAM10H_MMIO_CONF_BASE:
+               if (data != 0) {
+-                      vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+-                                  "0x%llx\n", data);
++                      kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+                       return 1;
+               }
+               break;
+@@ -3813,16 +3810,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+       case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+       case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+-              pr = true;
+-              fallthrough;
+       case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+       case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+               if (kvm_pmu_is_valid_msr(vcpu, msr))
+                       return kvm_pmu_set_msr(vcpu, msr_info);
+-              if (pr || data != 0)
+-                      vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
+-                                  "0x%x data 0x%llx\n", msr, data);
++              if (data)
++                      kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+               break;
+       case MSR_K7_CLK_CTL:
+               /*
+@@ -3849,9 +3843,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+               /* Drop writes to this legacy MSR -- see rdmsr
+                * counterpart for further detail.
+                */
+-              if (report_ignored_msrs)
+-                      vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
+-                              msr, data);
++              kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+               break;
+       case MSR_AMD64_OSVW_ID_LENGTH:
+               if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
+index 9de72586f406..f3554bf05201 100644
+--- a/arch/x86/kvm/x86.h
++++ b/arch/x86/kvm/x86.h
+@@ -331,6 +331,18 @@ extern bool report_ignored_msrs;
+ extern bool eager_page_split;
++static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
++{
++      if (report_ignored_msrs)
++              vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
++}
++
++static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
++{
++      if (report_ignored_msrs)
++              vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
++}
++
+ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+ {
+       return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch b/queue-6.1/kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch
new file mode 100644 (file)
index 0000000..fa413c4
--- /dev/null
@@ -0,0 +1,162 @@
+From c53c4c4220e372f9a392cb4dd337b2ddd5b5596a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:46 -0700
+Subject: KVM: x86: Re-split x2APIC ICR into ICR+ICR2 for AMD (x2AVIC)
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 73b42dc69be8564d4951a14d00f827929fe5ef79 ]
+
+Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's
+IPI virtualization support, but only for AMD.  While not stated anywhere
+in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs
+store the 64-bit ICR as two separate 32-bit values in ICR and ICR2.  When
+IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled,
+KVM needs to match CPU behavior as some ICR ICR writes will be handled by
+the CPU, not by KVM.
+
+Add a kvm_x86_ops knob to control the underlying format used by the CPU to
+store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether
+or not x2AVIC is enabled.  If KVM is handling all ICR writes, the storage
+format for x2APIC mode doesn't matter, and having the behavior follow AMD
+versus Intel will provide better test coverage and ease debugging.
+
+Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode")
+Cc: stable@vger.kernel.org
+Cc: Maxim Levitsky <mlevitsk@redhat.com>
+Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+Link: https://lore.kernel.org/r/20240719235107.3023592-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflicts]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  2 ++
+ arch/x86/kvm/lapic.c            | 42 +++++++++++++++++++++++----------
+ arch/x86/kvm/svm/svm.c          |  2 ++
+ arch/x86/kvm/vmx/vmx.c          |  2 ++
+ 4 files changed, 36 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index eb06c2f68314..17b4e61a52b9 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1547,6 +1547,8 @@ struct kvm_x86_ops {
+       void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+       void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+       void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
++
++      const bool x2apic_icr_is_split;
+       bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+       void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+       void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 7f57dce5c828..42eec987ac3d 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -2315,11 +2315,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+       data &= ~APIC_ICR_BUSY;
+       kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+-      kvm_lapic_set_reg64(apic, APIC_ICR, data);
++      if (kvm_x86_ops.x2apic_icr_is_split) {
++              kvm_lapic_set_reg(apic, APIC_ICR, data);
++              kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
++      } else {
++              kvm_lapic_set_reg64(apic, APIC_ICR, data);
++      }
+       trace_kvm_apic_write(APIC_ICR, data);
+       return 0;
+ }
++static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
++{
++      if (kvm_x86_ops.x2apic_icr_is_split)
++              return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
++                     (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
++
++      return kvm_lapic_get_reg64(apic, APIC_ICR);
++}
++
+ /* emulate APIC access in a trap manner */
+ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+ {
+@@ -2337,7 +2351,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+        * maybe-unecessary write, and both are in the noise anyways.
+        */
+       if (apic_x2apic_mode(apic) && offset == APIC_ICR)
+-              WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)));
++              WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
+       else
+               kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+ }
+@@ -2760,18 +2774,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+               /*
+                * In x2APIC mode, the LDR is fixed and based on the id.  And
+-               * ICR is internally a single 64-bit register, but needs to be
+-               * split to ICR+ICR2 in userspace for backwards compatibility.
++               * if the ICR is _not_ split, ICR is internally a single 64-bit
++               * register, but needs to be split to ICR+ICR2 in userspace for
++               * backwards compatibility.
+                */
+-              if (set) {
++              if (set)
+                       *ldr = kvm_apic_calc_x2apic_ldr(*id);
+-                      icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+-                            (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+-                      __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+-              } else {
+-                      icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+-                      __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
++              if (!kvm_x86_ops.x2apic_icr_is_split) {
++                      if (set) {
++                              icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
++                                    (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
++                              __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
++                      } else {
++                              icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
++                              __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
++                      }
+               }
+       }
+@@ -2971,7 +2989,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+       u32 low;
+       if (reg == APIC_ICR) {
+-              *data = kvm_lapic_get_reg64(apic, APIC_ICR);
++              *data = kvm_x2apic_icr_read(apic);
+               return 0;
+       }
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index c95a84afc35f..b922f31d1415 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4851,6 +4851,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+       .enable_nmi_window = svm_enable_nmi_window,
+       .enable_irq_window = svm_enable_irq_window,
+       .update_cr8_intercept = svm_update_cr8_intercept,
++
++      .x2apic_icr_is_split = true,
+       .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
+       .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+       .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index fbe26b88f731..9a5cb896229f 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -8202,6 +8202,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+       .enable_nmi_window = vmx_enable_nmi_window,
+       .enable_irq_window = vmx_enable_irq_window,
+       .update_cr8_intercept = vmx_update_cr8_intercept,
++
++      .x2apic_icr_is_split = false,
+       .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+       .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+       .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch b/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch
new file mode 100644 (file)
index 0000000..f700208
--- /dev/null
@@ -0,0 +1,48 @@
+From ac35d395216d2db6535082fde4a62a3ee3849d40 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:51 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 189ecdb3e112da703ac0699f4ec76aa78122f911 ]
+
+Snapshot the host's DEBUGCTL after disabling IRQs, as perf can toggle
+debugctl bits from IRQ context, e.g. when enabling/disabling events via
+smp_call_function_single().  Taking the snapshot (long) before IRQs are
+disabled could result in KVM effectively clobbering DEBUGCTL due to using
+a stale snapshot.
+
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index ba24bb50af57..b0ae61ba9b99 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4742,7 +4742,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+       /* Save host pkru register if supported */
+       vcpu->arch.host_pkru = read_pkru();
+-      vcpu->arch.host_debugctl = get_debugctlmsr();
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+@@ -10851,6 +10850,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(0, 7);
+       }
++      vcpu->arch.host_debugctl = get_debugctlmsr();
++
+       guest_timing_enter_irqoff();
+       for (;;) {
+-- 
+2.50.1
+
diff --git a/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch b/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch
new file mode 100644 (file)
index 0000000..94ab9ea
--- /dev/null
@@ -0,0 +1,100 @@
+From 52e78074c894adecdf2fb1d987959707ce46beed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:50 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL in common x86
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit fb71c795935652fa20eaf9517ca9547f5af99a76 ]
+
+Move KVM's snapshot of DEBUGCTL to kvm_vcpu_arch and take the snapshot in
+common x86, so that SVM can also use the snapshot.
+
+Opportunistically change the field to a u64.  While bits 63:32 are reserved
+on AMD, not mentioned at all in Intel's SDM, and managed as an "unsigned
+long" by the kernel, DEBUGCTL is an MSR and therefore a 64-bit value.
+
+Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in vmx_vcpu_load()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/vmx/vmx.c          | 8 ++------
+ arch/x86/kvm/vmx/vmx.h          | 2 --
+ arch/x86/kvm/x86.c              | 1 +
+ 4 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 6db42ee82032..555c7bf35e28 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -677,6 +677,7 @@ struct kvm_vcpu_arch {
+       u32 pkru;
+       u32 hflags;
+       u64 efer;
++      u64 host_debugctl;
+       u64 apic_base;
+       struct kvm_lapic *apic;    /* kernel irqchip context */
+       bool load_eoi_exitmap_pending;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 7b87fbc69b21..c24da2cff208 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -1418,13 +1418,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+  */
+ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+-      struct vcpu_vmx *vmx = to_vmx(vcpu);
+-
+       vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+       vmx_vcpu_pi_load(vcpu, cpu);
+-
+-      vmx->host_debugctlmsr = get_debugctlmsr();
+ }
+ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+@@ -7275,8 +7271,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+       }
+       /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+-      if (vmx->host_debugctlmsr)
+-              update_debugctlmsr(vmx->host_debugctlmsr);
++      if (vcpu->arch.host_debugctl)
++              update_debugctlmsr(vcpu->arch.host_debugctl);
+ #ifndef CONFIG_X86_64
+       /*
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 8b4b149bd9c1..357819872d80 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -352,8 +352,6 @@ struct vcpu_vmx {
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
+-      unsigned long host_debugctlmsr;
+-
+       /*
+        * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+        * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index a6dc8f662fa4..ba24bb50af57 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4742,6 +4742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+       /* Save host pkru register if supported */
+       vcpu->arch.host_pkru = read_pkru();
++      vcpu->arch.host_debugctl = get_debugctlmsr();
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+-- 
+2.50.1
+
diff --git a/queue-6.1/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.1/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..95b6bda
--- /dev/null
@@ -0,0 +1,129 @@
+From 70d909202444ad2c328a4944d265dc9ad7efe92a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 2cf58a8b8e4d..d3e28574ceb9 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -859,8 +859,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -1175,19 +1173,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -1224,7 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -1237,8 +1242,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                           ct, true, flags);
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -1251,12 +1255,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
index 3df0a6ae83ab43177c042f68e89f10934f44f9bb..c1c590bc0335f8aedb09d7613df30631b01ca804 100644 (file)
@@ -24,3 +24,25 @@ eventpoll-fix-semi-unbounded-recursion.patch
 documentation-acpi-fix-parent-device-references.patch
 acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
 acpi-processor-perflib-move-problematic-pr-performance-check.patch
+kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch
+kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch
+kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch
+kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch
+kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch
+kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch
+kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch
+kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch
+kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch
+kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch
+kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch
+kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch
+kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch
+kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
+kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
+kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
+kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
+kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
+kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
+kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
diff --git a/queue-6.1/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.1/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..bdf07f4
--- /dev/null
@@ -0,0 +1,51 @@
+From 21b9dfb2ec919b8b4561d84dd45c0ee4799c62d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 1a51c4b44c00..593108049ab7 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -60,7 +60,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+
diff --git a/queue-6.12/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.12/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
new file mode 100644 (file)
index 0000000..246baaa
--- /dev/null
@@ -0,0 +1,91 @@
+From a05287bd1654c451e1eb7b9e28de5ef9f1b9d901 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2].  This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period.  Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index 97ffadc7e57a..01322a905414 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -153,6 +153,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters)
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++      /* Update the repeating-pattern data. */
++      data->intervals[data->interval_ptr++] = interval_us;
++      if (data->interval_ptr >= INTERVALS)
++              data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+ /*
+@@ -277,6 +285,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+       if (data->needs_update) {
+               menu_update(drv, dev);
+               data->needs_update = 0;
++      } else if (!dev->last_residency_ns) {
++              /*
++               * This happens when the driver rejects the previously selected
++               * idle state and returns an error, so update the recent
++               * intervals table to prevent invalid information from being
++               * used going forward.
++               */
++              menu_update_intervals(data, UINT_MAX);
+       }
+       nr_iowaiters = nr_iowait_cpu(dev->cpu);
+@@ -546,10 +562,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+       data->correction_factor[data->bucket] = new_factor;
+-      /* update the repeating-pattern data */
+-      data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+-      if (data->interval_ptr >= INTERVALS)
+-              data->interval_ptr = 0;
++      menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+ /**
+-- 
+2.50.1
+
diff --git a/queue-6.12/habanalabs-fix-uaf-in-export_dmabuf.patch b/queue-6.12/habanalabs-fix-uaf-in-export_dmabuf.patch
new file mode 100644 (file)
index 0000000..852df8f
--- /dev/null
@@ -0,0 +1,96 @@
+From b5a874d6221e42baa1685f2af96f79fd75b92995 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 12 Jul 2025 06:02:31 +0100
+Subject: habanalabs: fix UAF in export_dmabuf()
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ]
+
+As soon as we'd inserted a file reference into descriptor table, another
+thread could close it.  That's fine for the case when all we are doing is
+returning that descriptor to userland (it's a race, but it's a userland
+race and there's nothing the kernel can do about it).  However, if we
+follow fd_install() with any kind of access to objects that would be
+destroyed on close (be it the struct file itself or anything destroyed
+by its ->release()), we have a UAF.
+
+dma_buf_fd() is a combination of reserving a descriptor and fd_install().
+habanalabs export_dmabuf() calls it and then proceeds to access the
+objects destroyed on close.  In particular, it grabs an extra reference to
+another struct file that will be dropped as part of ->release() for ours;
+that "will be" is actually "might have already been".
+
+Fix that by reserving descriptor before anything else and do fd_install()
+only when everything had been set up.  As a side benefit, we no longer
+have the failure exit with file already created, but reference to
+underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet;
+unlike dma_buf_fd(), fd_install() can't fail.
+
+Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/accel/habanalabs/common/memory.c | 23 +++++++----------------
+ 1 file changed, 7 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
+index 3348ad12c237..11c55fd76db5 100644
+--- a/drivers/accel/habanalabs/common/memory.c
++++ b/drivers/accel/habanalabs/common/memory.c
+@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
+       struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
+       struct hl_ctx *ctx;
+-      if (!hl_dmabuf)
+-              return;
+-
+       ctx = hl_dmabuf->ctx;
+       if (hl_dmabuf->memhash_hnode)
+@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ {
+       DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+       struct hl_device *hdev = ctx->hdev;
+-      int rc, fd;
++      CLASS(get_unused_fd, fd)(flags);
++
++      if (fd < 0) {
++              dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
++              return fd;
++      }
+       exp_info.ops = &habanalabs_dmabuf_ops;
+       exp_info.size = total_size;
+@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx,
+               return PTR_ERR(hl_dmabuf->dmabuf);
+       }
+-      fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
+-      if (fd < 0) {
+-              dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
+-              rc = fd;
+-              goto err_dma_buf_put;
+-      }
+-
+       hl_dmabuf->ctx = ctx;
+       hl_ctx_get(hl_dmabuf->ctx);
+       atomic_inc(&ctx->hdev->dmabuf_export_cnt);
+@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx,
+       get_file(ctx->hpriv->file_priv->filp);
+       *dmabuf_fd = fd;
++      fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);
+       return 0;
+-
+-err_dma_buf_put:
+-      hl_dmabuf->dmabuf->priv = NULL;
+-      dma_buf_put(hl_dmabuf->dmabuf);
+-      return rc;
+ }
+ static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
+-- 
+2.50.1
+
diff --git a/queue-6.12/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.12/intel_idle-allow-loading-acpi-tables-for-any-family.patch
new file mode 100644 (file)
index 0000000..4cc26ea
--- /dev/null
@@ -0,0 +1,41 @@
+From b77cd82a6114ef64d2a6bf354fbd8a7e91c721fd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6.  Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 524ed143f875..4506e1cc4b65 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1608,7 +1608,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+-      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+       {}
+ };
+-- 
+2.50.1
+
diff --git a/queue-6.12/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch b/queue-6.12/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
new file mode 100644 (file)
index 0000000..34ddf0c
--- /dev/null
@@ -0,0 +1,117 @@
+From c61650533f1bd7068592df158f48962bfcd8bd98 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:23 -0700
+Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
+
+Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports
+a subset of hardware functionality, i.e. KVM can't rely on hardware to
+detect illegal/unsupported values.  Failure to check the vmcs12 value
+would allow the guest to load any harware-supported value while running L2.
+
+Take care to exempt BTF and LBR from the validity check in order to match
+KVM's behavior for writes via WRMSR, but without clobbering vmcs12.  Even
+if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect
+that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR
+are being intercepted.
+
+Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set
+*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but
+that would incur non-trivial complexity and wouldn't change the fact that
+KVM's handling of DEBUGCTL is blatantly broken.  I.e. the extra complexity
+is not worth carrying.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 12 ++++++++++--
+ arch/x86/kvm/vmx/vmx.c    |  5 ++---
+ arch/x86/kvm/vmx/vmx.h    |  3 +++
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 903e874041ac..1e0b9f92ff18 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2653,7 +2653,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
++              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
++                                                vmx_get_supported_debugctl(vcpu, false));
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+@@ -3135,7 +3136,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+               return -EINVAL;
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+-          CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
++          (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
++           CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
+               return -EINVAL;
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+@@ -4576,6 +4578,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+               (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+               (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
++      /*
++       * Note!  Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
++       * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
++       * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
++       * vmcs02 doesn't strictly track vmcs12.
++       */
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+               vmcs12->guest_dr7 = vcpu->arch.dr7;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index ff61093e9af7..50d45c18fce9 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2173,7 +2173,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+       return (unsigned long)data;
+ }
+-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+       u64 debugctl = 0;
+@@ -2192,8 +2192,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+       return debugctl;
+ }
+-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
+-                                bool host_initiated)
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ {
+       u64 invalid;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index cf57fbf12104..ee330d14089d 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -435,6 +435,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++
+ /*
+  * Note, early Intel manuals have the write-low and read-high bitmap offsets
+  * the wrong way round.  The bitmaps control MSRs 0x00000000-0x00001fff and
+-- 
+2.50.1
+
diff --git a/queue-6.12/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch b/queue-6.12/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
new file mode 100644 (file)
index 0000000..9117d3e
--- /dev/null
@@ -0,0 +1,63 @@
+From 442fe2ed58d95e8ffd4c75c29b7f1d884bce1d02 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:21 -0700
+Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ]
+
+Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the
+guest CPUID model, as debug support is supposed to be available if RTM is
+supported, and there are no known downsides to letting the guest debug RTM
+aborts.
+
+Note, there are no known bug reports related to RTM_DEBUG, the primary
+motivation is to reduce the probability of breaking existing guests when a
+future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL
+(KVM currently lets L2 run with whatever hardware supports; whoops).
+
+Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to
+DR7.RTM.
+
+Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/msr-index.h | 1 +
+ arch/x86/kvm/vmx/vmx.c           | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index 7ebe76f69417..2b6e3127ef4e 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -417,6 +417,7 @@
+ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI     (1UL << 12)
+ #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+ #define DEBUGCTLMSR_FREEZE_IN_SMM     (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
++#define DEBUGCTLMSR_RTM_DEBUG         BIT(15)
+ #define MSR_PEBS_FRONTEND             0x000003f7
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index b9c7940feac6..529a10bba056 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2185,6 +2185,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+           (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+               debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
++      if (boot_cpu_has(X86_FEATURE_RTM) &&
++          (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM)))
++              debugctl |= DEBUGCTLMSR_RTM_DEBUG;
++
+       return debugctl;
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.12/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch b/queue-6.12/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
new file mode 100644 (file)
index 0000000..031bb76
--- /dev/null
@@ -0,0 +1,90 @@
+From 9169769cd413b64e64d5b12b3b21446c9d1340a5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:22 -0700
+Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
+
+Move VMX's logic to check DEBUGCTL values into a standalone helper so that
+the code can be used by nested VM-Enter to apply the same logic to the
+value being loaded from vmcs12.
+
+KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested
+VM-Enter, as hardware may support features that KVM does not, i.e. relying
+on hardware to detect invalid guest state will result in false negatives.
+Unfortunately, that means applying KVM's funky suppression of BTF and LBR
+to vmcs12 so as not to break existing guests.
+
+No functional change intended.
+
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------
+ 1 file changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 529a10bba056..ff61093e9af7 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2192,6 +2192,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+       return debugctl;
+ }
++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
++                                bool host_initiated)
++{
++      u64 invalid;
++
++      invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
++      if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
++              kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
++              invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
++      }
++      return !invalid;
++}
++
+ /*
+  * Writes msr value into the appropriate "register".
+  * Returns 0 on success, non-0 otherwise.
+@@ -2260,19 +2273,12 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+               }
+               vmcs_writel(GUEST_SYSENTER_ESP, data);
+               break;
+-      case MSR_IA32_DEBUGCTLMSR: {
+-              u64 invalid;
+-
+-              invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+-              if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+-                      kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+-                      data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+-                      invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+-              }
+-
+-              if (invalid)
++      case MSR_IA32_DEBUGCTLMSR:
++              if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+                       return 1;
++              data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
++
+               if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+                                               VM_EXIT_SAVE_DEBUG_CONTROLS)
+                       get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+@@ -2282,7 +2288,6 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                   (data & DEBUGCTLMSR_LBR))
+                       intel_pmu_create_guest_lbr_event(vcpu);
+               return 0;
+-      }
+       case MSR_IA32_BNDCFGS:
+               if (!kvm_mpx_supported() ||
+                   (!msr_info->host_initiated &&
+-- 
+2.50.1
+
diff --git a/queue-6.12/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch b/queue-6.12/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
new file mode 100644 (file)
index 0000000..ac1e857
--- /dev/null
@@ -0,0 +1,196 @@
+From 064fd232cf9cd7db42a4842d7bec28e315b2ac1b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:25 -0700
+Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the
+ guest
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ]
+
+Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the
+host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting
+while running the guest.  When running with the "default treatment of SMIs"
+in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that
+is visible to host (non-SMM) software, and instead transitions directly
+from VMX non-root to SMM.  And critically, DEBUGCTL isn't context switched
+by hardware on SMI or RSM, i.e. SMM will run with whatever value was
+resident in hardware at the time of the SMI.
+
+Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting
+events while the CPU is executing in SMM, which can pollute profiling and
+potentially leak information into the guest.
+
+Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner
+run loop, as the bit can be toggled in IRQ context via IPI callback (SMP
+function call), by way of /sys/devices/cpu/freeze_on_smi.
+
+Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be
+preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs,
+i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and
+at worst could lead to undesirable behavior in the future if AMD CPUs ever
+happened to pick up a collision with the bit.
+
+Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module
+owns and controls GUEST_IA32_DEBUGCTL.
+
+WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the
+lack of handling isn't a KVM bug (TDX already WARNs on any run_flag).
+
+Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed
+by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state().
+Doing so avoids the need to track host_debugctl on a per-VMCS basis, as
+GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and
+load_vmcs12_host_state().  For the VM-Fail case, even though KVM won't
+have actually entered the guest, vcpu_enter_guest() will have run with
+vmcs02 active and thus could result in vmcs01 being run with a stale value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve syntactic conflict in vt_x86_ops definition]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  7 +++++++
+ arch/x86/kvm/vmx/main.c         |  2 ++
+ arch/x86/kvm/vmx/nested.c       |  3 +++
+ arch/x86/kvm/vmx/vmx.c          |  3 +++
+ arch/x86/kvm/vmx/vmx.h          | 15 ++++++++++++++-
+ arch/x86/kvm/x86.c              | 14 ++++++++++++--
+ 6 files changed, 41 insertions(+), 3 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 2ed05925d9d5..d27df86aa62c 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1630,6 +1630,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+       KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
+       KVM_RUN_LOAD_GUEST_DR6          = BIT(1),
++      KVM_RUN_LOAD_DEBUGCTL           = BIT(2),
+ };
+ struct kvm_x86_ops {
+@@ -1659,6 +1660,12 @@ struct kvm_x86_ops {
+       void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+       void (*vcpu_put)(struct kvm_vcpu *vcpu);
++      /*
++       * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
++       * match the host's value even while the guest is active.
++       */
++      const u64 HOST_OWNED_DEBUGCTL;
++
+       void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
+       int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+       int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
+index 7668e2fb8043..3f83e36a657b 100644
+--- a/arch/x86/kvm/vmx/main.c
++++ b/arch/x86/kvm/vmx/main.c
+@@ -42,6 +42,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
+       .vcpu_load = vmx_vcpu_load,
+       .vcpu_put = vmx_vcpu_put,
++      .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
++
+       .update_exception_bitmap = vmx_update_exception_bitmap,
+       .get_feature_msr = vmx_get_feature_msr,
+       .get_msr = vmx_get_msr,
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 9a336f661fc6..60bd2791d933 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4829,6 +4829,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+                       WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+       }
++      /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
++      vmx_reload_guest_debugctl(vcpu);
++
+       /*
+        * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+        * handle a variety of side effects to KVM's software model.
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4bb25519e7ce..6c185a260c5b 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7407,6 +7407,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+       if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+               set_debugreg(vcpu->arch.dr6, 6);
++      if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
++              vmx_reload_guest_debugctl(vcpu);
++
+       /*
+        * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+        * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 5b2c5cb5e32e..a7e2de50d27f 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -440,12 +440,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+ {
++      WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
++
++      val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
+       vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+ }
+ static inline u64 vmx_guest_debugctl_read(void)
+ {
+-      return vmcs_read64(GUEST_IA32_DEBUGCTL);
++      return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
++}
++
++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
++{
++      u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
++
++      if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
++              return;
++
++      vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
+ }
+ /*
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 7beea8fb6ea6..dbd295ef3eba 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10711,7 +10711,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               dm_request_for_irq_injection(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu);
+       fastpath_t exit_fastpath;
+-      u64 run_flags;
++      u64 run_flags, debug_ctl;
+       bool req_immediate_exit = false;
+@@ -10982,7 +10982,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(DR7_FIXED_1, 7);
+       }
+-      vcpu->arch.host_debugctl = get_debugctlmsr();
++      /*
++       * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
++       * can be modified in IRQ context, e.g. via SMP function calls.  Inform
++       * vendor code if any host-owned bits were changed, e.g. so that the
++       * value loaded into hardware while running the guest can be updated.
++       */
++      debug_ctl = get_debugctlmsr();
++      if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
++          !vcpu->arch.guest_state_protected)
++              run_flags |= KVM_RUN_LOAD_DEBUGCTL;
++      vcpu->arch.host_debugctl = debug_ctl;
+       guest_timing_enter_irqoff();
+-- 
+2.50.1
+
diff --git a/queue-6.12/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch b/queue-6.12/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
new file mode 100644 (file)
index 0000000..6767b6d
--- /dev/null
@@ -0,0 +1,162 @@
+From 23fe0561dff1a54e2d0cadace8e98dc9775bd0b3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:24 -0700
+Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
+
+Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to
+vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into
+GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state
+into the guest, and without needing to copy+paste the FREEZE_IN_SMM
+logic into every patch that accesses GUEST_IA32_DEBUGCTL.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+[sean: massage changelog, make inline, use in all prepare_vmcs02() cases]
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c    | 10 +++++-----
+ arch/x86/kvm/vmx/pmu_intel.c |  8 ++++----
+ arch/x86/kvm/vmx/vmx.c       |  8 +++++---
+ arch/x86/kvm/vmx/vmx.h       | 10 ++++++++++
+ 4 files changed, 24 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 1e0b9f92ff18..9a336f661fc6 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2653,11 +2653,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
+-                                                vmx_get_supported_debugctl(vcpu, false));
++              vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
++                                             vmx_get_supported_debugctl(vcpu, false));
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
++              vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
+       }
+       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -3527,7 +3527,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+       if (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+-              vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
++              vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
+       if (kvm_mpx_supported() &&
+           (!vmx->nested.nested_run_pending ||
+            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -4774,7 +4774,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+       kvm_set_dr(vcpu, 7, 0x400);
+-      vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++      vmx_guest_debugctl_write(vcpu, 0);
+       if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+                               vmcs12->vm_exit_msr_load_count))
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 9c9d4a336166..a5edc623166a 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -605,11 +605,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+  */
+ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+ {
+-      u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++      u64 data = vmx_guest_debugctl_read();
+       if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+               data &= ~DEBUGCTLMSR_LBR;
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++              vmx_guest_debugctl_write(vcpu, data);
+       }
+ }
+@@ -679,7 +679,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+       if (!lbr_desc->event) {
+               vmx_disable_lbr_msrs_passthrough(vcpu);
+-              if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
++              if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
+                       goto warn;
+               if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+                       goto warn;
+@@ -701,7 +701,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+ {
+-      if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
++      if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
+               intel_pmu_release_guest_lbr_event(vcpu);
+ }
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 50d45c18fce9..4bb25519e7ce 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2148,7 +2148,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                       msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+               break;
+       case MSR_IA32_DEBUGCTLMSR:
+-              msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++              msr_info->data = vmx_guest_debugctl_read();
+               break;
+       default:
+       find_uret_msr:
+@@ -2282,7 +2282,8 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                                               VM_EXIT_SAVE_DEBUG_CONTROLS)
+                       get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++              vmx_guest_debugctl_write(vcpu, data);
++
+               if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+                   (data & DEBUGCTLMSR_LBR))
+                       intel_pmu_create_guest_lbr_event(vcpu);
+@@ -4831,7 +4832,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+-      vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++
++      vmx_guest_debugctl_write(&vmx->vcpu, 0);
+       if (cpu_has_vmx_tpr_shadow()) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index ee330d14089d..5b2c5cb5e32e 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -438,6 +438,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
++{
++      vmcs_write64(GUEST_IA32_DEBUGCTL, val);
++}
++
++static inline u64 vmx_guest_debugctl_read(void)
++{
++      return vmcs_read64(GUEST_IA32_DEBUGCTL);
++}
++
+ /*
+  * Note, early Intel manuals have the write-low and read-high bitmap offsets
+  * the wrong way round.  The bitmaps control MSRs 0x00000000-0x00001fff and
+-- 
+2.50.1
+
diff --git a/queue-6.12/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch b/queue-6.12/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
new file mode 100644 (file)
index 0000000..a83f1b7
--- /dev/null
@@ -0,0 +1,153 @@
+From d43a98921ac0ceecd8840b7a5d4dc24377a1c4d9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:19 -0700
+Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic
+ bitmap
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ]
+
+Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter
+into an a generic bitmap so that similar "take action" information can be
+passed to vendor code without creating a pile of boolean parameters.
+
+This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and
+will also allow for adding similar functionality for re-loading debugctl
+in the active VMCS.
+
+Opportunistically massage the TDX WARN and comment to prepare for adding
+more run_flags, all of which are expected to be mutually exclusive with
+TDX, i.e. should be WARNed on.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX changes]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  6 +++++-
+ arch/x86/kvm/svm/svm.c          |  4 ++--
+ arch/x86/kvm/vmx/vmx.c          |  3 ++-
+ arch/x86/kvm/vmx/x86_ops.h      |  2 +-
+ arch/x86/kvm/x86.c              | 11 ++++++++---
+ 5 files changed, 18 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 0caa3293f6db..cccc8cbe72db 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1627,6 +1627,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+       return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+ }
++enum kvm_x86_run_flags {
++      KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
++};
++
+ struct kvm_x86_ops {
+       const char *name;
+@@ -1706,7 +1710,7 @@ struct kvm_x86_ops {
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+       enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+-                                                bool force_immediate_exit);
++                                                u64 run_flags);
+       int (*handle_exit)(struct kvm_vcpu *vcpu,
+               enum exit_fastpath_completion exit_fastpath);
+       int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 1f42a71b15c0..7d1b871cfc02 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4226,9 +4226,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       guest_state_exit_irqoff();
+ }
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+-                                        bool force_immediate_exit)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++      bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9a4ebf3dfbfc..2a977cdfcd0c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7353,8 +7353,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+       guest_state_exit_irqoff();
+ }
+-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
++fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++      bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
+index 4aba200f435d..5e4ce13ab305 100644
+--- a/arch/x86/kvm/vmx/x86_ops.h
++++ b/arch/x86/kvm/vmx/x86_ops.h
+@@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm);
+ int vmx_vcpu_precreate(struct kvm *kvm);
+ int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+ int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
++fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
+ void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 213af0fda768..44ab46f2a2d2 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10711,6 +10711,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               dm_request_for_irq_injection(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu);
+       fastpath_t exit_fastpath;
++      u64 run_flags;
+       bool req_immediate_exit = false;
+@@ -10955,8 +10956,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               goto cancel_injection;
+       }
+-      if (req_immediate_exit)
++      run_flags = 0;
++      if (req_immediate_exit) {
++              run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
++      }
+       fpregs_assert_state_consistent();
+       if (test_thread_flag(TIF_NEED_FPU_LOAD))
+@@ -10992,8 +10996,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+                            (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+-              exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
+-                                                     req_immediate_exit);
++              exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
+               if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+                       break;
+@@ -11005,6 +11008,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+                       break;
+               }
++              run_flags = 0;
++
+               /* Note, VM-Exits that go down the "slow" path are accounted below. */
+               ++vcpu->stat.exits;
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.12/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch b/queue-6.12/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
new file mode 100644 (file)
index 0000000..3a76505
--- /dev/null
@@ -0,0 +1,149 @@
+From 60ac0019cd78125bddc4cc6b46b022c333b534cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:20 -0700
+Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ]
+
+Instruct vendor code to load the guest's DR6 into hardware via a new
+KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to
+load vcpu->arch.dr6 into hardware when DR6 can be read/written directly
+by the guest.
+
+Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM
+thinks DR6 needs to be reloaded.  TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH
+and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX changes]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h |  1 -
+ arch/x86/include/asm/kvm_host.h    |  2 +-
+ arch/x86/kvm/svm/svm.c             | 10 ++++++----
+ arch/x86/kvm/vmx/main.c            |  1 -
+ arch/x86/kvm/vmx/vmx.c             |  9 +++------
+ arch/x86/kvm/x86.c                 |  2 +-
+ 6 files changed, 11 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index cfb22f8c451a..861d080ed4c6 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -47,7 +47,6 @@ KVM_X86_OP(set_idt)
+ KVM_X86_OP(get_gdt)
+ KVM_X86_OP(set_gdt)
+ KVM_X86_OP(sync_dirty_debug_regs)
+-KVM_X86_OP(set_dr6)
+ KVM_X86_OP(set_dr7)
+ KVM_X86_OP(cache_reg)
+ KVM_X86_OP(get_rflags)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index cccc8cbe72db..2ed05925d9d5 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1629,6 +1629,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+       KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
++      KVM_RUN_LOAD_GUEST_DR6          = BIT(1),
+ };
+ struct kvm_x86_ops {
+@@ -1679,7 +1680,6 @@ struct kvm_x86_ops {
+       void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+       void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+       void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+-      void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+       unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 7d1b871cfc02..800f781475c0 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4270,10 +4270,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+       svm_hv_update_vp_id(svm->vmcb, vcpu);
+       /*
+-       * Run with all-zero DR6 unless needed, so that we can get the exact cause
+-       * of a #DB.
++       * Run with all-zero DR6 unless the guest can write DR6 freely, so that
++       * KVM can get the exact cause of a #DB.  Note, loading guest DR6 from
++       * KVM's snapshot is only necessary when DR accesses won't exit.
+        */
+-      if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
++      if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
++              svm_set_dr6(vcpu, vcpu->arch.dr6);
++      else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+               svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
+       clgi();
+@@ -5084,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+       .set_idt = svm_set_idt,
+       .get_gdt = svm_get_gdt,
+       .set_gdt = svm_set_gdt,
+-      .set_dr6 = svm_set_dr6,
+       .set_dr7 = svm_set_dr7,
+       .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+       .cache_reg = svm_cache_reg,
+diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
+index 47476fcc179a..7668e2fb8043 100644
+--- a/arch/x86/kvm/vmx/main.c
++++ b/arch/x86/kvm/vmx/main.c
+@@ -60,7 +60,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
+       .set_idt = vmx_set_idt,
+       .get_gdt = vmx_get_gdt,
+       .set_gdt = vmx_set_gdt,
+-      .set_dr6 = vmx_set_dr6,
+       .set_dr7 = vmx_set_dr7,
+       .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+       .cache_reg = vmx_cache_reg,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 2a977cdfcd0c..b9c7940feac6 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5630,12 +5630,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+       set_debugreg(DR6_RESERVED, 6);
+ }
+-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+-{
+-      lockdep_assert_irqs_disabled();
+-      set_debugreg(vcpu->arch.dr6, 6);
+-}
+-
+ void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+ {
+       vmcs_writel(GUEST_DR7, val);
+@@ -7400,6 +7394,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+               vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+       vcpu->arch.regs_dirty = 0;
++      if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
++              set_debugreg(vcpu->arch.dr6, 6);
++
+       /*
+        * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+        * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 44ab46f2a2d2..7beea8fb6ea6 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10977,7 +10977,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(vcpu->arch.eff_db[3], 3);
+               /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+               if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+-                      kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
++                      run_flags |= KVM_RUN_LOAD_GUEST_DR6;
+       } else if (unlikely(hw_breakpoint_active())) {
+               set_debugreg(DR7_FIXED_1, 7);
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.12/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch b/queue-6.12/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
new file mode 100644 (file)
index 0000000..4bdc787
--- /dev/null
@@ -0,0 +1,78 @@
+From e14e8193de61d485369bd36f87d887c94c48751d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 17:09:56 +0800
+Subject: mm/smaps: fix race between smaps_hugetlb_range and migration
+
+From: Jinjiang Tu <tujinjiang@huawei.com>
+
+[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ]
+
+smaps_hugetlb_range() handles the pte without holdling ptl, and may be
+concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page().
+The race is as follows.
+
+smaps_hugetlb_range              migrate_pages
+  huge_ptep_get
+                                   remove_migration_ptes
+                                  folio_unlock
+  pfn_swap_entry_folio
+    BUG_ON
+
+To fix it, hold ptl lock in smaps_hugetlb_range().
+
+Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com
+Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com
+Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps")
+Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrei Vagin <avagin@gmail.com>
+Cc: Andrii Nakryiko <andrii@kernel.org>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Brahmajit Das <brahmajit.xyz@gmail.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/proc/task_mmu.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index 72a58681f031..2257bf52fb2a 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1007,10 +1007,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ {
+       struct mem_size_stats *mss = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+-      pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
+       struct folio *folio = NULL;
+       bool present = false;
++      spinlock_t *ptl;
++      pte_t ptent;
++      ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
++      ptent = huge_ptep_get(walk->mm, addr, pte);
+       if (pte_present(ptent)) {
+               folio = page_folio(pte_page(ptent));
+               present = true;
+@@ -1029,6 +1032,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+               else
+                       mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+       }
++      spin_unlock(ptl);
+       return 0;
+ }
+ #else
+-- 
+2.50.1
+
diff --git a/queue-6.12/net-kcm-fix-race-condition-in-kcm_unattach.patch b/queue-6.12/net-kcm-fix-race-condition-in-kcm_unattach.patch
new file mode 100644 (file)
index 0000000..c8a4938
--- /dev/null
@@ -0,0 +1,88 @@
+From 6ece36736d8033ce02a676412c51e99271b4ef6a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 21:18:03 +0200
+Subject: net: kcm: Fix race condition in kcm_unattach()
+
+From: Sven Stegemann <sven@stegemann.de>
+
+[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ]
+
+syzbot found a race condition when kcm_unattach(psock)
+and kcm_release(kcm) are executed at the same time.
+
+kcm_unattach() is missing a check of the flag
+kcm->tx_stopped before calling queue_work().
+
+If the kcm has a reserved psock, kcm_unattach() might get executed
+between cancel_work_sync() and unreserve_psock() in kcm_release(),
+requeuing kcm->tx_work right before kcm gets freed in kcm_done().
+
+Remove kcm->tx_stopped and replace it by the less
+error-prone disable_work_sync().
+
+Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
+Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662
+Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94
+Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e
+Signed-off-by: Sven Stegemann <sven@stegemann.de>
+Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/kcm.h |  1 -
+ net/kcm/kcmsock.c | 10 ++--------
+ 2 files changed, 2 insertions(+), 9 deletions(-)
+
+diff --git a/include/net/kcm.h b/include/net/kcm.h
+index 441e993be634..d9c35e71ecea 100644
+--- a/include/net/kcm.h
++++ b/include/net/kcm.h
+@@ -71,7 +71,6 @@ struct kcm_sock {
+       struct list_head wait_psock_list;
+       struct sk_buff *seq_skb;
+       struct mutex tx_mutex;
+-      u32 tx_stopped : 1;
+       /* Don't use bit fields here, these are set under different locks */
+       bool tx_wait;
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index d4118c796290..1d37b26ea2ef 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -429,7 +429,7 @@ static void psock_write_space(struct sock *sk)
+       /* Check if the socket is reserved so someone is waiting for sending. */
+       kcm = psock->tx_kcm;
+-      if (kcm && !unlikely(kcm->tx_stopped))
++      if (kcm)
+               queue_work(kcm_wq, &kcm->tx_work);
+       spin_unlock_bh(&mux->lock);
+@@ -1696,12 +1696,6 @@ static int kcm_release(struct socket *sock)
+        */
+       __skb_queue_purge(&sk->sk_write_queue);
+-      /* Set tx_stopped. This is checked when psock is bound to a kcm and we
+-       * get a writespace callback. This prevents further work being queued
+-       * from the callback (unbinding the psock occurs after canceling work.
+-       */
+-      kcm->tx_stopped = 1;
+-
+       release_sock(sk);
+       spin_lock_bh(&mux->lock);
+@@ -1717,7 +1711,7 @@ static int kcm_release(struct socket *sock)
+       /* Cancel work. After this point there should be no outside references
+        * to the kcm socket.
+        */
+-      cancel_work_sync(&kcm->tx_work);
++      disable_work_sync(&kcm->tx_work);
+       lock_sock(sk);
+       psock = kcm->tx_psock;
+-- 
+2.50.1
+
diff --git a/queue-6.12/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.12/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
new file mode 100644 (file)
index 0000000..697eaa7
--- /dev/null
@@ -0,0 +1,44 @@
+From 2f916039451174e3ab687b9a37e3c5231e2ed92a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 50bfbc2779e4..d8c9fe1d98c4 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+-      u32 val, cap, ret = 0;
++      u32 val, cap;
++      int ret = 0;
+       mutex_lock(&iep->ptp_clk_mutex);
+-- 
+2.50.1
+
diff --git a/queue-6.12/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch b/queue-6.12/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
new file mode 100644 (file)
index 0000000..3f1c6d7
--- /dev/null
@@ -0,0 +1,56 @@
+From ff2cbb791d9045e359020bf8dcdb70db907b394d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 23:08:12 +0530
+Subject: net: ti: icssg-prueth: Fix emac link speed handling
+
+From: MD Danish Anwar <danishanwar@ti.com>
+
+[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ]
+
+When link settings are changed emac->speed is populated by
+emac_adjust_link(). The link speed and other settings are then written into
+the DRAM. However if both ports are brought down after this and brought up
+again or if the operating mode is changed and a firmware reload is needed,
+the DRAM is cleared by icssg_config(). As a result the link settings are
+lost.
+
+Fix this by calling emac_adjust_link() after icssg_config(). This re
+populates the settings in the DRAM after a new firmware load.
+
+Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.")
+Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Message-ID: <20250805173812.2183161-1-danishanwar@ti.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+index 0769e1ade30b..ddbc4624ae88 100644
+--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+@@ -50,6 +50,8 @@
+ /* CTRLMMR_ICSSG_RGMII_CTRL register bits */
+ #define ICSSG_CTRL_RGMII_ID_MODE                BIT(24)
++static void emac_adjust_link(struct net_device *ndev);
++
+ static int emac_get_tx_ts(struct prueth_emac *emac,
+                         struct emac_tx_ts_response *rsp)
+ {
+@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth)
+               ret = icssg_config(prueth, emac, slice);
+               if (ret)
+                       goto disable_class;
++
++              mutex_lock(&emac->ndev->phydev->lock);
++              emac_adjust_link(emac->ndev);
++              mutex_unlock(&emac->ndev->phydev->lock);
+       }
+       ret = prueth_emac_start(prueth);
+-- 
+2.50.1
+
diff --git a/queue-6.12/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.12/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..9c046fd
--- /dev/null
@@ -0,0 +1,129 @@
+From e029781097349b203ded1588deab6713cbf6a350 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 6a1239433830..18a91c031554 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -860,8 +860,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -1184,19 +1182,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -1233,7 +1238,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -1246,8 +1251,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                           ct, true, flags);
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -1260,12 +1264,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
diff --git a/queue-6.12/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.12/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
new file mode 100644 (file)
index 0000000..56beb5a
--- /dev/null
@@ -0,0 +1,103 @@
+From 311ad70a27210004849b7d07dc87eb8eec7af3b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+       CPU0                           CPU1
+       ----                           ----
+  n_vclocks_store()
+    lock(&ptp->n_vclocks_mux) [1]
+        (physical clock)
+                                     pc_clock_adjtime()
+                                       lock(&clk->rwsem) [2]
+                                        (physical clock)
+                                       ...
+                                       ptp_clock_freerun()
+                                         ptp_vclock_in_use()
+                                           lock(&ptp->n_vclocks_mux) [3]
+                                              (physical clock)
+    ptp_clock_unregister()
+      posix_clock_unregister()
+        lock(&clk->rwsem) [4]
+          (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c  | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a6aad743c282..b352df4cd3f9 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -24,6 +24,11 @@
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+ #define PTP_MAX_CHANNELS 2048
++enum {
++      PTP_LOCK_PHYSICAL = 0,
++      PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+       struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+       int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index 7febfdcbde8b..8ed4b8598924 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+       return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++      lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+       .owner          = THIS_MODULE,
+       .name           = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+               return NULL;
+       }
++      ptp_vclock_set_subclass(vclock->clock);
++
+       timecounter_init(&vclock->tc, &vclock->cc, 0);
+       ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+-- 
+2.50.1
+
diff --git a/queue-6.12/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.12/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
new file mode 100644 (file)
index 0000000..66b427f
--- /dev/null
@@ -0,0 +1,73 @@
+From 7a09b3640b9f599fabc4fa354e9ea99af238d33c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+  BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+   sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+   sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+   __release_sock+0x1da/0x330 net/core/sock.c:3106
+   release_sock+0x6b/0x250 net/core/sock.c:3660
+   sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+   sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+   sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+   inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+  BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+   sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+   __release_sock+0x1d3/0x330 net/core/sock.c:3213
+   release_sock+0x6b/0x270 net/core/sock.c:3767
+   sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+   sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+   sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+   inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index a8a254a5008e..032a10d82302 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+        * it's better to just linearize it otherwise crc computing
+        * takes longer.
+        */
+-      if ((!is_gso && skb_linearize(skb)) ||
++      if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+           !pskb_may_pull(skb, sizeof(struct sctphdr)))
+               goto discard_it;
+-- 
+2.50.1
+
index 61295a552b47e64e70299a784794ff2dd5c49864..d5f954c3af93b78f969787dfab0dc2309f591ca7 100644 (file)
@@ -40,3 +40,22 @@ acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
 acpi-processor-perflib-move-problematic-pr-performance-check.patch
 block-make-req_op_zone_finish-a-write-operation.patch
 mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
+kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
+kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
+kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
+kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
+kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
+kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
+kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
+habanalabs-fix-uaf-in-export_dmabuf.patch
+mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
+net-kcm-fix-race-condition-in-kcm_unattach.patch
diff --git a/queue-6.12/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.12/tls-handle-data-disappearing-from-under-the-tls-ulp.patch
new file mode 100644 (file)
index 0000000..326d669
--- /dev/null
@@ -0,0 +1,106 @@
+From 6d4442b6803ab0cdf8929963a5e6113ae219f06e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h      |  2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c   |  3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index e5e47452308a..e1eaf12b3742 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -195,7 +195,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index 095cf31bae0b..d71643b494a1 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+       strp->stm.offset = offset;
+ }
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+       struct strp_msg *rxm;
+       struct tls_msg *tlm;
+@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+       if (!strp->copy_mode && force_refresh) {
+-              if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+-                      return;
++              if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++                      WRITE_ONCE(strp->msg_ready, 0);
++                      memset(&strp->stm, 0, sizeof(strp->stm));
++                      return false;
++              }
+               tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+       }
+@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       rxm->offset     = strp->stm.offset;
+       tlm = tls_msg(strp->anchor);
+       tlm->control    = strp->mark;
++
++      return true;
+ }
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 1d7caadd0cbc..6385329ef98d 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1380,7 +1380,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+                       return sock_intr_errno(timeo);
+       }
+-      tls_strp_msg_load(&ctx->strp, released);
++      if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++              return tls_rx_rec_wait(sk, psock, nonblock, false);
+       return 1;
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.12/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.12/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..9f55170
--- /dev/null
@@ -0,0 +1,51 @@
+From f4d9b128db5250a96a548994f339a395c002e13f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 5de47dd5e909..12ba1a8db93a 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+
diff --git a/queue-6.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
new file mode 100644 (file)
index 0000000..bd22484
--- /dev/null
@@ -0,0 +1,91 @@
+From 5e058a0b161a48cd29cded0776081d5e31d66472 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2].  This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period.  Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index 39aa0aea61c6..711517bd43a1 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns)
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++      /* Update the repeating-pattern data. */
++      data->intervals[data->interval_ptr++] = interval_us;
++      if (data->interval_ptr >= INTERVALS)
++              data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+ /*
+@@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+       if (data->needs_update) {
+               menu_update(drv, dev);
+               data->needs_update = 0;
++      } else if (!dev->last_residency_ns) {
++              /*
++               * This happens when the driver rejects the previously selected
++               * idle state and returns an error, so update the recent
++               * intervals table to prevent invalid information from being
++               * used going forward.
++               */
++              menu_update_intervals(data, UINT_MAX);
+       }
+       /* Find the shortest expected idle interval. */
+@@ -482,10 +498,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+       data->correction_factor[data->bucket] = new_factor;
+-      /* update the repeating-pattern data */
+-      data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+-      if (data->interval_ptr >= INTERVALS)
+-              data->interval_ptr = 0;
++      menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+ /**
+-- 
+2.50.1
+
diff --git a/queue-6.15/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch b/queue-6.15/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch
new file mode 100644 (file)
index 0000000..84b618c
--- /dev/null
@@ -0,0 +1,37 @@
+From eb0b60837981894893c566d7ac0d81ad2b5d8126 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:20:19 +0800
+Subject: erofs: fix block count report when 48-bit layout is on
+
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+
+[ Upstream commit 0b96d9bed324a1c1b7d02bfb9596351ef178428d ]
+
+Fix incorrect shift order when combining the 48-bit block count.
+
+Fixes: 2e1473d5195f ("erofs: implement 48-bit block addressing for unencoded inodes")
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Link: https://lore.kernel.org/r/20250807082019.3093539-1-hsiangkao@linux.alibaba.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/erofs/super.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/erofs/super.c b/fs/erofs/super.c
+index 6e57b9cc6ed2..cfe454dbf415 100644
+--- a/fs/erofs/super.c
++++ b/fs/erofs/super.c
+@@ -313,8 +313,8 @@ static int erofs_read_superblock(struct super_block *sb)
+       sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+       if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+               sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+-              sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
+-                              le16_to_cpu(dsb->rb.blocks_hi);
++              sbi->dif0.blocks = sbi->dif0.blocks |
++                              ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
+       } else {
+               sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.15/habanalabs-fix-uaf-in-export_dmabuf.patch b/queue-6.15/habanalabs-fix-uaf-in-export_dmabuf.patch
new file mode 100644 (file)
index 0000000..0b4f8a9
--- /dev/null
@@ -0,0 +1,96 @@
+From 89ee3cca075191f343cb997a8c8f9baefda963f1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 12 Jul 2025 06:02:31 +0100
+Subject: habanalabs: fix UAF in export_dmabuf()
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ]
+
+As soon as we'd inserted a file reference into descriptor table, another
+thread could close it.  That's fine for the case when all we are doing is
+returning that descriptor to userland (it's a race, but it's a userland
+race and there's nothing the kernel can do about it).  However, if we
+follow fd_install() with any kind of access to objects that would be
+destroyed on close (be it the struct file itself or anything destroyed
+by its ->release()), we have a UAF.
+
+dma_buf_fd() is a combination of reserving a descriptor and fd_install().
+habanalabs export_dmabuf() calls it and then proceeds to access the
+objects destroyed on close.  In particular, it grabs an extra reference to
+another struct file that will be dropped as part of ->release() for ours;
+that "will be" is actually "might have already been".
+
+Fix that by reserving descriptor before anything else and do fd_install()
+only when everything had been set up.  As a side benefit, we no longer
+have the failure exit with file already created, but reference to
+underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet;
+unlike dma_buf_fd(), fd_install() can't fail.
+
+Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/accel/habanalabs/common/memory.c | 23 +++++++----------------
+ 1 file changed, 7 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
+index 601fdbe70179..61472a381904 100644
+--- a/drivers/accel/habanalabs/common/memory.c
++++ b/drivers/accel/habanalabs/common/memory.c
+@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
+       struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
+       struct hl_ctx *ctx;
+-      if (!hl_dmabuf)
+-              return;
+-
+       ctx = hl_dmabuf->ctx;
+       if (hl_dmabuf->memhash_hnode)
+@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ {
+       DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+       struct hl_device *hdev = ctx->hdev;
+-      int rc, fd;
++      CLASS(get_unused_fd, fd)(flags);
++
++      if (fd < 0) {
++              dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
++              return fd;
++      }
+       exp_info.ops = &habanalabs_dmabuf_ops;
+       exp_info.size = total_size;
+@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx,
+               return PTR_ERR(hl_dmabuf->dmabuf);
+       }
+-      fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
+-      if (fd < 0) {
+-              dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
+-              rc = fd;
+-              goto err_dma_buf_put;
+-      }
+-
+       hl_dmabuf->ctx = ctx;
+       hl_ctx_get(hl_dmabuf->ctx);
+       atomic_inc(&ctx->hdev->dmabuf_export_cnt);
+@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx,
+       get_file(ctx->hpriv->file_priv->filp);
+       *dmabuf_fd = fd;
++      fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);
+       return 0;
+-
+-err_dma_buf_put:
+-      hl_dmabuf->dmabuf->priv = NULL;
+-      dma_buf_put(hl_dmabuf->dmabuf);
+-      return rc;
+ }
+ static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
+-- 
+2.50.1
+
diff --git a/queue-6.15/hamradio-ignore-ops-locked-netdevs.patch b/queue-6.15/hamradio-ignore-ops-locked-netdevs.patch
new file mode 100644 (file)
index 0000000..90399e0
--- /dev/null
@@ -0,0 +1,62 @@
+From d23f33c1a9c34b07bd4781c90f234a9a1cbeaa8b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:26 -0700
+Subject: hamradio: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit c64237960819aee1766d03f446ae6de94b1e3f73 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding hamradio for ops-locked
+devices.
+
+       xsk_notifier+0x89/0x230 net/xdp/xsk.c:1664
+       notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+       call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+       call_netdevice_notifiers net/core/dev.c:2281 [inline]
+       unregister_netdevice_many_notify+0x14d7/0x1ff0 net/core/dev.c:12156
+       unregister_netdevice_many net/core/dev.c:12219 [inline]
+       unregister_netdevice_queue+0x33c/0x380 net/core/dev.c:12063
+       register_netdevice+0x1689/0x1ae0 net/core/dev.c:11241
+       bpq_new_device drivers/net/hamradio/bpqether.c:481 [inline]
+       bpq_device_event+0x491/0x600 drivers/net/hamradio/bpqether.c:523
+       notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+       call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+       call_netdevice_notifiers net/core/dev.c:2281 [inline]
+       __dev_notify_flags+0x18d/0x2e0 net/core/dev.c:-1
+       netif_change_flags+0xe8/0x1a0 net/core/dev.c:9608
+       dev_change_flags+0x130/0x260 net/core/dev_api.c:68
+       devinet_ioctl+0xbb4/0x1b50 net/ipv4/devinet.c:1200
+       inet_ioctl+0x3c0/0x4c0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e6300f66a999a6612477@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e6300f66a999a6612477
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-2-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/hamradio/bpqether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
+index 0e0fe32d2da4..045c5177262e 100644
+--- a/drivers/net/hamradio/bpqether.c
++++ b/drivers/net/hamradio/bpqether.c
+@@ -138,7 +138,7 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev)
+ static inline int dev_is_ethdev(struct net_device *dev)
+ {
+-      return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++      return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+ /* ------------------------------------------------------------------------ */
+-- 
+2.50.1
+
diff --git a/queue-6.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch
new file mode 100644 (file)
index 0000000..2fb4262
--- /dev/null
@@ -0,0 +1,41 @@
+From ceb238ac9f661be4f5172183ee62b9858f74ad67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6.  Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 976f5be54e36..039dc42dd509 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1665,7 +1665,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+-      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+       {}
+ };
+-- 
+2.50.1
+
diff --git a/queue-6.15/ipvs-fix-estimator-kthreads-preferred-affinity.patch b/queue-6.15/ipvs-fix-estimator-kthreads-preferred-affinity.patch
new file mode 100644 (file)
index 0000000..5af8ee7
--- /dev/null
@@ -0,0 +1,90 @@
+From f981586f1a7a73248f159d3d77b4516449f568ed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 29 Jul 2025 14:26:11 +0200
+Subject: ipvs: Fix estimator kthreads preferred affinity
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 ]
+
+The estimator kthreads' affinity are defined by sysctl overwritten
+preferences and applied through a plain call to the scheduler's affinity
+API.
+
+However since the introduction of managed kthreads preferred affinity,
+such a practice shortcuts the kthreads core code which eventually
+overwrites the target to the default unbound affinity.
+
+Fix this with using the appropriate kthread's API.
+
+Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Acked-by: Julian Anastasov <ja@ssi.bg>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/ip_vs.h            | 13 +++++++++++++
+ kernel/kthread.c               |  1 +
+ net/netfilter/ipvs/ip_vs_est.c |  3 ++-
+ 3 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
+index ff406ef4fd4a..29a36709e7f3 100644
+--- a/include/net/ip_vs.h
++++ b/include/net/ip_vs.h
+@@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+               return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++      if (ipvs->est_cpulist_valid)
++              return ipvs->sysctl_est_cpulist;
++      else
++              return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+       return ipvs->sysctl_est_nice;
+@@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+       return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++      return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+       return IPVS_EST_NICE;
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index 77c44924cf54..800c8fc46b08 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -894,6 +894,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+       return ret;
+ }
++EXPORT_SYMBOL_GPL(kthread_affine_preferred);
+ /*
+  * Re-affine kthreads according to their preferences
+diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
+index f821ad2e19b3..15049b826732 100644
+--- a/net/netfilter/ipvs/ip_vs_est.c
++++ b/net/netfilter/ipvs/ip_vs_est.c
+@@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+       }
+       set_user_nice(kd->task, sysctl_est_nice(ipvs));
+-      set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
++      if (sysctl_est_preferred_cpulist(ipvs))
++              kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
+       pr_info("starting estimator thread %d...\n", kd->id);
+       wake_up_process(kd->task);
+-- 
+2.50.1
+
diff --git a/queue-6.15/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch b/queue-6.15/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
new file mode 100644 (file)
index 0000000..c36a2fd
--- /dev/null
@@ -0,0 +1,78 @@
+From 9e464b22810b43dd7989a0886ce28831fb986189 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 17:09:56 +0800
+Subject: mm/smaps: fix race between smaps_hugetlb_range and migration
+
+From: Jinjiang Tu <tujinjiang@huawei.com>
+
+[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ]
+
+smaps_hugetlb_range() handles the pte without holdling ptl, and may be
+concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page().
+The race is as follows.
+
+smaps_hugetlb_range              migrate_pages
+  huge_ptep_get
+                                   remove_migration_ptes
+                                  folio_unlock
+  pfn_swap_entry_folio
+    BUG_ON
+
+To fix it, hold ptl lock in smaps_hugetlb_range().
+
+Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com
+Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com
+Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps")
+Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrei Vagin <avagin@gmail.com>
+Cc: Andrii Nakryiko <andrii@kernel.org>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Brahmajit Das <brahmajit.xyz@gmail.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/proc/task_mmu.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index e57e323817e7..3b8eaa7722c8 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1020,10 +1020,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ {
+       struct mem_size_stats *mss = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+-      pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
+       struct folio *folio = NULL;
+       bool present = false;
++      spinlock_t *ptl;
++      pte_t ptent;
++      ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
++      ptent = huge_ptep_get(walk->mm, addr, pte);
+       if (pte_present(ptent)) {
+               folio = page_folio(pte_page(ptent));
+               present = true;
+@@ -1042,6 +1045,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+               else
+                       mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+       }
++      spin_unlock(ptl);
+       return 0;
+ }
+ #else
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-hibmcge-fix-rtnl-deadlock-issue.patch b/queue-6.15/net-hibmcge-fix-rtnl-deadlock-issue.patch
new file mode 100644 (file)
index 0000000..757639b
--- /dev/null
@@ -0,0 +1,122 @@
+From ed1da7003cf22b8dda5eafc827e4981942bcc092 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:56 +0800
+Subject: net: hibmcge: fix rtnl deadlock issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit c875503a9b9082928d7d3fc60b5400d16fbfae4e ]
+
+Currently, the hibmcge netdev acquires the rtnl_lock in
+pci_error_handlers.reset_prepare() and releases it in
+pci_error_handlers.reset_done().
+
+However, in the PCI framework:
+pci_reset_bus - __pci_reset_slot - pci_slot_save_and_disable_locked -
+ pci_dev_save_and_disable - err_handler->reset_prepare(dev);
+
+In pci_slot_save_and_disable_locked():
+       list_for_each_entry(dev, &slot->bus->devices, bus_list) {
+               if (!dev->slot || dev->slot!= slot)
+                       continue;
+               pci_dev_save_and_disable(dev);
+               if (dev->subordinate)
+                       pci_bus_save_and_disable_locked(dev->subordinate);
+       }
+
+This will iterate through all devices under the current bus and execute
+err_handler->reset_prepare(), causing two devices of the hibmcge driver
+to sequentially request the rtnl_lock, leading to a deadlock.
+
+Since the driver now executes netif_device_detach()
+before the reset process, it will not concurrently with
+other netdev APIs, so there is no need to hold the rtnl_lock now.
+
+Therefore, this patch removes the rtnl_lock during the reset process and
+adjusts the position of HBG_NIC_STATE_RESETTING to ensure
+that multiple resets are not executed concurrently.
+
+Fixes: 3f5a61f6d504f ("net: hibmcge: Add reset supported in this module")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c | 14 +++++---------
+ 1 file changed, 5 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+index ff3295b60a69..dee1e8681157 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+@@ -53,9 +53,11 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+ {
+       int ret;
+-      ASSERT_RTNL();
++      if (test_and_set_bit(HBG_NIC_STATE_RESETTING, &priv->state))
++              return -EBUSY;
+       if (netif_running(priv->netdev)) {
++              clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+               dev_warn(&priv->pdev->dev,
+                        "failed to reset because port is up\n");
+               return -EBUSY;
+@@ -64,7 +66,6 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+       netif_device_detach(priv->netdev);
+       priv->reset_type = type;
+-      set_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+       clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
+       ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET);
+       if (ret) {
+@@ -83,28 +84,25 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type)
+           type != priv->reset_type)
+               return 0;
+-      ASSERT_RTNL();
+-
+-      clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+       ret = hbg_rebuild(priv);
+       if (ret) {
+               set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
++              clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+               dev_err(&priv->pdev->dev, "failed to rebuild after reset\n");
+               return ret;
+       }
+       netif_device_attach(priv->netdev);
++      clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+       dev_info(&priv->pdev->dev, "reset done\n");
+       return ret;
+ }
+-/* must be protected by rtnl lock */
+ int hbg_reset(struct hbg_priv *priv)
+ {
+       int ret;
+-      ASSERT_RTNL();
+       ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION);
+       if (ret)
+               return ret;
+@@ -169,7 +167,6 @@ static void hbg_pci_err_reset_prepare(struct pci_dev *pdev)
+       struct net_device *netdev = pci_get_drvdata(pdev);
+       struct hbg_priv *priv = netdev_priv(netdev);
+-      rtnl_lock();
+       hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR);
+ }
+@@ -179,7 +176,6 @@ static void hbg_pci_err_reset_done(struct pci_dev *pdev)
+       struct hbg_priv *priv = netdev_priv(netdev);
+       hbg_reset_done(priv, HBG_RESET_TYPE_FLR);
+-      rtnl_unlock();
+ }
+ static const struct pci_error_handlers hbg_pci_err_handler = {
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-hibmcge-fix-the-division-by-zero-issue.patch b/queue-6.15/net-hibmcge-fix-the-division-by-zero-issue.patch
new file mode 100644 (file)
index 0000000..f98955e
--- /dev/null
@@ -0,0 +1,46 @@
+From 2870ddbdb6f9caf8329f3c51d2f9946a176eda6c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:57 +0800
+Subject: net: hibmcge: fix the division by zero issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 7004b26f0b64331143eb0b312e77a357a11427ce ]
+
+When the network port is down, the queue is released, and ring->len is 0.
+In debugfs, hbg_get_queue_used_num() will be called,
+which may lead to a division by zero issue.
+
+This patch adds a check, if ring->len is 0,
+hbg_get_queue_used_num() directly returns 0.
+
+Fixes: 40735e7543f9 ("net: hibmcge: Implement .ndo_start_xmit function")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+index 2883a5899ae2..8b6110599e10 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+@@ -29,7 +29,12 @@ static inline bool hbg_fifo_is_full(struct hbg_priv *priv, enum hbg_dir dir)
+ static inline u32 hbg_get_queue_used_num(struct hbg_ring *ring)
+ {
+-      return (ring->ntu + ring->len - ring->ntc) % ring->len;
++      u32 len = READ_ONCE(ring->len);
++
++      if (!len)
++              return 0;
++
++      return (READ_ONCE(ring->ntu) + len - READ_ONCE(ring->ntc)) % len;
+ }
+ netdev_tx_t hbg_net_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch b/queue-6.15/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch
new file mode 100644 (file)
index 0000000..c84fb4c
--- /dev/null
@@ -0,0 +1,68 @@
+From 1e8262925f1a14e833eb34376195aa9aa71ca95e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:58 +0800
+Subject: net: hibmcge: fix the np_link_fail error reporting issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 62c50180ffda01468e640ac14925503796f255e2 ]
+
+Currently, after modifying device port mode, the np_link_ok state
+is immediately checked. At this point, the device may not yet ready,
+leading to the querying of an intermediate state.
+
+This patch will poll to check if np_link is ok after
+modifying device port mode, and only report np_link_fail upon timeout.
+
+Fixes: e0306637e85d ("net: hibmcge: Add support for mac link exception handling feature")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+index 9b65eef62b3f..2844124f306d 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+@@ -12,6 +12,8 @@
+ #define HBG_HW_EVENT_WAIT_TIMEOUT_US  (2 * 1000 * 1000)
+ #define HBG_HW_EVENT_WAIT_INTERVAL_US (10 * 1000)
++#define HBG_MAC_LINK_WAIT_TIMEOUT_US  (500 * 1000)
++#define HBG_MAC_LINK_WAIT_INTERVAL_US (5 * 1000)
+ /* little endian or big endian.
+  * ctrl means packet description, data means skb packet data
+  */
+@@ -213,6 +215,9 @@ void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr)
+ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+ {
++      u32 link_status;
++      int ret;
++
+       hbg_hw_mac_enable(priv, HBG_STATUS_DISABLE);
+       hbg_reg_write_field(priv, HBG_REG_PORT_MODE_ADDR,
+@@ -224,8 +229,14 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+       hbg_hw_mac_enable(priv, HBG_STATUS_ENABLE);
+-      if (!hbg_reg_read_field(priv, HBG_REG_AN_NEG_STATE_ADDR,
+-                              HBG_REG_AN_NEG_STATE_NP_LINK_OK_B))
++      /* wait MAC link up */
++      ret = readl_poll_timeout(priv->io_base + HBG_REG_AN_NEG_STATE_ADDR,
++                               link_status,
++                               FIELD_GET(HBG_REG_AN_NEG_STATE_NP_LINK_OK_B,
++                                         link_status),
++                               HBG_MAC_LINK_WAIT_INTERVAL_US,
++                               HBG_MAC_LINK_WAIT_TIMEOUT_US);
++      if (ret)
+               hbg_np_link_fail_task_schedule(priv);
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-kcm-fix-race-condition-in-kcm_unattach.patch b/queue-6.15/net-kcm-fix-race-condition-in-kcm_unattach.patch
new file mode 100644 (file)
index 0000000..9de4428
--- /dev/null
@@ -0,0 +1,88 @@
+From 8bee886e735d51eab3303b93156daeda9571fef3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 21:18:03 +0200
+Subject: net: kcm: Fix race condition in kcm_unattach()
+
+From: Sven Stegemann <sven@stegemann.de>
+
+[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ]
+
+syzbot found a race condition when kcm_unattach(psock)
+and kcm_release(kcm) are executed at the same time.
+
+kcm_unattach() is missing a check of the flag
+kcm->tx_stopped before calling queue_work().
+
+If the kcm has a reserved psock, kcm_unattach() might get executed
+between cancel_work_sync() and unreserve_psock() in kcm_release(),
+requeuing kcm->tx_work right before kcm gets freed in kcm_done().
+
+Remove kcm->tx_stopped and replace it by the less
+error-prone disable_work_sync().
+
+Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
+Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662
+Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94
+Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e
+Signed-off-by: Sven Stegemann <sven@stegemann.de>
+Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/kcm.h |  1 -
+ net/kcm/kcmsock.c | 10 ++--------
+ 2 files changed, 2 insertions(+), 9 deletions(-)
+
+diff --git a/include/net/kcm.h b/include/net/kcm.h
+index 441e993be634..d9c35e71ecea 100644
+--- a/include/net/kcm.h
++++ b/include/net/kcm.h
+@@ -71,7 +71,6 @@ struct kcm_sock {
+       struct list_head wait_psock_list;
+       struct sk_buff *seq_skb;
+       struct mutex tx_mutex;
+-      u32 tx_stopped : 1;
+       /* Don't use bit fields here, these are set under different locks */
+       bool tx_wait;
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index 24aec295a51c..8c0577cd764f 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -429,7 +429,7 @@ static void psock_write_space(struct sock *sk)
+       /* Check if the socket is reserved so someone is waiting for sending. */
+       kcm = psock->tx_kcm;
+-      if (kcm && !unlikely(kcm->tx_stopped))
++      if (kcm)
+               queue_work(kcm_wq, &kcm->tx_work);
+       spin_unlock_bh(&mux->lock);
+@@ -1688,12 +1688,6 @@ static int kcm_release(struct socket *sock)
+        */
+       __skb_queue_purge(&sk->sk_write_queue);
+-      /* Set tx_stopped. This is checked when psock is bound to a kcm and we
+-       * get a writespace callback. This prevents further work being queued
+-       * from the callback (unbinding the psock occurs after canceling work.
+-       */
+-      kcm->tx_stopped = 1;
+-
+       release_sock(sk);
+       spin_lock_bh(&mux->lock);
+@@ -1709,7 +1703,7 @@ static int kcm_release(struct socket *sock)
+       /* Cancel work. After this point there should be no outside references
+        * to the kcm socket.
+        */
+-      cancel_work_sync(&kcm->tx_work);
++      disable_work_sync(&kcm->tx_work);
+       lock_sock(sk);
+       psock = kcm->tx_psock;
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-lapbether-ignore-ops-locked-netdevs.patch b/queue-6.15/net-lapbether-ignore-ops-locked-netdevs.patch
new file mode 100644 (file)
index 0000000..1213206
--- /dev/null
@@ -0,0 +1,64 @@
+From cdf8a27cb2b49bb5d7d3fd82a048319d4cf78cba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:25 -0700
+Subject: net: lapbether: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit 53898ebabe843bfa7baea9dae152797d5d0563c9 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding lapbeth for ops-locked
+devices.
+
+       xsk_notifier+0xa4/0x280 net/xdp/xsk.c:1645
+       notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+       call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+       call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+       call_netdevice_notifiers net/core/dev.c:2282 [inline]
+       unregister_netdevice_many_notify+0xf9d/0x2700 net/core/dev.c:12077
+       unregister_netdevice_many net/core/dev.c:12140 [inline]
+       unregister_netdevice_queue+0x305/0x3f0 net/core/dev.c:11984
+       register_netdevice+0x18f1/0x2270 net/core/dev.c:11149
+       lapbeth_new_device drivers/net/wan/lapbether.c:420 [inline]
+       lapbeth_device_event+0x5b1/0xbe0 drivers/net/wan/lapbether.c:462
+       notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+       call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+       call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+       call_netdevice_notifiers net/core/dev.c:2282 [inline]
+       __dev_notify_flags+0x12c/0x2e0 net/core/dev.c:9497
+       netif_change_flags+0x108/0x160 net/core/dev.c:9526
+       dev_change_flags+0xba/0x250 net/core/dev_api.c:68
+       devinet_ioctl+0x11d5/0x1f50 net/ipv4/devinet.c:1200
+       inet_ioctl+0x3a7/0x3f0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e67ea9c235b13b4f0020@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e67ea9c235b13b4f0020
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-1-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/wan/lapbether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
+index 995a7207bdf8..f357a7ac70ac 100644
+--- a/drivers/net/wan/lapbether.c
++++ b/drivers/net/wan/lapbether.c
+@@ -81,7 +81,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev)
+ static __inline__ int dev_is_ethdev(struct net_device *dev)
+ {
+-      return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++      return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+ /* ------------------------------------------------------------------------ */
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-page_pool-allow-enabling-recycling-late-fix-fals.patch b/queue-6.15/net-page_pool-allow-enabling-recycling-late-fix-fals.patch
new file mode 100644 (file)
index 0000000..f6b8e63
--- /dev/null
@@ -0,0 +1,174 @@
+From 50c290becd59110bb55ee279fd703e866f2814a0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 17:36:54 -0700
+Subject: net: page_pool: allow enabling recycling late, fix false positive
+ warning
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 64fdaa94bfe0cca3a0f4b2dd922486c5f59fe678 ]
+
+Page pool can have pages "directly" (locklessly) recycled to it,
+if the NAPI that owns the page pool is scheduled to run on the same CPU.
+To make this safe we check that the NAPI is disabled while we destroy
+the page pool. In most cases NAPI and page pool lifetimes are tied
+together so this happens naturally.
+
+The queue API expects the following order of calls:
+ -> mem_alloc
+    alloc new pp
+ -> stop
+    napi_disable
+ -> start
+    napi_enable
+ -> mem_free
+    free old pp
+
+Here we allocate the page pool in ->mem_alloc and free in ->mem_free.
+But the NAPIs are only stopped between ->stop and ->start. We created
+page_pool_disable_direct_recycling() to safely shut down the recycling
+in ->stop. This way the page_pool_destroy() call in ->mem_free doesn't
+have to worry about recycling any more.
+
+Unfortunately, the page_pool_disable_direct_recycling() is not enough
+to deal with failures which necessitate freeing the _new_ page pool.
+If we hit a failure in ->mem_alloc or ->stop the new page pool has
+to be freed while the NAPI is active (assuming driver attaches the
+page pool to an existing NAPI instance and doesn't reallocate NAPIs).
+
+Freeing the new page pool is technically safe because it hasn't been
+used for any packets, yet, so there can be no recycling. But the check
+in napi_assert_will_not_race() has no way of knowing that. We could
+check if page pool is empty but that'd make the check much less likely
+to trigger during development.
+
+Add page_pool_enable_direct_recycling(), pairing with
+page_pool_disable_direct_recycling(). It will allow us to create the new
+page pools in "disabled" state and only enable recycling when we know
+the reconfig operation will not fail.
+
+Coincidentally it will also let us re-enable the recycling for the old
+pool, if the reconfig failed:
+
+ -> mem_alloc (new)
+ -> stop (old)
+    # disables direct recycling for old
+ -> start (new)
+    # fail!!
+ -> start (old)
+    # go back to old pp but direct recycling is lost :(
+ -> mem_free (new)
+
+The new helper is idempotent to make the life easier for drivers,
+which can operate in HDS mode and support zero-copy Rx.
+The driver can call the helper twice whether there are two pools
+or it has multiple references to a single pool.
+
+Fixes: 40eca00ae605 ("bnxt_en: unlink page pool when stopping Rx queue")
+Tested-by: David Wei <dw@davidwei.uk>
+Link: https://patch.msgid.link/20250805003654.2944974-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c |  9 ++++++-
+ include/net/page_pool/types.h             |  2 ++
+ net/core/page_pool.c                      | 29 +++++++++++++++++++++++
+ 3 files changed, 39 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index d66519ce57af..8021d97f3f22 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -3779,7 +3779,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+       if (BNXT_RX_PAGE_MODE(bp))
+               pp.pool_size += bp->rx_ring_size;
+       pp.nid = numa_node;
+-      pp.napi = &rxr->bnapi->napi;
+       pp.netdev = bp->dev;
+       pp.dev = &bp->pdev->dev;
+       pp.dma_dir = bp->rx_dir;
+@@ -3807,6 +3806,12 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+       return PTR_ERR(pool);
+ }
++static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr)
++{
++      page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi);
++      page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi);
++}
++
+ static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
+ {
+       u16 mem_size;
+@@ -3845,6 +3850,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
+               rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
+               if (rc)
+                       return rc;
++              bnxt_enable_rx_page_pool(rxr);
+               rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0);
+               if (rc < 0)
+@@ -15998,6 +16004,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx)
+                       goto err_reset;
+       }
++      bnxt_enable_rx_page_pool(rxr);
+       napi_enable_locked(&bnapi->napi);
+       bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons);
+diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
+index 431b593de709..1509a536cb85 100644
+--- a/include/net/page_pool/types.h
++++ b/include/net/page_pool/types.h
+@@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
+ struct xdp_mem_info;
+ #ifdef CONFIG_PAGE_POOL
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++                                     struct napi_struct *napi);
+ void page_pool_disable_direct_recycling(struct page_pool *pool);
+ void page_pool_destroy(struct page_pool *pool);
+ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+diff --git a/net/core/page_pool.c b/net/core/page_pool.c
+index 3eabe78c93f4..ef870c21e854 100644
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -1201,6 +1201,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+       pool->xdp_mem_id = mem->id;
+ }
++/**
++ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
++ * @pool: page pool to modify
++ * @napi: NAPI instance to associate the page pool with
++ *
++ * Associate a page pool with a NAPI instance for lockless page recycling.
++ * This is useful when a new page pool has to be added to a NAPI instance
++ * without disabling that NAPI instance, to mark the point at which control
++ * path "hands over" the page pool to the NAPI instance. In most cases driver
++ * can simply set the @napi field in struct page_pool_params, and does not
++ * have to call this helper.
++ *
++ * The function is idempotent, but does not implement any refcounting.
++ * Single page_pool_disable_direct_recycling() will disable recycling,
++ * no matter how many times enable was called.
++ */
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++                                     struct napi_struct *napi)
++{
++      if (READ_ONCE(pool->p.napi) == napi)
++              return;
++      WARN_ON(!napi || pool->p.napi);
++
++      mutex_lock(&page_pools_lock);
++      WRITE_ONCE(pool->p.napi, napi);
++      mutex_unlock(&page_pools_lock);
++}
++EXPORT_SYMBOL(page_pool_enable_direct_recycling);
++
+ void page_pool_disable_direct_recycling(struct page_pool *pool)
+ {
+       /* Disable direct recycling based on pool->cpuid.
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch b/queue-6.15/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch
new file mode 100644 (file)
index 0000000..ae5cb5d
--- /dev/null
@@ -0,0 +1,69 @@
+From fb06884923f5b6158bb457592100bd9b1bb0ecbd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:55 +0000
+Subject: net: stmmac: thead: Get and enable APB clock on initialization
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit 4cc339ce482ba78589a2d5cbe1c84b735d263383 ]
+
+It's necessary to adjust the MAC TX clock when the linkspeed changes,
+but it's noted such adjustment always fails on TH1520 SoC, and reading
+back from APB glue registers that control clock generation results in
+garbage, causing broken link.
+
+With some testing, it's found a clock must be ungated for access to APB
+glue registers. Without any consumer, the clock is automatically
+disabled during late kernel startup. Let's get and enable it if it's
+described in devicetree.
+
+For backward compatibility with older devicetrees, probing won't fail if
+the APB clock isn't found. In this case, we emit a warning since the
+link will break if the speed changes.
+
+Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-4-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+index c72ee759aae5..f2946bea0bc2 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+@@ -211,6 +211,7 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+       struct stmmac_resources stmmac_res;
+       struct plat_stmmacenet_data *plat;
+       struct thead_dwmac *dwmac;
++      struct clk *apb_clk;
+       void __iomem *apb;
+       int ret;
+@@ -224,6 +225,19 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+               return dev_err_probe(&pdev->dev, PTR_ERR(plat),
+                                    "dt configuration failed\n");
++      /*
++       * The APB clock is essential for accessing glue registers. However,
++       * old devicetrees don't describe it correctly. We continue to probe
++       * and emit a warning if it isn't present.
++       */
++      apb_clk = devm_clk_get_enabled(&pdev->dev, "apb");
++      if (PTR_ERR(apb_clk) == -ENOENT)
++              dev_warn(&pdev->dev,
++                       "cannot get apb clock, link may break after speed changes\n");
++      else if (IS_ERR(apb_clk))
++              return dev_err_probe(&pdev->dev, PTR_ERR(apb_clk),
++                                   "failed to get apb clock\n");
++
+       dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL);
+       if (!dwmac)
+               return -ENOMEM;
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.15/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
new file mode 100644 (file)
index 0000000..ab0957f
--- /dev/null
@@ -0,0 +1,44 @@
+From 35bc060346d46b09199ed15886a9e6f60c6691ab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 50bfbc2779e4..d8c9fe1d98c4 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+-      u32 val, cap, ret = 0;
++      u32 val, cap;
++      int ret = 0;
+       mutex_lock(&iep->ptp_clk_mutex);
+-- 
+2.50.1
+
diff --git a/queue-6.15/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch b/queue-6.15/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
new file mode 100644 (file)
index 0000000..74e7ee8
--- /dev/null
@@ -0,0 +1,56 @@
+From 9d6910e013cfc66f3ee4de8d062a7ea32c989c2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 23:08:12 +0530
+Subject: net: ti: icssg-prueth: Fix emac link speed handling
+
+From: MD Danish Anwar <danishanwar@ti.com>
+
+[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ]
+
+When link settings are changed emac->speed is populated by
+emac_adjust_link(). The link speed and other settings are then written into
+the DRAM. However if both ports are brought down after this and brought up
+again or if the operating mode is changed and a firmware reload is needed,
+the DRAM is cleared by icssg_config(). As a result the link settings are
+lost.
+
+Fix this by calling emac_adjust_link() after icssg_config(). This re
+populates the settings in the DRAM after a new firmware load.
+
+Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.")
+Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Message-ID: <20250805173812.2183161-1-danishanwar@ti.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+index 2f5c4335dec3..008d77727400 100644
+--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+@@ -50,6 +50,8 @@
+ /* CTRLMMR_ICSSG_RGMII_CTRL register bits */
+ #define ICSSG_CTRL_RGMII_ID_MODE                BIT(24)
++static void emac_adjust_link(struct net_device *ndev);
++
+ static int emac_get_tx_ts(struct prueth_emac *emac,
+                         struct emac_tx_ts_response *rsp)
+ {
+@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth)
+               ret = icssg_config(prueth, emac, slice);
+               if (ret)
+                       goto disable_class;
++
++              mutex_lock(&emac->ndev->phydev->lock);
++              emac_adjust_link(emac->ndev);
++              mutex_unlock(&emac->ndev->phydev->lock);
+       }
+       ret = prueth_emac_start(prueth);
+-- 
+2.50.1
+
diff --git a/queue-6.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..ffee4f2
--- /dev/null
@@ -0,0 +1,129 @@
+From 1061094cf0f7026f7919fac281a9d2e9cf45d5b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 2cc0fde23344..5fdcae45e0bc 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -884,8 +884,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -1208,19 +1206,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -1257,7 +1262,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -1270,8 +1275,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                           ct, true, flags);
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -1284,12 +1288,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
diff --git a/queue-6.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
new file mode 100644 (file)
index 0000000..a89f303
--- /dev/null
@@ -0,0 +1,103 @@
+From cdcbf5a86ed33261d6360945f9d9033691f6bda8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+       CPU0                           CPU1
+       ----                           ----
+  n_vclocks_store()
+    lock(&ptp->n_vclocks_mux) [1]
+        (physical clock)
+                                     pc_clock_adjtime()
+                                       lock(&clk->rwsem) [2]
+                                        (physical clock)
+                                       ...
+                                       ptp_clock_freerun()
+                                         ptp_vclock_in_use()
+                                           lock(&ptp->n_vclocks_mux) [3]
+                                              (physical clock)
+    ptp_clock_unregister()
+      posix_clock_unregister()
+        lock(&clk->rwsem) [4]
+          (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c  | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a6aad743c282..b352df4cd3f9 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -24,6 +24,11 @@
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+ #define PTP_MAX_CHANNELS 2048
++enum {
++      PTP_LOCK_PHYSICAL = 0,
++      PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+       struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+       int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index 7febfdcbde8b..8ed4b8598924 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+       return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++      lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+       .owner          = THIS_MODULE,
+       .name           = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+               return NULL;
+       }
++      ptp_vclock_set_subclass(vclock->clock);
++
+       timecounter_init(&vclock->tc, &vclock->cc, 0);
+       ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+-- 
+2.50.1
+
diff --git a/queue-6.15/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch b/queue-6.15/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch
new file mode 100644 (file)
index 0000000..d7a4126
--- /dev/null
@@ -0,0 +1,54 @@
+From 9340f5b60b7593637159f924e6d0f92ffc7effa9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:56 +0000
+Subject: riscv: dts: thead: Add APB clocks for TH1520 GMACs
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit a7f75e2883c4bd57b12c3be61bb926929adad9c0 ]
+
+Describe perisys-apb4-hclk as the APB clock for TH1520 SoC, which is
+essential for accessing GMAC glue registers.
+
+Fixes: 7e756671a664 ("riscv: dts: thead: Add TH1520 ethernet nodes")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-5-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/boot/dts/thead/th1520.dtsi | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi
+index 527336417765..0aae4e6a5b33 100644
+--- a/arch/riscv/boot/dts/thead/th1520.dtsi
++++ b/arch/riscv/boot/dts/thead/th1520.dtsi
+@@ -286,8 +286,9 @@ gmac1: ethernet@ffe7060000 {
+                       reg-names = "dwmac", "apb";
+                       interrupts = <67 IRQ_TYPE_LEVEL_HIGH>;
+                       interrupt-names = "macirq";
+-                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>;
+-                      clock-names = "stmmaceth", "pclk";
++                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>,
++                               <&clk CLK_PERISYS_APB4_HCLK>;
++                      clock-names = "stmmaceth", "pclk", "apb";
+                       snps,pbl = <32>;
+                       snps,fixed-burst;
+                       snps,multicast-filter-bins = <64>;
+@@ -308,8 +309,9 @@ gmac0: ethernet@ffe7070000 {
+                       reg-names = "dwmac", "apb";
+                       interrupts = <66 IRQ_TYPE_LEVEL_HIGH>;
+                       interrupt-names = "macirq";
+-                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>;
+-                      clock-names = "stmmaceth", "pclk";
++                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>,
++                               <&clk CLK_PERISYS_APB4_HCLK>;
++                      clock-names = "stmmaceth", "pclk", "apb";
+                       snps,pbl = <32>;
+                       snps,fixed-burst;
+                       snps,multicast-filter-bins = <64>;
+-- 
+2.50.1
+
diff --git a/queue-6.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
new file mode 100644 (file)
index 0000000..b0337e3
--- /dev/null
@@ -0,0 +1,73 @@
+From bccdcc26cdd1a2db18a98c8314555d471f0ea68b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+  BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+   sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+   sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+   __release_sock+0x1da/0x330 net/core/sock.c:3106
+   release_sock+0x6b/0x250 net/core/sock.c:3660
+   sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+   sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+   sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+   inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+  BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+   sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+   __release_sock+0x1d3/0x330 net/core/sock.c:3213
+   release_sock+0x6b/0x270 net/core/sock.c:3767
+   sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+   sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+   sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+   inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 0c0d2757f6f8..6fcdcaeed40e 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+        * it's better to just linearize it otherwise crc computing
+        * takes longer.
+        */
+-      if ((!is_gso && skb_linearize(skb)) ||
++      if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+           !pskb_may_pull(skb, sizeof(struct sctphdr)))
+               goto discard_it;
+-- 
+2.50.1
+
index 2f0e67372d764cd3f3d3ed68fb3974f67235fae3..143d8e99f4489fc69884d8648c343e794377775a 100644 (file)
@@ -50,3 +50,26 @@ acpi-processor-perflib-move-problematic-pr-performance-check.patch
 block-make-req_op_zone_finish-a-write-operation.patch
 mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
 mfd-cros_ec-separate-charge-control-probing-from-usb-pd.patch
+habanalabs-fix-uaf-in-export_dmabuf.patch
+mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
+xfrm-restore-gso-for-sw-crypto.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+net-hibmcge-fix-rtnl-deadlock-issue.patch
+net-hibmcge-fix-the-division-by-zero-issue.patch
+net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch
+net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
+net-page_pool-allow-enabling-recycling-late-fix-fals.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+net-lapbether-ignore-ops-locked-netdevs.patch
+hamradio-ignore-ops-locked-netdevs.patch
+erofs-fix-block-count-report-when-48-bit-layout-is-o.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch
+riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
+ipvs-fix-estimator-kthreads-preferred-affinity.patch
+net-kcm-fix-race-condition-in-kcm_unattach.patch
diff --git a/queue-6.15/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.15/tls-handle-data-disappearing-from-under-the-tls-ulp.patch
new file mode 100644 (file)
index 0000000..0a9a3b4
--- /dev/null
@@ -0,0 +1,106 @@
+From 2f6ca8c2086da5826a2e90788e7990e6a81f6da8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h      |  2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c   |  3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index 774859b63f0d..4e077068e6d9 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -196,7 +196,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index 095cf31bae0b..d71643b494a1 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+       strp->stm.offset = offset;
+ }
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+       struct strp_msg *rxm;
+       struct tls_msg *tlm;
+@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+       if (!strp->copy_mode && force_refresh) {
+-              if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+-                      return;
++              if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++                      WRITE_ONCE(strp->msg_ready, 0);
++                      memset(&strp->stm, 0, sizeof(strp->stm));
++                      return false;
++              }
+               tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+       }
+@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       rxm->offset     = strp->stm.offset;
+       tlm = tls_msg(strp->anchor);
+       tlm->control    = strp->mark;
++
++      return true;
+ }
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 549d1ea01a72..51c98a007dda 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+                       return sock_intr_errno(timeo);
+       }
+-      tls_strp_msg_load(&ctx->strp, released);
++      if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++              return tls_rx_rec_wait(sk, psock, nonblock, false);
+       return 1;
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..7dd6f0b
--- /dev/null
@@ -0,0 +1,51 @@
+From b41a25fd1983a12413c6fe2ac51da32c519ef08e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index a1aca6308677..4245522d4201 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+
diff --git a/queue-6.15/xfrm-restore-gso-for-sw-crypto.patch b/queue-6.15/xfrm-restore-gso-for-sw-crypto.patch
new file mode 100644 (file)
index 0000000..2a34524
--- /dev/null
@@ -0,0 +1,58 @@
+From c4a6ec2c44c573d9dca08240a0c2e0c8ba20a461 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:25 +0200
+Subject: xfrm: restore GSO for SW crypto
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 234d1eff5d4987024be9d40ac07b918a5ae8db1a ]
+
+Commit 49431af6c4ef incorrectly assumes that the GSO path is only used
+by HW offload, but it's also useful for SW crypto.
+
+This patch re-enables GSO for SW crypto. It's not an exact revert to
+preserve the other changes made to xfrm_dev_offload_ok afterwards, but
+it reverts all of its effects.
+
+Fixes: 49431af6c4ef ("xfrm: rely on XFRM offload")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xfrm/xfrm_device.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
+index a2d3a5f3b485..a6c289858401 100644
+--- a/net/xfrm/xfrm_device.c
++++ b/net/xfrm/xfrm_device.c
+@@ -415,10 +415,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+       struct net_device *dev = x->xso.dev;
+       bool check_tunnel_size;
+-      if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED)
++      if (!x->type_offload ||
++          (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
+               return false;
+-      if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) {
++      if ((!dev || dev == xfrm_dst_path(dst)->dev) &&
++          !xdst->child->xfrm) {
+               mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
+               if (skb->len <= mtu)
+                       goto ok;
+@@ -430,6 +432,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+       return false;
+ ok:
++      if (!dev)
++              return true;
++
+       check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
+                           x->props.mode == XFRM_MODE_TUNNEL;
+       switch (x->props.family) {
+-- 
+2.50.1
+
diff --git a/queue-6.16/bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch b/queue-6.16/bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch
new file mode 100644 (file)
index 0000000..a04ed34
--- /dev/null
@@ -0,0 +1,69 @@
+From dddfb590c42c3c450b2cac204c7b69ea3c79f1d1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 11:29:07 -0700
+Subject: bnxt: fill data page pool with frags if PAGE_SIZE > BNXT_RX_PAGE_SIZE
+
+From: David Wei <dw@davidwei.uk>
+
+[ Upstream commit 39f8fcda2088382a4aa70b258d6f7225aa386f11 ]
+
+The data page pool always fills the HW rx ring with pages. On arm64 with
+64K pages, this will waste _at least_ 32K of memory per entry in the rx
+ring.
+
+Fix by fragmenting the pages if PAGE_SIZE > BNXT_RX_PAGE_SIZE. This
+makes the data page pool the same as the header pool.
+
+Tested with iperf3 with a small (64 entries) rx ring to encourage buffer
+circulation.
+
+Fixes: cd1fafe7da1f ("eth: bnxt: add support rx side device memory TCP")
+Reviewed-by: Michael Chan <michael.chan@broadcom.com>
+Signed-off-by: David Wei <dw@davidwei.uk>
+Link: https://patch.msgid.link/20250812182907.1540755-1-dw@davidwei.uk
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 8d950b43846e..e165490af6ac 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -921,15 +921,21 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
+ static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
+                                        struct bnxt_rx_ring_info *rxr,
++                                       unsigned int *offset,
+                                        gfp_t gfp)
+ {
+       netmem_ref netmem;
+-      netmem = page_pool_alloc_netmems(rxr->page_pool, gfp);
++      if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) {
++              netmem = page_pool_alloc_frag_netmem(rxr->page_pool, offset, BNXT_RX_PAGE_SIZE, gfp);
++      } else {
++              netmem = page_pool_alloc_netmems(rxr->page_pool, gfp);
++              *offset = 0;
++      }
+       if (!netmem)
+               return 0;
+-      *mapping = page_pool_get_dma_addr_netmem(netmem);
++      *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
+       return netmem;
+ }
+@@ -1024,7 +1030,7 @@ static int bnxt_alloc_rx_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
+       dma_addr_t mapping;
+       netmem_ref netmem;
+-      netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, gfp);
++      netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp);
+       if (!netmem)
+               return -ENOMEM;
+-- 
+2.50.1
+
diff --git a/queue-6.16/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.16/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
new file mode 100644 (file)
index 0000000..f36bad1
--- /dev/null
@@ -0,0 +1,91 @@
+From 7cde332159d0e7a0dd2d95c8374ad52850b4db14 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2].  This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period.  Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index 52d5d26fc7c6..81306612a5c6 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns)
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++      /* Update the repeating-pattern data. */
++      data->intervals[data->interval_ptr++] = interval_us;
++      if (data->interval_ptr >= INTERVALS)
++              data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+ /*
+@@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+       if (data->needs_update) {
+               menu_update(drv, dev);
+               data->needs_update = 0;
++      } else if (!dev->last_residency_ns) {
++              /*
++               * This happens when the driver rejects the previously selected
++               * idle state and returns an error, so update the recent
++               * intervals table to prevent invalid information from being
++               * used going forward.
++               */
++              menu_update_intervals(data, UINT_MAX);
+       }
+       /* Find the shortest expected idle interval. */
+@@ -482,10 +498,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+       data->correction_factor[data->bucket] = new_factor;
+-      /* update the repeating-pattern data */
+-      data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+-      if (data->interval_ptr >= INTERVALS)
+-              data->interval_ptr = 0;
++      menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+ /**
+-- 
+2.50.1
+
diff --git a/queue-6.16/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch b/queue-6.16/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch
new file mode 100644 (file)
index 0000000..4e340df
--- /dev/null
@@ -0,0 +1,37 @@
+From 04b932e34aabb02993b5947cbdae75426353973a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:20:19 +0800
+Subject: erofs: fix block count report when 48-bit layout is on
+
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+
+[ Upstream commit 0b96d9bed324a1c1b7d02bfb9596351ef178428d ]
+
+Fix incorrect shift order when combining the 48-bit block count.
+
+Fixes: 2e1473d5195f ("erofs: implement 48-bit block addressing for unencoded inodes")
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Link: https://lore.kernel.org/r/20250807082019.3093539-1-hsiangkao@linux.alibaba.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/erofs/super.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/erofs/super.c b/fs/erofs/super.c
+index e1e9f06e8342..799fef437aa8 100644
+--- a/fs/erofs/super.c
++++ b/fs/erofs/super.c
+@@ -313,8 +313,8 @@ static int erofs_read_superblock(struct super_block *sb)
+       sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+       if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+               sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+-              sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
+-                              le16_to_cpu(dsb->rb.blocks_hi);
++              sbi->dif0.blocks = sbi->dif0.blocks |
++                              ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
+       } else {
+               sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.16/habanalabs-fix-uaf-in-export_dmabuf.patch b/queue-6.16/habanalabs-fix-uaf-in-export_dmabuf.patch
new file mode 100644 (file)
index 0000000..480d832
--- /dev/null
@@ -0,0 +1,96 @@
+From 94bb4ee76202eae8bd46a9e2051579b0bd2c8aa8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 12 Jul 2025 06:02:31 +0100
+Subject: habanalabs: fix UAF in export_dmabuf()
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ]
+
+As soon as we'd inserted a file reference into descriptor table, another
+thread could close it.  That's fine for the case when all we are doing is
+returning that descriptor to userland (it's a race, but it's a userland
+race and there's nothing the kernel can do about it).  However, if we
+follow fd_install() with any kind of access to objects that would be
+destroyed on close (be it the struct file itself or anything destroyed
+by its ->release()), we have a UAF.
+
+dma_buf_fd() is a combination of reserving a descriptor and fd_install().
+habanalabs export_dmabuf() calls it and then proceeds to access the
+objects destroyed on close.  In particular, it grabs an extra reference to
+another struct file that will be dropped as part of ->release() for ours;
+that "will be" is actually "might have already been".
+
+Fix that by reserving descriptor before anything else and do fd_install()
+only when everything had been set up.  As a side benefit, we no longer
+have the failure exit with file already created, but reference to
+underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet;
+unlike dma_buf_fd(), fd_install() can't fail.
+
+Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/accel/habanalabs/common/memory.c | 23 +++++++----------------
+ 1 file changed, 7 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
+index 601fdbe70179..61472a381904 100644
+--- a/drivers/accel/habanalabs/common/memory.c
++++ b/drivers/accel/habanalabs/common/memory.c
+@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
+       struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
+       struct hl_ctx *ctx;
+-      if (!hl_dmabuf)
+-              return;
+-
+       ctx = hl_dmabuf->ctx;
+       if (hl_dmabuf->memhash_hnode)
+@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ {
+       DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+       struct hl_device *hdev = ctx->hdev;
+-      int rc, fd;
++      CLASS(get_unused_fd, fd)(flags);
++
++      if (fd < 0) {
++              dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
++              return fd;
++      }
+       exp_info.ops = &habanalabs_dmabuf_ops;
+       exp_info.size = total_size;
+@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx,
+               return PTR_ERR(hl_dmabuf->dmabuf);
+       }
+-      fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
+-      if (fd < 0) {
+-              dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
+-              rc = fd;
+-              goto err_dma_buf_put;
+-      }
+-
+       hl_dmabuf->ctx = ctx;
+       hl_ctx_get(hl_dmabuf->ctx);
+       atomic_inc(&ctx->hdev->dmabuf_export_cnt);
+@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx,
+       get_file(ctx->hpriv->file_priv->filp);
+       *dmabuf_fd = fd;
++      fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);
+       return 0;
+-
+-err_dma_buf_put:
+-      hl_dmabuf->dmabuf->priv = NULL;
+-      dma_buf_put(hl_dmabuf->dmabuf);
+-      return rc;
+ }
+ static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
+-- 
+2.50.1
+
diff --git a/queue-6.16/hamradio-ignore-ops-locked-netdevs.patch b/queue-6.16/hamradio-ignore-ops-locked-netdevs.patch
new file mode 100644 (file)
index 0000000..f0ac3d5
--- /dev/null
@@ -0,0 +1,62 @@
+From f83f4f27bb385d16cb1c9541d6b03c8088d15f0d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:26 -0700
+Subject: hamradio: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit c64237960819aee1766d03f446ae6de94b1e3f73 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding hamradio for ops-locked
+devices.
+
+       xsk_notifier+0x89/0x230 net/xdp/xsk.c:1664
+       notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+       call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+       call_netdevice_notifiers net/core/dev.c:2281 [inline]
+       unregister_netdevice_many_notify+0x14d7/0x1ff0 net/core/dev.c:12156
+       unregister_netdevice_many net/core/dev.c:12219 [inline]
+       unregister_netdevice_queue+0x33c/0x380 net/core/dev.c:12063
+       register_netdevice+0x1689/0x1ae0 net/core/dev.c:11241
+       bpq_new_device drivers/net/hamradio/bpqether.c:481 [inline]
+       bpq_device_event+0x491/0x600 drivers/net/hamradio/bpqether.c:523
+       notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+       call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+       call_netdevice_notifiers net/core/dev.c:2281 [inline]
+       __dev_notify_flags+0x18d/0x2e0 net/core/dev.c:-1
+       netif_change_flags+0xe8/0x1a0 net/core/dev.c:9608
+       dev_change_flags+0x130/0x260 net/core/dev_api.c:68
+       devinet_ioctl+0xbb4/0x1b50 net/ipv4/devinet.c:1200
+       inet_ioctl+0x3c0/0x4c0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e6300f66a999a6612477@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e6300f66a999a6612477
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-2-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/hamradio/bpqether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
+index 0e0fe32d2da4..045c5177262e 100644
+--- a/drivers/net/hamradio/bpqether.c
++++ b/drivers/net/hamradio/bpqether.c
+@@ -138,7 +138,7 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev)
+ static inline int dev_is_ethdev(struct net_device *dev)
+ {
+-      return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++      return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+ /* ------------------------------------------------------------------------ */
+-- 
+2.50.1
+
diff --git a/queue-6.16/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.16/intel_idle-allow-loading-acpi-tables-for-any-family.patch
new file mode 100644 (file)
index 0000000..aacb76e
--- /dev/null
@@ -0,0 +1,41 @@
+From 5bc45712394018e4bf1a06c92753e90d4d00252a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6.  Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 73747d20df85..91a7b7e7c0c8 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1679,7 +1679,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+-      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+       {}
+ };
+-- 
+2.50.1
+
diff --git a/queue-6.16/ipvs-fix-estimator-kthreads-preferred-affinity.patch b/queue-6.16/ipvs-fix-estimator-kthreads-preferred-affinity.patch
new file mode 100644 (file)
index 0000000..e2b91cf
--- /dev/null
@@ -0,0 +1,90 @@
+From f4ed6bb5279fbef65bd65697b4e54aa15facbcaa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 29 Jul 2025 14:26:11 +0200
+Subject: ipvs: Fix estimator kthreads preferred affinity
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 ]
+
+The estimator kthreads' affinity are defined by sysctl overwritten
+preferences and applied through a plain call to the scheduler's affinity
+API.
+
+However since the introduction of managed kthreads preferred affinity,
+such a practice shortcuts the kthreads core code which eventually
+overwrites the target to the default unbound affinity.
+
+Fix this with using the appropriate kthread's API.
+
+Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Acked-by: Julian Anastasov <ja@ssi.bg>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/ip_vs.h            | 13 +++++++++++++
+ kernel/kthread.c               |  1 +
+ net/netfilter/ipvs/ip_vs_est.c |  3 ++-
+ 3 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
+index ff406ef4fd4a..29a36709e7f3 100644
+--- a/include/net/ip_vs.h
++++ b/include/net/ip_vs.h
+@@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+               return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++      if (ipvs->est_cpulist_valid)
++              return ipvs->sysctl_est_cpulist;
++      else
++              return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+       return ipvs->sysctl_est_nice;
+@@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+       return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++      return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+       return IPVS_EST_NICE;
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index 85fc068f0083..8d5e87b03d1e 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -894,6 +894,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+       return ret;
+ }
++EXPORT_SYMBOL_GPL(kthread_affine_preferred);
+ /*
+  * Re-affine kthreads according to their preferences
+diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
+index f821ad2e19b3..15049b826732 100644
+--- a/net/netfilter/ipvs/ip_vs_est.c
++++ b/net/netfilter/ipvs/ip_vs_est.c
+@@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+       }
+       set_user_nice(kd->task, sysctl_est_nice(ipvs));
+-      set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
++      if (sysctl_est_preferred_cpulist(ipvs))
++              kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
+       pr_info("starting estimator thread %d...\n", kd->id);
+       wake_up_process(kd->task);
+-- 
+2.50.1
+
diff --git a/queue-6.16/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch b/queue-6.16/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
new file mode 100644 (file)
index 0000000..e95b025
--- /dev/null
@@ -0,0 +1,78 @@
+From fbc4eefd9eaf2473aa9cf85fa2deda55d8ec654d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 17:09:56 +0800
+Subject: mm/smaps: fix race between smaps_hugetlb_range and migration
+
+From: Jinjiang Tu <tujinjiang@huawei.com>
+
+[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ]
+
+smaps_hugetlb_range() handles the pte without holdling ptl, and may be
+concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page().
+The race is as follows.
+
+smaps_hugetlb_range              migrate_pages
+  huge_ptep_get
+                                   remove_migration_ptes
+                                  folio_unlock
+  pfn_swap_entry_folio
+    BUG_ON
+
+To fix it, hold ptl lock in smaps_hugetlb_range().
+
+Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com
+Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com
+Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps")
+Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrei Vagin <avagin@gmail.com>
+Cc: Andrii Nakryiko <andrii@kernel.org>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Brahmajit Das <brahmajit.xyz@gmail.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/proc/task_mmu.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index 751479eb128f..0102ab3aaec1 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1020,10 +1020,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ {
+       struct mem_size_stats *mss = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+-      pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
+       struct folio *folio = NULL;
+       bool present = false;
++      spinlock_t *ptl;
++      pte_t ptent;
++      ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
++      ptent = huge_ptep_get(walk->mm, addr, pte);
+       if (pte_present(ptent)) {
+               folio = page_folio(pte_page(ptent));
+               present = true;
+@@ -1042,6 +1045,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+               else
+                       mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+       }
++      spin_unlock(ptl);
+       return 0;
+ }
+ #else
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-hibmcge-fix-rtnl-deadlock-issue.patch b/queue-6.16/net-hibmcge-fix-rtnl-deadlock-issue.patch
new file mode 100644 (file)
index 0000000..f8558b7
--- /dev/null
@@ -0,0 +1,122 @@
+From f88d08d5c14c6994d3611fc3adc2f16564729220 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:56 +0800
+Subject: net: hibmcge: fix rtnl deadlock issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit c875503a9b9082928d7d3fc60b5400d16fbfae4e ]
+
+Currently, the hibmcge netdev acquires the rtnl_lock in
+pci_error_handlers.reset_prepare() and releases it in
+pci_error_handlers.reset_done().
+
+However, in the PCI framework:
+pci_reset_bus - __pci_reset_slot - pci_slot_save_and_disable_locked -
+ pci_dev_save_and_disable - err_handler->reset_prepare(dev);
+
+In pci_slot_save_and_disable_locked():
+       list_for_each_entry(dev, &slot->bus->devices, bus_list) {
+               if (!dev->slot || dev->slot!= slot)
+                       continue;
+               pci_dev_save_and_disable(dev);
+               if (dev->subordinate)
+                       pci_bus_save_and_disable_locked(dev->subordinate);
+       }
+
+This will iterate through all devices under the current bus and execute
+err_handler->reset_prepare(), causing two devices of the hibmcge driver
+to sequentially request the rtnl_lock, leading to a deadlock.
+
+Since the driver now executes netif_device_detach()
+before the reset process, it will not concurrently with
+other netdev APIs, so there is no need to hold the rtnl_lock now.
+
+Therefore, this patch removes the rtnl_lock during the reset process and
+adjusts the position of HBG_NIC_STATE_RESETTING to ensure
+that multiple resets are not executed concurrently.
+
+Fixes: 3f5a61f6d504f ("net: hibmcge: Add reset supported in this module")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c | 14 +++++---------
+ 1 file changed, 5 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+index ff3295b60a69..dee1e8681157 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+@@ -53,9 +53,11 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+ {
+       int ret;
+-      ASSERT_RTNL();
++      if (test_and_set_bit(HBG_NIC_STATE_RESETTING, &priv->state))
++              return -EBUSY;
+       if (netif_running(priv->netdev)) {
++              clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+               dev_warn(&priv->pdev->dev,
+                        "failed to reset because port is up\n");
+               return -EBUSY;
+@@ -64,7 +66,6 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+       netif_device_detach(priv->netdev);
+       priv->reset_type = type;
+-      set_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+       clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
+       ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET);
+       if (ret) {
+@@ -83,28 +84,25 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type)
+           type != priv->reset_type)
+               return 0;
+-      ASSERT_RTNL();
+-
+-      clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+       ret = hbg_rebuild(priv);
+       if (ret) {
+               set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
++              clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+               dev_err(&priv->pdev->dev, "failed to rebuild after reset\n");
+               return ret;
+       }
+       netif_device_attach(priv->netdev);
++      clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+       dev_info(&priv->pdev->dev, "reset done\n");
+       return ret;
+ }
+-/* must be protected by rtnl lock */
+ int hbg_reset(struct hbg_priv *priv)
+ {
+       int ret;
+-      ASSERT_RTNL();
+       ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION);
+       if (ret)
+               return ret;
+@@ -169,7 +167,6 @@ static void hbg_pci_err_reset_prepare(struct pci_dev *pdev)
+       struct net_device *netdev = pci_get_drvdata(pdev);
+       struct hbg_priv *priv = netdev_priv(netdev);
+-      rtnl_lock();
+       hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR);
+ }
+@@ -179,7 +176,6 @@ static void hbg_pci_err_reset_done(struct pci_dev *pdev)
+       struct hbg_priv *priv = netdev_priv(netdev);
+       hbg_reset_done(priv, HBG_RESET_TYPE_FLR);
+-      rtnl_unlock();
+ }
+ static const struct pci_error_handlers hbg_pci_err_handler = {
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-hibmcge-fix-the-division-by-zero-issue.patch b/queue-6.16/net-hibmcge-fix-the-division-by-zero-issue.patch
new file mode 100644 (file)
index 0000000..67eff14
--- /dev/null
@@ -0,0 +1,46 @@
+From 2d5cc1e9320bffb1c936a0e982dce8ab8803a836 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:57 +0800
+Subject: net: hibmcge: fix the division by zero issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 7004b26f0b64331143eb0b312e77a357a11427ce ]
+
+When the network port is down, the queue is released, and ring->len is 0.
+In debugfs, hbg_get_queue_used_num() will be called,
+which may lead to a division by zero issue.
+
+This patch adds a check, if ring->len is 0,
+hbg_get_queue_used_num() directly returns 0.
+
+Fixes: 40735e7543f9 ("net: hibmcge: Implement .ndo_start_xmit function")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+index 2883a5899ae2..8b6110599e10 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+@@ -29,7 +29,12 @@ static inline bool hbg_fifo_is_full(struct hbg_priv *priv, enum hbg_dir dir)
+ static inline u32 hbg_get_queue_used_num(struct hbg_ring *ring)
+ {
+-      return (ring->ntu + ring->len - ring->ntc) % ring->len;
++      u32 len = READ_ONCE(ring->len);
++
++      if (!len)
++              return 0;
++
++      return (READ_ONCE(ring->ntu) + len - READ_ONCE(ring->ntc)) % len;
+ }
+ netdev_tx_t hbg_net_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch b/queue-6.16/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch
new file mode 100644 (file)
index 0000000..53b232d
--- /dev/null
@@ -0,0 +1,68 @@
+From d8162f09505cdd9f80470c636561332cc9e2e7d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:58 +0800
+Subject: net: hibmcge: fix the np_link_fail error reporting issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 62c50180ffda01468e640ac14925503796f255e2 ]
+
+Currently, after modifying device port mode, the np_link_ok state
+is immediately checked. At this point, the device may not yet ready,
+leading to the querying of an intermediate state.
+
+This patch will poll to check if np_link is ok after
+modifying device port mode, and only report np_link_fail upon timeout.
+
+Fixes: e0306637e85d ("net: hibmcge: Add support for mac link exception handling feature")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+index 9b65eef62b3f..2844124f306d 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+@@ -12,6 +12,8 @@
+ #define HBG_HW_EVENT_WAIT_TIMEOUT_US  (2 * 1000 * 1000)
+ #define HBG_HW_EVENT_WAIT_INTERVAL_US (10 * 1000)
++#define HBG_MAC_LINK_WAIT_TIMEOUT_US  (500 * 1000)
++#define HBG_MAC_LINK_WAIT_INTERVAL_US (5 * 1000)
+ /* little endian or big endian.
+  * ctrl means packet description, data means skb packet data
+  */
+@@ -213,6 +215,9 @@ void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr)
+ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+ {
++      u32 link_status;
++      int ret;
++
+       hbg_hw_mac_enable(priv, HBG_STATUS_DISABLE);
+       hbg_reg_write_field(priv, HBG_REG_PORT_MODE_ADDR,
+@@ -224,8 +229,14 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+       hbg_hw_mac_enable(priv, HBG_STATUS_ENABLE);
+-      if (!hbg_reg_read_field(priv, HBG_REG_AN_NEG_STATE_ADDR,
+-                              HBG_REG_AN_NEG_STATE_NP_LINK_OK_B))
++      /* wait MAC link up */
++      ret = readl_poll_timeout(priv->io_base + HBG_REG_AN_NEG_STATE_ADDR,
++                               link_status,
++                               FIELD_GET(HBG_REG_AN_NEG_STATE_NP_LINK_OK_B,
++                                         link_status),
++                               HBG_MAC_LINK_WAIT_INTERVAL_US,
++                               HBG_MAC_LINK_WAIT_TIMEOUT_US);
++      if (ret)
+               hbg_np_link_fail_task_schedule(priv);
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-kcm-fix-race-condition-in-kcm_unattach.patch b/queue-6.16/net-kcm-fix-race-condition-in-kcm_unattach.patch
new file mode 100644 (file)
index 0000000..6246393
--- /dev/null
@@ -0,0 +1,88 @@
+From b6830a257156db66722ed8d61507ba2528f6c0a0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 21:18:03 +0200
+Subject: net: kcm: Fix race condition in kcm_unattach()
+
+From: Sven Stegemann <sven@stegemann.de>
+
+[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ]
+
+syzbot found a race condition when kcm_unattach(psock)
+and kcm_release(kcm) are executed at the same time.
+
+kcm_unattach() is missing a check of the flag
+kcm->tx_stopped before calling queue_work().
+
+If the kcm has a reserved psock, kcm_unattach() might get executed
+between cancel_work_sync() and unreserve_psock() in kcm_release(),
+requeuing kcm->tx_work right before kcm gets freed in kcm_done().
+
+Remove kcm->tx_stopped and replace it by the less
+error-prone disable_work_sync().
+
+Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
+Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662
+Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94
+Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e
+Signed-off-by: Sven Stegemann <sven@stegemann.de>
+Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/kcm.h |  1 -
+ net/kcm/kcmsock.c | 10 ++--------
+ 2 files changed, 2 insertions(+), 9 deletions(-)
+
+diff --git a/include/net/kcm.h b/include/net/kcm.h
+index 441e993be634..d9c35e71ecea 100644
+--- a/include/net/kcm.h
++++ b/include/net/kcm.h
+@@ -71,7 +71,6 @@ struct kcm_sock {
+       struct list_head wait_psock_list;
+       struct sk_buff *seq_skb;
+       struct mutex tx_mutex;
+-      u32 tx_stopped : 1;
+       /* Don't use bit fields here, these are set under different locks */
+       bool tx_wait;
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index c05047dad62d..d0a001ebabfe 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -430,7 +430,7 @@ static void psock_write_space(struct sock *sk)
+       /* Check if the socket is reserved so someone is waiting for sending. */
+       kcm = psock->tx_kcm;
+-      if (kcm && !unlikely(kcm->tx_stopped))
++      if (kcm)
+               queue_work(kcm_wq, &kcm->tx_work);
+       spin_unlock_bh(&mux->lock);
+@@ -1694,12 +1694,6 @@ static int kcm_release(struct socket *sock)
+        */
+       __skb_queue_purge(&sk->sk_write_queue);
+-      /* Set tx_stopped. This is checked when psock is bound to a kcm and we
+-       * get a writespace callback. This prevents further work being queued
+-       * from the callback (unbinding the psock occurs after canceling work.
+-       */
+-      kcm->tx_stopped = 1;
+-
+       release_sock(sk);
+       spin_lock_bh(&mux->lock);
+@@ -1715,7 +1709,7 @@ static int kcm_release(struct socket *sock)
+       /* Cancel work. After this point there should be no outside references
+        * to the kcm socket.
+        */
+-      cancel_work_sync(&kcm->tx_work);
++      disable_work_sync(&kcm->tx_work);
+       lock_sock(sk);
+       psock = kcm->tx_psock;
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-lapbether-ignore-ops-locked-netdevs.patch b/queue-6.16/net-lapbether-ignore-ops-locked-netdevs.patch
new file mode 100644 (file)
index 0000000..0b2d2df
--- /dev/null
@@ -0,0 +1,64 @@
+From 3fdafaf1ab15d2a63c9237a845b9c1448d472fe2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:25 -0700
+Subject: net: lapbether: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit 53898ebabe843bfa7baea9dae152797d5d0563c9 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding lapbeth for ops-locked
+devices.
+
+       xsk_notifier+0xa4/0x280 net/xdp/xsk.c:1645
+       notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+       call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+       call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+       call_netdevice_notifiers net/core/dev.c:2282 [inline]
+       unregister_netdevice_many_notify+0xf9d/0x2700 net/core/dev.c:12077
+       unregister_netdevice_many net/core/dev.c:12140 [inline]
+       unregister_netdevice_queue+0x305/0x3f0 net/core/dev.c:11984
+       register_netdevice+0x18f1/0x2270 net/core/dev.c:11149
+       lapbeth_new_device drivers/net/wan/lapbether.c:420 [inline]
+       lapbeth_device_event+0x5b1/0xbe0 drivers/net/wan/lapbether.c:462
+       notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+       call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+       call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+       call_netdevice_notifiers net/core/dev.c:2282 [inline]
+       __dev_notify_flags+0x12c/0x2e0 net/core/dev.c:9497
+       netif_change_flags+0x108/0x160 net/core/dev.c:9526
+       dev_change_flags+0xba/0x250 net/core/dev_api.c:68
+       devinet_ioctl+0x11d5/0x1f50 net/ipv4/devinet.c:1200
+       inet_ioctl+0x3a7/0x3f0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e67ea9c235b13b4f0020@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e67ea9c235b13b4f0020
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-1-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/wan/lapbether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
+index 995a7207bdf8..f357a7ac70ac 100644
+--- a/drivers/net/wan/lapbether.c
++++ b/drivers/net/wan/lapbether.c
+@@ -81,7 +81,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev)
+ static __inline__ int dev_is_ethdev(struct net_device *dev)
+ {
+-      return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++      return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+ /* ------------------------------------------------------------------------ */
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch b/queue-6.16/net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch
new file mode 100644 (file)
index 0000000..b11abd1
--- /dev/null
@@ -0,0 +1,62 @@
+From 133eb2c7ce6134ab45bff3f7632a07aa5bf086a5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:54:49 +0200
+Subject: net: mdiobus: release reset_gpio in mdiobus_unregister_device()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Buday Csaba <buday.csaba@prolan.hu>
+
+[ Upstream commit 8ea25274ebaf2f6be8be374633b2ed8348ec0e70 ]
+
+reset_gpio is claimed in mdiobus_register_device(), but it is not
+released in mdiobus_unregister_device(). It is instead only
+released when the whole MDIO bus is unregistered.
+When a device uses the reset_gpio property, it becomes impossible
+to unregister it and register it again, because the GPIO remains
+claimed.
+This patch resolves that issue.
+
+Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support") # see notes
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Cc: Csókás Bence <csokas.bence@prolan.hu>
+[ csokas.bence: Resolve rebase conflict and clarify msg ]
+Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
+Link: https://patch.msgid.link/20250807135449.254254-2-csokas.bence@prolan.hu
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/mdio_bus.c          | 1 +
+ drivers/net/phy/mdio_bus_provider.c | 3 ---
+ 2 files changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
+index fda2e27c1810..cad6ed3aa10b 100644
+--- a/drivers/net/phy/mdio_bus.c
++++ b/drivers/net/phy/mdio_bus.c
+@@ -91,6 +91,7 @@ int mdiobus_unregister_device(struct mdio_device *mdiodev)
+       if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev)
+               return -EINVAL;
++      gpiod_put(mdiodev->reset_gpio);
+       reset_control_put(mdiodev->reset_ctrl);
+       mdiodev->bus->mdio_map[mdiodev->addr] = NULL;
+diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c
+index 65850e36284d..5401170f14e5 100644
+--- a/drivers/net/phy/mdio_bus_provider.c
++++ b/drivers/net/phy/mdio_bus_provider.c
+@@ -444,9 +444,6 @@ void mdiobus_unregister(struct mii_bus *bus)
+               if (!mdiodev)
+                       continue;
+-              if (mdiodev->reset_gpio)
+-                      gpiod_put(mdiodev->reset_gpio);
+-
+               mdiodev->device_remove(mdiodev);
+               mdiodev->device_free(mdiodev);
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-page_pool-allow-enabling-recycling-late-fix-fals.patch b/queue-6.16/net-page_pool-allow-enabling-recycling-late-fix-fals.patch
new file mode 100644 (file)
index 0000000..29d91c0
--- /dev/null
@@ -0,0 +1,174 @@
+From a2759ceb0274ed9b8379560e6e516170a71b2101 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 17:36:54 -0700
+Subject: net: page_pool: allow enabling recycling late, fix false positive
+ warning
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 64fdaa94bfe0cca3a0f4b2dd922486c5f59fe678 ]
+
+Page pool can have pages "directly" (locklessly) recycled to it,
+if the NAPI that owns the page pool is scheduled to run on the same CPU.
+To make this safe we check that the NAPI is disabled while we destroy
+the page pool. In most cases NAPI and page pool lifetimes are tied
+together so this happens naturally.
+
+The queue API expects the following order of calls:
+ -> mem_alloc
+    alloc new pp
+ -> stop
+    napi_disable
+ -> start
+    napi_enable
+ -> mem_free
+    free old pp
+
+Here we allocate the page pool in ->mem_alloc and free in ->mem_free.
+But the NAPIs are only stopped between ->stop and ->start. We created
+page_pool_disable_direct_recycling() to safely shut down the recycling
+in ->stop. This way the page_pool_destroy() call in ->mem_free doesn't
+have to worry about recycling any more.
+
+Unfortunately, the page_pool_disable_direct_recycling() is not enough
+to deal with failures which necessitate freeing the _new_ page pool.
+If we hit a failure in ->mem_alloc or ->stop the new page pool has
+to be freed while the NAPI is active (assuming driver attaches the
+page pool to an existing NAPI instance and doesn't reallocate NAPIs).
+
+Freeing the new page pool is technically safe because it hasn't been
+used for any packets, yet, so there can be no recycling. But the check
+in napi_assert_will_not_race() has no way of knowing that. We could
+check if page pool is empty but that'd make the check much less likely
+to trigger during development.
+
+Add page_pool_enable_direct_recycling(), pairing with
+page_pool_disable_direct_recycling(). It will allow us to create the new
+page pools in "disabled" state and only enable recycling when we know
+the reconfig operation will not fail.
+
+Coincidentally it will also let us re-enable the recycling for the old
+pool, if the reconfig failed:
+
+ -> mem_alloc (new)
+ -> stop (old)
+    # disables direct recycling for old
+ -> start (new)
+    # fail!!
+ -> start (old)
+    # go back to old pp but direct recycling is lost :(
+ -> mem_free (new)
+
+The new helper is idempotent to make the life easier for drivers,
+which can operate in HDS mode and support zero-copy Rx.
+The driver can call the helper twice whether there are two pools
+or it has multiple references to a single pool.
+
+Fixes: 40eca00ae605 ("bnxt_en: unlink page pool when stopping Rx queue")
+Tested-by: David Wei <dw@davidwei.uk>
+Link: https://patch.msgid.link/20250805003654.2944974-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c |  9 ++++++-
+ include/net/page_pool/types.h             |  2 ++
+ net/core/page_pool.c                      | 29 +++++++++++++++++++++++
+ 3 files changed, 39 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 243cb13cb01c..8d950b43846e 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -3810,7 +3810,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+       if (BNXT_RX_PAGE_MODE(bp))
+               pp.pool_size += bp->rx_ring_size;
+       pp.nid = numa_node;
+-      pp.napi = &rxr->bnapi->napi;
+       pp.netdev = bp->dev;
+       pp.dev = &bp->pdev->dev;
+       pp.dma_dir = bp->rx_dir;
+@@ -3842,6 +3841,12 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+       return PTR_ERR(pool);
+ }
++static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr)
++{
++      page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi);
++      page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi);
++}
++
+ static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
+ {
+       u16 mem_size;
+@@ -3880,6 +3885,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
+               rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
+               if (rc)
+                       return rc;
++              bnxt_enable_rx_page_pool(rxr);
+               rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0);
+               if (rc < 0)
+@@ -16042,6 +16048,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx)
+                       goto err_reset;
+       }
++      bnxt_enable_rx_page_pool(rxr);
+       napi_enable_locked(&bnapi->napi);
+       bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons);
+diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
+index 431b593de709..1509a536cb85 100644
+--- a/include/net/page_pool/types.h
++++ b/include/net/page_pool/types.h
+@@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
+ struct xdp_mem_info;
+ #ifdef CONFIG_PAGE_POOL
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++                                     struct napi_struct *napi);
+ void page_pool_disable_direct_recycling(struct page_pool *pool);
+ void page_pool_destroy(struct page_pool *pool);
+ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+diff --git a/net/core/page_pool.c b/net/core/page_pool.c
+index ba7cf3e3c32f..368412baad26 100644
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -1201,6 +1201,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+       pool->xdp_mem_id = mem->id;
+ }
++/**
++ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
++ * @pool: page pool to modify
++ * @napi: NAPI instance to associate the page pool with
++ *
++ * Associate a page pool with a NAPI instance for lockless page recycling.
++ * This is useful when a new page pool has to be added to a NAPI instance
++ * without disabling that NAPI instance, to mark the point at which control
++ * path "hands over" the page pool to the NAPI instance. In most cases driver
++ * can simply set the @napi field in struct page_pool_params, and does not
++ * have to call this helper.
++ *
++ * The function is idempotent, but does not implement any refcounting.
++ * Single page_pool_disable_direct_recycling() will disable recycling,
++ * no matter how many times enable was called.
++ */
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++                                     struct napi_struct *napi)
++{
++      if (READ_ONCE(pool->p.napi) == napi)
++              return;
++      WARN_ON(!napi || pool->p.napi);
++
++      mutex_lock(&page_pools_lock);
++      WRITE_ONCE(pool->p.napi, napi);
++      mutex_unlock(&page_pools_lock);
++}
++EXPORT_SYMBOL(page_pool_enable_direct_recycling);
++
+ void page_pool_disable_direct_recycling(struct page_pool *pool)
+ {
+       /* Disable direct recycling based on pool->cpuid.
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch b/queue-6.16/net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch
new file mode 100644 (file)
index 0000000..ec00381
--- /dev/null
@@ -0,0 +1,75 @@
+From 03536af7c40cc0517826bb09f63fd02513d39540 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 12:08:32 +0800
+Subject: net: phy: nxp-c45-tja11xx: fix the PHY ID mismatch issue when using
+ C45
+
+From: Clark Wang <xiaoning.wang@nxp.com>
+
+[ Upstream commit 8ee90742cf29427683294a6a80f1e2b7f4af1cff ]
+
+TJA1103/04/20/21 support both C22 and C45 accessing methods.
+
+The TJA11xx driver has implemented the match_phy_device() API.
+However, it does not handle the C45 ID. If C45 was used to access
+TJA11xx, match_phy_device() would always return false due to
+phydev->phy_id only used by C22 being empty, resulting in the
+generic phy driver being used for TJA11xx PHYs.
+
+Therefore, check phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] when
+using C45.
+
+Fixes: 1b76b2497aba ("net: phy: nxp-c45-tja11xx: simplify .match_phy_device OP")
+Signed-off-by: Clark Wang <xiaoning.wang@nxp.com>
+Link: https://patch.msgid.link/20250807040832.2455306-1-xiaoning.wang@nxp.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/nxp-c45-tja11xx.c | 23 +++++++++++++----------
+ 1 file changed, 13 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
+index 4c6d905f0a9f..87adb6508017 100644
+--- a/drivers/net/phy/nxp-c45-tja11xx.c
++++ b/drivers/net/phy/nxp-c45-tja11xx.c
+@@ -1965,24 +1965,27 @@ static int nxp_c45_macsec_ability(struct phy_device *phydev)
+       return macsec_ability;
+ }
++static bool tja11xx_phy_id_compare(struct phy_device *phydev,
++                                 const struct phy_driver *phydrv)
++{
++      u32 id = phydev->is_c45 ? phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] :
++                                phydev->phy_id;
++
++      return phy_id_compare(id, phydrv->phy_id, phydrv->phy_id_mask);
++}
++
+ static int tja11xx_no_macsec_match_phy_device(struct phy_device *phydev,
+                                             const struct phy_driver *phydrv)
+ {
+-      if (!phy_id_compare(phydev->phy_id, phydrv->phy_id,
+-                          phydrv->phy_id_mask))
+-              return 0;
+-
+-      return !nxp_c45_macsec_ability(phydev);
++      return tja11xx_phy_id_compare(phydev, phydrv) &&
++             !nxp_c45_macsec_ability(phydev);
+ }
+ static int tja11xx_macsec_match_phy_device(struct phy_device *phydev,
+                                          const struct phy_driver *phydrv)
+ {
+-      if (!phy_id_compare(phydev->phy_id, phydrv->phy_id,
+-                          phydrv->phy_id_mask))
+-              return 0;
+-
+-      return nxp_c45_macsec_ability(phydev);
++      return tja11xx_phy_id_compare(phydev, phydrv) &&
++             nxp_c45_macsec_ability(phydev);
+ }
+ static const struct nxp_c45_regmap tja1120_regmap = {
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch b/queue-6.16/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch
new file mode 100644 (file)
index 0000000..3fbac86
--- /dev/null
@@ -0,0 +1,69 @@
+From c53fdb12017debb6f4da4eac9248b6aa4ecc5bb5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:55 +0000
+Subject: net: stmmac: thead: Get and enable APB clock on initialization
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit 4cc339ce482ba78589a2d5cbe1c84b735d263383 ]
+
+It's necessary to adjust the MAC TX clock when the linkspeed changes,
+but it's noted such adjustment always fails on TH1520 SoC, and reading
+back from APB glue registers that control clock generation results in
+garbage, causing broken link.
+
+With some testing, it's found a clock must be ungated for access to APB
+glue registers. Without any consumer, the clock is automatically
+disabled during late kernel startup. Let's get and enable it if it's
+described in devicetree.
+
+For backward compatibility with older devicetrees, probing won't fail if
+the APB clock isn't found. In this case, we emit a warning since the
+link will break if the speed changes.
+
+Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-4-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+index c72ee759aae5..f2946bea0bc2 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+@@ -211,6 +211,7 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+       struct stmmac_resources stmmac_res;
+       struct plat_stmmacenet_data *plat;
+       struct thead_dwmac *dwmac;
++      struct clk *apb_clk;
+       void __iomem *apb;
+       int ret;
+@@ -224,6 +225,19 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+               return dev_err_probe(&pdev->dev, PTR_ERR(plat),
+                                    "dt configuration failed\n");
++      /*
++       * The APB clock is essential for accessing glue registers. However,
++       * old devicetrees don't describe it correctly. We continue to probe
++       * and emit a warning if it isn't present.
++       */
++      apb_clk = devm_clk_get_enabled(&pdev->dev, "apb");
++      if (PTR_ERR(apb_clk) == -ENOENT)
++              dev_warn(&pdev->dev,
++                       "cannot get apb clock, link may break after speed changes\n");
++      else if (IS_ERR(apb_clk))
++              return dev_err_probe(&pdev->dev, PTR_ERR(apb_clk),
++                                   "failed to get apb clock\n");
++
+       dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL);
+       if (!dwmac)
+               return -ENOMEM;
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.16/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
new file mode 100644 (file)
index 0000000..e264d95
--- /dev/null
@@ -0,0 +1,44 @@
+From 10472084e5247a182caea03db3b210864d598363 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 50bfbc2779e4..d8c9fe1d98c4 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+-      u32 val, cap, ret = 0;
++      u32 val, cap;
++      int ret = 0;
+       mutex_lock(&iep->ptp_clk_mutex);
+-- 
+2.50.1
+
diff --git a/queue-6.16/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch b/queue-6.16/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
new file mode 100644 (file)
index 0000000..9ef9552
--- /dev/null
@@ -0,0 +1,56 @@
+From be96c5c83b7e3f06515946d993a3bbb28a66ad22 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 23:08:12 +0530
+Subject: net: ti: icssg-prueth: Fix emac link speed handling
+
+From: MD Danish Anwar <danishanwar@ti.com>
+
+[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ]
+
+When link settings are changed emac->speed is populated by
+emac_adjust_link(). The link speed and other settings are then written into
+the DRAM. However if both ports are brought down after this and brought up
+again or if the operating mode is changed and a firmware reload is needed,
+the DRAM is cleared by icssg_config(). As a result the link settings are
+lost.
+
+Fix this by calling emac_adjust_link() after icssg_config(). This re
+populates the settings in the DRAM after a new firmware load.
+
+Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.")
+Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Message-ID: <20250805173812.2183161-1-danishanwar@ti.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+index 2f5c4335dec3..008d77727400 100644
+--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+@@ -50,6 +50,8 @@
+ /* CTRLMMR_ICSSG_RGMII_CTRL register bits */
+ #define ICSSG_CTRL_RGMII_ID_MODE                BIT(24)
++static void emac_adjust_link(struct net_device *ndev);
++
+ static int emac_get_tx_ts(struct prueth_emac *emac,
+                         struct emac_tx_ts_response *rsp)
+ {
+@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth)
+               ret = icssg_config(prueth, emac, slice);
+               if (ret)
+                       goto disable_class;
++
++              mutex_lock(&emac->ndev->phydev->lock);
++              emac_adjust_link(emac->ndev);
++              mutex_unlock(&emac->ndev->phydev->lock);
+       }
+       ret = prueth_emac_start(prueth);
+-- 
+2.50.1
+
diff --git a/queue-6.16/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.16/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..0b6a44b
--- /dev/null
@@ -0,0 +1,129 @@
+From 2a6a033f9923cd580fdd3ad1f09e4155d839f3ea Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 2cc0fde23344..5fdcae45e0bc 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -884,8 +884,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -1208,19 +1206,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -1257,7 +1262,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -1270,8 +1275,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                           ct, true, flags);
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -1284,12 +1288,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
diff --git a/queue-6.16/netfilter-ctnetlink-remove-refcounting-in-expectatio.patch b/queue-6.16/netfilter-ctnetlink-remove-refcounting-in-expectatio.patch
new file mode 100644 (file)
index 0000000..fff54e3
--- /dev/null
@@ -0,0 +1,164 @@
+From b5625a8250316d9830487dc5d90c7f8bd77a3a8e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:09 +0200
+Subject: netfilter: ctnetlink: remove refcounting in expectation dumpers
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 1492e3dcb2be3aa46d1963da96aa9593e4e4db5a ]
+
+Same pattern as previous patch: do not keep the expectation object
+alive via refcount, only store a cookie value and then use that
+as the skip hint for dump resumption.
+
+AFAICS this has the same issue as the one resolved in the conntrack
+dumper, when we do
+  if (!refcount_inc_not_zero(&exp->use))
+
+to increment the refcount, there is a chance that exp == last, which
+causes a double-increment of the refcount and subsequent memory leak.
+
+Fixes: cf6994c2b981 ("[NETFILTER]: nf_conntrack_netlink: sync expectation dumping with conntrack table dumping")
+Fixes: e844a928431f ("netfilter: ctnetlink: allow to dump expectation per master conntrack")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 41 ++++++++++++----------------
+ 1 file changed, 17 insertions(+), 24 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 5fdcae45e0bc..2273ead8102f 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -3171,23 +3171,27 @@ ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item)
+       return 0;
+ }
+ #endif
+-static int ctnetlink_exp_done(struct netlink_callback *cb)
++
++static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp)
+ {
+-      if (cb->args[1])
+-              nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
+-      return 0;
++      unsigned long id = (unsigned long)exp;
++
++      id += nf_ct_get_id(exp->master);
++      id += exp->class;
++
++      return id ? id : 1;
+ }
+ static int
+ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conntrack_expect *exp, *last;
+       struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+       u_int8_t l3proto = nfmsg->nfgen_family;
++      unsigned long last_id = cb->args[1];
++      struct nf_conntrack_expect *exp;
+       rcu_read_lock();
+-      last = (struct nf_conntrack_expect *)cb->args[1];
+       for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
+ restart:
+               hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
+@@ -3199,7 +3203,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (exp != last)
++                              if (ctnetlink_exp_id(exp) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -3208,9 +3212,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                                   cb->nlh->nlmsg_seq,
+                                                   IPCTNL_MSG_EXP_NEW,
+                                                   exp) < 0) {
+-                              if (!refcount_inc_not_zero(&exp->use))
+-                                      continue;
+-                              cb->args[1] = (unsigned long)exp;
++                              cb->args[1] = ctnetlink_exp_id(exp);
+                               goto out;
+                       }
+               }
+@@ -3221,32 +3223,30 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       rcu_read_unlock();
+-      if (last)
+-              nf_ct_expect_put(last);
+-
+       return skb->len;
+ }
+ static int
+ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+-      struct nf_conntrack_expect *exp, *last;
+       struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+       struct nf_conn *ct = cb->data;
+       struct nf_conn_help *help = nfct_help(ct);
+       u_int8_t l3proto = nfmsg->nfgen_family;
++      unsigned long last_id = cb->args[1];
++      struct nf_conntrack_expect *exp;
+       if (cb->args[0])
+               return 0;
+       rcu_read_lock();
+-      last = (struct nf_conntrack_expect *)cb->args[1];
++
+ restart:
+       hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
+               if (l3proto && exp->tuple.src.l3num != l3proto)
+                       continue;
+               if (cb->args[1]) {
+-                      if (exp != last)
++                      if (ctnetlink_exp_id(exp) != last_id)
+                               continue;
+                       cb->args[1] = 0;
+               }
+@@ -3254,9 +3254,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           cb->nlh->nlmsg_seq,
+                                           IPCTNL_MSG_EXP_NEW,
+                                           exp) < 0) {
+-                      if (!refcount_inc_not_zero(&exp->use))
+-                              continue;
+-                      cb->args[1] = (unsigned long)exp;
++                      cb->args[1] = ctnetlink_exp_id(exp);
+                       goto out;
+               }
+       }
+@@ -3267,9 +3265,6 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       cb->args[0] = 1;
+ out:
+       rcu_read_unlock();
+-      if (last)
+-              nf_ct_expect_put(last);
+-
+       return skb->len;
+ }
+@@ -3288,7 +3283,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
+       struct nf_conntrack_zone zone;
+       struct netlink_dump_control c = {
+               .dump = ctnetlink_exp_ct_dump_table,
+-              .done = ctnetlink_exp_done,
+       };
+       err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+@@ -3338,7 +3332,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
+               else {
+                       struct netlink_dump_control c = {
+                               .dump = ctnetlink_exp_dump_table,
+-                              .done = ctnetlink_exp_done,
+                       };
+                       return netlink_dump_start(info->sk, skb, info->nlh, &c);
+               }
+-- 
+2.50.1
+
diff --git a/queue-6.16/netfilter-nf_tables-reject-duplicate-device-on-updat.patch b/queue-6.16/netfilter-nf_tables-reject-duplicate-device-on-updat.patch
new file mode 100644 (file)
index 0000000..41df7da
--- /dev/null
@@ -0,0 +1,98 @@
+From 41a492b44d333b69f99787abd80e483c77ed08f0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 13 Aug 2025 02:38:50 +0200
+Subject: netfilter: nf_tables: reject duplicate device on updates
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit cf5fb87fcdaaaafec55dcc0dc5a9e15ead343973 ]
+
+A chain/flowtable update with duplicated devices in the same batch is
+possible. Unfortunately, netdev event path only removes the first
+device that is found, leaving unregistered the hook of the duplicated
+device.
+
+Check if a duplicated device exists in the transaction batch, bail out
+with EEXIST in such case.
+
+WARNING is hit when unregistering the hook:
+
+ [49042.221275] WARNING: CPU: 4 PID: 8425 at net/netfilter/core.c:340 nf_hook_entry_head+0xaa/0x150
+ [49042.221375] CPU: 4 UID: 0 PID: 8425 Comm: nft Tainted: G S                  6.16.0+ #170 PREEMPT(full)
+ [...]
+ [49042.221382] RIP: 0010:nf_hook_entry_head+0xaa/0x150
+
+Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable")
+Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index 064f18792d98..46ca725d6538 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -2790,6 +2790,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
+       struct nft_chain *chain = ctx->chain;
+       struct nft_chain_hook hook = {};
+       struct nft_stats __percpu *stats = NULL;
++      struct nftables_pernet *nft_net;
+       struct nft_hook *h, *next;
+       struct nf_hook_ops *ops;
+       struct nft_trans *trans;
+@@ -2832,6 +2833,20 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
+                               if (nft_hook_list_find(&basechain->hook_list, h)) {
+                                       list_del(&h->list);
+                                       nft_netdev_hook_free(h);
++                                      continue;
++                              }
++
++                              nft_net = nft_pernet(ctx->net);
++                              list_for_each_entry(trans, &nft_net->commit_list, list) {
++                                      if (trans->msg_type != NFT_MSG_NEWCHAIN ||
++                                          trans->table != ctx->table ||
++                                          !nft_trans_chain_update(trans))
++                                              continue;
++
++                                      if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) {
++                                              nft_chain_release_hook(&hook);
++                                              return -EEXIST;
++                                      }
+                               }
+                       }
+               } else {
+@@ -9033,6 +9048,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
+ {
+       const struct nlattr * const *nla = ctx->nla;
+       struct nft_flowtable_hook flowtable_hook;
++      struct nftables_pernet *nft_net;
+       struct nft_hook *hook, *next;
+       struct nf_hook_ops *ops;
+       struct nft_trans *trans;
+@@ -9049,6 +9065,20 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
+               if (nft_hook_list_find(&flowtable->hook_list, hook)) {
+                       list_del(&hook->list);
+                       nft_netdev_hook_free(hook);
++                      continue;
++              }
++
++              nft_net = nft_pernet(ctx->net);
++              list_for_each_entry(trans, &nft_net->commit_list, list) {
++                      if (trans->msg_type != NFT_MSG_NEWFLOWTABLE ||
++                          trans->table != ctx->table ||
++                          !nft_trans_flowtable_update(trans))
++                              continue;
++
++                      if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) {
++                              err = -EEXIST;
++                              goto err_flowtable_update_hook;
++                      }
+               }
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.16/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.16/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
new file mode 100644 (file)
index 0000000..fe92f1f
--- /dev/null
@@ -0,0 +1,103 @@
+From 6618a73f0c48d3ab0731910efbb5972d5afde30f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+       CPU0                           CPU1
+       ----                           ----
+  n_vclocks_store()
+    lock(&ptp->n_vclocks_mux) [1]
+        (physical clock)
+                                     pc_clock_adjtime()
+                                       lock(&clk->rwsem) [2]
+                                        (physical clock)
+                                       ...
+                                       ptp_clock_freerun()
+                                         ptp_vclock_in_use()
+                                           lock(&ptp->n_vclocks_mux) [3]
+                                              (physical clock)
+    ptp_clock_unregister()
+      posix_clock_unregister()
+        lock(&clk->rwsem) [4]
+          (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c  | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a6aad743c282..b352df4cd3f9 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -24,6 +24,11 @@
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+ #define PTP_MAX_CHANNELS 2048
++enum {
++      PTP_LOCK_PHYSICAL = 0,
++      PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+       struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+       int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index 7febfdcbde8b..8ed4b8598924 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+       return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++      lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+       .owner          = THIS_MODULE,
+       .name           = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+               return NULL;
+       }
++      ptp_vclock_set_subclass(vclock->clock);
++
+       timecounter_init(&vclock->tc, &vclock->cc, 0);
+       ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+-- 
+2.50.1
+
diff --git a/queue-6.16/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch b/queue-6.16/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch
new file mode 100644 (file)
index 0000000..e5332e4
--- /dev/null
@@ -0,0 +1,54 @@
+From 85ec46b8224074065b1bc397b3fb3ccff51c5ca8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:56 +0000
+Subject: riscv: dts: thead: Add APB clocks for TH1520 GMACs
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit a7f75e2883c4bd57b12c3be61bb926929adad9c0 ]
+
+Describe perisys-apb4-hclk as the APB clock for TH1520 SoC, which is
+essential for accessing GMAC glue registers.
+
+Fixes: 7e756671a664 ("riscv: dts: thead: Add TH1520 ethernet nodes")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-5-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/boot/dts/thead/th1520.dtsi | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi
+index 1db0054c4e09..93135e0f5a77 100644
+--- a/arch/riscv/boot/dts/thead/th1520.dtsi
++++ b/arch/riscv/boot/dts/thead/th1520.dtsi
+@@ -294,8 +294,9 @@ gmac1: ethernet@ffe7060000 {
+                       reg-names = "dwmac", "apb";
+                       interrupts = <67 IRQ_TYPE_LEVEL_HIGH>;
+                       interrupt-names = "macirq";
+-                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>;
+-                      clock-names = "stmmaceth", "pclk";
++                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>,
++                               <&clk CLK_PERISYS_APB4_HCLK>;
++                      clock-names = "stmmaceth", "pclk", "apb";
+                       snps,pbl = <32>;
+                       snps,fixed-burst;
+                       snps,multicast-filter-bins = <64>;
+@@ -316,8 +317,9 @@ gmac0: ethernet@ffe7070000 {
+                       reg-names = "dwmac", "apb";
+                       interrupts = <66 IRQ_TYPE_LEVEL_HIGH>;
+                       interrupt-names = "macirq";
+-                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>;
+-                      clock-names = "stmmaceth", "pclk";
++                      clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>,
++                               <&clk CLK_PERISYS_APB4_HCLK>;
++                      clock-names = "stmmaceth", "pclk", "apb";
+                       snps,pbl = <32>;
+                       snps,fixed-burst;
+                       snps,multicast-filter-bins = <64>;
+-- 
+2.50.1
+
diff --git a/queue-6.16/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.16/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
new file mode 100644 (file)
index 0000000..97ef3ef
--- /dev/null
@@ -0,0 +1,73 @@
+From 3e9fc7914a86dfe7937c516e67ff01c5544bec23 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+  BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+   sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+   sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+   __release_sock+0x1da/0x330 net/core/sock.c:3106
+   release_sock+0x6b/0x250 net/core/sock.c:3660
+   sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+   sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+   sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+   inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+  BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+   sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+   __release_sock+0x1d3/0x330 net/core/sock.c:3213
+   release_sock+0x6b/0x270 net/core/sock.c:3767
+   sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+   sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+   sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+   inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 0c0d2757f6f8..6fcdcaeed40e 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+        * it's better to just linearize it otherwise crc computing
+        * takes longer.
+        */
+-      if ((!is_gso && skb_linearize(skb)) ||
++      if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+           !pskb_may_pull(skb, sizeof(struct sctphdr)))
+               goto discard_it;
+-- 
+2.50.1
+
index 8a30aa84eb516d8fced4ca92cf6e9fd730f113ab..1c26cd75b2bece86c9ce12b43fe9365b4703f479 100644 (file)
@@ -54,3 +54,33 @@ acpi-processor-perflib-move-problematic-pr-performance-check.patch
 block-make-req_op_zone_finish-a-write-operation.patch
 mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
 mfd-cros_ec-separate-charge-control-probing-from-usb-pd.patch
+habanalabs-fix-uaf-in-export_dmabuf.patch
+mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
+xfrm-flush-all-states-in-xfrm_state_fini.patch
+xfrm-restore-gso-for-sw-crypto.patch
+xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+netfilter-ctnetlink-remove-refcounting-in-expectatio.patch
+net-hibmcge-fix-rtnl-deadlock-issue.patch
+net-hibmcge-fix-the-division-by-zero-issue.patch
+net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch
+net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
+net-page_pool-allow-enabling-recycling-late-fix-fals.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+net-lapbether-ignore-ops-locked-netdevs.patch
+hamradio-ignore-ops-locked-netdevs.patch
+erofs-fix-block-count-report-when-48-bit-layout-is-o.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch
+net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch
+net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch
+riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
+ipvs-fix-estimator-kthreads-preferred-affinity.patch
+netfilter-nf_tables-reject-duplicate-device-on-updat.patch
+bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch
+net-kcm-fix-race-condition-in-kcm_unattach.patch
diff --git a/queue-6.16/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.16/tls-handle-data-disappearing-from-under-the-tls-ulp.patch
new file mode 100644 (file)
index 0000000..42dc41a
--- /dev/null
@@ -0,0 +1,106 @@
+From 370e01dc4b6dee8ec7f8338bd058e954edaed079 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h      |  2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c   |  3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index 774859b63f0d..4e077068e6d9 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -196,7 +196,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index 095cf31bae0b..d71643b494a1 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+       strp->stm.offset = offset;
+ }
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+       struct strp_msg *rxm;
+       struct tls_msg *tlm;
+@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+       if (!strp->copy_mode && force_refresh) {
+-              if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+-                      return;
++              if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++                      WRITE_ONCE(strp->msg_ready, 0);
++                      memset(&strp->stm, 0, sizeof(strp->stm));
++                      return false;
++              }
+               tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+       }
+@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       rxm->offset     = strp->stm.offset;
+       tlm = tls_msg(strp->anchor);
+       tlm->control    = strp->mark;
++
++      return true;
+ }
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 549d1ea01a72..51c98a007dda 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+                       return sock_intr_errno(timeo);
+       }
+-      tls_strp_msg_load(&ctx->strp, released);
++      if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++              return tls_rx_rec_wait(sk, psock, nonblock, false);
+       return 1;
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.16/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.16/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..7ab062f
--- /dev/null
@@ -0,0 +1,51 @@
+From db3787d79bafca5f191f3fc9d0d8f261e0c90bba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index e0a6bfa95118..eeac86bacdba 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -224,7 +224,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+
diff --git a/queue-6.16/xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch b/queue-6.16/xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch
new file mode 100644 (file)
index 0000000..8549767
--- /dev/null
@@ -0,0 +1,42 @@
+From 813b078a896269b17b6d904ba2cae892c2cc6d03 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:26 +0200
+Subject: xfrm: bring back device check in validate_xmit_xfrm
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 65f079a6c446a939eefe71e6d5957d5d6365fcf9 ]
+
+This is partial revert of commit d53dda291bbd993a29b84d358d282076e3d01506.
+
+This change causes traffic using GSO with SW crypto running through a
+NIC capable of HW offload to no longer get segmented during
+validate_xmit_xfrm, and is unrelated to the bonding use case mentioned
+in the commit.
+
+Fixes: d53dda291bbd ("xfrm: Remove unneeded device check from validate_xmit_xfrm")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xfrm/xfrm_device.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
+index 1f88472aaac0..c7a1f080d2de 100644
+--- a/net/xfrm/xfrm_device.c
++++ b/net/xfrm/xfrm_device.c
+@@ -155,7 +155,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
+               return skb;
+       }
+-      if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) {
++      if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
++                              unlikely(xmit_xfrm_check_overflow(skb)))) {
+               struct sk_buff *segs;
+               /* Packet got rerouted, fixup features and segment it. */
+-- 
+2.50.1
+
diff --git a/queue-6.16/xfrm-flush-all-states-in-xfrm_state_fini.patch b/queue-6.16/xfrm-flush-all-states-in-xfrm_state_fini.patch
new file mode 100644 (file)
index 0000000..499845f
--- /dev/null
@@ -0,0 +1,61 @@
+From 72312704ac548ae1474e05e1017aeb91aceb8a58 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:05:43 +0200
+Subject: xfrm: flush all states in xfrm_state_fini
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 42e42562c9cfcdacf000f1b42284a4fad24f8546 ]
+
+While reverting commit f75a2804da39 ("xfrm: destroy xfrm_state
+synchronously on net exit path"), I incorrectly changed
+xfrm_state_flush's "proto" argument back to IPSEC_PROTO_ANY. This
+reverts some of the changes in commit dbb2483b2a46 ("xfrm: clean up
+xfrm protocol checks"), and leads to some states not being removed
+when we exit the netns.
+
+Pass 0 instead of IPSEC_PROTO_ANY from both xfrm_state_fini
+xfrm6_tunnel_net_exit, so that xfrm_state_flush deletes all states.
+
+Fixes: 2a198bbec691 ("Revert "xfrm: destroy xfrm_state synchronously on net exit path"")
+Reported-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=6641a61fe0e2e89ae8c5
+Tested-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/xfrm6_tunnel.c | 2 +-
+ net/xfrm/xfrm_state.c   | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
+index 5120a763da0d..0a0eeaed0591 100644
+--- a/net/ipv6/xfrm6_tunnel.c
++++ b/net/ipv6/xfrm6_tunnel.c
+@@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
+       struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+       unsigned int i;
+-      xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
++      xfrm_state_flush(net, 0, false);
+       xfrm_flush_gc();
+       for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
+diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
+index 97ff756191ba..5f1da305eea8 100644
+--- a/net/xfrm/xfrm_state.c
++++ b/net/xfrm/xfrm_state.c
+@@ -3278,7 +3278,7 @@ void xfrm_state_fini(struct net *net)
+       unsigned int sz;
+       flush_work(&net->xfrm.state_hash_work);
+-      xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
++      xfrm_state_flush(net, 0, false);
+       flush_work(&xfrm_state_gc_work);
+       WARN_ON(!list_empty(&net->xfrm.state_all));
+-- 
+2.50.1
+
diff --git a/queue-6.16/xfrm-restore-gso-for-sw-crypto.patch b/queue-6.16/xfrm-restore-gso-for-sw-crypto.patch
new file mode 100644 (file)
index 0000000..eff6391
--- /dev/null
@@ -0,0 +1,58 @@
+From da846e56cf21a09f2ac3e5528405cfcdb9f956cc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:25 +0200
+Subject: xfrm: restore GSO for SW crypto
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 234d1eff5d4987024be9d40ac07b918a5ae8db1a ]
+
+Commit 49431af6c4ef incorrectly assumes that the GSO path is only used
+by HW offload, but it's also useful for SW crypto.
+
+This patch re-enables GSO for SW crypto. It's not an exact revert to
+preserve the other changes made to xfrm_dev_offload_ok afterwards, but
+it reverts all of its effects.
+
+Fixes: 49431af6c4ef ("xfrm: rely on XFRM offload")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xfrm/xfrm_device.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
+index d2819baea414..1f88472aaac0 100644
+--- a/net/xfrm/xfrm_device.c
++++ b/net/xfrm/xfrm_device.c
+@@ -415,10 +415,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+       struct net_device *dev = x->xso.dev;
+       bool check_tunnel_size;
+-      if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED)
++      if (!x->type_offload ||
++          (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
+               return false;
+-      if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) {
++      if ((!dev || dev == xfrm_dst_path(dst)->dev) &&
++          !xdst->child->xfrm) {
+               mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
+               if (skb->len <= mtu)
+                       goto ok;
+@@ -430,6 +432,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+       return false;
+ ok:
++      if (!dev)
++              return true;
++
+       check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
+                           x->props.mode == XFRM_MODE_TUNNEL;
+       switch (x->props.family) {
+-- 
+2.50.1
+
diff --git a/queue-6.6/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.6/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
new file mode 100644 (file)
index 0000000..589aa23
--- /dev/null
@@ -0,0 +1,91 @@
+From a8a27555c9f7f82114edc7e731fd752b9483bf61 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2].  This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period.  Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index edd9a8fb9878..f3a071ac3b2a 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -160,6 +160,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters)
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++      /* Update the repeating-pattern data. */
++      data->intervals[data->interval_ptr++] = interval_us;
++      if (data->interval_ptr >= INTERVALS)
++              data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+ /*
+@@ -284,6 +292,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+       if (data->needs_update) {
+               menu_update(drv, dev);
+               data->needs_update = 0;
++      } else if (!dev->last_residency_ns) {
++              /*
++               * This happens when the driver rejects the previously selected
++               * idle state and returns an error, so update the recent
++               * intervals table to prevent invalid information from being
++               * used going forward.
++               */
++              menu_update_intervals(data, UINT_MAX);
+       }
+       nr_iowaiters = nr_iowait_cpu(dev->cpu);
+@@ -553,10 +569,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+       data->correction_factor[data->bucket] = new_factor;
+-      /* update the repeating-pattern data */
+-      data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+-      if (data->interval_ptr >= INTERVALS)
+-              data->interval_ptr = 0;
++      menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+ /**
+-- 
+2.50.1
+
diff --git a/queue-6.6/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.6/intel_idle-allow-loading-acpi-tables-for-any-family.patch
new file mode 100644 (file)
index 0000000..606f124
--- /dev/null
@@ -0,0 +1,41 @@
+From 49fe61784622f7ba647e5db6df74aa4619ee0dd9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6.  Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 44842f243f40..6908052dea77 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1432,7 +1432,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+-      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++      X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+       {}
+ };
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch b/queue-6.6/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
new file mode 100644 (file)
index 0000000..4b906e2
--- /dev/null
@@ -0,0 +1,117 @@
+From 430f336058e516557bd4d0e04e7597307b4b537c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:38 -0700
+Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
+
+Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports
+a subset of hardware functionality, i.e. KVM can't rely on hardware to
+detect illegal/unsupported values.  Failure to check the vmcs12 value
+would allow the guest to load any harware-supported value while running L2.
+
+Take care to exempt BTF and LBR from the validity check in order to match
+KVM's behavior for writes via WRMSR, but without clobbering vmcs12.  Even
+if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect
+that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR
+are being intercepted.
+
+Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set
+*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but
+that would incur non-trivial complexity and wouldn't change the fact that
+KVM's handling of DEBUGCTL is blatantly broken.  I.e. the extra complexity
+is not worth carrying.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 12 ++++++++++--
+ arch/x86/kvm/vmx/vmx.c    |  5 ++---
+ arch/x86/kvm/vmx/vmx.h    |  3 +++
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index fdf7503491f9..10236ecdad95 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2564,7 +2564,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
++              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
++                                                vmx_get_supported_debugctl(vcpu, false));
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+@@ -3045,7 +3046,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+               return -EINVAL;
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+-          CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
++          (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
++           CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
+               return -EINVAL;
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+@@ -4435,6 +4437,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+               (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+               (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
++      /*
++       * Note!  Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
++       * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
++       * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
++       * vmcs02 doesn't strictly track vmcs12.
++       */
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+               kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index aaa767ed170e..32f1a38a1010 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2149,7 +2149,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+       return (unsigned long)data;
+ }
+-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+       u64 debugctl = 0;
+@@ -2168,8 +2168,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+       return debugctl;
+ }
+-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
+-                                bool host_initiated)
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ {
+       u64 invalid;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 50d32d830890..5816fdd2dfa8 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -429,6 +429,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++
+ /*
+  * Note, early Intel manuals have the write-low and read-high bitmap offsets
+  * the wrong way round.  The bitmaps control MSRs 0x00000000-0x00001fff and
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch b/queue-6.6/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch
new file mode 100644 (file)
index 0000000..06d5948
--- /dev/null
@@ -0,0 +1,156 @@
+From dbe73b19d1fdc8dd7627b1c3710fd6f9191a1429 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:24 -0700
+Subject: KVM: nVMX: Defer SVI update to vmcs01 on EOI when L2 is active w/o
+ VID
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chao Gao <chao.gao@intel.com>
+
+[ Upstream commit 04bc93cf49d16d01753b95ddb5d4f230b809a991 ]
+
+If KVM emulates an EOI for L1's virtual APIC while L2 is active, defer
+updating GUEST_INTERUPT_STATUS.SVI, i.e. the VMCS's cache of the highest
+in-service IRQ, until L1 is active, as vmcs01, not vmcs02, needs to track
+vISR.  The missed SVI update for vmcs01 can result in L1 interrupts being
+incorrectly blocked, e.g. if there is a pending interrupt with lower
+priority than the interrupt that was EOI'd.
+
+This bug only affects use cases where L1's vAPIC is effectively passed
+through to L2, e.g. in a pKVM scenario where L2 is L1's depriveleged host,
+as KVM will only emulate an EOI for L1's vAPIC if Virtual Interrupt
+Delivery (VID) is disabled in vmc12, and L1 isn't intercepting L2 accesses
+to its (virtual) APIC page (or if x2APIC is enabled, the EOI MSR).
+
+WARN() if KVM updates L1's ISR while L2 is active with VID enabled, as an
+EOI from L2 is supposed to affect L2's vAPIC, but still defer the update,
+to try to keep L1 alive.  Specifically, KVM forwards all APICv-related
+VM-Exits to L1 via nested_vmx_l1_wants_exit():
+
+       case EXIT_REASON_APIC_ACCESS:
+       case EXIT_REASON_APIC_WRITE:
+       case EXIT_REASON_EOI_INDUCED:
+               /*
+                * The controls for "virtualize APIC accesses," "APIC-
+                * register virtualization," and "virtual-interrupt
+                * delivery" only come from vmcs12.
+                */
+               return true;
+
+Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/kvm/20230312180048.1778187-1-jason.cj.chen@intel.com
+Reported-by: Markku Ahvenjärvi <mankku@gmail.com>
+Closes: https://lore.kernel.org/all/20240920080012.74405-1-mankku@gmail.com
+Cc: Janne Karhunen <janne.karhunen@gmail.com>
+Signed-off-by: Chao Gao <chao.gao@intel.com>
+[sean: drop request, handle in VMX, write changelog]
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntactic conflict in lapic.h, account for lack of
+       kvm_x86_call(), drop sanity check due to lack of wants_to_run]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/lapic.c      | 11 +++++++++++
+ arch/x86/kvm/lapic.h      |  1 +
+ arch/x86/kvm/vmx/nested.c |  5 +++++
+ arch/x86/kvm/vmx/vmx.c    | 16 ++++++++++++++++
+ arch/x86/kvm/vmx/vmx.h    |  1 +
+ 5 files changed, 34 insertions(+)
+
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index cbf85a1ffb74..ba1c2a7f74f7 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -803,6 +803,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+       }
+ }
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
++{
++      struct kvm_lapic *apic = vcpu->arch.apic;
++
++      if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
++              return;
++
++      static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
++}
++EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
++
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+ {
+       /* This may race with setting of irr in __apic_accept_irq() and
+diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
+index 0a0ea4b5dd8c..0dd069b8d6d1 100644
+--- a/arch/x86/kvm/lapic.h
++++ b/arch/x86/kvm/lapic.h
+@@ -124,6 +124,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index d3e346a574f1..fdf7503491f9 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4900,6 +4900,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+               kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+       }
++      if (vmx->nested.update_vmcs01_hwapic_isr) {
++              vmx->nested.update_vmcs01_hwapic_isr = false;
++              kvm_apic_update_hwapic_isr(vcpu);
++      }
++
+       if ((vm_exit_reason != -1) &&
+           (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
+               vmx->nested.need_vmcs12_to_shadow_sync = true;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index cde01eb1f5e3..4563e7a9a851 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6839,6 +6839,22 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+       u16 status;
+       u8 old;
++      /*
++       * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
++       * is only relevant for if and only if Virtual Interrupt Delivery is
++       * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
++       * vAPIC, not L1's vAPIC.  KVM must update vmcs01 on the next nested
++       * VM-Exit, otherwise L1 with run with a stale SVI.
++       */
++      if (is_guest_mode(vcpu)) {
++              /*
++               * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
++               * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
++               */
++              to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
++              return;
++      }
++
+       if (max_isr == -1)
+               max_isr = 0;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 6be1627d888e..88c5b7ebf9d3 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -177,6 +177,7 @@ struct nested_vmx {
+       bool reload_vmcs01_apic_access_page;
+       bool update_vmcs01_cpu_dirty_logging;
+       bool update_vmcs01_apicv_status;
++      bool update_vmcs01_hwapic_isr;
+       /*
+        * Enlightened VMCS has been enabled. It does not mean that L1 has to
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch b/queue-6.6/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch
new file mode 100644 (file)
index 0000000..440e714
--- /dev/null
@@ -0,0 +1,123 @@
+From 5b61b1298fba19f1de5269df84bb30ea8bbb71f4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:22 -0700
+Subject: KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI
+ shadow
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit be45bc4eff33d9a7dae84a2150f242a91a617402 ]
+
+Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common
+svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff()
+so that VMRUN is not executed in an STI shadow.  AMD CPUs have a quirk
+(some would say "bug"), where the STI shadow bleeds into the guest's
+intr_state field if a #VMEXIT occurs during injection of an event, i.e. if
+the VMRUN doesn't complete before the subsequent #VMEXIT.
+
+The spurious "interrupts masked" state is relatively benign, as it only
+occurs during event injection and is transient.  Because KVM is already
+injecting an event, the guest can't be in HLT, and if KVM is querying IRQ
+blocking for injection, then KVM would need to force an immediate exit
+anyways since injecting multiple events is impossible.
+
+However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the
+spurious STI shadow is visible to L1 when running a nested VM, which can
+trip sanity checks, e.g. in VMware's VMM.
+
+Hoist the STI+CLI all the way to C code, as the aforementioned calls to
+guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are
+enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already
+possible.  I.e. if there's kernel code that is confused by running with
+RFLAGS.IF=1, then it's already a problem.  In practice, since GIF=0 also
+blocks NMIs, the only change in exposure to non-KVM code (relative to
+surrounding VMRUN with STI+CLI) is exception handling code, and except for
+the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path
+are fatal.
+
+Use the "raw" variants to enable/disable IRQs to avoid tracing in the
+"no instrumentation" code; the guest state helpers also take care of
+tracing IRQ state.
+
+Oppurtunstically document why KVM needs to do STI in the first place.
+
+Reported-by: Doug Covelli <doug.covelli@broadcom.com>
+Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com
+Fixes: f14eec0a3203 ("KVM: SVM: move more vmentry code to assembly")
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in __svm_sev_es_vcpu_run()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c     | 14 ++++++++++++++
+ arch/x86/kvm/svm/vmenter.S |  9 +--------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 86c50747e158..abbb84ddfe02 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4170,6 +4170,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       guest_state_enter_irqoff();
++      /*
++       * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
++       * VMRUN controls whether or not physical IRQs are masked (KVM always
++       * runs with V_INTR_MASKING_MASK).  Toggle RFLAGS.IF here to avoid the
++       * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
++       * into guest state if delivery of an event during VMRUN triggers a
++       * #VMEXIT, and the guest_state transitions already tell lockdep that
++       * IRQs are being enabled/disabled.  Note!  GIF=0 for the entirety of
++       * this path, so IRQs aren't actually unmasked while running host code.
++       */
++      raw_local_irq_enable();
++
+       amd_clear_divider();
+       if (sev_es_guest(vcpu->kvm))
+@@ -4177,6 +4189,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       else
+               __svm_vcpu_run(svm, spec_ctrl_intercepted);
++      raw_local_irq_disable();
++
+       guest_state_exit_irqoff();
+ }
+diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
+index 56fe34d9397f..81ecb9e1101d 100644
+--- a/arch/x86/kvm/svm/vmenter.S
++++ b/arch/x86/kvm/svm/vmenter.S
+@@ -171,12 +171,8 @@ SYM_FUNC_START(__svm_vcpu_run)
+       VM_CLEAR_CPU_BUFFERS
+       /* Enter guest mode */
+-      sti
+-
+ 3:    vmrun %_ASM_AX
+ 4:
+-      cli
+-
+       /* Pop @svm to RAX while it's the only available register. */
+       pop %_ASM_AX
+@@ -341,11 +337,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+       VM_CLEAR_CPU_BUFFERS
+       /* Enter guest mode */
+-      sti
+-
+ 1:    vmrun %_ASM_AX
+-
+-2:    cli
++2:
+       /* Pop @svm to RDI, guest registers have been saved already. */
+       pop %_ASM_DI
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch b/queue-6.6/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
new file mode 100644 (file)
index 0000000..cbe9694
--- /dev/null
@@ -0,0 +1,63 @@
+From f4b282d7e8425cc2b1c48f385fa3c049a29e137b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:36 -0700
+Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ]
+
+Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the
+guest CPUID model, as debug support is supposed to be available if RTM is
+supported, and there are no known downsides to letting the guest debug RTM
+aborts.
+
+Note, there are no known bug reports related to RTM_DEBUG, the primary
+motivation is to reduce the probability of breaking existing guests when a
+future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL
+(KVM currently lets L2 run with whatever hardware supports; whoops).
+
+Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to
+DR7.RTM.
+
+Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/msr-index.h | 1 +
+ arch/x86/kvm/vmx/vmx.c           | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index 033855457581..723e48b57bd0 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -380,6 +380,7 @@
+ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI     (1UL << 12)
+ #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+ #define DEBUGCTLMSR_FREEZE_IN_SMM     (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
++#define DEBUGCTLMSR_RTM_DEBUG         BIT(15)
+ #define MSR_PEBS_FRONTEND             0x000003f7
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 08ca218ee858..359c3b7f52a1 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2161,6 +2161,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+           (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+               debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
++      if (boot_cpu_has(X86_FEATURE_RTM) &&
++          (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM)))
++              debugctl |= DEBUGCTLMSR_RTM_DEBUG;
++
+       return debugctl;
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch b/queue-6.6/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
new file mode 100644 (file)
index 0000000..33b3b73
--- /dev/null
@@ -0,0 +1,90 @@
+From b170b1c7fa1f5907611a190f0e1c6fa6d1ae712e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:37 -0700
+Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
+
+Move VMX's logic to check DEBUGCTL values into a standalone helper so that
+the code can be used by nested VM-Enter to apply the same logic to the
+value being loaded from vmcs12.
+
+KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested
+VM-Enter, as hardware may support features that KVM does not, i.e. relying
+on hardware to detect invalid guest state will result in false negatives.
+Unfortunately, that means applying KVM's funky suppression of BTF and LBR
+to vmcs12 so as not to break existing guests.
+
+No functional change intended.
+
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------
+ 1 file changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 359c3b7f52a1..aaa767ed170e 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2168,6 +2168,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+       return debugctl;
+ }
++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
++                                bool host_initiated)
++{
++      u64 invalid;
++
++      invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
++      if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
++              kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
++              invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
++      }
++      return !invalid;
++}
++
+ /*
+  * Writes msr value into the appropriate "register".
+  * Returns 0 on success, non-0 otherwise.
+@@ -2236,19 +2249,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+               }
+               vmcs_writel(GUEST_SYSENTER_ESP, data);
+               break;
+-      case MSR_IA32_DEBUGCTLMSR: {
+-              u64 invalid;
+-
+-              invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+-              if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+-                      kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+-                      data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+-                      invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+-              }
+-
+-              if (invalid)
++      case MSR_IA32_DEBUGCTLMSR:
++              if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+                       return 1;
++              data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
++
+               if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+                                               VM_EXIT_SAVE_DEBUG_CONTROLS)
+                       get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+@@ -2258,7 +2264,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                   (data & DEBUGCTLMSR_LBR))
+                       intel_pmu_create_guest_lbr_event(vcpu);
+               return 0;
+-      }
+       case MSR_IA32_BNDCFGS:
+               if (!kvm_mpx_supported() ||
+                   (!msr_info->host_initiated &&
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch b/queue-6.6/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch
new file mode 100644 (file)
index 0000000..5f21482
--- /dev/null
@@ -0,0 +1,56 @@
+From 10389ae08622b1effe126e28b1a647b66752a860 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:30 -0700
+Subject: KVM: VMX: Handle forced exit due to preemption timer in fastpath
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 11776aa0cfa7d007ad1799b1553bdcbd830e5010 ]
+
+Handle VMX preemption timer VM-Exits due to KVM forcing an exit in the
+exit fastpath, i.e. avoid calling back into handle_preemption_timer() for
+the same exit.  There is no work to be done for forced exits, as the name
+suggests the goal is purely to get control back in KVM.
+
+In addition to shaving a few cycles, this will allow cleanly separating
+handle_fastpath_preemption_timer() from handle_preemption_timer(), e.g.
+it's not immediately obvious why _apparently_ calling
+handle_fastpath_preemption_timer() twice on a "slow" exit is necessary:
+the "slow" call is necessary to handle exits from L2, which are excluded
+from the fastpath by vmx_vcpu_run().
+
+Link: https://lore.kernel.org/r/20240110012705.506918-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 32b792387271..631fdd4a575a 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6027,12 +6027,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+       if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+               return EXIT_FASTPATH_REENTER_GUEST;
+-      if (!vmx->req_immediate_exit) {
+-              kvm_lapic_expired_hv_timer(vcpu);
+-              return EXIT_FASTPATH_REENTER_GUEST;
+-      }
++      /*
++       * If the timer expired because KVM used it to force an immediate exit,
++       * then mission accomplished.
++       */
++      if (vmx->req_immediate_exit)
++              return EXIT_FASTPATH_EXIT_HANDLED;
+-      return EXIT_FASTPATH_NONE;
++      kvm_lapic_expired_hv_timer(vcpu);
++      return EXIT_FASTPATH_REENTER_GUEST;
+ }
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch b/queue-6.6/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch
new file mode 100644 (file)
index 0000000..fbc9dd4
--- /dev/null
@@ -0,0 +1,74 @@
+From 346070b6afc211b7d9c548666678021841dbbc67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:32 -0700
+Subject: KVM: VMX: Handle KVM-induced preemption timer exits in fastpath for
+ L2
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 7b3d1bbf8d68d76fb21210932a5e8ed8ea80dbcc ]
+
+Eat VMX treemption timer exits in the fastpath regardless of whether L1 or
+L2 is active.  The VM-Exit is 100% KVM-induced, i.e. there is nothing
+directly related to the exit that KVM needs to do on behalf of the guest,
+thus there is no reason to wait until the slow path to do nothing.
+
+Opportunistically add comments explaining why preemption timer exits for
+emulating the guest's APIC timer need to go down the slow path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++--
+ 1 file changed, 20 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4c991d514015..0ecc0e996386 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6034,13 +6034,26 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+       if (vmx->req_immediate_exit)
+               return EXIT_FASTPATH_EXIT_HANDLED;
++      /*
++       * If L2 is active, go down the slow path as emulating the guest timer
++       * expiration likely requires synthesizing a nested VM-Exit.
++       */
++      if (is_guest_mode(vcpu))
++              return EXIT_FASTPATH_NONE;
++
+       kvm_lapic_expired_hv_timer(vcpu);
+       return EXIT_FASTPATH_REENTER_GUEST;
+ }
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+-      handle_fastpath_preemption_timer(vcpu);
++      /*
++       * This non-fastpath handler is reached if and only if the preemption
++       * timer was being used to emulate a guest timer while L2 is active.
++       * All other scenarios are supposed to be handled in the fastpath.
++       */
++      WARN_ON_ONCE(!is_guest_mode(vcpu));
++      kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+ }
+@@ -7258,7 +7271,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
+-      if (is_guest_mode(vcpu))
++      /*
++       * If L2 is active, some VMX preemption timer exits can be handled in
++       * the fastpath even, all other exits must use the slow path.
++       */
++      if (is_guest_mode(vcpu) &&
++          to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+               return EXIT_FASTPATH_NONE;
+       switch (to_vmx(vcpu)->exit_reason.basic) {
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch b/queue-6.6/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
new file mode 100644 (file)
index 0000000..ffe7a41
--- /dev/null
@@ -0,0 +1,191 @@
+From 40b4fc9e84bf81654f1ef6040150a04d2e2fc2fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:40 -0700
+Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the
+ guest
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ]
+
+Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the
+host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting
+while running the guest.  When running with the "default treatment of SMIs"
+in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that
+is visible to host (non-SMM) software, and instead transitions directly
+from VMX non-root to SMM.  And critically, DEBUGCTL isn't context switched
+by hardware on SMI or RSM, i.e. SMM will run with whatever value was
+resident in hardware at the time of the SMI.
+
+Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting
+events while the CPU is executing in SMM, which can pollute profiling and
+potentially leak information into the guest.
+
+Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner
+run loop, as the bit can be toggled in IRQ context via IPI callback (SMP
+function call), by way of /sys/devices/cpu/freeze_on_smi.
+
+Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be
+preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs,
+i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and
+at worst could lead to undesirable behavior in the future if AMD CPUs ever
+happened to pick up a collision with the bit.
+
+Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module
+owns and controls GUEST_IA32_DEBUGCTL.
+
+WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the
+lack of handling isn't a KVM bug (TDX already WARNs on any run_flag).
+
+Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed
+by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state().
+Doing so avoids the need to track host_debugctl on a per-VMCS basis, as
+GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and
+load_vmcs12_host_state().  For the VM-Fail case, even though KVM won't
+have actually entered the guest, vcpu_enter_guest() will have run with
+vmcs02 active and thus could result in vmcs01 being run with a stale value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: move vmx/main.c change to vmx/vmx.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  7 +++++++
+ arch/x86/kvm/vmx/nested.c       |  3 +++
+ arch/x86/kvm/vmx/vmx.c          |  5 +++++
+ arch/x86/kvm/vmx/vmx.h          | 15 ++++++++++++++-
+ arch/x86/kvm/x86.c              | 14 ++++++++++++--
+ 5 files changed, 41 insertions(+), 3 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 7373b22c02a7..813887324d52 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1553,6 +1553,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+       KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
+       KVM_RUN_LOAD_GUEST_DR6          = BIT(1),
++      KVM_RUN_LOAD_DEBUGCTL           = BIT(2),
+ };
+ struct kvm_x86_ops {
+@@ -1580,6 +1581,12 @@ struct kvm_x86_ops {
+       void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+       void (*vcpu_put)(struct kvm_vcpu *vcpu);
++      /*
++       * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
++       * match the host's value even while the guest is active.
++       */
++      const u64 HOST_OWNED_DEBUGCTL;
++
+       void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
+       int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+       int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 2ce39ffbcefb..d2fa192d7ce7 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4688,6 +4688,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+                       WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+       }
++      /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
++      vmx_reload_guest_debugctl(vcpu);
++
+       /*
+        * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+        * handle a variety of side effects to KVM's software model.
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index d0973bd7853c..9b1f22bcb716 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7399,6 +7399,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+       if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+               set_debugreg(vcpu->arch.dr6, 6);
++      if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
++              vmx_reload_guest_debugctl(vcpu);
++
+       /*
+        * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+        * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8326,6 +8329,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+       .vcpu_load = vmx_vcpu_load,
+       .vcpu_put = vmx_vcpu_put,
++      .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
++
+       .update_exception_bitmap = vmx_update_exception_bitmap,
+       .get_msr_feature = vmx_get_msr_feature,
+       .get_msr = vmx_get_msr,
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 769e70fd142c..5d73d3e570d7 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -434,12 +434,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+ {
++      WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
++
++      val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
+       vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+ }
+ static inline u64 vmx_guest_debugctl_read(void)
+ {
+-      return vmcs_read64(GUEST_IA32_DEBUGCTL);
++      return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
++}
++
++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
++{
++      u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
++
++      if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
++              return;
++
++      vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
+ }
+ /*
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index fbb2e70e3031..fc2cafc33b37 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10518,7 +10518,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               dm_request_for_irq_injection(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu);
+       fastpath_t exit_fastpath;
+-      u64 run_flags;
++      u64 run_flags, debug_ctl;
+       bool req_immediate_exit = false;
+@@ -10777,7 +10777,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(0, 7);
+       }
+-      vcpu->arch.host_debugctl = get_debugctlmsr();
++      /*
++       * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
++       * can be modified in IRQ context, e.g. via SMP function calls.  Inform
++       * vendor code if any host-owned bits were changed, e.g. so that the
++       * value loaded into hardware while running the guest can be updated.
++       */
++      debug_ctl = get_debugctlmsr();
++      if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
++          !vcpu->arch.guest_state_protected)
++              run_flags |= KVM_RUN_LOAD_DEBUGCTL;
++      vcpu->arch.host_debugctl = debug_ctl;
+       guest_timing_enter_irqoff();
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch b/queue-6.6/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch
new file mode 100644 (file)
index 0000000..82cd3d2
--- /dev/null
@@ -0,0 +1,49 @@
+From 010b3aed9b879bc35a20a52e2435f99c018fd9bc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:29 -0700
+Subject: KVM: VMX: Re-enter guest in fastpath for "spurious" preemption timer
+ exits
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit e6b5d16bbd2d4c8259ad76aa33de80d561aba5f9 ]
+
+Re-enter the guest in the fast path if VMX preeemption timer VM-Exit was
+"spurious", i.e. if KVM "soft disabled" the timer by writing -1u and by
+some miracle the timer expired before any other VM-Exit occurred.  This is
+just an intermediate step to cleaning up the preemption timer handling,
+optimizing these types of spurious VM-Exits is not interesting as they are
+extremely rare/infrequent.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index ee501871ddb0..32b792387271 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6019,8 +6019,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+-      if (!vmx->req_immediate_exit &&
+-          !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
++      /*
++       * In the *extremely* unlikely scenario that this is a spurious VM-Exit
++       * due to the timer expiring while it was "soft" disabled, just eat the
++       * exit and re-enter the guest.
++       */
++      if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
++              return EXIT_FASTPATH_REENTER_GUEST;
++
++      if (!vmx->req_immediate_exit) {
+               kvm_lapic_expired_hv_timer(vcpu);
+               return EXIT_FASTPATH_REENTER_GUEST;
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch b/queue-6.6/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
new file mode 100644 (file)
index 0000000..3dfeefd
--- /dev/null
@@ -0,0 +1,162 @@
+From 2277369c1b499bf85b3b553e281b264495bb2514 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:39 -0700
+Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
+
+Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to
+vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into
+GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state
+into the guest, and without needing to copy+paste the FREEZE_IN_SMM
+logic into every patch that accesses GUEST_IA32_DEBUGCTL.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+[sean: massage changelog, make inline, use in all prepare_vmcs02() cases]
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c    | 10 +++++-----
+ arch/x86/kvm/vmx/pmu_intel.c |  8 ++++----
+ arch/x86/kvm/vmx/vmx.c       |  8 +++++---
+ arch/x86/kvm/vmx/vmx.h       | 10 ++++++++++
+ 4 files changed, 24 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 10236ecdad95..2ce39ffbcefb 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2564,11 +2564,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
+-                                                vmx_get_supported_debugctl(vcpu, false));
++              vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
++                                             vmx_get_supported_debugctl(vcpu, false));
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
++              vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
+       }
+       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -3433,7 +3433,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+       if (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+-              vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
++              vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
+       if (kvm_mpx_supported() &&
+           (!vmx->nested.nested_run_pending ||
+            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -4633,7 +4633,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+       kvm_set_dr(vcpu, 7, 0x400);
+-      vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++      vmx_guest_debugctl_write(vcpu, 0);
+       if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+                               vmcs12->vm_exit_msr_load_count))
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 48a2f77f62ef..50364e00e4e9 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -633,11 +633,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+  */
+ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+ {
+-      u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++      u64 data = vmx_guest_debugctl_read();
+       if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+               data &= ~DEBUGCTLMSR_LBR;
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++              vmx_guest_debugctl_write(vcpu, data);
+       }
+ }
+@@ -707,7 +707,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+       if (!lbr_desc->event) {
+               vmx_disable_lbr_msrs_passthrough(vcpu);
+-              if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
++              if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
+                       goto warn;
+               if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+                       goto warn;
+@@ -729,7 +729,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+ {
+-      if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
++      if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
+               intel_pmu_release_guest_lbr_event(vcpu);
+ }
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 32f1a38a1010..d0973bd7853c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2124,7 +2124,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                       msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+               break;
+       case MSR_IA32_DEBUGCTLMSR:
+-              msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++              msr_info->data = vmx_guest_debugctl_read();
+               break;
+       default:
+       find_uret_msr:
+@@ -2258,7 +2258,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+                                               VM_EXIT_SAVE_DEBUG_CONTROLS)
+                       get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+-              vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++              vmx_guest_debugctl_write(vcpu, data);
++
+               if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+                   (data & DEBUGCTLMSR_LBR))
+                       intel_pmu_create_guest_lbr_event(vcpu);
+@@ -4826,7 +4827,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+-      vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++
++      vmx_guest_debugctl_write(&vmx->vcpu, 0);
+       if (cpu_has_vmx_tpr_shadow()) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 5816fdd2dfa8..769e70fd142c 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -432,6 +432,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
++{
++      vmcs_write64(GUEST_IA32_DEBUGCTL, val);
++}
++
++static inline u64 vmx_guest_debugctl_read(void)
++{
++      return vmcs_read64(GUEST_IA32_DEBUGCTL);
++}
++
+ /*
+  * Note, early Intel manuals have the write-low and read-high bitmap offsets
+  * the wrong way round.  The bitmaps control MSRs 0x00000000-0x00001fff and
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch b/queue-6.6/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
new file mode 100644 (file)
index 0000000..b337a35
--- /dev/null
@@ -0,0 +1,138 @@
+From ab60a5a234aeb79d78d4830caee1d001313cd5e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:34 -0700
+Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic
+ bitmap
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ]
+
+Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter
+into an a generic bitmap so that similar "take action" information can be
+passed to vendor code without creating a pile of boolean parameters.
+
+This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and
+will also allow for adding similar functionality for re-loading debugctl
+in the active VMCS.
+
+Opportunistically massage the TDX WARN and comment to prepare for adding
+more run_flags, all of which are expected to be mutually exclusive with
+TDX, i.e. should be WARNed on.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX crud, account for lack of kvm_x86_call()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  6 +++++-
+ arch/x86/kvm/svm/svm.c          |  4 ++--
+ arch/x86/kvm/vmx/vmx.c          |  3 ++-
+ arch/x86/kvm/x86.c              | 10 ++++++++--
+ 4 files changed, 17 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 8898ad8cb3de..aa6d04cd9ee6 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1550,6 +1550,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+       return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+ }
++enum kvm_x86_run_flags {
++      KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
++};
++
+ struct kvm_x86_ops {
+       const char *name;
+@@ -1625,7 +1629,7 @@ struct kvm_x86_ops {
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+       enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+-                                                bool force_immediate_exit);
++                                                u64 run_flags);
+       int (*handle_exit)(struct kvm_vcpu *vcpu,
+               enum exit_fastpath_completion exit_fastpath);
+       int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 4a53b38ea386..61e5e261cde2 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4197,9 +4197,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       guest_state_exit_irqoff();
+ }
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+-                                        bool force_immediate_exit)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++      bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 704e5a552b4f..065aac2f4bce 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7345,8 +7345,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+       guest_state_exit_irqoff();
+ }
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++      bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index f3150d9a1918..ecc151397341 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10518,6 +10518,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               dm_request_for_irq_injection(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu);
+       fastpath_t exit_fastpath;
++      u64 run_flags;
+       bool req_immediate_exit = false;
+@@ -10750,8 +10751,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               goto cancel_injection;
+       }
+-      if (req_immediate_exit)
++      run_flags = 0;
++      if (req_immediate_exit) {
++              run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
++      }
+       fpregs_assert_state_consistent();
+       if (test_thread_flag(TIF_NEED_FPU_LOAD))
+@@ -10787,7 +10791,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+                            (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+-              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
++              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags);
+               if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+                       break;
+@@ -10799,6 +10803,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+                       break;
+               }
++              run_flags = 0;
++
+               /* Note, VM-Exits that go down the "slow" path are accounted below. */
+               ++vcpu->stat.exits;
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch b/queue-6.6/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
new file mode 100644 (file)
index 0000000..124fa5c
--- /dev/null
@@ -0,0 +1,144 @@
+From 22c51f0290ecf799d1bb5992d6add57aaa64597f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:35 -0700
+Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ]
+
+Instruct vendor code to load the guest's DR6 into hardware via a new
+KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to
+load vcpu->arch.dr6 into hardware when DR6 can be read/written directly
+by the guest.
+
+Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM
+thinks DR6 needs to be reloaded.  TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH
+and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of vmx/main.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h |  1 -
+ arch/x86/include/asm/kvm_host.h    |  2 +-
+ arch/x86/kvm/svm/svm.c             | 10 ++++++----
+ arch/x86/kvm/vmx/vmx.c             | 10 +++-------
+ arch/x86/kvm/x86.c                 |  2 +-
+ 5 files changed, 11 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 8fe6667d945f..a0a4fc684e63 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -48,7 +48,6 @@ KVM_X86_OP(set_idt)
+ KVM_X86_OP(get_gdt)
+ KVM_X86_OP(set_gdt)
+ KVM_X86_OP(sync_dirty_debug_regs)
+-KVM_X86_OP(set_dr6)
+ KVM_X86_OP(set_dr7)
+ KVM_X86_OP(cache_reg)
+ KVM_X86_OP(get_rflags)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index aa6d04cd9ee6..7373b22c02a7 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1552,6 +1552,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+       KVM_RUN_FORCE_IMMEDIATE_EXIT    = BIT(0),
++      KVM_RUN_LOAD_GUEST_DR6          = BIT(1),
+ };
+ struct kvm_x86_ops {
+@@ -1600,7 +1601,6 @@ struct kvm_x86_ops {
+       void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+       void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+       void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+-      void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+       unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 61e5e261cde2..abff6d45ae33 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4241,10 +4241,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+       svm_hv_update_vp_id(svm->vmcb, vcpu);
+       /*
+-       * Run with all-zero DR6 unless needed, so that we can get the exact cause
+-       * of a #DB.
++       * Run with all-zero DR6 unless the guest can write DR6 freely, so that
++       * KVM can get the exact cause of a #DB.  Note, loading guest DR6 from
++       * KVM's snapshot is only necessary when DR accesses won't exit.
+        */
+-      if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
++      if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
++              svm_set_dr6(vcpu, vcpu->arch.dr6);
++      else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+               svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
+       clgi();
+@@ -5021,7 +5024,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+       .set_idt = svm_set_idt,
+       .get_gdt = svm_get_gdt,
+       .set_gdt = svm_set_gdt,
+-      .set_dr6 = svm_set_dr6,
+       .set_dr7 = svm_set_dr7,
+       .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+       .cache_reg = svm_cache_reg,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 065aac2f4bce..08ca218ee858 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5616,12 +5616,6 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+       set_debugreg(DR6_RESERVED, 6);
+ }
+-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+-{
+-      lockdep_assert_irqs_disabled();
+-      set_debugreg(vcpu->arch.dr6, 6);
+-}
+-
+ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+ {
+       vmcs_writel(GUEST_DR7, val);
+@@ -7392,6 +7386,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+               vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+       vcpu->arch.regs_dirty = 0;
++      if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
++              set_debugreg(vcpu->arch.dr6, 6);
++
+       /*
+        * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+        * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8337,7 +8334,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+       .set_idt = vmx_set_idt,
+       .get_gdt = vmx_get_gdt,
+       .set_gdt = vmx_set_gdt,
+-      .set_dr6 = vmx_set_dr6,
+       .set_dr7 = vmx_set_dr7,
+       .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+       .cache_reg = vmx_cache_reg,
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index ecc151397341..fbb2e70e3031 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10772,7 +10772,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(vcpu->arch.eff_db[3], 3);
+               /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+               if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+-                      static_call(kvm_x86_set_dr6)(vcpu, vcpu->arch.dr6);
++                      run_flags |= KVM_RUN_LOAD_GUEST_DR6;
+       } else if (unlikely(hw_breakpoint_active())) {
+               set_debugreg(0, 7);
+       }
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch b/queue-6.6/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch
new file mode 100644 (file)
index 0000000..53bf15f
--- /dev/null
@@ -0,0 +1,265 @@
+From f141c80dbc877633ec0fb299da98a44a81d7c5aa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:33 -0700
+Subject: KVM: x86: Fully defer to vendor code to decide how to force immediate
+ exit
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 0ec3d6d1f169baa7fc512ae4b78d17e7c94b7763 ]
+
+Now that vmx->req_immediate_exit is used only in the scope of
+vmx_vcpu_run(), use force_immediate_exit to detect that KVM should usurp
+the VMX preemption to force a VM-Exit and let vendor code fully handle
+forcing a VM-Exit.
+
+Opportunsitically drop __kvm_request_immediate_exit() and just have
+vendor code call smp_send_reschedule() directly.  SVM already does this
+when injecting an event while also trying to single-step an IRET, i.e.
+it's not exactly secret knowledge that KVM uses a reschedule IPI to force
+an exit.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-7-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve absurd conflict due to funky kvm_x86_ops.sched_in prototype]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h |  1 -
+ arch/x86/include/asm/kvm_host.h    |  3 ---
+ arch/x86/kvm/svm/svm.c             |  7 ++++---
+ arch/x86/kvm/vmx/vmx.c             | 32 +++++++++++++-----------------
+ arch/x86/kvm/vmx/vmx.h             |  2 --
+ arch/x86/kvm/x86.c                 | 10 +---------
+ 6 files changed, 19 insertions(+), 36 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index e59ded976166..8fe6667d945f 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -102,7 +102,6 @@ KVM_X86_OP(write_tsc_multiplier)
+ KVM_X86_OP(get_exit_info)
+ KVM_X86_OP(check_intercept)
+ KVM_X86_OP(handle_exit_irqoff)
+-KVM_X86_OP(request_immediate_exit)
+ KVM_X86_OP(sched_in)
+ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
+ KVM_X86_OP_OPTIONAL(vcpu_blocking)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 5703600a454e..8898ad8cb3de 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1695,8 +1695,6 @@ struct kvm_x86_ops {
+                              struct x86_exception *exception);
+       void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+-      void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
+-
+       void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+       /*
+@@ -2182,7 +2180,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
+ void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+                                    u32 size);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index f42c6ef7dc20..4a53b38ea386 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4222,9 +4222,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+                * is enough to force an immediate vmexit.
+                */
+               disable_nmi_singlestep(svm);
+-              smp_send_reschedule(vcpu->cpu);
++              force_immediate_exit = true;
+       }
++      if (force_immediate_exit)
++              smp_send_reschedule(vcpu->cpu);
++
+       pre_svm_run(vcpu);
+       sync_lapic_to_cr8(vcpu);
+@@ -5075,8 +5078,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+       .check_intercept = svm_check_intercept,
+       .handle_exit_irqoff = svm_handle_exit_irqoff,
+-      .request_immediate_exit = __kvm_request_immediate_exit,
+-
+       .sched_in = svm_sched_in,
+       .nested_ops = &svm_nested_ops,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0ecc0e996386..704e5a552b4f 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -49,6 +49,8 @@
+ #include <asm/spec-ctrl.h>
+ #include <asm/vmx.h>
++#include <trace/events/ipi.h>
++
+ #include "capabilities.h"
+ #include "cpuid.h"
+ #include "hyperv.h"
+@@ -1304,8 +1306,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+       u16 fs_sel, gs_sel;
+       int i;
+-      vmx->req_immediate_exit = false;
+-
+       /*
+        * Note that guest MSRs to be saved/restored can also be changed
+        * when guest state is loaded. This happens when guest transitions
+@@ -6015,7 +6015,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
+       return 1;
+ }
+-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
++static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
++                                                 bool force_immediate_exit)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+@@ -6031,7 +6032,7 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+        * If the timer expired because KVM used it to force an immediate exit,
+        * then mission accomplished.
+        */
+-      if (vmx->req_immediate_exit)
++      if (force_immediate_exit)
+               return EXIT_FASTPATH_EXIT_HANDLED;
+       /*
+@@ -7210,13 +7211,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+                                       msrs[i].host, false);
+ }
+-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
++static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+-      if (vmx->req_immediate_exit) {
++      if (force_immediate_exit) {
+               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+               vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+       } else if (vmx->hv_deadline_tsc != -1) {
+@@ -7269,7 +7270,8 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+       barrier_nospec();
+ }
+-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
++                                           bool force_immediate_exit)
+ {
+       /*
+        * If L2 is active, some VMX preemption timer exits can be handled in
+@@ -7283,7 +7285,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+       case EXIT_REASON_MSR_WRITE:
+               return handle_fastpath_set_msr_irqoff(vcpu);
+       case EXIT_REASON_PREEMPTION_TIMER:
+-              return handle_fastpath_preemption_timer(vcpu);
++              return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+       default:
+               return EXIT_FASTPATH_NONE;
+       }
+@@ -7425,7 +7427,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+               vmx_passthrough_lbr_msrs(vcpu);
+       if (enable_preemption_timer)
+-              vmx_update_hv_timer(vcpu);
++              vmx_update_hv_timer(vcpu, force_immediate_exit);
++      else if (force_immediate_exit)
++              smp_send_reschedule(vcpu->cpu);
+       kvm_wait_lapic_expire(vcpu);
+@@ -7489,7 +7493,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+       vmx_recover_nmi_blocking(vmx);
+       vmx_complete_interrupts(vmx);
+-      return vmx_exit_handlers_fastpath(vcpu);
++      return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
+ }
+ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+@@ -7988,11 +7992,6 @@ static __init void vmx_set_cpu_caps(void)
+               kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+ }
+-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+-      to_vmx(vcpu)->req_immediate_exit = true;
+-}
+-
+ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+                                 struct x86_instruction_info *info)
+ {
+@@ -8404,8 +8403,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+       .check_intercept = vmx_check_intercept,
+       .handle_exit_irqoff = vmx_handle_exit_irqoff,
+-      .request_immediate_exit = vmx_request_immediate_exit,
+-
+       .sched_in = vmx_sched_in,
+       .cpu_dirty_log_size = PML_ENTITY_NUM,
+@@ -8663,7 +8660,6 @@ static __init int hardware_setup(void)
+       if (!enable_preemption_timer) {
+               vmx_x86_ops.set_hv_timer = NULL;
+               vmx_x86_ops.cancel_hv_timer = NULL;
+-              vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
+       }
+       kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index fb36bde2dd87..50d32d830890 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -331,8 +331,6 @@ struct vcpu_vmx {
+       unsigned int ple_window;
+       bool ple_window_dirty;
+-      bool req_immediate_exit;
+-
+       /* Support for PML */
+ #define PML_ENTITY_NUM                512
+       struct page *pml_pg;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index d04066099567..f3150d9a1918 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10505,12 +10505,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+       static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+ }
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+-      smp_send_reschedule(vcpu->cpu);
+-}
+-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+-
+ /*
+  * Called within kvm->srcu read side.
+  * Returns 1 to let vcpu_run() continue the guest execution loop without
+@@ -10756,10 +10750,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               goto cancel_injection;
+       }
+-      if (req_immediate_exit) {
++      if (req_immediate_exit)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+-              static_call(kvm_x86_request_immediate_exit)(vcpu);
+-      }
+       fpregs_assert_state_consistent();
+       if (test_thread_flag(TIF_NEED_FPU_LOAD))
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch b/queue-6.6/kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch
new file mode 100644 (file)
index 0000000..853ce19
--- /dev/null
@@ -0,0 +1,75 @@
+From 21d37a330aba310a5c2dc24ee8eea174acdfb829 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:21 -0700
+Subject: KVM: x86/hyper-v: Skip non-canonical addresses during PV TLB flush
+
+From: Manuel Andreas <manuel.andreas@tum.de>
+
+[ Upstream commit fa787ac07b3ceb56dd88a62d1866038498e96230 ]
+
+In KVM guests with Hyper-V hypercalls enabled, the hypercalls
+HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST and HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
+allow a guest to request invalidation of portions of a virtual TLB.
+For this, the hypercall parameter includes a list of GVAs that are supposed
+to be invalidated.
+
+However, when non-canonical GVAs are passed, there is currently no
+filtering in place and they are eventually passed to checked invocations of
+INVVPID on Intel / INVLPGA on AMD.  While AMD's INVLPGA silently ignores
+non-canonical addresses (effectively a no-op), Intel's INVVPID explicitly
+signals VM-Fail and ultimately triggers the WARN_ONCE in invvpid_error():
+
+  invvpid failed: ext=0x0 vpid=1 gva=0xaaaaaaaaaaaaa000
+  WARNING: CPU: 6 PID: 326 at arch/x86/kvm/vmx/vmx.c:482
+  invvpid_error+0x91/0xa0 [kvm_intel]
+  Modules linked in: kvm_intel kvm 9pnet_virtio irqbypass fuse
+  CPU: 6 UID: 0 PID: 326 Comm: kvm-vm Not tainted 6.15.0 #14 PREEMPT(voluntary)
+  RIP: 0010:invvpid_error+0x91/0xa0 [kvm_intel]
+  Call Trace:
+    vmx_flush_tlb_gva+0x320/0x490 [kvm_intel]
+    kvm_hv_vcpu_flush_tlb+0x24f/0x4f0 [kvm]
+    kvm_arch_vcpu_ioctl_run+0x3013/0x5810 [kvm]
+
+Hyper-V documents that invalid GVAs (those that are beyond a partition's
+GVA space) are to be ignored.  While not completely clear whether this
+ruling also applies to non-canonical GVAs, it is likely fine to make that
+assumption, and manual testing on Azure confirms "real" Hyper-V interprets
+the specification in the same way.
+
+Skip non-canonical GVAs when processing the list of address to avoid
+tripping the INVVPID failure.  Alternatively, KVM could filter out "bad"
+GVAs before inserting into the FIFO, but practically speaking the only
+downside of pushing validation to the final processing is that doing so
+is suboptimal for the guest, and no well-behaved guest will request TLB
+flushes for non-canonical addresses.
+
+Fixes: 260970862c88 ("KVM: x86: hyper-v: Handle HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls gently")
+Cc: stable@vger.kernel.org
+Signed-off-by: Manuel Andreas <manuel.andreas@tum.de>
+Suggested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Link: https://lore.kernel.org/r/c090efb3-ef82-499f-a5e0-360fc8420fb7@tum.de
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: use plain is_noncanonical_address()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/hyperv.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
+index bd3fbd5be5da..223f4fa6a849 100644
+--- a/arch/x86/kvm/hyperv.c
++++ b/arch/x86/kvm/hyperv.c
+@@ -1929,6 +1929,9 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+               if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+                       goto out_flush_all;
++              if (is_noncanonical_address(entries[i], vcpu))
++                      continue;
++
+               /*
+                * Lower 12 bits of 'address' encode the number of additional
+                * pages to flush.
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch b/queue-6.6/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch
new file mode 100644 (file)
index 0000000..3594804
--- /dev/null
@@ -0,0 +1,80 @@
+From 03a5de01f1a2bd29e4c24999991e84839d3b1fa1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:31 -0700
+Subject: KVM: x86: Move handling of is_guest_mode() into fastpath exit
+ handlers
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit bf1a49436ea37b98dd2f37c57608951d0e28eecc ]
+
+Let the fastpath code decide which exits can/can't be handled in the
+fastpath when L2 is active, e.g. when KVM generates a VMX preemption
+timer exit to forcefully regain control, there is no "work" to be done and
+so such exits can be handled in the fastpath regardless of whether L1 or
+L2 is active.
+
+Moving the is_guest_mode() check into the fastpath code also makes it
+easier to see that L2 isn't allowed to use the fastpath in most cases,
+e.g. it's not immediately obvious why handle_fastpath_preemption_timer()
+is called from the fastpath and the normal path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-5-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c | 6 +++---
+ arch/x86/kvm/vmx/vmx.c | 6 +++---
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 5a230be224d1..f42c6ef7dc20 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4157,6 +4157,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
++      if (is_guest_mode(vcpu))
++              return EXIT_FASTPATH_NONE;
++
+       if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+           to_svm(vcpu)->vmcb->control.exit_info_1)
+               return handle_fastpath_set_msr_irqoff(vcpu);
+@@ -4315,9 +4318,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+       svm_complete_interrupts(vcpu);
+-      if (is_guest_mode(vcpu))
+-              return EXIT_FASTPATH_NONE;
+-
+       return svm_exit_handlers_fastpath(vcpu);
+ }
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 631fdd4a575a..4c991d514015 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7258,6 +7258,9 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
++      if (is_guest_mode(vcpu))
++              return EXIT_FASTPATH_NONE;
++
+       switch (to_vmx(vcpu)->exit_reason.basic) {
+       case EXIT_REASON_MSR_WRITE:
+               return handle_fastpath_set_msr_irqoff(vcpu);
+@@ -7468,9 +7471,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+       vmx_recover_nmi_blocking(vmx);
+       vmx_complete_interrupts(vmx);
+-      if (is_guest_mode(vcpu))
+-              return EXIT_FASTPATH_NONE;
+-
+       return vmx_exit_handlers_fastpath(vcpu);
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch b/queue-6.6/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch
new file mode 100644 (file)
index 0000000..f7f8e86
--- /dev/null
@@ -0,0 +1,130 @@
+From c1681ed16b27201a312047cb6a8038088fdc0608 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:28 -0700
+Subject: KVM: x86: Plumb "force_immediate_exit" into kvm_entry() tracepoint
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 9c9025ea003a03f967affd690f39b4ef3452c0f5 ]
+
+Annotate the kvm_entry() tracepoint with "immediate exit" when KVM is
+forcing a VM-Exit immediately after VM-Enter, e.g. when KVM wants to
+inject an event but needs to first complete some other operation.
+Knowing that KVM is (or isn't) forcing an exit is useful information when
+debugging issues related to event injection.
+
+Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
+Link: https://lore.kernel.org/r/20240110012705.506918-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 3 ++-
+ arch/x86/kvm/svm/svm.c          | 5 +++--
+ arch/x86/kvm/trace.h            | 9 ++++++---
+ arch/x86/kvm/vmx/vmx.c          | 4 ++--
+ arch/x86/kvm/x86.c              | 2 +-
+ 5 files changed, 14 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index b5210505abfa..5703600a454e 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1624,7 +1624,8 @@ struct kvm_x86_ops {
+       void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+-      enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
++      enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
++                                                bool force_immediate_exit);
+       int (*handle_exit)(struct kvm_vcpu *vcpu,
+               enum exit_fastpath_completion exit_fastpath);
+       int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index abbb84ddfe02..5a230be224d1 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4194,12 +4194,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+       guest_state_exit_irqoff();
+ }
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
++                                        bool force_immediate_exit)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+-      trace_kvm_entry(vcpu);
++      trace_kvm_entry(vcpu, force_immediate_exit);
+       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
+index b82e6ed4f024..c6b4b1728006 100644
+--- a/arch/x86/kvm/trace.h
++++ b/arch/x86/kvm/trace.h
+@@ -15,20 +15,23 @@
+  * Tracepoint for guest mode entry.
+  */
+ TRACE_EVENT(kvm_entry,
+-      TP_PROTO(struct kvm_vcpu *vcpu),
+-      TP_ARGS(vcpu),
++      TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
++      TP_ARGS(vcpu, force_immediate_exit),
+       TP_STRUCT__entry(
+               __field(        unsigned int,   vcpu_id         )
+               __field(        unsigned long,  rip             )
++              __field(        bool,           immediate_exit  )
+       ),
+       TP_fast_assign(
+               __entry->vcpu_id        = vcpu->vcpu_id;
+               __entry->rip            = kvm_rip_read(vcpu);
++              __entry->immediate_exit = force_immediate_exit;
+       ),
+-      TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
++      TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
++                __entry->immediate_exit ? "[immediate exit]" : "")
+ );
+ /*
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9ba4baf2a9e9..ee501871ddb0 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7312,7 +7312,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+       guest_state_exit_irqoff();
+ }
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+@@ -7339,7 +7339,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+               return EXIT_FASTPATH_NONE;
+       }
+-      trace_kvm_entry(vcpu);
++      trace_kvm_entry(vcpu, force_immediate_exit);
+       if (vmx->ple_window_dirty) {
+               vmx->ple_window_dirty = false;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 9944b32b0b30..d04066099567 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10795,7 +10795,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+                            (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+-              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
++              exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
+               if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+                       break;
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch b/queue-6.6/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch
new file mode 100644 (file)
index 0000000..42f3463
--- /dev/null
@@ -0,0 +1,104 @@
+From 8fd23c953af487158937416f6ea3a2e16c6c7503 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:23 -0700
+Subject: KVM: x86: Plumb in the vCPU to kvm_x86_ops.hwapic_isr_update()
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 76bce9f10162cd4b36ac0b7889649b22baf70ebd ]
+
+Pass the target vCPU to the hwapic_isr_update() vendor hook so that VMX
+can defer the update until after nested VM-Exit if an EOI for L1's vAPIC
+occurs while L2 is active.
+
+Note, commit d39850f57d21 ("KVM: x86: Drop @vcpu parameter from
+kvm_x86_ops.hwapic_isr_update()") removed the parameter with the
+justification that doing so "allows for a decent amount of (future)
+cleanup in the APIC code", but it's not at all clear what cleanup was
+intended, or if it was ever realized.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Chao Gao <chao.gao@intel.com>
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of kvm_x86_call(), drop vmx/x86_ops.h change]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 2 +-
+ arch/x86/kvm/lapic.c            | 8 ++++----
+ arch/x86/kvm/vmx/vmx.c          | 2 +-
+ 3 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 5dfb8cc9616e..5fc89d255550 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1657,7 +1657,7 @@ struct kvm_x86_ops {
+       bool allow_apicv_in_x2apic_without_x2apic_virtualization;
+       void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+       void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+-      void (*hwapic_isr_update)(int isr);
++      void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+       bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
+       void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+       void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 66c7f2367bb3..cbf85a1ffb74 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -750,7 +750,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+        * just set SVI.
+        */
+       if (unlikely(apic->apicv_active))
+-              static_call_cond(kvm_x86_hwapic_isr_update)(vec);
++              static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec);
+       else {
+               ++apic->isr_count;
+               BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+@@ -795,7 +795,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+        * and must be left alone.
+        */
+       if (unlikely(apic->apicv_active))
+-              static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++              static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
+       else {
+               --apic->isr_count;
+               BUG_ON(apic->isr_count < 0);
+@@ -2772,7 +2772,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+       if (apic->apicv_active) {
+               static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+               static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+-              static_call_cond(kvm_x86_hwapic_isr_update)(-1);
++              static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
+       }
+       vcpu->arch.apic_arb_prio = 0;
+@@ -3072,7 +3072,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+       if (apic->apicv_active) {
+               static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+               static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+-              static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++              static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+       }
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       if (ioapic_in_kernel(vcpu->kvm))
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index e53620e18925..cde01eb1f5e3 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6834,7 +6834,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+       kvm_release_pfn_clean(pfn);
+ }
+-static void vmx_hwapic_isr_update(int max_isr)
++static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ {
+       u16 status;
+       u8 old;
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch b/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch
new file mode 100644 (file)
index 0000000..b933ff3
--- /dev/null
@@ -0,0 +1,48 @@
+From 5bfa7e5a50ee261faf4f40ec4bae020fe4f2e08b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:27 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 189ecdb3e112da703ac0699f4ec76aa78122f911 ]
+
+Snapshot the host's DEBUGCTL after disabling IRQs, as perf can toggle
+debugctl bits from IRQ context, e.g. when enabling/disabling events via
+smp_call_function_single().  Taking the snapshot (long) before IRQs are
+disabled could result in KVM effectively clobbering DEBUGCTL due to using
+a stale snapshot.
+
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 7aff0fe469c3..9944b32b0b30 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4823,7 +4823,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+       /* Save host pkru register if supported */
+       vcpu->arch.host_pkru = read_pkru();
+-      vcpu->arch.host_debugctl = get_debugctlmsr();
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+@@ -10782,6 +10781,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+               set_debugreg(0, 7);
+       }
++      vcpu->arch.host_debugctl = get_debugctlmsr();
++
+       guest_timing_enter_irqoff();
+       for (;;) {
+-- 
+2.50.1
+
diff --git a/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch b/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch
new file mode 100644 (file)
index 0000000..fdac54d
--- /dev/null
@@ -0,0 +1,100 @@
+From c711192f36c4c41ec0716b1b0a20448a9cc2194f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:26 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL in common x86
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit fb71c795935652fa20eaf9517ca9547f5af99a76 ]
+
+Move KVM's snapshot of DEBUGCTL to kvm_vcpu_arch and take the snapshot in
+common x86, so that SVM can also use the snapshot.
+
+Opportunistically change the field to a u64.  While bits 63:32 are reserved
+on AMD, not mentioned at all in Intel's SDM, and managed as an "unsigned
+long" by the kernel, DEBUGCTL is an MSR and therefore a 64-bit value.
+
+Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in vmx_vcpu_load()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/vmx/vmx.c          | 8 ++------
+ arch/x86/kvm/vmx/vmx.h          | 2 --
+ arch/x86/kvm/x86.c              | 1 +
+ 4 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 5fc89d255550..b5210505abfa 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -733,6 +733,7 @@ struct kvm_vcpu_arch {
+       u32 pkru;
+       u32 hflags;
+       u64 efer;
++      u64 host_debugctl;
+       u64 apic_base;
+       struct kvm_lapic *apic;    /* kernel irqchip context */
+       bool load_eoi_exitmap_pending;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4563e7a9a851..9ba4baf2a9e9 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -1499,13 +1499,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+  */
+ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+-      struct vcpu_vmx *vmx = to_vmx(vcpu);
+-
+       vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+       vmx_vcpu_pi_load(vcpu, cpu);
+-
+-      vmx->host_debugctlmsr = get_debugctlmsr();
+ }
+ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+@@ -7414,8 +7410,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+       }
+       /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+-      if (vmx->host_debugctlmsr)
+-              update_debugctlmsr(vmx->host_debugctlmsr);
++      if (vcpu->arch.host_debugctl)
++              update_debugctlmsr(vcpu->arch.host_debugctl);
+ #ifndef CONFIG_X86_64
+       /*
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 88c5b7ebf9d3..fb36bde2dd87 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -340,8 +340,6 @@ struct vcpu_vmx {
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
+-      unsigned long host_debugctlmsr;
+-
+       /*
+        * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+        * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 55185670e0e5..7aff0fe469c3 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4823,6 +4823,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+       /* Save host pkru register if supported */
+       vcpu->arch.host_pkru = read_pkru();
++      vcpu->arch.host_debugctl = get_debugctlmsr();
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+-- 
+2.50.1
+
diff --git a/queue-6.6/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.6/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
new file mode 100644 (file)
index 0000000..a63a437
--- /dev/null
@@ -0,0 +1,44 @@
+From 420eaab27f40ffc253c0d624df6b87a47a58e99c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 8ed72c8b210f..e7306ed52922 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -638,7 +638,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+-      u32 val, cap, ret = 0;
++      u32 val, cap;
++      int ret = 0;
+       mutex_lock(&iep->ptp_clk_mutex);
+-- 
+2.50.1
+
diff --git a/queue-6.6/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.6/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
new file mode 100644 (file)
index 0000000..32449be
--- /dev/null
@@ -0,0 +1,129 @@
+From a6dce037ccb6be527d9dd4d896d9612980da17e6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+      if (res < 0) {
+                nf_conntrack_get(&ct->ct_general); // HERE
+                cb->args[1] = (unsigned long)ct;
+                ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+        if (res < 0) {
+               if (ct != last)
+                       nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 282e9644f6fd..928bd2013289 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -859,8 +859,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+-      if (cb->args[1])
+-              nf_ct_put((struct nf_conn *)cb->args[1]);
+       kfree(cb->data);
+       return 0;
+ }
+@@ -1175,19 +1173,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+       return 0;
+ }
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++      unsigned long id = nf_ct_get_id(ct);
++
++      return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+       unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+       struct net *net = sock_net(skb->sk);
+-      struct nf_conn *ct, *last;
++      unsigned long last_id = cb->args[1];
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct nf_conn *nf_ct_evict[8];
++      struct nf_conn *ct;
+       int res, i;
+       spinlock_t *lockp;
+-      last = (struct nf_conn *)cb->args[1];
+       i = 0;
+       local_bh_disable();
+@@ -1224,7 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                               continue;
+                       if (cb->args[1]) {
+-                              if (ct != last)
++                              if (ctnetlink_get_id(ct) != last_id)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+@@ -1237,8 +1242,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+                                           NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                           ct, true, flags);
+                       if (res < 0) {
+-                              nf_conntrack_get(&ct->ct_general);
+-                              cb->args[1] = (unsigned long)ct;
++                              cb->args[1] = ctnetlink_get_id(ct);
+                               spin_unlock(lockp);
+                               goto out;
+                       }
+@@ -1251,12 +1255,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+       }
+ out:
+       local_bh_enable();
+-      if (last) {
++      if (last_id) {
+               /* nf ct hash resize happened, now clear the leftover. */
+-              if ((struct nf_conn *)cb->args[1] == last)
++              if (cb->args[1] == last_id)
+                       cb->args[1] = 0;
+-
+-              nf_ct_put(last);
+       }
+       while (i) {
+-- 
+2.50.1
+
diff --git a/queue-6.6/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.6/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
new file mode 100644 (file)
index 0000000..3d61663
--- /dev/null
@@ -0,0 +1,103 @@
+From 5bca481007ef80b38edd17f64f35c01f02fae3f0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+       CPU0                           CPU1
+       ----                           ----
+  n_vclocks_store()
+    lock(&ptp->n_vclocks_mux) [1]
+        (physical clock)
+                                     pc_clock_adjtime()
+                                       lock(&clk->rwsem) [2]
+                                        (physical clock)
+                                       ...
+                                       ptp_clock_freerun()
+                                         ptp_vclock_in_use()
+                                           lock(&ptp->n_vclocks_mux) [3]
+                                              (physical clock)
+    ptp_clock_unregister()
+      posix_clock_unregister()
+        lock(&clk->rwsem) [4]
+          (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c  | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a54124269c2f..3fbd1d68a9bc 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -20,6 +20,11 @@
+ #define PTP_BUF_TIMESTAMPS 30
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
++enum {
++      PTP_LOCK_PHYSICAL = 0,
++      PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+       struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+       int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index dcf752c9e045..7d08ff3b30fc 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+       return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++      lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+       .owner          = THIS_MODULE,
+       .name           = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+               return NULL;
+       }
++      ptp_vclock_set_subclass(vclock->clock);
++
+       timecounter_init(&vclock->tc, &vclock->cc, 0);
+       ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+-- 
+2.50.1
+
diff --git a/queue-6.6/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.6/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
new file mode 100644 (file)
index 0000000..bae5dd4
--- /dev/null
@@ -0,0 +1,73 @@
+From 61292c12f981631257858fa1a7e22814646ed11e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+  BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+   sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+   sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+   sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+   __release_sock+0x1da/0x330 net/core/sock.c:3106
+   release_sock+0x6b/0x250 net/core/sock.c:3660
+   sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+   sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+   sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+   inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+  BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+   sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+   sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+   sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+   __release_sock+0x1d3/0x330 net/core/sock.c:3213
+   release_sock+0x6b/0x270 net/core/sock.c:3767
+   sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+   sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+   sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+   inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+   sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index a8a254a5008e..032a10d82302 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+        * it's better to just linearize it otherwise crc computing
+        * takes longer.
+        */
+-      if ((!is_gso && skb_linearize(skb)) ||
++      if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+           !pskb_may_pull(skb, sizeof(struct sctphdr)))
+               goto discard_it;
+-- 
+2.50.1
+
index e3972226738802c1cd49575f7d19d0b5059e3797..f481a6f7f3a31d128d2a965b04fa3e5dfc4752ad 100644 (file)
@@ -30,3 +30,30 @@ eventpoll-fix-semi-unbounded-recursion.patch
 documentation-acpi-fix-parent-device-references.patch
 acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
 acpi-processor-perflib-move-problematic-pr-performance-check.patch
+kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch
+kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch
+kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch
+kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch
+kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch
+kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch
+kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch
+kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch
+kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch
+kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch
+kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch
+kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch
+kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
+kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
+kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
+kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
+kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
+kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
+kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
diff --git a/queue-6.6/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.6/tls-handle-data-disappearing-from-under-the-tls-ulp.patch
new file mode 100644 (file)
index 0000000..fffc41b
--- /dev/null
@@ -0,0 +1,106 @@
+From 7b3746b0fb7bce25102c2ab1f5d3c2d406a17e0a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h      |  2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c   |  3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index 02038d0381b7..5dc61c85c076 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -192,7 +192,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index bea60b0160d1..6ce64a6e4495 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -474,7 +474,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+       strp->stm.offset = offset;
+ }
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+       struct strp_msg *rxm;
+       struct tls_msg *tlm;
+@@ -483,8 +483,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+       if (!strp->copy_mode && force_refresh) {
+-              if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+-                      return;
++              if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++                      WRITE_ONCE(strp->msg_ready, 0);
++                      memset(&strp->stm, 0, sizeof(strp->stm));
++                      return false;
++              }
+               tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+       }
+@@ -494,6 +497,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+       rxm->offset     = strp->stm.offset;
+       tlm = tls_msg(strp->anchor);
+       tlm->control    = strp->mark;
++
++      return true;
+ }
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 4905a81c4ac1..c9b53472e955 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1380,7 +1380,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+                       return sock_intr_errno(timeo);
+       }
+-      tls_strp_msg_load(&ctx->strp, released);
++      if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++              return tls_rx_rec_wait(sk, psock, nonblock, false);
+       return 1;
+ }
+-- 
+2.50.1
+
diff --git a/queue-6.6/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.6/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
new file mode 100644 (file)
index 0000000..e3533e5
--- /dev/null
@@ -0,0 +1,51 @@
+From d605402dabbd308ab13f66983c4babc3e4773210 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 3870b59f5400..9be9df2caf65 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+       remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+       skb->remcsum_offload = remcsum;
+-      need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++      need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+       /* Try to offload checksum if possible */
+       offload_csum = !!(need_csum &&
+                         !need_ipsec &&
+-- 
+2.50.1
+