--- /dev/null
+From cfccde6ceaa234284620cfaca692e6585dba91b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2]. This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period. Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index a95cc8f024fd..d34463f96848 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -158,6 +158,14 @@ static inline int performance_multiplier(unsigned long nr_iowaiters)
+
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++ /* Update the repeating-pattern data. */
++ data->intervals[data->interval_ptr++] = interval_us;
++ if (data->interval_ptr >= INTERVALS)
++ data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+ /*
+@@ -288,6 +296,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ if (data->needs_update) {
+ menu_update(drv, dev);
+ data->needs_update = 0;
++ } else if (!dev->last_residency_ns) {
++ /*
++ * This happens when the driver rejects the previously selected
++ * idle state and returns an error, so update the recent
++ * intervals table to prevent invalid information from being
++ * used going forward.
++ */
++ menu_update_intervals(data, UINT_MAX);
+ }
+
+ /* determine the expected residency time, round up */
+@@ -537,10 +553,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+
+ data->correction_factor[data->bucket] = new_factor;
+
+- /* update the repeating-pattern data */
+- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+- if (data->interval_ptr >= INTERVALS)
+- data->interval_ptr = 0;
++ menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+
+ /**
+--
+2.50.1
+
--- /dev/null
+From 55146a8e555eb6bcaf596bf8b7455a06175b4760 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6. Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 1cead368f961..f6a2211ca4ef 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1154,7 +1154,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+ {}
+ };
+
+--
+2.50.1
+
--- /dev/null
+From f713f56980c58c0297138ace9f7b483378f6bd73 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index b2b06033ef2c..f622fcad3f50 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -839,8 +839,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -1112,19 +1110,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -1160,7 +1165,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -1173,8 +1178,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, true, flags);
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -1187,12 +1191,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
--- /dev/null
+From 3e634f70881c39e2c08fb3e91544b90694df00bc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+ BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+ sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+ sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+ __release_sock+0x1da/0x330 net/core/sock.c:3106
+ release_sock+0x6b/0x250 net/core/sock.c:3660
+ sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+ sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+ sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+ inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+ BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+ sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+ __release_sock+0x1d3/0x330 net/core/sock.c:3213
+ release_sock+0x6b/0x270 net/core/sock.c:3767
+ sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+ sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+ sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+ inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 8fe1a74f0618..079b1bfc7d31 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb)
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+- if ((!is_gso && skb_linearize(skb)) ||
++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
+ goto discard_it;
+
+--
+2.50.1
+
documentation-acpi-fix-parent-device-references.patch
acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
acpi-processor-perflib-move-problematic-pr-performance-check.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
--- /dev/null
+From 703e70d1d8e2e3fa7a948735d5f6cd1cc8ce9e8d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 73beaa7e2d70..5d4413fe4195 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+
--- /dev/null
+From 05efdd270d75536fbf901a5eae7145a45a532748 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2]. This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period. Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index e1e2721beb75..246b4a1b664a 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -158,6 +158,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters)
+
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++ /* Update the repeating-pattern data. */
++ data->intervals[data->interval_ptr++] = interval_us;
++ if (data->interval_ptr >= INTERVALS)
++ data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+ /*
+@@ -288,6 +296,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ if (data->needs_update) {
+ menu_update(drv, dev);
+ data->needs_update = 0;
++ } else if (!dev->last_residency_ns) {
++ /*
++ * This happens when the driver rejects the previously selected
++ * idle state and returns an error, so update the recent
++ * intervals table to prevent invalid information from being
++ * used going forward.
++ */
++ menu_update_intervals(data, UINT_MAX);
+ }
+
+ /* determine the expected residency time, round up */
+@@ -542,10 +558,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+
+ data->correction_factor[data->bucket] = new_factor;
+
+- /* update the repeating-pattern data */
+- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+- if (data->interval_ptr >= INTERVALS)
+- data->interval_ptr = 0;
++ menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+
+ /**
+--
+2.50.1
+
--- /dev/null
+From 1ee55ceeeb4fb9720509f1f18eb551a41c5568c1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6. Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 359272ce8e29..96002f35405e 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1194,7 +1194,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+ {}
+ };
+
+--
+2.50.1
+
--- /dev/null
+From 93145a29f5f86a93148422799f8ec6667e0b6f50 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 585103c16a8a..50f7531221c3 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -848,8 +848,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -1164,19 +1162,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -1211,7 +1216,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -1224,8 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, true, flags);
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -1238,12 +1242,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
--- /dev/null
+From 167cea59f060ec2d6f527a724186f5e6a9a3f4d6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+ CPU0 CPU1
+ ---- ----
+ n_vclocks_store()
+ lock(&ptp->n_vclocks_mux) [1]
+ (physical clock)
+ pc_clock_adjtime()
+ lock(&clk->rwsem) [2]
+ (physical clock)
+ ...
+ ptp_clock_freerun()
+ ptp_vclock_in_use()
+ lock(&ptp->n_vclocks_mux) [3]
+ (physical clock)
+ ptp_clock_unregister()
+ posix_clock_unregister()
+ lock(&clk->rwsem) [4]
+ (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index b8d3df8a393a..bf823b8c3c8f 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -20,6 +20,11 @@
+ #define PTP_BUF_TIMESTAMPS 30
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+
++enum {
++ PTP_LOCK_PHYSICAL = 0,
++ PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+ struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+ int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index ab1d233173e1..6a14c39c4508 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -81,6 +81,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+ return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
+
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+ .owner = THIS_MODULE,
+ .name = "ptp virtual clock",
+@@ -137,6 +142,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+ return NULL;
+ }
+
++ ptp_vclock_set_subclass(vclock->clock);
++
+ timecounter_init(&vclock->tc, &vclock->cc, 0);
+ ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+--
+2.50.1
+
--- /dev/null
+From ecc53eb08e436c50345fff6ff4f2d84eddc7ffc3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+ BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+ sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+ sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+ __release_sock+0x1da/0x330 net/core/sock.c:3106
+ release_sock+0x6b/0x250 net/core/sock.c:3660
+ sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+ sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+ sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+ inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+ BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+ sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+ __release_sock+0x1d3/0x330 net/core/sock.c:3213
+ release_sock+0x6b/0x270 net/core/sock.c:3767
+ sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+ sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+ sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+ inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 4ee9374dcfb9..182898cb754a 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb)
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+- if ((!is_gso && skb_linearize(skb)) ||
++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
+ goto discard_it;
+
+--
+2.50.1
+
documentation-acpi-fix-parent-device-references.patch
acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
acpi-processor-perflib-move-problematic-pr-performance-check.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
--- /dev/null
+From 92e0e5246675bee6f45ac39d6c8c1ff8e588dd53 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 612da8ec1081..8f47d07c49fb 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -59,7 +59,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+
--- /dev/null
+From 4ad31ff02bc58329fdb26bd716b8c3ab15ba0533 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index eeb000e41ad7..5d6f9b375c0f 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -808,8 +808,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -890,18 +888,25 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -936,7 +941,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -951,8 +956,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ ct);
+ rcu_read_unlock();
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -965,12 +969,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
--- /dev/null
+From d22304d70bb1c9842e7607c98275fa7c2316a3a9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+ BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+ sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+ sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+ __release_sock+0x1da/0x330 net/core/sock.c:3106
+ release_sock+0x6b/0x250 net/core/sock.c:3660
+ sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+ sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+ sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+ inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+ BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+ sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+ __release_sock+0x1d3/0x330 net/core/sock.c:3213
+ release_sock+0x6b/0x270 net/core/sock.c:3767
+ sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+ sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+ sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+ inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index b1d3e342ac83..9013257cf3df 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb)
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+- if ((!is_gso && skb_linearize(skb)) ||
++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
+ goto discard_it;
+
+--
+2.50.1
+
documentation-acpi-fix-parent-device-references.patch
acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
acpi-processor-perflib-move-problematic-pr-performance-check.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
--- /dev/null
+From 35a73e471818ebb5d92b5a51ddfbc30c777fb59c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 6505a6fd245a..7e025f3517b8 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+
--- /dev/null
+From 9e1075bdd03cf356ae89ba3b703080b5c4fa2278 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:03 -0700
+Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
+
+Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports
+a subset of hardware functionality, i.e. KVM can't rely on hardware to
+detect illegal/unsupported values. Failure to check the vmcs12 value
+would allow the guest to load any harware-supported value while running L2.
+
+Take care to exempt BTF and LBR from the validity check in order to match
+KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even
+if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect
+that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR
+are being intercepted.
+
+Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set
+*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but
+that would incur non-trivial complexity and wouldn't change the fact that
+KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity
+is not worth carrying.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 12 ++++++++++--
+ arch/x86/kvm/vmx/vmx.c | 5 ++---
+ arch/x86/kvm/vmx/vmx.h | 3 +++
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index d55f7edc0860..da129e12cff9 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2532,7 +2532,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
++ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
++ vmx_get_supported_debugctl(vcpu, false));
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+@@ -3022,7 +3023,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
++ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
++ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+@@ -4374,6 +4376,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+ (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+
++ /*
++ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
++ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
++ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
++ * vmcs02 doesn't strictly track vmcs12.
++ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 6517b9d929bf..0b37e21d55b1 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2052,7 +2052,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+ return (unsigned long)data;
+ }
+
+-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+ u64 debugctl = 0;
+
+@@ -2071,8 +2071,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ return debugctl;
+ }
+
+-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
+- bool host_initiated)
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ {
+ u64 invalid;
+
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index ddbe73958d7f..99e3f46de2ec 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -442,6 +442,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+
+ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++
+ /*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+--
+2.50.1
+
--- /dev/null
+From 2fbc005722e5d1985ef69a071a4a889ff1cb6120 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:48 -0700
+Subject: KVM: nVMX: Defer SVI update to vmcs01 on EOI when L2 is active w/o
+ VID
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chao Gao <chao.gao@intel.com>
+
+[ Upstream commit 04bc93cf49d16d01753b95ddb5d4f230b809a991 ]
+
+If KVM emulates an EOI for L1's virtual APIC while L2 is active, defer
+updating GUEST_INTERUPT_STATUS.SVI, i.e. the VMCS's cache of the highest
+in-service IRQ, until L1 is active, as vmcs01, not vmcs02, needs to track
+vISR. The missed SVI update for vmcs01 can result in L1 interrupts being
+incorrectly blocked, e.g. if there is a pending interrupt with lower
+priority than the interrupt that was EOI'd.
+
+This bug only affects use cases where L1's vAPIC is effectively passed
+through to L2, e.g. in a pKVM scenario where L2 is L1's depriveleged host,
+as KVM will only emulate an EOI for L1's vAPIC if Virtual Interrupt
+Delivery (VID) is disabled in vmc12, and L1 isn't intercepting L2 accesses
+to its (virtual) APIC page (or if x2APIC is enabled, the EOI MSR).
+
+WARN() if KVM updates L1's ISR while L2 is active with VID enabled, as an
+EOI from L2 is supposed to affect L2's vAPIC, but still defer the update,
+to try to keep L1 alive. Specifically, KVM forwards all APICv-related
+VM-Exits to L1 via nested_vmx_l1_wants_exit():
+
+ case EXIT_REASON_APIC_ACCESS:
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /*
+ * The controls for "virtualize APIC accesses," "APIC-
+ * register virtualization," and "virtual-interrupt
+ * delivery" only come from vmcs12.
+ */
+ return true;
+
+Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/kvm/20230312180048.1778187-1-jason.cj.chen@intel.com
+Reported-by: Markku Ahvenjärvi <mankku@gmail.com>
+Closes: https://lore.kernel.org/all/20240920080012.74405-1-mankku@gmail.com
+Cc: Janne Karhunen <janne.karhunen@gmail.com>
+Signed-off-by: Chao Gao <chao.gao@intel.com>
+[sean: drop request, handle in VMX, write changelog]
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntactic conflict in lapic.h, account for lack of
+ kvm_x86_call(), drop sanity check due to lack of wants_to_run]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/lapic.c | 11 +++++++++++
+ arch/x86/kvm/lapic.h | 1 +
+ arch/x86/kvm/vmx/nested.c | 5 +++++
+ arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++++++
+ arch/x86/kvm/vmx/vmx.h | 1 +
+ 5 files changed, 34 insertions(+)
+
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 3d65d6a023c9..9aae76b74417 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -640,6 +640,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+ }
+ }
+
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
++{
++ struct kvm_lapic *apic = vcpu->arch.apic;
++
++ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
++ return;
++
++ static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
++}
++EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
++
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+ {
+ /* This may race with setting of irr in __apic_accept_irq() and
+diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
+index a5ac4a5a5179..e5d2dc58fcf8 100644
+--- a/arch/x86/kvm/lapic.h
++++ b/arch/x86/kvm/lapic.h
+@@ -122,6 +122,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+
+ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 8052f8b7d8e1..d55f7edc0860 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4839,6 +4839,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
++ if (vmx->nested.update_vmcs01_hwapic_isr) {
++ vmx->nested.update_vmcs01_hwapic_isr = false;
++ kvm_apic_update_hwapic_isr(vcpu);
++ }
++
+ if ((vm_exit_reason != -1) &&
+ (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 721ba6ddb121..7b87fbc69b21 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6713,6 +6713,22 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ u16 status;
+ u8 old;
+
++ /*
++ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
++ * is only relevant for if and only if Virtual Interrupt Delivery is
++ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
++ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
++ * VM-Exit, otherwise L1 with run with a stale SVI.
++ */
++ if (is_guest_mode(vcpu)) {
++ /*
++ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
++ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
++ */
++ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
++ return;
++ }
++
+ if (max_isr == -1)
+ max_isr = 0;
+
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 9e0bb98b116d..8b4b149bd9c1 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -189,6 +189,7 @@ struct nested_vmx {
+ bool reload_vmcs01_apic_access_page;
+ bool update_vmcs01_cpu_dirty_logging;
+ bool update_vmcs01_apicv_status;
++ bool update_vmcs01_hwapic_isr;
+
+ /*
+ * Enlightened VMCS has been enabled. It does not mean that L1 has to
+--
+2.50.1
+
--- /dev/null
+From 7a3ebf358c60cdf6f7ef1c175053ec17e59945c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:45 -0700
+Subject: KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI
+ shadow
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit be45bc4eff33d9a7dae84a2150f242a91a617402 ]
+
+Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common
+svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff()
+so that VMRUN is not executed in an STI shadow. AMD CPUs have a quirk
+(some would say "bug"), where the STI shadow bleeds into the guest's
+intr_state field if a #VMEXIT occurs during injection of an event, i.e. if
+the VMRUN doesn't complete before the subsequent #VMEXIT.
+
+The spurious "interrupts masked" state is relatively benign, as it only
+occurs during event injection and is transient. Because KVM is already
+injecting an event, the guest can't be in HLT, and if KVM is querying IRQ
+blocking for injection, then KVM would need to force an immediate exit
+anyways since injecting multiple events is impossible.
+
+However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the
+spurious STI shadow is visible to L1 when running a nested VM, which can
+trip sanity checks, e.g. in VMware's VMM.
+
+Hoist the STI+CLI all the way to C code, as the aforementioned calls to
+guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are
+enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already
+possible. I.e. if there's kernel code that is confused by running with
+RFLAGS.IF=1, then it's already a problem. In practice, since GIF=0 also
+blocks NMIs, the only change in exposure to non-KVM code (relative to
+surrounding VMRUN with STI+CLI) is exception handling code, and except for
+the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path
+are fatal.
+
+Use the "raw" variants to enable/disable IRQs to avoid tracing in the
+"no instrumentation" code; the guest state helpers also take care of
+tracing IRQ state.
+
+Oppurtunstically document why KVM needs to do STI in the first place.
+
+Reported-by: Doug Covelli <doug.covelli@broadcom.com>
+Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com
+Fixes: f14eec0a3203 ("KVM: SVM: move more vmentry code to assembly")
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in __svm_sev_es_vcpu_run()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c | 14 ++++++++++++++
+ arch/x86/kvm/svm/vmenter.S | 9 +--------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index b6bbd0dc4e65..c95a84afc35f 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3982,6 +3982,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+
+ guest_state_enter_irqoff();
+
++ /*
++ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
++ * VMRUN controls whether or not physical IRQs are masked (KVM always
++ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
++ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
++ * into guest state if delivery of an event during VMRUN triggers a
++ * #VMEXIT, and the guest_state transitions already tell lockdep that
++ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
++ * this path, so IRQs aren't actually unmasked while running host code.
++ */
++ raw_local_irq_enable();
++
+ amd_clear_divider();
+
+ if (sev_es_guest(vcpu->kvm))
+@@ -3989,6 +4001,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+ else
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
+
++ raw_local_irq_disable();
++
+ guest_state_exit_irqoff();
+ }
+
+diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
+index 42824f9b06a2..48b72625cc45 100644
+--- a/arch/x86/kvm/svm/vmenter.S
++++ b/arch/x86/kvm/svm/vmenter.S
+@@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
+ VM_CLEAR_CPU_BUFFERS
+
+ /* Enter guest mode */
+- sti
+-
+ 3: vmrun %_ASM_AX
+ 4:
+- cli
+-
+ /* Pop @svm to RAX while it's the only available register. */
+ pop %_ASM_AX
+
+@@ -343,11 +339,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ VM_CLEAR_CPU_BUFFERS
+
+ /* Enter guest mode */
+- sti
+-
+ 1: vmrun %_ASM_AX
+-
+-2: cli
++2:
+
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+--
+2.50.1
+
--- /dev/null
+From a0343421f8ed3cfa76b9719e3d3f1d575d5dd176 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:01 -0700
+Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ]
+
+Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the
+guest CPUID model, as debug support is supposed to be available if RTM is
+supported, and there are no known downsides to letting the guest debug RTM
+aborts.
+
+Note, there are no known bug reports related to RTM_DEBUG, the primary
+motivation is to reduce the probability of breaking existing guests when a
+future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL
+(KVM currently lets L2 run with whatever hardware supports; whoops).
+
+Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to
+DR7.RTM.
+
+Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/msr-index.h | 1 +
+ arch/x86/kvm/vmx/vmx.c | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index 727947ed5e5e..afd65c815043 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -379,6 +379,7 @@
+ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
+ #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+ #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
++#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
+
+ #define MSR_PEBS_FRONTEND 0x000003f7
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 60d1ff3fca45..9445def2b3d2 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2064,6 +2064,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
++ if (boot_cpu_has(X86_FEATURE_RTM) &&
++ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM)))
++ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
++
+ return debugctl;
+ }
+
+--
+2.50.1
+
--- /dev/null
+From ada33297c8f7efa38a5100d5dde191508fc0254b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:02 -0700
+Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
+
+Move VMX's logic to check DEBUGCTL values into a standalone helper so that
+the code can be used by nested VM-Enter to apply the same logic to the
+value being loaded from vmcs12.
+
+KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested
+VM-Enter, as hardware may support features that KVM does not, i.e. relying
+on hardware to detect invalid guest state will result in false negatives.
+Unfortunately, that means applying KVM's funky suppression of BTF and LBR
+to vmcs12 so as not to break existing guests.
+
+No functional change intended.
+
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------
+ 1 file changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9445def2b3d2..6517b9d929bf 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2071,6 +2071,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ return debugctl;
+ }
+
++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
++ bool host_initiated)
++{
++ u64 invalid;
++
++ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
++ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
++ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
++ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
++ }
++ return !invalid;
++}
++
+ /*
+ * Writes msr value into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+@@ -2139,19 +2152,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ }
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+- case MSR_IA32_DEBUGCTLMSR: {
+- u64 invalid;
+-
+- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+- }
+-
+- if (invalid)
++ case MSR_IA32_DEBUGCTLMSR:
++ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+ return 1;
+
++ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
++
+ if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+@@ -2161,7 +2167,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+- }
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+--
+2.50.1
+
--- /dev/null
+From 318a7d25fb1c4671eb3c0e5ead8980801321fd3a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:55 -0700
+Subject: KVM: VMX: Handle forced exit due to preemption timer in fastpath
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 11776aa0cfa7d007ad1799b1553bdcbd830e5010 ]
+
+Handle VMX preemption timer VM-Exits due to KVM forcing an exit in the
+exit fastpath, i.e. avoid calling back into handle_preemption_timer() for
+the same exit. There is no work to be done for forced exits, as the name
+suggests the goal is purely to get control back in KVM.
+
+In addition to shaving a few cycles, this will allow cleanly separating
+handle_fastpath_preemption_timer() from handle_preemption_timer(), e.g.
+it's not immediately obvious why _apparently_ calling
+handle_fastpath_preemption_timer() twice on a "slow" exit is necessary:
+the "slow" call is necessary to handle exits from L2, which are excluded
+from the fastpath by vmx_vcpu_run().
+
+Link: https://lore.kernel.org/r/20240110012705.506918-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 96bbccd9477c..c804ad001a79 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5941,12 +5941,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+- if (!vmx->req_immediate_exit) {
+- kvm_lapic_expired_hv_timer(vcpu);
+- return EXIT_FASTPATH_REENTER_GUEST;
+- }
++ /*
++ * If the timer expired because KVM used it to force an immediate exit,
++ * then mission accomplished.
++ */
++ if (vmx->req_immediate_exit)
++ return EXIT_FASTPATH_EXIT_HANDLED;
+
+- return EXIT_FASTPATH_NONE;
++ kvm_lapic_expired_hv_timer(vcpu);
++ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+--
+2.50.1
+
--- /dev/null
+From d21ac42171b150d6870e91a395f72845982311ff Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:57 -0700
+Subject: KVM: VMX: Handle KVM-induced preemption timer exits in fastpath for
+ L2
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 7b3d1bbf8d68d76fb21210932a5e8ed8ea80dbcc ]
+
+Eat VMX treemption timer exits in the fastpath regardless of whether L1 or
+L2 is active. The VM-Exit is 100% KVM-induced, i.e. there is nothing
+directly related to the exit that KVM needs to do on behalf of the guest,
+thus there is no reason to wait until the slow path to do nothing.
+
+Opportunistically add comments explaining why preemption timer exits for
+emulating the guest's APIC timer need to go down the slow path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++--
+ 1 file changed, 20 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 18ceed9046a9..4db9d41d988c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5948,13 +5948,26 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ if (vmx->req_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
++ /*
++ * If L2 is active, go down the slow path as emulating the guest timer
++ * expiration likely requires synthesizing a nested VM-Exit.
++ */
++ if (is_guest_mode(vcpu))
++ return EXIT_FASTPATH_NONE;
++
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+- handle_fastpath_preemption_timer(vcpu);
++ /*
++ * This non-fastpath handler is reached if and only if the preemption
++ * timer was being used to emulate a guest timer while L2 is active.
++ * All other scenarios are supposed to be handled in the fastpath.
++ */
++ WARN_ON_ONCE(!is_guest_mode(vcpu));
++ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+ }
+
+@@ -7138,7 +7151,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
+- if (is_guest_mode(vcpu))
++ /*
++ * If L2 is active, some VMX preemption timer exits can be handled in
++ * the fastpath even, all other exits must use the slow path.
++ */
++ if (is_guest_mode(vcpu) &&
++ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
+ switch (to_vmx(vcpu)->exit_reason.basic) {
+--
+2.50.1
+
--- /dev/null
+From 94d2d32566130542daf6fc1a32f0c8b615def9bd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:05 -0700
+Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the
+ guest
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ]
+
+Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the
+host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting
+while running the guest. When running with the "default treatment of SMIs"
+in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that
+is visible to host (non-SMM) software, and instead transitions directly
+from VMX non-root to SMM. And critically, DEBUGCTL isn't context switched
+by hardware on SMI or RSM, i.e. SMM will run with whatever value was
+resident in hardware at the time of the SMI.
+
+Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting
+events while the CPU is executing in SMM, which can pollute profiling and
+potentially leak information into the guest.
+
+Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner
+run loop, as the bit can be toggled in IRQ context via IPI callback (SMP
+function call), by way of /sys/devices/cpu/freeze_on_smi.
+
+Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be
+preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs,
+i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and
+at worst could lead to undesirable behavior in the future if AMD CPUs ever
+happened to pick up a collision with the bit.
+
+Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module
+owns and controls GUEST_IA32_DEBUGCTL.
+
+WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the
+lack of handling isn't a KVM bug (TDX already WARNs on any run_flag).
+
+Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed
+by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state().
+Doing so avoids the need to track host_debugctl on a per-VMCS basis, as
+GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and
+load_vmcs12_host_state(). For the VM-Fail case, even though KVM won't
+have actually entered the guest, vcpu_enter_guest() will have run with
+vmcs02 active and thus could result in vmcs01 being run with a stale value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: move vmx/main.c change to vmx/vmx.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 7 +++++++
+ arch/x86/kvm/vmx/nested.c | 3 +++
+ arch/x86/kvm/vmx/vmx.c | 5 +++++
+ arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++++-
+ arch/x86/kvm/x86.c | 14 ++++++++++++--
+ 5 files changed, 41 insertions(+), 3 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index c8fc4f2acf69..d0229323ca63 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1459,6 +1459,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
++ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+ };
+
+ struct kvm_x86_ops {
+@@ -1484,6 +1485,12 @@ struct kvm_x86_ops {
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
++ /*
++ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
++ * match the host's value even while the guest is active.
++ */
++ const u64 HOST_OWNED_DEBUGCTL;
++
+ void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index a220770644e1..2c3cf4351c4c 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4627,6 +4627,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
++ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
++ vmx_reload_guest_debugctl(vcpu);
++
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index e470a294b22d..3fef4e14abc6 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7258,6 +7258,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
++ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
++ vmx_reload_guest_debugctl(vcpu);
++
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8197,6 +8200,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
++ .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
++
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index b7ae263cde7b..dc6f06326648 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -447,12 +447,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+
+ static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+ {
++ WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
++
++ val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+ }
+
+ static inline u64 vmx_guest_debugctl_read(void)
+ {
+- return vmcs_read64(GUEST_IA32_DEBUGCTL);
++ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
++}
++
++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
++{
++ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
++
++ if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
++ return;
++
++ vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
+ }
+
+ /*
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 9d66830d594c..dfecf5ba5aa7 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10591,7 +10591,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
+- u64 run_flags;
++ u64 run_flags, debug_ctl;
+
+ bool req_immediate_exit = false;
+
+@@ -10838,7 +10838,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(0, 7);
+ }
+
+- vcpu->arch.host_debugctl = get_debugctlmsr();
++ /*
++ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
++ * can be modified in IRQ context, e.g. via SMP function calls. Inform
++ * vendor code if any host-owned bits were changed, e.g. so that the
++ * value loaded into hardware while running the guest can be updated.
++ */
++ debug_ctl = get_debugctlmsr();
++ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
++ !vcpu->arch.guest_state_protected)
++ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
++ vcpu->arch.host_debugctl = debug_ctl;
+
+ guest_timing_enter_irqoff();
+
+--
+2.50.1
+
--- /dev/null
+From 46e5f37d619ea0a3b02610d32be90ddab43d9393 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:54 -0700
+Subject: KVM: VMX: Re-enter guest in fastpath for "spurious" preemption timer
+ exits
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit e6b5d16bbd2d4c8259ad76aa33de80d561aba5f9 ]
+
+Re-enter the guest in the fast path if VMX preeemption timer VM-Exit was
+"spurious", i.e. if KVM "soft disabled" the timer by writing -1u and by
+some miracle the timer expired before any other VM-Exit occurred. This is
+just an intermediate step to cleaning up the preemption timer handling,
+optimizing these types of spurious VM-Exits is not interesting as they are
+extremely rare/infrequent.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0b495979a02b..96bbccd9477c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5933,8 +5933,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+- if (!vmx->req_immediate_exit &&
+- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
++ /*
++ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
++ * due to the timer expiring while it was "soft" disabled, just eat the
++ * exit and re-enter the guest.
++ */
++ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
++ return EXIT_FASTPATH_REENTER_GUEST;
++
++ if (!vmx->req_immediate_exit) {
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+--
+2.50.1
+
--- /dev/null
+From 495f4d2993192a89076ae3ae03216019fc88fa55 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:04 -0700
+Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
+
+Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to
+vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into
+GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state
+into the guest, and without needing to copy+paste the FREEZE_IN_SMM
+logic into every patch that accesses GUEST_IA32_DEBUGCTL.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+[sean: massage changelog, make inline, use in all prepare_vmcs02() cases]
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 10 +++++-----
+ arch/x86/kvm/vmx/pmu_intel.c | 8 ++++----
+ arch/x86/kvm/vmx/vmx.c | 8 +++++---
+ arch/x86/kvm/vmx/vmx.h | 10 ++++++++++
+ 4 files changed, 24 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index da129e12cff9..a220770644e1 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2532,11 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
+- vmx_get_supported_debugctl(vcpu, false));
++ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
++ vmx_get_supported_debugctl(vcpu, false));
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
++ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
+ }
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -3404,7 +3404,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
+ if (kvm_mpx_supported() &&
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -4572,7 +4572,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+
+ kvm_set_dr(vcpu, 7, 0x400);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++ vmx_guest_debugctl_write(vcpu, 0);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 220cdbe1e286..76d3ed8abf6a 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -672,11 +672,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+ */
+ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+ {
+- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ u64 data = vmx_guest_debugctl_read();
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++ vmx_guest_debugctl_write(vcpu, data);
+ }
+ }
+
+@@ -746,7 +746,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
++ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+@@ -769,7 +769,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+
+ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+ {
+- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
++ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
+ }
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0b37e21d55b1..e470a294b22d 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2027,7 +2027,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ msr_info->data = vmx_guest_debugctl_read();
+ break;
+ default:
+ find_uret_msr:
+@@ -2161,7 +2161,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+
+- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++ vmx_guest_debugctl_write(vcpu, data);
++
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+@@ -4751,7 +4752,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++
++ vmx_guest_debugctl_write(&vmx->vcpu, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 99e3f46de2ec..b7ae263cde7b 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -445,6 +445,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
++{
++ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
++}
++
++static inline u64 vmx_guest_debugctl_read(void)
++{
++ return vmcs_read64(GUEST_IA32_DEBUGCTL);
++}
++
+ /*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+--
+2.50.1
+
--- /dev/null
+From 36f7addde5e161c3ad08eccfdaaf6d318b6e6461 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:59 -0700
+Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic
+ bitmap
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ]
+
+Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter
+into an a generic bitmap so that similar "take action" information can be
+passed to vendor code without creating a pile of boolean parameters.
+
+This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and
+will also allow for adding similar functionality for re-loading debugctl
+in the active VMCS.
+
+Opportunistically massage the TDX WARN and comment to prepare for adding
+more run_flags, all of which are expected to be mutually exclusive with
+TDX, i.e. should be WARNed on.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX crud, account for lack of kvm_x86_call()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 6 +++++-
+ arch/x86/kvm/svm/svm.c | 4 ++--
+ arch/x86/kvm/vmx/vmx.c | 3 ++-
+ arch/x86/kvm/x86.c | 10 ++++++++--
+ 4 files changed, 17 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 86f3bd6601e7..1383f5e5238a 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1456,6 +1456,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+ }
+
++enum kvm_x86_run_flags {
++ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
++};
++
+ struct kvm_x86_ops {
+ const char *name;
+
+@@ -1529,7 +1533,7 @@ struct kvm_x86_ops {
+
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+- bool force_immediate_exit);
++ u64 run_flags);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath);
+ int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 12de50db401f..dc8a1b72d8ec 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4008,9 +4008,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+ guest_state_exit_irqoff();
+ }
+
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+- bool force_immediate_exit)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 179747d04edc..382f42200688 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7204,8 +7204,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ guest_state_exit_irqoff();
+ }
+
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 400a6e9fb0be..83e5e823cbae 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10591,6 +10591,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
++ u64 run_flags;
+
+ bool req_immediate_exit = false;
+
+@@ -10811,8 +10812,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ goto cancel_injection;
+ }
+
+- if (req_immediate_exit)
++ run_flags = 0;
++ if (req_immediate_exit) {
++ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
++ }
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+@@ -10848,7 +10852,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+
+- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+@@ -10860,6 +10864,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ break;
+ }
+
++ run_flags = 0;
++
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
+ }
+--
+2.50.1
+
--- /dev/null
+From 3a65689ab6b232b205f7e1d222883025eacb62d1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:12:00 -0700
+Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ]
+
+Instruct vendor code to load the guest's DR6 into hardware via a new
+KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to
+load vcpu->arch.dr6 into hardware when DR6 can be read/written directly
+by the guest.
+
+Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM
+thinks DR6 needs to be reloaded. TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH
+and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of vmx/main.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h | 1 -
+ arch/x86/include/asm/kvm_host.h | 2 +-
+ arch/x86/kvm/svm/svm.c | 10 ++++++----
+ arch/x86/kvm/vmx/vmx.c | 10 +++-------
+ arch/x86/kvm/x86.c | 2 +-
+ 5 files changed, 11 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 0e5ae3b0c867..c068565fe954 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -47,7 +47,6 @@ KVM_X86_OP(set_idt)
+ KVM_X86_OP(get_gdt)
+ KVM_X86_OP(set_gdt)
+ KVM_X86_OP(sync_dirty_debug_regs)
+-KVM_X86_OP(set_dr6)
+ KVM_X86_OP(set_dr7)
+ KVM_X86_OP(cache_reg)
+ KVM_X86_OP(get_rflags)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 1383f5e5238a..c8fc4f2acf69 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1458,6 +1458,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+
+ enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
++ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ };
+
+ struct kvm_x86_ops {
+@@ -1504,7 +1505,6 @@ struct kvm_x86_ops {
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index dc8a1b72d8ec..5a6bd9d5cceb 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4052,10 +4052,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
+ /*
+- * Run with all-zero DR6 unless needed, so that we can get the exact cause
+- * of a #DB.
++ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
++ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
++ * KVM's snapshot is only necessary when DR accesses won't exit.
+ */
+- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
++ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
++ svm_set_dr6(vcpu, vcpu->arch.dr6);
++ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
+
+ clgi();
+@@ -4822,7 +4825,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+- .set_dr6 = svm_set_dr6,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 382f42200688..60d1ff3fca45 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5530,12 +5530,6 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+ set_debugreg(DR6_RESERVED, 6);
+ }
+
+-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+-{
+- lockdep_assert_irqs_disabled();
+- set_debugreg(vcpu->arch.dr6, 6);
+-}
+-
+ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+ {
+ vmcs_writel(GUEST_DR7, val);
+@@ -7251,6 +7245,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+
++ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
++ set_debugreg(vcpu->arch.dr6, 6);
++
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8208,7 +8205,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+- .set_dr6 = vmx_set_dr6,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 83e5e823cbae..9d66830d594c 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10833,7 +10833,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+- static_call(kvm_x86_set_dr6)(vcpu, vcpu->arch.dr6);
++ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
+ }
+--
+2.50.1
+
--- /dev/null
+From b596c99630a856d3912ec549084a96dd2546752f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:58 -0700
+Subject: KVM: x86: Fully defer to vendor code to decide how to force immediate
+ exit
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 0ec3d6d1f169baa7fc512ae4b78d17e7c94b7763 ]
+
+Now that vmx->req_immediate_exit is used only in the scope of
+vmx_vcpu_run(), use force_immediate_exit to detect that KVM should usurp
+the VMX preemption to force a VM-Exit and let vendor code fully handle
+forcing a VM-Exit.
+
+Opportunsitically drop __kvm_request_immediate_exit() and just have
+vendor code call smp_send_reschedule() directly. SVM already does this
+when injecting an event while also trying to single-step an IRET, i.e.
+it's not exactly secret knowledge that KVM uses a reschedule IPI to force
+an exit.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-7-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve absurd conflict due to funky kvm_x86_ops.sched_in prototype]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h | 1 -
+ arch/x86/include/asm/kvm_host.h | 3 ---
+ arch/x86/kvm/svm/svm.c | 7 ++++---
+ arch/x86/kvm/vmx/vmx.c | 32 +++++++++++++-----------------
+ arch/x86/kvm/vmx/vmx.h | 2 --
+ arch/x86/kvm/x86.c | 10 +---------
+ 6 files changed, 19 insertions(+), 36 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 29bef25ac77c..0e5ae3b0c867 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -100,7 +100,6 @@ KVM_X86_OP(write_tsc_multiplier)
+ KVM_X86_OP(get_exit_info)
+ KVM_X86_OP(check_intercept)
+ KVM_X86_OP(handle_exit_irqoff)
+-KVM_X86_OP(request_immediate_exit)
+ KVM_X86_OP(sched_in)
+ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
+ KVM_X86_OP_OPTIONAL(vcpu_blocking)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 93f523762854..86f3bd6601e7 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1590,8 +1590,6 @@ struct kvm_x86_ops {
+ struct x86_exception *exception);
+ void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+
+- void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
+-
+ void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+ /*
+@@ -2059,7 +2057,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
+
+ void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 337a304d211b..12de50db401f 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4033,9 +4033,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+- smp_send_reschedule(vcpu->cpu);
++ force_immediate_exit = true;
+ }
+
++ if (force_immediate_exit)
++ smp_send_reschedule(vcpu->cpu);
++
+ pre_svm_run(vcpu);
+
+ sync_lapic_to_cr8(vcpu);
+@@ -4874,8 +4877,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .check_intercept = svm_check_intercept,
+ .handle_exit_irqoff = svm_handle_exit_irqoff,
+
+- .request_immediate_exit = __kvm_request_immediate_exit,
+-
+ .sched_in = svm_sched_in,
+
+ .nested_ops = &svm_nested_ops,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4db9d41d988c..179747d04edc 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -49,6 +49,8 @@
+ #include <asm/virtext.h>
+ #include <asm/vmx.h>
+
++#include <trace/events/ipi.h>
++
+ #include "capabilities.h"
+ #include "cpuid.h"
+ #include "evmcs.h"
+@@ -1223,8 +1225,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+ u16 fs_sel, gs_sel;
+ int i;
+
+- vmx->req_immediate_exit = false;
+-
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+@@ -5929,7 +5929,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
+ return 1;
+ }
+
+-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
++static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+@@ -5945,7 +5946,7 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+- if (vmx->req_immediate_exit)
++ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+@@ -7090,13 +7091,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+ msrs[i].host, false);
+ }
+
+-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
++static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+- if (vmx->req_immediate_exit) {
++ if (force_immediate_exit) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (vmx->hv_deadline_tsc != -1) {
+@@ -7149,7 +7150,8 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ barrier_nospec();
+ }
+
+-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit)
+ {
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+@@ -7163,7 +7165,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_set_msr_irqoff(vcpu);
+ case EXIT_REASON_PREEMPTION_TIMER:
+- return handle_fastpath_preemption_timer(vcpu);
++ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+@@ -7284,7 +7286,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ vmx_passthrough_lbr_msrs(vcpu);
+
+ if (enable_preemption_timer)
+- vmx_update_hv_timer(vcpu);
++ vmx_update_hv_timer(vcpu, force_immediate_exit);
++ else if (force_immediate_exit)
++ smp_send_reschedule(vcpu->cpu);
+
+ kvm_wait_lapic_expire(vcpu);
+
+@@ -7358,7 +7362,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+
+- return vmx_exit_handlers_fastpath(vcpu);
++ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
+ }
+
+ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+@@ -7865,11 +7869,6 @@ static __init void vmx_set_cpu_caps(void)
+ kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+ }
+
+-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+- to_vmx(vcpu)->req_immediate_exit = true;
+-}
+-
+ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+ {
+@@ -8275,8 +8274,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+- .request_immediate_exit = vmx_request_immediate_exit,
+-
+ .sched_in = vmx_sched_in,
+
+ .cpu_dirty_log_size = PML_ENTITY_NUM,
+@@ -8533,7 +8530,6 @@ static __init int hardware_setup(void)
+ if (!enable_preemption_timer) {
+ vmx_x86_ops.set_hv_timer = NULL;
+ vmx_x86_ops.cancel_hv_timer = NULL;
+- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
+ }
+
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 357819872d80..ddbe73958d7f 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -343,8 +343,6 @@ struct vcpu_vmx {
+ unsigned int ple_window;
+ bool ple_window_dirty;
+
+- bool req_immediate_exit;
+-
+ /* Support for PML */
+ #define PML_ENTITY_NUM 512
+ struct page *pml_pg;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 08c3da88f402..400a6e9fb0be 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10578,12 +10578,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+ static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+ }
+
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+- smp_send_reschedule(vcpu->cpu);
+-}
+-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+-
+ /*
+ * Called within kvm->srcu read side.
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
+@@ -10817,10 +10811,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ goto cancel_injection;
+ }
+
+- if (req_immediate_exit) {
++ if (req_immediate_exit)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+- static_call(kvm_x86_request_immediate_exit)(vcpu);
+- }
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+--
+2.50.1
+
--- /dev/null
+From 2ce55c36cca09ff95c3ba4cdb09407fc864500b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:56 -0700
+Subject: KVM: x86: Move handling of is_guest_mode() into fastpath exit
+ handlers
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit bf1a49436ea37b98dd2f37c57608951d0e28eecc ]
+
+Let the fastpath code decide which exits can/can't be handled in the
+fastpath when L2 is active, e.g. when KVM generates a VMX preemption
+timer exit to forcefully regain control, there is no "work" to be done and
+so such exits can be handled in the fastpath regardless of whether L1 or
+L2 is active.
+
+Moving the is_guest_mode() check into the fastpath code also makes it
+easier to see that L2 isn't allowed to use the fastpath in most cases,
+e.g. it's not immediately obvious why handle_fastpath_preemption_timer()
+is called from the fastpath and the normal path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-5-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve syntactic conflict in svm_exit_handlers_fastpath()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c | 6 +++---
+ arch/x86/kvm/vmx/vmx.c | 6 +++---
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index b4283c2358a6..337a304d211b 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3964,6 +3964,9 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
++ if (is_guest_mode(vcpu))
++ return EXIT_FASTPATH_NONE;
++
+ /*
+ * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
+ * can't read guest memory (dereference memslots) to decode the WRMSR.
+@@ -4127,9 +4130,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+
+ svm_complete_interrupts(vcpu);
+
+- if (is_guest_mode(vcpu))
+- return EXIT_FASTPATH_NONE;
+-
+ return svm_exit_handlers_fastpath(vcpu);
+ }
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index c804ad001a79..18ceed9046a9 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7138,6 +7138,9 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
++ if (is_guest_mode(vcpu))
++ return EXIT_FASTPATH_NONE;
++
+ switch (to_vmx(vcpu)->exit_reason.basic) {
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_set_msr_irqoff(vcpu);
+@@ -7337,9 +7340,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+
+- if (is_guest_mode(vcpu))
+- return EXIT_FASTPATH_NONE;
+-
+ return vmx_exit_handlers_fastpath(vcpu);
+ }
+
+--
+2.50.1
+
--- /dev/null
+From b8df9da8aaf5d2d743800536dbd0bf0ec684f320 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:53 -0700
+Subject: KVM: x86: Plumb "force_immediate_exit" into kvm_entry() tracepoint
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 9c9025ea003a03f967affd690f39b4ef3452c0f5 ]
+
+Annotate the kvm_entry() tracepoint with "immediate exit" when KVM is
+forcing a VM-Exit immediately after VM-Enter, e.g. when KVM wants to
+inject an event but needs to first complete some other operation.
+Knowing that KVM is (or isn't) forcing an exit is useful information when
+debugging issues related to event injection.
+
+Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
+Link: https://lore.kernel.org/r/20240110012705.506918-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 3 ++-
+ arch/x86/kvm/svm/svm.c | 5 +++--
+ arch/x86/kvm/trace.h | 9 ++++++---
+ arch/x86/kvm/vmx/vmx.c | 4 ++--
+ arch/x86/kvm/x86.c | 2 +-
+ 5 files changed, 14 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 555c7bf35e28..93f523762854 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1528,7 +1528,8 @@ struct kvm_x86_ops {
+ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
+
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
++ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath);
+ int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 2c0f9c7d1242..b4283c2358a6 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4005,12 +4005,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+ guest_state_exit_irqoff();
+ }
+
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+- trace_kvm_entry(vcpu);
++ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
+index 6c1dcf44c4fa..ab407bc00d84 100644
+--- a/arch/x86/kvm/trace.h
++++ b/arch/x86/kvm/trace.h
+@@ -15,20 +15,23 @@
+ * Tracepoint for guest mode entry.
+ */
+ TRACE_EVENT(kvm_entry,
+- TP_PROTO(struct kvm_vcpu *vcpu),
+- TP_ARGS(vcpu),
++ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
++ TP_ARGS(vcpu, force_immediate_exit),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned long, rip )
++ __field( bool, immediate_exit )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->rip = kvm_rip_read(vcpu);
++ __entry->immediate_exit = force_immediate_exit;
+ ),
+
+- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
++ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
++ __entry->immediate_exit ? "[immediate exit]" : "")
+ );
+
+ /*
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 390af16d9a67..0b495979a02b 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7171,7 +7171,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ guest_state_exit_irqoff();
+ }
+
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+@@ -7198,7 +7198,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+ return EXIT_FASTPATH_NONE;
+ }
+
+- trace_kvm_entry(vcpu);
++ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index d224180c56f5..08c3da88f402 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10856,7 +10856,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+
+- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+--
+2.50.1
+
--- /dev/null
+From 8aadc6631ffd7b08508de7b053eb6e237402d947 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:47 -0700
+Subject: KVM: x86: Plumb in the vCPU to kvm_x86_ops.hwapic_isr_update()
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 76bce9f10162cd4b36ac0b7889649b22baf70ebd ]
+
+Pass the target vCPU to the hwapic_isr_update() vendor hook so that VMX
+can defer the update until after nested VM-Exit if an EOI for L1's vAPIC
+occurs while L2 is active.
+
+Note, commit d39850f57d21 ("KVM: x86: Drop @vcpu parameter from
+kvm_x86_ops.hwapic_isr_update()") removed the parameter with the
+justification that doing so "allows for a decent amount of (future)
+cleanup in the APIC code", but it's not at all clear what cleanup was
+intended, or if it was ever realized.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Chao Gao <chao.gao@intel.com>
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of kvm_x86_call(), drop vmx/x86_ops.h change]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 2 +-
+ arch/x86/kvm/lapic.c | 8 ++++----
+ arch/x86/kvm/vmx/vmx.c | 2 +-
+ 3 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 17b4e61a52b9..6db42ee82032 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1552,7 +1552,7 @@ struct kvm_x86_ops {
+ bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+ void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+ void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+- void (*hwapic_isr_update)(int isr);
++ void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+ bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 42eec987ac3d..3d65d6a023c9 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -587,7 +587,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+ * just set SVI.
+ */
+ if (unlikely(apic->apicv_active))
+- static_call_cond(kvm_x86_hwapic_isr_update)(vec);
++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+@@ -632,7 +632,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+ * and must be left alone.
+ */
+ if (unlikely(apic->apicv_active))
+- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
+ else {
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+@@ -2554,7 +2554,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+ if (apic->apicv_active) {
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+- static_call_cond(kvm_x86_hwapic_isr_update)(-1);
++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
+ }
+
+ vcpu->arch.apic_arb_prio = 0;
+@@ -2847,7 +2847,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+ if (apic->apicv_active) {
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9a5cb896229f..721ba6ddb121 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6708,7 +6708,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+ put_page(page);
+ }
+
+-static void vmx_hwapic_isr_update(int max_isr)
++static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ {
+ u16 status;
+ u8 old;
+--
+2.50.1
+
--- /dev/null
+From 7a29b546168ea7252a909633d7860beb1599191b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:52 -0700
+Subject: KVM: x86/pmu: Gate all "unimplemented MSR" prints on
+ report_ignored_msrs
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit e76ae52747a82a548742107b4100e90da41a624d ]
+
+Add helpers to print unimplemented MSR accesses and condition all such
+prints on report_ignored_msrs, i.e. honor userspace's request to not
+print unimplemented MSRs. Even though vcpu_unimpl() is ratelimited,
+printing can still be problematic, e.g. if a print gets stalled when host
+userspace is writing MSRs during live migration, an effective stall can
+result in very noticeable disruption in the guest.
+
+E.g. the profile below was taken while calling KVM_SET_MSRS on the PMU
+counters while the PMU was disabled in KVM.
+
+ - 99.75% 0.00% [.] __ioctl
+ - __ioctl
+ - 99.74% entry_SYSCALL_64_after_hwframe
+ do_syscall_64
+ sys_ioctl
+ - do_vfs_ioctl
+ - 92.48% kvm_vcpu_ioctl
+ - kvm_arch_vcpu_ioctl
+ - 85.12% kvm_set_msr_ignored_check
+ svm_set_msr
+ kvm_set_msr_common
+ printk
+ vprintk_func
+ vprintk_default
+ vprintk_emit
+ console_unlock
+ call_console_drivers
+ univ8250_console_write
+ serial8250_console_write
+ uart_console_write
+
+Reported-by: Aaron Lewis <aaronlewis@google.com>
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Link: https://lore.kernel.org/r/20230124234905.3774678-3-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/hyperv.c | 10 ++++------
+ arch/x86/kvm/svm/svm.c | 5 ++---
+ arch/x86/kvm/vmx/vmx.c | 4 +---
+ arch/x86/kvm/x86.c | 18 +++++-------------
+ arch/x86/kvm/x86.h | 12 ++++++++++++
+ 5 files changed, 24 insertions(+), 25 deletions(-)
+
+diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
+index 28555bbd52e8..cb0a531e13c5 100644
+--- a/arch/x86/kvm/hyperv.c
++++ b/arch/x86/kvm/hyperv.c
+@@ -1406,8 +1406,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_set_msr(vcpu, msr, data, host);
+ default:
+- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
+- msr, data);
++ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ return 0;
+@@ -1528,8 +1527,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+ return 1;
+ break;
+ default:
+- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
+- msr, data);
++ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+
+@@ -1581,7 +1579,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_get_msr(vcpu, msr, pdata, host);
+ default:
+- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
++ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+
+@@ -1646,7 +1644,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ data = APIC_BUS_FREQUENCY;
+ break;
+ default:
+- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
++ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+ *pdata = data;
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index b922f31d1415..2c0f9c7d1242 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3035,8 +3035,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!lbrv) {
+- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+- __func__, data);
++ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+ break;
+ }
+
+@@ -3077,7 +3076,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+ case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
+ case MSR_VM_IGNNE:
+- vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
++ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+ break;
+ case MSR_AMD64_DE_CFG: {
+ struct kvm_msr_entry msr_entry;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index c24da2cff208..390af16d9a67 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2140,9 +2140,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+- if (report_ignored_msrs)
+- vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
+- __func__, data);
++ kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+ data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ }
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index b0ae61ba9b99..d224180c56f5 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3573,7 +3573,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
+
+ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ {
+- bool pr = false;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+@@ -3625,15 +3624,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ if (data == BIT_ULL(18)) {
+ vcpu->arch.msr_hwcr = data;
+ } else if (data != 0) {
+- vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+- data);
++ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ break;
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+- vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+- "0x%llx\n", data);
++ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ break;
+@@ -3813,16 +3810,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+- pr = true;
+- fallthrough;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+- if (pr || data != 0)
+- vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
+- "0x%x data 0x%llx\n", msr, data);
++ if (data)
++ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+@@ -3849,9 +3843,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ /* Drop writes to this legacy MSR -- see rdmsr
+ * counterpart for further detail.
+ */
+- if (report_ignored_msrs)
+- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
+- msr, data);
++ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
+index 9de72586f406..f3554bf05201 100644
+--- a/arch/x86/kvm/x86.h
++++ b/arch/x86/kvm/x86.h
+@@ -331,6 +331,18 @@ extern bool report_ignored_msrs;
+
+ extern bool eager_page_split;
+
++static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
++{
++ if (report_ignored_msrs)
++ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
++}
++
++static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
++{
++ if (report_ignored_msrs)
++ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
++}
++
+ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+ {
+ return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+--
+2.50.1
+
--- /dev/null
+From c53c4c4220e372f9a392cb4dd337b2ddd5b5596a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:46 -0700
+Subject: KVM: x86: Re-split x2APIC ICR into ICR+ICR2 for AMD (x2AVIC)
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 73b42dc69be8564d4951a14d00f827929fe5ef79 ]
+
+Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's
+IPI virtualization support, but only for AMD. While not stated anywhere
+in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs
+store the 64-bit ICR as two separate 32-bit values in ICR and ICR2. When
+IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled,
+KVM needs to match CPU behavior as some ICR ICR writes will be handled by
+the CPU, not by KVM.
+
+Add a kvm_x86_ops knob to control the underlying format used by the CPU to
+store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether
+or not x2AVIC is enabled. If KVM is handling all ICR writes, the storage
+format for x2APIC mode doesn't matter, and having the behavior follow AMD
+versus Intel will provide better test coverage and ease debugging.
+
+Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode")
+Cc: stable@vger.kernel.org
+Cc: Maxim Levitsky <mlevitsk@redhat.com>
+Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+Link: https://lore.kernel.org/r/20240719235107.3023592-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflicts]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 2 ++
+ arch/x86/kvm/lapic.c | 42 +++++++++++++++++++++++----------
+ arch/x86/kvm/svm/svm.c | 2 ++
+ arch/x86/kvm/vmx/vmx.c | 2 ++
+ 4 files changed, 36 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index eb06c2f68314..17b4e61a52b9 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1547,6 +1547,8 @@ struct kvm_x86_ops {
+ void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+ void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+ void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
++
++ const bool x2apic_icr_is_split;
+ bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+ void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+ void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 7f57dce5c828..42eec987ac3d 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -2315,11 +2315,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+ data &= ~APIC_ICR_BUSY;
+
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+- kvm_lapic_set_reg64(apic, APIC_ICR, data);
++ if (kvm_x86_ops.x2apic_icr_is_split) {
++ kvm_lapic_set_reg(apic, APIC_ICR, data);
++ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
++ } else {
++ kvm_lapic_set_reg64(apic, APIC_ICR, data);
++ }
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+ }
+
++static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
++{
++ if (kvm_x86_ops.x2apic_icr_is_split)
++ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
++ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
++
++ return kvm_lapic_get_reg64(apic, APIC_ICR);
++}
++
+ /* emulate APIC access in a trap manner */
+ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+ {
+@@ -2337,7 +2351,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+ * maybe-unecessary write, and both are in the noise anyways.
+ */
+ if (apic_x2apic_mode(apic) && offset == APIC_ICR)
+- WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)));
++ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
+ else
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+ }
+@@ -2760,18 +2774,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+
+ /*
+ * In x2APIC mode, the LDR is fixed and based on the id. And
+- * ICR is internally a single 64-bit register, but needs to be
+- * split to ICR+ICR2 in userspace for backwards compatibility.
++ * if the ICR is _not_ split, ICR is internally a single 64-bit
++ * register, but needs to be split to ICR+ICR2 in userspace for
++ * backwards compatibility.
+ */
+- if (set) {
++ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(*id);
+
+- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+- } else {
+- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
++ if (!kvm_x86_ops.x2apic_icr_is_split) {
++ if (set) {
++ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
++ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
++ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
++ } else {
++ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
++ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
++ }
+ }
+ }
+
+@@ -2971,7 +2989,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+ u32 low;
+
+ if (reg == APIC_ICR) {
+- *data = kvm_lapic_get_reg64(apic, APIC_ICR);
++ *data = kvm_x2apic_icr_read(apic);
+ return 0;
+ }
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index c95a84afc35f..b922f31d1415 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4851,6 +4851,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .enable_nmi_window = svm_enable_nmi_window,
+ .enable_irq_window = svm_enable_irq_window,
+ .update_cr8_intercept = svm_update_cr8_intercept,
++
++ .x2apic_icr_is_split = true,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index fbe26b88f731..9a5cb896229f 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -8202,6 +8202,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
++
++ .x2apic_icr_is_split = false,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+--
+2.50.1
+
--- /dev/null
+From ac35d395216d2db6535082fde4a62a3ee3849d40 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:51 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 189ecdb3e112da703ac0699f4ec76aa78122f911 ]
+
+Snapshot the host's DEBUGCTL after disabling IRQs, as perf can toggle
+debugctl bits from IRQ context, e.g. when enabling/disabling events via
+smp_call_function_single(). Taking the snapshot (long) before IRQs are
+disabled could result in KVM effectively clobbering DEBUGCTL due to using
+a stale snapshot.
+
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index ba24bb50af57..b0ae61ba9b99 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4742,7 +4742,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
+- vcpu->arch.host_debugctl = get_debugctlmsr();
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+@@ -10851,6 +10850,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(0, 7);
+ }
+
++ vcpu->arch.host_debugctl = get_debugctlmsr();
++
+ guest_timing_enter_irqoff();
+
+ for (;;) {
+--
+2.50.1
+
--- /dev/null
+From 52e78074c894adecdf2fb1d987959707ce46beed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:11:50 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL in common x86
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit fb71c795935652fa20eaf9517ca9547f5af99a76 ]
+
+Move KVM's snapshot of DEBUGCTL to kvm_vcpu_arch and take the snapshot in
+common x86, so that SVM can also use the snapshot.
+
+Opportunistically change the field to a u64. While bits 63:32 are reserved
+on AMD, not mentioned at all in Intel's SDM, and managed as an "unsigned
+long" by the kernel, DEBUGCTL is an MSR and therefore a 64-bit value.
+
+Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in vmx_vcpu_load()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/vmx/vmx.c | 8 ++------
+ arch/x86/kvm/vmx/vmx.h | 2 --
+ arch/x86/kvm/x86.c | 1 +
+ 4 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 6db42ee82032..555c7bf35e28 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -677,6 +677,7 @@ struct kvm_vcpu_arch {
+ u32 pkru;
+ u32 hflags;
+ u64 efer;
++ u64 host_debugctl;
+ u64 apic_base;
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ bool load_eoi_exitmap_pending;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 7b87fbc69b21..c24da2cff208 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -1418,13 +1418,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ */
+ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+- struct vcpu_vmx *vmx = to_vmx(vcpu);
+-
+ vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+-
+- vmx->host_debugctlmsr = get_debugctlmsr();
+ }
+
+ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+@@ -7275,8 +7271,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+ }
+
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+- if (vmx->host_debugctlmsr)
+- update_debugctlmsr(vmx->host_debugctlmsr);
++ if (vcpu->arch.host_debugctl)
++ update_debugctlmsr(vcpu->arch.host_debugctl);
+
+ #ifndef CONFIG_X86_64
+ /*
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 8b4b149bd9c1..357819872d80 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -352,8 +352,6 @@ struct vcpu_vmx {
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
+- unsigned long host_debugctlmsr;
+-
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index a6dc8f662fa4..ba24bb50af57 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4742,6 +4742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
++ vcpu->arch.host_debugctl = get_debugctlmsr();
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+--
+2.50.1
+
--- /dev/null
+From 70d909202444ad2c328a4944d265dc9ad7efe92a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 2cf58a8b8e4d..d3e28574ceb9 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -859,8 +859,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -1175,19 +1173,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -1224,7 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -1237,8 +1242,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, true, flags);
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -1251,12 +1255,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
documentation-acpi-fix-parent-device-references.patch
acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
acpi-processor-perflib-move-problematic-pr-performance-check.patch
+kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch
+kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch
+kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch
+kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch
+kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch
+kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch
+kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch
+kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch
+kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch
+kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch
+kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch
+kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch
+kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch
+kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
+kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
+kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
+kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
+kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
+kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
+kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
--- /dev/null
+From 21b9dfb2ec919b8b4561d84dd45c0ee4799c62d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 1a51c4b44c00..593108049ab7 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -60,7 +60,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+
--- /dev/null
+From a05287bd1654c451e1eb7b9e28de5ef9f1b9d901 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2]. This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period. Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index 97ffadc7e57a..01322a905414 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -153,6 +153,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters)
+
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++ /* Update the repeating-pattern data. */
++ data->intervals[data->interval_ptr++] = interval_us;
++ if (data->interval_ptr >= INTERVALS)
++ data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+ /*
+@@ -277,6 +285,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ if (data->needs_update) {
+ menu_update(drv, dev);
+ data->needs_update = 0;
++ } else if (!dev->last_residency_ns) {
++ /*
++ * This happens when the driver rejects the previously selected
++ * idle state and returns an error, so update the recent
++ * intervals table to prevent invalid information from being
++ * used going forward.
++ */
++ menu_update_intervals(data, UINT_MAX);
+ }
+
+ nr_iowaiters = nr_iowait_cpu(dev->cpu);
+@@ -546,10 +562,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+
+ data->correction_factor[data->bucket] = new_factor;
+
+- /* update the repeating-pattern data */
+- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+- if (data->interval_ptr >= INTERVALS)
+- data->interval_ptr = 0;
++ menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+
+ /**
+--
+2.50.1
+
--- /dev/null
+From b5a874d6221e42baa1685f2af96f79fd75b92995 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 12 Jul 2025 06:02:31 +0100
+Subject: habanalabs: fix UAF in export_dmabuf()
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ]
+
+As soon as we'd inserted a file reference into descriptor table, another
+thread could close it. That's fine for the case when all we are doing is
+returning that descriptor to userland (it's a race, but it's a userland
+race and there's nothing the kernel can do about it). However, if we
+follow fd_install() with any kind of access to objects that would be
+destroyed on close (be it the struct file itself or anything destroyed
+by its ->release()), we have a UAF.
+
+dma_buf_fd() is a combination of reserving a descriptor and fd_install().
+habanalabs export_dmabuf() calls it and then proceeds to access the
+objects destroyed on close. In particular, it grabs an extra reference to
+another struct file that will be dropped as part of ->release() for ours;
+that "will be" is actually "might have already been".
+
+Fix that by reserving descriptor before anything else and do fd_install()
+only when everything had been set up. As a side benefit, we no longer
+have the failure exit with file already created, but reference to
+underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet;
+unlike dma_buf_fd(), fd_install() can't fail.
+
+Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/accel/habanalabs/common/memory.c | 23 +++++++----------------
+ 1 file changed, 7 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
+index 3348ad12c237..11c55fd76db5 100644
+--- a/drivers/accel/habanalabs/common/memory.c
++++ b/drivers/accel/habanalabs/common/memory.c
+@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
+ struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
+ struct hl_ctx *ctx;
+
+- if (!hl_dmabuf)
+- return;
+-
+ ctx = hl_dmabuf->ctx;
+
+ if (hl_dmabuf->memhash_hnode)
+@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ {
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct hl_device *hdev = ctx->hdev;
+- int rc, fd;
++ CLASS(get_unused_fd, fd)(flags);
++
++ if (fd < 0) {
++ dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
++ return fd;
++ }
+
+ exp_info.ops = &habanalabs_dmabuf_ops;
+ exp_info.size = total_size;
+@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ return PTR_ERR(hl_dmabuf->dmabuf);
+ }
+
+- fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
+- if (fd < 0) {
+- dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
+- rc = fd;
+- goto err_dma_buf_put;
+- }
+-
+ hl_dmabuf->ctx = ctx;
+ hl_ctx_get(hl_dmabuf->ctx);
+ atomic_inc(&ctx->hdev->dmabuf_export_cnt);
+@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ get_file(ctx->hpriv->file_priv->filp);
+
+ *dmabuf_fd = fd;
++ fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);
+
+ return 0;
+-
+-err_dma_buf_put:
+- hl_dmabuf->dmabuf->priv = NULL;
+- dma_buf_put(hl_dmabuf->dmabuf);
+- return rc;
+ }
+
+ static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
+--
+2.50.1
+
--- /dev/null
+From b77cd82a6114ef64d2a6bf354fbd8a7e91c721fd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6. Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 524ed143f875..4506e1cc4b65 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1608,7 +1608,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+ {}
+ };
+
+--
+2.50.1
+
--- /dev/null
+From c61650533f1bd7068592df158f48962bfcd8bd98 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:23 -0700
+Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
+
+Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports
+a subset of hardware functionality, i.e. KVM can't rely on hardware to
+detect illegal/unsupported values. Failure to check the vmcs12 value
+would allow the guest to load any harware-supported value while running L2.
+
+Take care to exempt BTF and LBR from the validity check in order to match
+KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even
+if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect
+that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR
+are being intercepted.
+
+Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set
+*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but
+that would incur non-trivial complexity and wouldn't change the fact that
+KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity
+is not worth carrying.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 12 ++++++++++--
+ arch/x86/kvm/vmx/vmx.c | 5 ++---
+ arch/x86/kvm/vmx/vmx.h | 3 +++
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 903e874041ac..1e0b9f92ff18 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2653,7 +2653,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
++ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
++ vmx_get_supported_debugctl(vcpu, false));
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+@@ -3135,7 +3136,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
++ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
++ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+@@ -4576,6 +4578,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+ (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+
++ /*
++ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
++ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
++ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
++ * vmcs02 doesn't strictly track vmcs12.
++ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+ vmcs12->guest_dr7 = vcpu->arch.dr7;
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index ff61093e9af7..50d45c18fce9 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2173,7 +2173,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+ return (unsigned long)data;
+ }
+
+-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+ u64 debugctl = 0;
+
+@@ -2192,8 +2192,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ return debugctl;
+ }
+
+-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
+- bool host_initiated)
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ {
+ u64 invalid;
+
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index cf57fbf12104..ee330d14089d 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -435,6 +435,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+
+ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++
+ /*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+--
+2.50.1
+
--- /dev/null
+From 442fe2ed58d95e8ffd4c75c29b7f1d884bce1d02 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:21 -0700
+Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ]
+
+Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the
+guest CPUID model, as debug support is supposed to be available if RTM is
+supported, and there are no known downsides to letting the guest debug RTM
+aborts.
+
+Note, there are no known bug reports related to RTM_DEBUG, the primary
+motivation is to reduce the probability of breaking existing guests when a
+future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL
+(KVM currently lets L2 run with whatever hardware supports; whoops).
+
+Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to
+DR7.RTM.
+
+Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/msr-index.h | 1 +
+ arch/x86/kvm/vmx/vmx.c | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index 7ebe76f69417..2b6e3127ef4e 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -417,6 +417,7 @@
+ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
+ #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+ #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
++#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
+
+ #define MSR_PEBS_FRONTEND 0x000003f7
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index b9c7940feac6..529a10bba056 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2185,6 +2185,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
++ if (boot_cpu_has(X86_FEATURE_RTM) &&
++ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM)))
++ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
++
+ return debugctl;
+ }
+
+--
+2.50.1
+
--- /dev/null
+From 9169769cd413b64e64d5b12b3b21446c9d1340a5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:22 -0700
+Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
+
+Move VMX's logic to check DEBUGCTL values into a standalone helper so that
+the code can be used by nested VM-Enter to apply the same logic to the
+value being loaded from vmcs12.
+
+KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested
+VM-Enter, as hardware may support features that KVM does not, i.e. relying
+on hardware to detect invalid guest state will result in false negatives.
+Unfortunately, that means applying KVM's funky suppression of BTF and LBR
+to vmcs12 so as not to break existing guests.
+
+No functional change intended.
+
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------
+ 1 file changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 529a10bba056..ff61093e9af7 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2192,6 +2192,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ return debugctl;
+ }
+
++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
++ bool host_initiated)
++{
++ u64 invalid;
++
++ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
++ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
++ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
++ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
++ }
++ return !invalid;
++}
++
+ /*
+ * Writes msr value into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+@@ -2260,19 +2273,12 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ }
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+- case MSR_IA32_DEBUGCTLMSR: {
+- u64 invalid;
+-
+- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+- }
+-
+- if (invalid)
++ case MSR_IA32_DEBUGCTLMSR:
++ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+ return 1;
+
++ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
++
+ if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+@@ -2282,7 +2288,6 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+- }
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+--
+2.50.1
+
--- /dev/null
+From 064fd232cf9cd7db42a4842d7bec28e315b2ac1b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:25 -0700
+Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the
+ guest
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ]
+
+Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the
+host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting
+while running the guest. When running with the "default treatment of SMIs"
+in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that
+is visible to host (non-SMM) software, and instead transitions directly
+from VMX non-root to SMM. And critically, DEBUGCTL isn't context switched
+by hardware on SMI or RSM, i.e. SMM will run with whatever value was
+resident in hardware at the time of the SMI.
+
+Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting
+events while the CPU is executing in SMM, which can pollute profiling and
+potentially leak information into the guest.
+
+Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner
+run loop, as the bit can be toggled in IRQ context via IPI callback (SMP
+function call), by way of /sys/devices/cpu/freeze_on_smi.
+
+Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be
+preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs,
+i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and
+at worst could lead to undesirable behavior in the future if AMD CPUs ever
+happened to pick up a collision with the bit.
+
+Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module
+owns and controls GUEST_IA32_DEBUGCTL.
+
+WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the
+lack of handling isn't a KVM bug (TDX already WARNs on any run_flag).
+
+Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed
+by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state().
+Doing so avoids the need to track host_debugctl on a per-VMCS basis, as
+GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and
+load_vmcs12_host_state(). For the VM-Fail case, even though KVM won't
+have actually entered the guest, vcpu_enter_guest() will have run with
+vmcs02 active and thus could result in vmcs01 being run with a stale value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve syntactic conflict in vt_x86_ops definition]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 7 +++++++
+ arch/x86/kvm/vmx/main.c | 2 ++
+ arch/x86/kvm/vmx/nested.c | 3 +++
+ arch/x86/kvm/vmx/vmx.c | 3 +++
+ arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++++-
+ arch/x86/kvm/x86.c | 14 ++++++++++++--
+ 6 files changed, 41 insertions(+), 3 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 2ed05925d9d5..d27df86aa62c 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1630,6 +1630,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
++ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+ };
+
+ struct kvm_x86_ops {
+@@ -1659,6 +1660,12 @@ struct kvm_x86_ops {
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
++ /*
++ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
++ * match the host's value even while the guest is active.
++ */
++ const u64 HOST_OWNED_DEBUGCTL;
++
+ void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
+index 7668e2fb8043..3f83e36a657b 100644
+--- a/arch/x86/kvm/vmx/main.c
++++ b/arch/x86/kvm/vmx/main.c
+@@ -42,6 +42,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
++ .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
++
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_feature_msr = vmx_get_feature_msr,
+ .get_msr = vmx_get_msr,
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 9a336f661fc6..60bd2791d933 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4829,6 +4829,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
++ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
++ vmx_reload_guest_debugctl(vcpu);
++
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4bb25519e7ce..6c185a260c5b 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7407,6 +7407,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
++ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
++ vmx_reload_guest_debugctl(vcpu);
++
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 5b2c5cb5e32e..a7e2de50d27f 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -440,12 +440,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+
+ static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+ {
++ WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
++
++ val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+ }
+
+ static inline u64 vmx_guest_debugctl_read(void)
+ {
+- return vmcs_read64(GUEST_IA32_DEBUGCTL);
++ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
++}
++
++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
++{
++ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
++
++ if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
++ return;
++
++ vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
+ }
+
+ /*
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 7beea8fb6ea6..dbd295ef3eba 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10711,7 +10711,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
+- u64 run_flags;
++ u64 run_flags, debug_ctl;
+
+ bool req_immediate_exit = false;
+
+@@ -10982,7 +10982,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(DR7_FIXED_1, 7);
+ }
+
+- vcpu->arch.host_debugctl = get_debugctlmsr();
++ /*
++ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
++ * can be modified in IRQ context, e.g. via SMP function calls. Inform
++ * vendor code if any host-owned bits were changed, e.g. so that the
++ * value loaded into hardware while running the guest can be updated.
++ */
++ debug_ctl = get_debugctlmsr();
++ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
++ !vcpu->arch.guest_state_protected)
++ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
++ vcpu->arch.host_debugctl = debug_ctl;
+
+ guest_timing_enter_irqoff();
+
+--
+2.50.1
+
--- /dev/null
+From 23fe0561dff1a54e2d0cadace8e98dc9775bd0b3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:24 -0700
+Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
+
+Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to
+vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into
+GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state
+into the guest, and without needing to copy+paste the FREEZE_IN_SMM
+logic into every patch that accesses GUEST_IA32_DEBUGCTL.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+[sean: massage changelog, make inline, use in all prepare_vmcs02() cases]
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 10 +++++-----
+ arch/x86/kvm/vmx/pmu_intel.c | 8 ++++----
+ arch/x86/kvm/vmx/vmx.c | 8 +++++---
+ arch/x86/kvm/vmx/vmx.h | 10 ++++++++++
+ 4 files changed, 24 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 1e0b9f92ff18..9a336f661fc6 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2653,11 +2653,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
+- vmx_get_supported_debugctl(vcpu, false));
++ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
++ vmx_get_supported_debugctl(vcpu, false));
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
++ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
+ }
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -3527,7 +3527,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
+ if (kvm_mpx_supported() &&
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -4774,7 +4774,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+
+ kvm_set_dr(vcpu, 7, 0x400);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++ vmx_guest_debugctl_write(vcpu, 0);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 9c9d4a336166..a5edc623166a 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -605,11 +605,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+ */
+ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+ {
+- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ u64 data = vmx_guest_debugctl_read();
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++ vmx_guest_debugctl_write(vcpu, data);
+ }
+ }
+
+@@ -679,7 +679,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
++ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+@@ -701,7 +701,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+
+ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+ {
+- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
++ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
+ }
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 50d45c18fce9..4bb25519e7ce 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2148,7 +2148,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ msr_info->data = vmx_guest_debugctl_read();
+ break;
+ default:
+ find_uret_msr:
+@@ -2282,7 +2282,8 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+
+- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++ vmx_guest_debugctl_write(vcpu, data);
++
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+@@ -4831,7 +4832,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++
++ vmx_guest_debugctl_write(&vmx->vcpu, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index ee330d14089d..5b2c5cb5e32e 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -438,6 +438,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
++{
++ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
++}
++
++static inline u64 vmx_guest_debugctl_read(void)
++{
++ return vmcs_read64(GUEST_IA32_DEBUGCTL);
++}
++
+ /*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+--
+2.50.1
+
--- /dev/null
+From d43a98921ac0ceecd8840b7a5d4dc24377a1c4d9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:19 -0700
+Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic
+ bitmap
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ]
+
+Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter
+into an a generic bitmap so that similar "take action" information can be
+passed to vendor code without creating a pile of boolean parameters.
+
+This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and
+will also allow for adding similar functionality for re-loading debugctl
+in the active VMCS.
+
+Opportunistically massage the TDX WARN and comment to prepare for adding
+more run_flags, all of which are expected to be mutually exclusive with
+TDX, i.e. should be WARNed on.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX changes]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 6 +++++-
+ arch/x86/kvm/svm/svm.c | 4 ++--
+ arch/x86/kvm/vmx/vmx.c | 3 ++-
+ arch/x86/kvm/vmx/x86_ops.h | 2 +-
+ arch/x86/kvm/x86.c | 11 ++++++++---
+ 5 files changed, 18 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 0caa3293f6db..cccc8cbe72db 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1627,6 +1627,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+ }
+
++enum kvm_x86_run_flags {
++ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
++};
++
+ struct kvm_x86_ops {
+ const char *name;
+
+@@ -1706,7 +1710,7 @@ struct kvm_x86_ops {
+
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+- bool force_immediate_exit);
++ u64 run_flags);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath);
+ int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 1f42a71b15c0..7d1b871cfc02 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4226,9 +4226,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+ guest_state_exit_irqoff();
+ }
+
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+- bool force_immediate_exit)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9a4ebf3dfbfc..2a977cdfcd0c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7353,8 +7353,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ guest_state_exit_irqoff();
+ }
+
+-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
++fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+
+diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
+index 4aba200f435d..5e4ce13ab305 100644
+--- a/arch/x86/kvm/vmx/x86_ops.h
++++ b/arch/x86/kvm/vmx/x86_ops.h
+@@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm);
+ int vmx_vcpu_precreate(struct kvm *kvm);
+ int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+ int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
++fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
+ void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 213af0fda768..44ab46f2a2d2 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10711,6 +10711,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
++ u64 run_flags;
+
+ bool req_immediate_exit = false;
+
+@@ -10955,8 +10956,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ goto cancel_injection;
+ }
+
+- if (req_immediate_exit)
++ run_flags = 0;
++ if (req_immediate_exit) {
++ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
++ }
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+@@ -10992,8 +10996,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+
+- exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
+- req_immediate_exit);
++ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+@@ -11005,6 +11008,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ break;
+ }
+
++ run_flags = 0;
++
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
+ }
+--
+2.50.1
+
--- /dev/null
+From 60ac0019cd78125bddc4cc6b46b022c333b534cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:57:20 -0700
+Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ]
+
+Instruct vendor code to load the guest's DR6 into hardware via a new
+KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to
+load vcpu->arch.dr6 into hardware when DR6 can be read/written directly
+by the guest.
+
+Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM
+thinks DR6 needs to be reloaded. TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH
+and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX changes]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h | 1 -
+ arch/x86/include/asm/kvm_host.h | 2 +-
+ arch/x86/kvm/svm/svm.c | 10 ++++++----
+ arch/x86/kvm/vmx/main.c | 1 -
+ arch/x86/kvm/vmx/vmx.c | 9 +++------
+ arch/x86/kvm/x86.c | 2 +-
+ 6 files changed, 11 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index cfb22f8c451a..861d080ed4c6 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -47,7 +47,6 @@ KVM_X86_OP(set_idt)
+ KVM_X86_OP(get_gdt)
+ KVM_X86_OP(set_gdt)
+ KVM_X86_OP(sync_dirty_debug_regs)
+-KVM_X86_OP(set_dr6)
+ KVM_X86_OP(set_dr7)
+ KVM_X86_OP(cache_reg)
+ KVM_X86_OP(get_rflags)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index cccc8cbe72db..2ed05925d9d5 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1629,6 +1629,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+
+ enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
++ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ };
+
+ struct kvm_x86_ops {
+@@ -1679,7 +1680,6 @@ struct kvm_x86_ops {
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 7d1b871cfc02..800f781475c0 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4270,10 +4270,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
+ /*
+- * Run with all-zero DR6 unless needed, so that we can get the exact cause
+- * of a #DB.
++ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
++ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
++ * KVM's snapshot is only necessary when DR accesses won't exit.
+ */
+- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
++ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
++ svm_set_dr6(vcpu, vcpu->arch.dr6);
++ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
+
+ clgi();
+@@ -5084,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+- .set_dr6 = svm_set_dr6,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
+index 47476fcc179a..7668e2fb8043 100644
+--- a/arch/x86/kvm/vmx/main.c
++++ b/arch/x86/kvm/vmx/main.c
+@@ -60,7 +60,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+- .set_dr6 = vmx_set_dr6,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 2a977cdfcd0c..b9c7940feac6 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5630,12 +5630,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+ set_debugreg(DR6_RESERVED, 6);
+ }
+
+-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+-{
+- lockdep_assert_irqs_disabled();
+- set_debugreg(vcpu->arch.dr6, 6);
+-}
+-
+ void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+ {
+ vmcs_writel(GUEST_DR7, val);
+@@ -7400,6 +7394,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+
++ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
++ set_debugreg(vcpu->arch.dr6, 6);
++
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 44ab46f2a2d2..7beea8fb6ea6 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10977,7 +10977,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+- kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
++ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(DR7_FIXED_1, 7);
+ }
+--
+2.50.1
+
--- /dev/null
+From e14e8193de61d485369bd36f87d887c94c48751d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 17:09:56 +0800
+Subject: mm/smaps: fix race between smaps_hugetlb_range and migration
+
+From: Jinjiang Tu <tujinjiang@huawei.com>
+
+[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ]
+
+smaps_hugetlb_range() handles the pte without holdling ptl, and may be
+concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page().
+The race is as follows.
+
+smaps_hugetlb_range migrate_pages
+ huge_ptep_get
+ remove_migration_ptes
+ folio_unlock
+ pfn_swap_entry_folio
+ BUG_ON
+
+To fix it, hold ptl lock in smaps_hugetlb_range().
+
+Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com
+Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com
+Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps")
+Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrei Vagin <avagin@gmail.com>
+Cc: Andrii Nakryiko <andrii@kernel.org>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Brahmajit Das <brahmajit.xyz@gmail.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/proc/task_mmu.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index 72a58681f031..2257bf52fb2a 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1007,10 +1007,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ {
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+- pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
+ struct folio *folio = NULL;
+ bool present = false;
++ spinlock_t *ptl;
++ pte_t ptent;
+
++ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
++ ptent = huge_ptep_get(walk->mm, addr, pte);
+ if (pte_present(ptent)) {
+ folio = page_folio(pte_page(ptent));
+ present = true;
+@@ -1029,6 +1032,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
++ spin_unlock(ptl);
+ return 0;
+ }
+ #else
+--
+2.50.1
+
--- /dev/null
+From 6ece36736d8033ce02a676412c51e99271b4ef6a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 21:18:03 +0200
+Subject: net: kcm: Fix race condition in kcm_unattach()
+
+From: Sven Stegemann <sven@stegemann.de>
+
+[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ]
+
+syzbot found a race condition when kcm_unattach(psock)
+and kcm_release(kcm) are executed at the same time.
+
+kcm_unattach() is missing a check of the flag
+kcm->tx_stopped before calling queue_work().
+
+If the kcm has a reserved psock, kcm_unattach() might get executed
+between cancel_work_sync() and unreserve_psock() in kcm_release(),
+requeuing kcm->tx_work right before kcm gets freed in kcm_done().
+
+Remove kcm->tx_stopped and replace it by the less
+error-prone disable_work_sync().
+
+Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
+Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662
+Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94
+Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e
+Signed-off-by: Sven Stegemann <sven@stegemann.de>
+Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/kcm.h | 1 -
+ net/kcm/kcmsock.c | 10 ++--------
+ 2 files changed, 2 insertions(+), 9 deletions(-)
+
+diff --git a/include/net/kcm.h b/include/net/kcm.h
+index 441e993be634..d9c35e71ecea 100644
+--- a/include/net/kcm.h
++++ b/include/net/kcm.h
+@@ -71,7 +71,6 @@ struct kcm_sock {
+ struct list_head wait_psock_list;
+ struct sk_buff *seq_skb;
+ struct mutex tx_mutex;
+- u32 tx_stopped : 1;
+
+ /* Don't use bit fields here, these are set under different locks */
+ bool tx_wait;
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index d4118c796290..1d37b26ea2ef 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -429,7 +429,7 @@ static void psock_write_space(struct sock *sk)
+
+ /* Check if the socket is reserved so someone is waiting for sending. */
+ kcm = psock->tx_kcm;
+- if (kcm && !unlikely(kcm->tx_stopped))
++ if (kcm)
+ queue_work(kcm_wq, &kcm->tx_work);
+
+ spin_unlock_bh(&mux->lock);
+@@ -1696,12 +1696,6 @@ static int kcm_release(struct socket *sock)
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+
+- /* Set tx_stopped. This is checked when psock is bound to a kcm and we
+- * get a writespace callback. This prevents further work being queued
+- * from the callback (unbinding the psock occurs after canceling work.
+- */
+- kcm->tx_stopped = 1;
+-
+ release_sock(sk);
+
+ spin_lock_bh(&mux->lock);
+@@ -1717,7 +1711,7 @@ static int kcm_release(struct socket *sock)
+ /* Cancel work. After this point there should be no outside references
+ * to the kcm socket.
+ */
+- cancel_work_sync(&kcm->tx_work);
++ disable_work_sync(&kcm->tx_work);
+
+ lock_sock(sk);
+ psock = kcm->tx_psock;
+--
+2.50.1
+
--- /dev/null
+From 2f916039451174e3ab687b9a37e3c5231e2ed92a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 50bfbc2779e4..d8c9fe1d98c4 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+- u32 val, cap, ret = 0;
++ u32 val, cap;
++ int ret = 0;
+
+ mutex_lock(&iep->ptp_clk_mutex);
+
+--
+2.50.1
+
--- /dev/null
+From ff2cbb791d9045e359020bf8dcdb70db907b394d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 23:08:12 +0530
+Subject: net: ti: icssg-prueth: Fix emac link speed handling
+
+From: MD Danish Anwar <danishanwar@ti.com>
+
+[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ]
+
+When link settings are changed emac->speed is populated by
+emac_adjust_link(). The link speed and other settings are then written into
+the DRAM. However if both ports are brought down after this and brought up
+again or if the operating mode is changed and a firmware reload is needed,
+the DRAM is cleared by icssg_config(). As a result the link settings are
+lost.
+
+Fix this by calling emac_adjust_link() after icssg_config(). This re
+populates the settings in the DRAM after a new firmware load.
+
+Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.")
+Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Message-ID: <20250805173812.2183161-1-danishanwar@ti.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+index 0769e1ade30b..ddbc4624ae88 100644
+--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+@@ -50,6 +50,8 @@
+ /* CTRLMMR_ICSSG_RGMII_CTRL register bits */
+ #define ICSSG_CTRL_RGMII_ID_MODE BIT(24)
+
++static void emac_adjust_link(struct net_device *ndev);
++
+ static int emac_get_tx_ts(struct prueth_emac *emac,
+ struct emac_tx_ts_response *rsp)
+ {
+@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth)
+ ret = icssg_config(prueth, emac, slice);
+ if (ret)
+ goto disable_class;
++
++ mutex_lock(&emac->ndev->phydev->lock);
++ emac_adjust_link(emac->ndev);
++ mutex_unlock(&emac->ndev->phydev->lock);
+ }
+
+ ret = prueth_emac_start(prueth);
+--
+2.50.1
+
--- /dev/null
+From e029781097349b203ded1588deab6713cbf6a350 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 6a1239433830..18a91c031554 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -860,8 +860,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -1184,19 +1182,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -1233,7 +1238,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -1246,8 +1251,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, true, flags);
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -1260,12 +1264,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
--- /dev/null
+From 311ad70a27210004849b7d07dc87eb8eec7af3b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+ CPU0 CPU1
+ ---- ----
+ n_vclocks_store()
+ lock(&ptp->n_vclocks_mux) [1]
+ (physical clock)
+ pc_clock_adjtime()
+ lock(&clk->rwsem) [2]
+ (physical clock)
+ ...
+ ptp_clock_freerun()
+ ptp_vclock_in_use()
+ lock(&ptp->n_vclocks_mux) [3]
+ (physical clock)
+ ptp_clock_unregister()
+ posix_clock_unregister()
+ lock(&clk->rwsem) [4]
+ (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a6aad743c282..b352df4cd3f9 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -24,6 +24,11 @@
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+ #define PTP_MAX_CHANNELS 2048
+
++enum {
++ PTP_LOCK_PHYSICAL = 0,
++ PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+ struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+ int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index 7febfdcbde8b..8ed4b8598924 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+ return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
+
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+ .owner = THIS_MODULE,
+ .name = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+ return NULL;
+ }
+
++ ptp_vclock_set_subclass(vclock->clock);
++
+ timecounter_init(&vclock->tc, &vclock->cc, 0);
+ ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+--
+2.50.1
+
--- /dev/null
+From 7a09b3640b9f599fabc4fa354e9ea99af238d33c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+ BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+ sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+ sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+ __release_sock+0x1da/0x330 net/core/sock.c:3106
+ release_sock+0x6b/0x250 net/core/sock.c:3660
+ sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+ sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+ sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+ inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+ BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+ sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+ __release_sock+0x1d3/0x330 net/core/sock.c:3213
+ release_sock+0x6b/0x270 net/core/sock.c:3767
+ sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+ sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+ sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+ inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index a8a254a5008e..032a10d82302 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+- if ((!is_gso && skb_linearize(skb)) ||
++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
+ goto discard_it;
+
+--
+2.50.1
+
acpi-processor-perflib-move-problematic-pr-performance-check.patch
block-make-req_op_zone_finish-a-write-operation.patch
mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
+kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
+kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
+kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
+kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
+kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
+kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
+kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
+habanalabs-fix-uaf-in-export_dmabuf.patch
+mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
+net-kcm-fix-race-condition-in-kcm_unattach.patch
--- /dev/null
+From 6d4442b6803ab0cdf8929963a5e6113ae219f06e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h | 2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c | 3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index e5e47452308a..e1eaf12b3742 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -195,7 +195,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index 095cf31bae0b..d71643b494a1 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+ strp->stm.offset = offset;
+ }
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+ struct strp_msg *rxm;
+ struct tls_msg *tlm;
+@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+
+ if (!strp->copy_mode && force_refresh) {
+- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+- return;
++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++ WRITE_ONCE(strp->msg_ready, 0);
++ memset(&strp->stm, 0, sizeof(strp->stm));
++ return false;
++ }
+
+ tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+ }
+@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ rxm->offset = strp->stm.offset;
+ tlm = tls_msg(strp->anchor);
+ tlm->control = strp->mark;
++
++ return true;
+ }
+
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 1d7caadd0cbc..6385329ef98d 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1380,7 +1380,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+ return sock_intr_errno(timeo);
+ }
+
+- tls_strp_msg_load(&ctx->strp, released);
++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++ return tls_rx_rec_wait(sk, psock, nonblock, false);
+
+ return 1;
+ }
+--
+2.50.1
+
--- /dev/null
+From f4d9b128db5250a96a548994f339a395c002e13f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 5de47dd5e909..12ba1a8db93a 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+
--- /dev/null
+From 5e058a0b161a48cd29cded0776081d5e31d66472 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2]. This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period. Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index 39aa0aea61c6..711517bd43a1 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns)
+
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++ /* Update the repeating-pattern data. */
++ data->intervals[data->interval_ptr++] = interval_us;
++ if (data->interval_ptr >= INTERVALS)
++ data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+ /*
+@@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ if (data->needs_update) {
+ menu_update(drv, dev);
+ data->needs_update = 0;
++ } else if (!dev->last_residency_ns) {
++ /*
++ * This happens when the driver rejects the previously selected
++ * idle state and returns an error, so update the recent
++ * intervals table to prevent invalid information from being
++ * used going forward.
++ */
++ menu_update_intervals(data, UINT_MAX);
+ }
+
+ /* Find the shortest expected idle interval. */
+@@ -482,10 +498,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+
+ data->correction_factor[data->bucket] = new_factor;
+
+- /* update the repeating-pattern data */
+- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+- if (data->interval_ptr >= INTERVALS)
+- data->interval_ptr = 0;
++ menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+
+ /**
+--
+2.50.1
+
--- /dev/null
+From eb0b60837981894893c566d7ac0d81ad2b5d8126 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:20:19 +0800
+Subject: erofs: fix block count report when 48-bit layout is on
+
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+
+[ Upstream commit 0b96d9bed324a1c1b7d02bfb9596351ef178428d ]
+
+Fix incorrect shift order when combining the 48-bit block count.
+
+Fixes: 2e1473d5195f ("erofs: implement 48-bit block addressing for unencoded inodes")
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Link: https://lore.kernel.org/r/20250807082019.3093539-1-hsiangkao@linux.alibaba.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/erofs/super.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/erofs/super.c b/fs/erofs/super.c
+index 6e57b9cc6ed2..cfe454dbf415 100644
+--- a/fs/erofs/super.c
++++ b/fs/erofs/super.c
+@@ -313,8 +313,8 @@ static int erofs_read_superblock(struct super_block *sb)
+ sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+ if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+ sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+- sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
+- le16_to_cpu(dsb->rb.blocks_hi);
++ sbi->dif0.blocks = sbi->dif0.blocks |
++ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
+ } else {
+ sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+ }
+--
+2.50.1
+
--- /dev/null
+From 89ee3cca075191f343cb997a8c8f9baefda963f1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 12 Jul 2025 06:02:31 +0100
+Subject: habanalabs: fix UAF in export_dmabuf()
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ]
+
+As soon as we'd inserted a file reference into descriptor table, another
+thread could close it. That's fine for the case when all we are doing is
+returning that descriptor to userland (it's a race, but it's a userland
+race and there's nothing the kernel can do about it). However, if we
+follow fd_install() with any kind of access to objects that would be
+destroyed on close (be it the struct file itself or anything destroyed
+by its ->release()), we have a UAF.
+
+dma_buf_fd() is a combination of reserving a descriptor and fd_install().
+habanalabs export_dmabuf() calls it and then proceeds to access the
+objects destroyed on close. In particular, it grabs an extra reference to
+another struct file that will be dropped as part of ->release() for ours;
+that "will be" is actually "might have already been".
+
+Fix that by reserving descriptor before anything else and do fd_install()
+only when everything had been set up. As a side benefit, we no longer
+have the failure exit with file already created, but reference to
+underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet;
+unlike dma_buf_fd(), fd_install() can't fail.
+
+Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/accel/habanalabs/common/memory.c | 23 +++++++----------------
+ 1 file changed, 7 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
+index 601fdbe70179..61472a381904 100644
+--- a/drivers/accel/habanalabs/common/memory.c
++++ b/drivers/accel/habanalabs/common/memory.c
+@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
+ struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
+ struct hl_ctx *ctx;
+
+- if (!hl_dmabuf)
+- return;
+-
+ ctx = hl_dmabuf->ctx;
+
+ if (hl_dmabuf->memhash_hnode)
+@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ {
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct hl_device *hdev = ctx->hdev;
+- int rc, fd;
++ CLASS(get_unused_fd, fd)(flags);
++
++ if (fd < 0) {
++ dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
++ return fd;
++ }
+
+ exp_info.ops = &habanalabs_dmabuf_ops;
+ exp_info.size = total_size;
+@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ return PTR_ERR(hl_dmabuf->dmabuf);
+ }
+
+- fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
+- if (fd < 0) {
+- dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
+- rc = fd;
+- goto err_dma_buf_put;
+- }
+-
+ hl_dmabuf->ctx = ctx;
+ hl_ctx_get(hl_dmabuf->ctx);
+ atomic_inc(&ctx->hdev->dmabuf_export_cnt);
+@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ get_file(ctx->hpriv->file_priv->filp);
+
+ *dmabuf_fd = fd;
++ fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);
+
+ return 0;
+-
+-err_dma_buf_put:
+- hl_dmabuf->dmabuf->priv = NULL;
+- dma_buf_put(hl_dmabuf->dmabuf);
+- return rc;
+ }
+
+ static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
+--
+2.50.1
+
--- /dev/null
+From d23f33c1a9c34b07bd4781c90f234a9a1cbeaa8b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:26 -0700
+Subject: hamradio: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit c64237960819aee1766d03f446ae6de94b1e3f73 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding hamradio for ops-locked
+devices.
+
+ xsk_notifier+0x89/0x230 net/xdp/xsk.c:1664
+ notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+ call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+ call_netdevice_notifiers net/core/dev.c:2281 [inline]
+ unregister_netdevice_many_notify+0x14d7/0x1ff0 net/core/dev.c:12156
+ unregister_netdevice_many net/core/dev.c:12219 [inline]
+ unregister_netdevice_queue+0x33c/0x380 net/core/dev.c:12063
+ register_netdevice+0x1689/0x1ae0 net/core/dev.c:11241
+ bpq_new_device drivers/net/hamradio/bpqether.c:481 [inline]
+ bpq_device_event+0x491/0x600 drivers/net/hamradio/bpqether.c:523
+ notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+ call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+ call_netdevice_notifiers net/core/dev.c:2281 [inline]
+ __dev_notify_flags+0x18d/0x2e0 net/core/dev.c:-1
+ netif_change_flags+0xe8/0x1a0 net/core/dev.c:9608
+ dev_change_flags+0x130/0x260 net/core/dev_api.c:68
+ devinet_ioctl+0xbb4/0x1b50 net/ipv4/devinet.c:1200
+ inet_ioctl+0x3c0/0x4c0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e6300f66a999a6612477@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e6300f66a999a6612477
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-2-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/hamradio/bpqether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
+index 0e0fe32d2da4..045c5177262e 100644
+--- a/drivers/net/hamradio/bpqether.c
++++ b/drivers/net/hamradio/bpqether.c
+@@ -138,7 +138,7 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev)
+
+ static inline int dev_is_ethdev(struct net_device *dev)
+ {
+- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+
+ /* ------------------------------------------------------------------------ */
+--
+2.50.1
+
--- /dev/null
+From ceb238ac9f661be4f5172183ee62b9858f74ad67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6. Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 976f5be54e36..039dc42dd509 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1665,7 +1665,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+ {}
+ };
+
+--
+2.50.1
+
--- /dev/null
+From f981586f1a7a73248f159d3d77b4516449f568ed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 29 Jul 2025 14:26:11 +0200
+Subject: ipvs: Fix estimator kthreads preferred affinity
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 ]
+
+The estimator kthreads' affinity are defined by sysctl overwritten
+preferences and applied through a plain call to the scheduler's affinity
+API.
+
+However since the introduction of managed kthreads preferred affinity,
+such a practice shortcuts the kthreads core code which eventually
+overwrites the target to the default unbound affinity.
+
+Fix this with using the appropriate kthread's API.
+
+Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Acked-by: Julian Anastasov <ja@ssi.bg>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/ip_vs.h | 13 +++++++++++++
+ kernel/kthread.c | 1 +
+ net/netfilter/ipvs/ip_vs_est.c | 3 ++-
+ 3 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
+index ff406ef4fd4a..29a36709e7f3 100644
+--- a/include/net/ip_vs.h
++++ b/include/net/ip_vs.h
+@@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+ return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
+
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++ if (ipvs->est_cpulist_valid)
++ return ipvs->sysctl_est_cpulist;
++ else
++ return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+ return ipvs->sysctl_est_nice;
+@@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+ return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
+
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++ return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+ return IPVS_EST_NICE;
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index 77c44924cf54..800c8fc46b08 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -894,6 +894,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+
+ return ret;
+ }
++EXPORT_SYMBOL_GPL(kthread_affine_preferred);
+
+ /*
+ * Re-affine kthreads according to their preferences
+diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
+index f821ad2e19b3..15049b826732 100644
+--- a/net/netfilter/ipvs/ip_vs_est.c
++++ b/net/netfilter/ipvs/ip_vs_est.c
+@@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+ }
+
+ set_user_nice(kd->task, sysctl_est_nice(ipvs));
+- set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
++ if (sysctl_est_preferred_cpulist(ipvs))
++ kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
+
+ pr_info("starting estimator thread %d...\n", kd->id);
+ wake_up_process(kd->task);
+--
+2.50.1
+
--- /dev/null
+From 9e464b22810b43dd7989a0886ce28831fb986189 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 17:09:56 +0800
+Subject: mm/smaps: fix race between smaps_hugetlb_range and migration
+
+From: Jinjiang Tu <tujinjiang@huawei.com>
+
+[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ]
+
+smaps_hugetlb_range() handles the pte without holdling ptl, and may be
+concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page().
+The race is as follows.
+
+smaps_hugetlb_range migrate_pages
+ huge_ptep_get
+ remove_migration_ptes
+ folio_unlock
+ pfn_swap_entry_folio
+ BUG_ON
+
+To fix it, hold ptl lock in smaps_hugetlb_range().
+
+Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com
+Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com
+Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps")
+Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrei Vagin <avagin@gmail.com>
+Cc: Andrii Nakryiko <andrii@kernel.org>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Brahmajit Das <brahmajit.xyz@gmail.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/proc/task_mmu.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index e57e323817e7..3b8eaa7722c8 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1020,10 +1020,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ {
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+- pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
+ struct folio *folio = NULL;
+ bool present = false;
++ spinlock_t *ptl;
++ pte_t ptent;
+
++ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
++ ptent = huge_ptep_get(walk->mm, addr, pte);
+ if (pte_present(ptent)) {
+ folio = page_folio(pte_page(ptent));
+ present = true;
+@@ -1042,6 +1045,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
++ spin_unlock(ptl);
+ return 0;
+ }
+ #else
+--
+2.50.1
+
--- /dev/null
+From ed1da7003cf22b8dda5eafc827e4981942bcc092 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:56 +0800
+Subject: net: hibmcge: fix rtnl deadlock issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit c875503a9b9082928d7d3fc60b5400d16fbfae4e ]
+
+Currently, the hibmcge netdev acquires the rtnl_lock in
+pci_error_handlers.reset_prepare() and releases it in
+pci_error_handlers.reset_done().
+
+However, in the PCI framework:
+pci_reset_bus - __pci_reset_slot - pci_slot_save_and_disable_locked -
+ pci_dev_save_and_disable - err_handler->reset_prepare(dev);
+
+In pci_slot_save_and_disable_locked():
+ list_for_each_entry(dev, &slot->bus->devices, bus_list) {
+ if (!dev->slot || dev->slot!= slot)
+ continue;
+ pci_dev_save_and_disable(dev);
+ if (dev->subordinate)
+ pci_bus_save_and_disable_locked(dev->subordinate);
+ }
+
+This will iterate through all devices under the current bus and execute
+err_handler->reset_prepare(), causing two devices of the hibmcge driver
+to sequentially request the rtnl_lock, leading to a deadlock.
+
+Since the driver now executes netif_device_detach()
+before the reset process, it will not concurrently with
+other netdev APIs, so there is no need to hold the rtnl_lock now.
+
+Therefore, this patch removes the rtnl_lock during the reset process and
+adjusts the position of HBG_NIC_STATE_RESETTING to ensure
+that multiple resets are not executed concurrently.
+
+Fixes: 3f5a61f6d504f ("net: hibmcge: Add reset supported in this module")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c | 14 +++++---------
+ 1 file changed, 5 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+index ff3295b60a69..dee1e8681157 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+@@ -53,9 +53,11 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+ {
+ int ret;
+
+- ASSERT_RTNL();
++ if (test_and_set_bit(HBG_NIC_STATE_RESETTING, &priv->state))
++ return -EBUSY;
+
+ if (netif_running(priv->netdev)) {
++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ dev_warn(&priv->pdev->dev,
+ "failed to reset because port is up\n");
+ return -EBUSY;
+@@ -64,7 +66,6 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+ netif_device_detach(priv->netdev);
+
+ priv->reset_type = type;
+- set_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
+ ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET);
+ if (ret) {
+@@ -83,28 +84,25 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type)
+ type != priv->reset_type)
+ return 0;
+
+- ASSERT_RTNL();
+-
+- clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ ret = hbg_rebuild(priv);
+ if (ret) {
+ set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ dev_err(&priv->pdev->dev, "failed to rebuild after reset\n");
+ return ret;
+ }
+
+ netif_device_attach(priv->netdev);
++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+
+ dev_info(&priv->pdev->dev, "reset done\n");
+ return ret;
+ }
+
+-/* must be protected by rtnl lock */
+ int hbg_reset(struct hbg_priv *priv)
+ {
+ int ret;
+
+- ASSERT_RTNL();
+ ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION);
+ if (ret)
+ return ret;
+@@ -169,7 +167,6 @@ static void hbg_pci_err_reset_prepare(struct pci_dev *pdev)
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct hbg_priv *priv = netdev_priv(netdev);
+
+- rtnl_lock();
+ hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR);
+ }
+
+@@ -179,7 +176,6 @@ static void hbg_pci_err_reset_done(struct pci_dev *pdev)
+ struct hbg_priv *priv = netdev_priv(netdev);
+
+ hbg_reset_done(priv, HBG_RESET_TYPE_FLR);
+- rtnl_unlock();
+ }
+
+ static const struct pci_error_handlers hbg_pci_err_handler = {
+--
+2.50.1
+
--- /dev/null
+From 2870ddbdb6f9caf8329f3c51d2f9946a176eda6c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:57 +0800
+Subject: net: hibmcge: fix the division by zero issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 7004b26f0b64331143eb0b312e77a357a11427ce ]
+
+When the network port is down, the queue is released, and ring->len is 0.
+In debugfs, hbg_get_queue_used_num() will be called,
+which may lead to a division by zero issue.
+
+This patch adds a check, if ring->len is 0,
+hbg_get_queue_used_num() directly returns 0.
+
+Fixes: 40735e7543f9 ("net: hibmcge: Implement .ndo_start_xmit function")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+index 2883a5899ae2..8b6110599e10 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+@@ -29,7 +29,12 @@ static inline bool hbg_fifo_is_full(struct hbg_priv *priv, enum hbg_dir dir)
+
+ static inline u32 hbg_get_queue_used_num(struct hbg_ring *ring)
+ {
+- return (ring->ntu + ring->len - ring->ntc) % ring->len;
++ u32 len = READ_ONCE(ring->len);
++
++ if (!len)
++ return 0;
++
++ return (READ_ONCE(ring->ntu) + len - READ_ONCE(ring->ntc)) % len;
+ }
+
+ netdev_tx_t hbg_net_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+--
+2.50.1
+
--- /dev/null
+From 1e8262925f1a14e833eb34376195aa9aa71ca95e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:58 +0800
+Subject: net: hibmcge: fix the np_link_fail error reporting issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 62c50180ffda01468e640ac14925503796f255e2 ]
+
+Currently, after modifying device port mode, the np_link_ok state
+is immediately checked. At this point, the device may not yet ready,
+leading to the querying of an intermediate state.
+
+This patch will poll to check if np_link is ok after
+modifying device port mode, and only report np_link_fail upon timeout.
+
+Fixes: e0306637e85d ("net: hibmcge: Add support for mac link exception handling feature")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+index 9b65eef62b3f..2844124f306d 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+@@ -12,6 +12,8 @@
+
+ #define HBG_HW_EVENT_WAIT_TIMEOUT_US (2 * 1000 * 1000)
+ #define HBG_HW_EVENT_WAIT_INTERVAL_US (10 * 1000)
++#define HBG_MAC_LINK_WAIT_TIMEOUT_US (500 * 1000)
++#define HBG_MAC_LINK_WAIT_INTERVAL_US (5 * 1000)
+ /* little endian or big endian.
+ * ctrl means packet description, data means skb packet data
+ */
+@@ -213,6 +215,9 @@ void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr)
+
+ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+ {
++ u32 link_status;
++ int ret;
++
+ hbg_hw_mac_enable(priv, HBG_STATUS_DISABLE);
+
+ hbg_reg_write_field(priv, HBG_REG_PORT_MODE_ADDR,
+@@ -224,8 +229,14 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+
+ hbg_hw_mac_enable(priv, HBG_STATUS_ENABLE);
+
+- if (!hbg_reg_read_field(priv, HBG_REG_AN_NEG_STATE_ADDR,
+- HBG_REG_AN_NEG_STATE_NP_LINK_OK_B))
++ /* wait MAC link up */
++ ret = readl_poll_timeout(priv->io_base + HBG_REG_AN_NEG_STATE_ADDR,
++ link_status,
++ FIELD_GET(HBG_REG_AN_NEG_STATE_NP_LINK_OK_B,
++ link_status),
++ HBG_MAC_LINK_WAIT_INTERVAL_US,
++ HBG_MAC_LINK_WAIT_TIMEOUT_US);
++ if (ret)
+ hbg_np_link_fail_task_schedule(priv);
+ }
+
+--
+2.50.1
+
--- /dev/null
+From 8bee886e735d51eab3303b93156daeda9571fef3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 21:18:03 +0200
+Subject: net: kcm: Fix race condition in kcm_unattach()
+
+From: Sven Stegemann <sven@stegemann.de>
+
+[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ]
+
+syzbot found a race condition when kcm_unattach(psock)
+and kcm_release(kcm) are executed at the same time.
+
+kcm_unattach() is missing a check of the flag
+kcm->tx_stopped before calling queue_work().
+
+If the kcm has a reserved psock, kcm_unattach() might get executed
+between cancel_work_sync() and unreserve_psock() in kcm_release(),
+requeuing kcm->tx_work right before kcm gets freed in kcm_done().
+
+Remove kcm->tx_stopped and replace it by the less
+error-prone disable_work_sync().
+
+Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
+Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662
+Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94
+Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e
+Signed-off-by: Sven Stegemann <sven@stegemann.de>
+Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/kcm.h | 1 -
+ net/kcm/kcmsock.c | 10 ++--------
+ 2 files changed, 2 insertions(+), 9 deletions(-)
+
+diff --git a/include/net/kcm.h b/include/net/kcm.h
+index 441e993be634..d9c35e71ecea 100644
+--- a/include/net/kcm.h
++++ b/include/net/kcm.h
+@@ -71,7 +71,6 @@ struct kcm_sock {
+ struct list_head wait_psock_list;
+ struct sk_buff *seq_skb;
+ struct mutex tx_mutex;
+- u32 tx_stopped : 1;
+
+ /* Don't use bit fields here, these are set under different locks */
+ bool tx_wait;
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index 24aec295a51c..8c0577cd764f 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -429,7 +429,7 @@ static void psock_write_space(struct sock *sk)
+
+ /* Check if the socket is reserved so someone is waiting for sending. */
+ kcm = psock->tx_kcm;
+- if (kcm && !unlikely(kcm->tx_stopped))
++ if (kcm)
+ queue_work(kcm_wq, &kcm->tx_work);
+
+ spin_unlock_bh(&mux->lock);
+@@ -1688,12 +1688,6 @@ static int kcm_release(struct socket *sock)
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+
+- /* Set tx_stopped. This is checked when psock is bound to a kcm and we
+- * get a writespace callback. This prevents further work being queued
+- * from the callback (unbinding the psock occurs after canceling work.
+- */
+- kcm->tx_stopped = 1;
+-
+ release_sock(sk);
+
+ spin_lock_bh(&mux->lock);
+@@ -1709,7 +1703,7 @@ static int kcm_release(struct socket *sock)
+ /* Cancel work. After this point there should be no outside references
+ * to the kcm socket.
+ */
+- cancel_work_sync(&kcm->tx_work);
++ disable_work_sync(&kcm->tx_work);
+
+ lock_sock(sk);
+ psock = kcm->tx_psock;
+--
+2.50.1
+
--- /dev/null
+From cdf8a27cb2b49bb5d7d3fd82a048319d4cf78cba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:25 -0700
+Subject: net: lapbether: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit 53898ebabe843bfa7baea9dae152797d5d0563c9 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding lapbeth for ops-locked
+devices.
+
+ xsk_notifier+0xa4/0x280 net/xdp/xsk.c:1645
+ notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+ call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+ call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+ call_netdevice_notifiers net/core/dev.c:2282 [inline]
+ unregister_netdevice_many_notify+0xf9d/0x2700 net/core/dev.c:12077
+ unregister_netdevice_many net/core/dev.c:12140 [inline]
+ unregister_netdevice_queue+0x305/0x3f0 net/core/dev.c:11984
+ register_netdevice+0x18f1/0x2270 net/core/dev.c:11149
+ lapbeth_new_device drivers/net/wan/lapbether.c:420 [inline]
+ lapbeth_device_event+0x5b1/0xbe0 drivers/net/wan/lapbether.c:462
+ notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+ call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+ call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+ call_netdevice_notifiers net/core/dev.c:2282 [inline]
+ __dev_notify_flags+0x12c/0x2e0 net/core/dev.c:9497
+ netif_change_flags+0x108/0x160 net/core/dev.c:9526
+ dev_change_flags+0xba/0x250 net/core/dev_api.c:68
+ devinet_ioctl+0x11d5/0x1f50 net/ipv4/devinet.c:1200
+ inet_ioctl+0x3a7/0x3f0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e67ea9c235b13b4f0020@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e67ea9c235b13b4f0020
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-1-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/wan/lapbether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
+index 995a7207bdf8..f357a7ac70ac 100644
+--- a/drivers/net/wan/lapbether.c
++++ b/drivers/net/wan/lapbether.c
+@@ -81,7 +81,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev)
+
+ static __inline__ int dev_is_ethdev(struct net_device *dev)
+ {
+- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+
+ /* ------------------------------------------------------------------------ */
+--
+2.50.1
+
--- /dev/null
+From 50c290becd59110bb55ee279fd703e866f2814a0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 17:36:54 -0700
+Subject: net: page_pool: allow enabling recycling late, fix false positive
+ warning
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 64fdaa94bfe0cca3a0f4b2dd922486c5f59fe678 ]
+
+Page pool can have pages "directly" (locklessly) recycled to it,
+if the NAPI that owns the page pool is scheduled to run on the same CPU.
+To make this safe we check that the NAPI is disabled while we destroy
+the page pool. In most cases NAPI and page pool lifetimes are tied
+together so this happens naturally.
+
+The queue API expects the following order of calls:
+ -> mem_alloc
+ alloc new pp
+ -> stop
+ napi_disable
+ -> start
+ napi_enable
+ -> mem_free
+ free old pp
+
+Here we allocate the page pool in ->mem_alloc and free in ->mem_free.
+But the NAPIs are only stopped between ->stop and ->start. We created
+page_pool_disable_direct_recycling() to safely shut down the recycling
+in ->stop. This way the page_pool_destroy() call in ->mem_free doesn't
+have to worry about recycling any more.
+
+Unfortunately, the page_pool_disable_direct_recycling() is not enough
+to deal with failures which necessitate freeing the _new_ page pool.
+If we hit a failure in ->mem_alloc or ->stop the new page pool has
+to be freed while the NAPI is active (assuming driver attaches the
+page pool to an existing NAPI instance and doesn't reallocate NAPIs).
+
+Freeing the new page pool is technically safe because it hasn't been
+used for any packets, yet, so there can be no recycling. But the check
+in napi_assert_will_not_race() has no way of knowing that. We could
+check if page pool is empty but that'd make the check much less likely
+to trigger during development.
+
+Add page_pool_enable_direct_recycling(), pairing with
+page_pool_disable_direct_recycling(). It will allow us to create the new
+page pools in "disabled" state and only enable recycling when we know
+the reconfig operation will not fail.
+
+Coincidentally it will also let us re-enable the recycling for the old
+pool, if the reconfig failed:
+
+ -> mem_alloc (new)
+ -> stop (old)
+ # disables direct recycling for old
+ -> start (new)
+ # fail!!
+ -> start (old)
+ # go back to old pp but direct recycling is lost :(
+ -> mem_free (new)
+
+The new helper is idempotent to make the life easier for drivers,
+which can operate in HDS mode and support zero-copy Rx.
+The driver can call the helper twice whether there are two pools
+or it has multiple references to a single pool.
+
+Fixes: 40eca00ae605 ("bnxt_en: unlink page pool when stopping Rx queue")
+Tested-by: David Wei <dw@davidwei.uk>
+Link: https://patch.msgid.link/20250805003654.2944974-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++-
+ include/net/page_pool/types.h | 2 ++
+ net/core/page_pool.c | 29 +++++++++++++++++++++++
+ 3 files changed, 39 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index d66519ce57af..8021d97f3f22 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -3779,7 +3779,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+ if (BNXT_RX_PAGE_MODE(bp))
+ pp.pool_size += bp->rx_ring_size;
+ pp.nid = numa_node;
+- pp.napi = &rxr->bnapi->napi;
+ pp.netdev = bp->dev;
+ pp.dev = &bp->pdev->dev;
+ pp.dma_dir = bp->rx_dir;
+@@ -3807,6 +3806,12 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+ return PTR_ERR(pool);
+ }
+
++static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr)
++{
++ page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi);
++ page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi);
++}
++
+ static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
+ {
+ u16 mem_size;
+@@ -3845,6 +3850,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
+ rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
+ if (rc)
+ return rc;
++ bnxt_enable_rx_page_pool(rxr);
+
+ rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0);
+ if (rc < 0)
+@@ -15998,6 +16004,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx)
+ goto err_reset;
+ }
+
++ bnxt_enable_rx_page_pool(rxr);
+ napi_enable_locked(&bnapi->napi);
+ bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons);
+
+diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
+index 431b593de709..1509a536cb85 100644
+--- a/include/net/page_pool/types.h
++++ b/include/net/page_pool/types.h
+@@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
+ struct xdp_mem_info;
+
+ #ifdef CONFIG_PAGE_POOL
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++ struct napi_struct *napi);
+ void page_pool_disable_direct_recycling(struct page_pool *pool);
+ void page_pool_destroy(struct page_pool *pool);
+ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+diff --git a/net/core/page_pool.c b/net/core/page_pool.c
+index 3eabe78c93f4..ef870c21e854 100644
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -1201,6 +1201,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+ pool->xdp_mem_id = mem->id;
+ }
+
++/**
++ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
++ * @pool: page pool to modify
++ * @napi: NAPI instance to associate the page pool with
++ *
++ * Associate a page pool with a NAPI instance for lockless page recycling.
++ * This is useful when a new page pool has to be added to a NAPI instance
++ * without disabling that NAPI instance, to mark the point at which control
++ * path "hands over" the page pool to the NAPI instance. In most cases driver
++ * can simply set the @napi field in struct page_pool_params, and does not
++ * have to call this helper.
++ *
++ * The function is idempotent, but does not implement any refcounting.
++ * Single page_pool_disable_direct_recycling() will disable recycling,
++ * no matter how many times enable was called.
++ */
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++ struct napi_struct *napi)
++{
++ if (READ_ONCE(pool->p.napi) == napi)
++ return;
++ WARN_ON(!napi || pool->p.napi);
++
++ mutex_lock(&page_pools_lock);
++ WRITE_ONCE(pool->p.napi, napi);
++ mutex_unlock(&page_pools_lock);
++}
++EXPORT_SYMBOL(page_pool_enable_direct_recycling);
++
+ void page_pool_disable_direct_recycling(struct page_pool *pool)
+ {
+ /* Disable direct recycling based on pool->cpuid.
+--
+2.50.1
+
--- /dev/null
+From fb06884923f5b6158bb457592100bd9b1bb0ecbd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:55 +0000
+Subject: net: stmmac: thead: Get and enable APB clock on initialization
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit 4cc339ce482ba78589a2d5cbe1c84b735d263383 ]
+
+It's necessary to adjust the MAC TX clock when the linkspeed changes,
+but it's noted such adjustment always fails on TH1520 SoC, and reading
+back from APB glue registers that control clock generation results in
+garbage, causing broken link.
+
+With some testing, it's found a clock must be ungated for access to APB
+glue registers. Without any consumer, the clock is automatically
+disabled during late kernel startup. Let's get and enable it if it's
+described in devicetree.
+
+For backward compatibility with older devicetrees, probing won't fail if
+the APB clock isn't found. In this case, we emit a warning since the
+link will break if the speed changes.
+
+Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-4-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+index c72ee759aae5..f2946bea0bc2 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+@@ -211,6 +211,7 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+ struct stmmac_resources stmmac_res;
+ struct plat_stmmacenet_data *plat;
+ struct thead_dwmac *dwmac;
++ struct clk *apb_clk;
+ void __iomem *apb;
+ int ret;
+
+@@ -224,6 +225,19 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+ return dev_err_probe(&pdev->dev, PTR_ERR(plat),
+ "dt configuration failed\n");
+
++ /*
++ * The APB clock is essential for accessing glue registers. However,
++ * old devicetrees don't describe it correctly. We continue to probe
++ * and emit a warning if it isn't present.
++ */
++ apb_clk = devm_clk_get_enabled(&pdev->dev, "apb");
++ if (PTR_ERR(apb_clk) == -ENOENT)
++ dev_warn(&pdev->dev,
++ "cannot get apb clock, link may break after speed changes\n");
++ else if (IS_ERR(apb_clk))
++ return dev_err_probe(&pdev->dev, PTR_ERR(apb_clk),
++ "failed to get apb clock\n");
++
+ dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL);
+ if (!dwmac)
+ return -ENOMEM;
+--
+2.50.1
+
--- /dev/null
+From 35bc060346d46b09199ed15886a9e6f60c6691ab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 50bfbc2779e4..d8c9fe1d98c4 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+- u32 val, cap, ret = 0;
++ u32 val, cap;
++ int ret = 0;
+
+ mutex_lock(&iep->ptp_clk_mutex);
+
+--
+2.50.1
+
--- /dev/null
+From 9d6910e013cfc66f3ee4de8d062a7ea32c989c2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 23:08:12 +0530
+Subject: net: ti: icssg-prueth: Fix emac link speed handling
+
+From: MD Danish Anwar <danishanwar@ti.com>
+
+[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ]
+
+When link settings are changed emac->speed is populated by
+emac_adjust_link(). The link speed and other settings are then written into
+the DRAM. However if both ports are brought down after this and brought up
+again or if the operating mode is changed and a firmware reload is needed,
+the DRAM is cleared by icssg_config(). As a result the link settings are
+lost.
+
+Fix this by calling emac_adjust_link() after icssg_config(). This re
+populates the settings in the DRAM after a new firmware load.
+
+Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.")
+Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Message-ID: <20250805173812.2183161-1-danishanwar@ti.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+index 2f5c4335dec3..008d77727400 100644
+--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+@@ -50,6 +50,8 @@
+ /* CTRLMMR_ICSSG_RGMII_CTRL register bits */
+ #define ICSSG_CTRL_RGMII_ID_MODE BIT(24)
+
++static void emac_adjust_link(struct net_device *ndev);
++
+ static int emac_get_tx_ts(struct prueth_emac *emac,
+ struct emac_tx_ts_response *rsp)
+ {
+@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth)
+ ret = icssg_config(prueth, emac, slice);
+ if (ret)
+ goto disable_class;
++
++ mutex_lock(&emac->ndev->phydev->lock);
++ emac_adjust_link(emac->ndev);
++ mutex_unlock(&emac->ndev->phydev->lock);
+ }
+
+ ret = prueth_emac_start(prueth);
+--
+2.50.1
+
--- /dev/null
+From 1061094cf0f7026f7919fac281a9d2e9cf45d5b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 2cc0fde23344..5fdcae45e0bc 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -884,8 +884,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -1208,19 +1206,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -1257,7 +1262,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -1270,8 +1275,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, true, flags);
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -1284,12 +1288,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
--- /dev/null
+From cdcbf5a86ed33261d6360945f9d9033691f6bda8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+ CPU0 CPU1
+ ---- ----
+ n_vclocks_store()
+ lock(&ptp->n_vclocks_mux) [1]
+ (physical clock)
+ pc_clock_adjtime()
+ lock(&clk->rwsem) [2]
+ (physical clock)
+ ...
+ ptp_clock_freerun()
+ ptp_vclock_in_use()
+ lock(&ptp->n_vclocks_mux) [3]
+ (physical clock)
+ ptp_clock_unregister()
+ posix_clock_unregister()
+ lock(&clk->rwsem) [4]
+ (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a6aad743c282..b352df4cd3f9 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -24,6 +24,11 @@
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+ #define PTP_MAX_CHANNELS 2048
+
++enum {
++ PTP_LOCK_PHYSICAL = 0,
++ PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+ struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+ int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index 7febfdcbde8b..8ed4b8598924 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+ return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
+
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+ .owner = THIS_MODULE,
+ .name = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+ return NULL;
+ }
+
++ ptp_vclock_set_subclass(vclock->clock);
++
+ timecounter_init(&vclock->tc, &vclock->cc, 0);
+ ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+--
+2.50.1
+
--- /dev/null
+From 9340f5b60b7593637159f924e6d0f92ffc7effa9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:56 +0000
+Subject: riscv: dts: thead: Add APB clocks for TH1520 GMACs
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit a7f75e2883c4bd57b12c3be61bb926929adad9c0 ]
+
+Describe perisys-apb4-hclk as the APB clock for TH1520 SoC, which is
+essential for accessing GMAC glue registers.
+
+Fixes: 7e756671a664 ("riscv: dts: thead: Add TH1520 ethernet nodes")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-5-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/boot/dts/thead/th1520.dtsi | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi
+index 527336417765..0aae4e6a5b33 100644
+--- a/arch/riscv/boot/dts/thead/th1520.dtsi
++++ b/arch/riscv/boot/dts/thead/th1520.dtsi
+@@ -286,8 +286,9 @@ gmac1: ethernet@ffe7060000 {
+ reg-names = "dwmac", "apb";
+ interrupts = <67 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "macirq";
+- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>;
+- clock-names = "stmmaceth", "pclk";
++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>,
++ <&clk CLK_PERISYS_APB4_HCLK>;
++ clock-names = "stmmaceth", "pclk", "apb";
+ snps,pbl = <32>;
+ snps,fixed-burst;
+ snps,multicast-filter-bins = <64>;
+@@ -308,8 +309,9 @@ gmac0: ethernet@ffe7070000 {
+ reg-names = "dwmac", "apb";
+ interrupts = <66 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "macirq";
+- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>;
+- clock-names = "stmmaceth", "pclk";
++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>,
++ <&clk CLK_PERISYS_APB4_HCLK>;
++ clock-names = "stmmaceth", "pclk", "apb";
+ snps,pbl = <32>;
+ snps,fixed-burst;
+ snps,multicast-filter-bins = <64>;
+--
+2.50.1
+
--- /dev/null
+From bccdcc26cdd1a2db18a98c8314555d471f0ea68b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+ BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+ sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+ sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+ __release_sock+0x1da/0x330 net/core/sock.c:3106
+ release_sock+0x6b/0x250 net/core/sock.c:3660
+ sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+ sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+ sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+ inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+ BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+ sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+ __release_sock+0x1d3/0x330 net/core/sock.c:3213
+ release_sock+0x6b/0x270 net/core/sock.c:3767
+ sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+ sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+ sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+ inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 0c0d2757f6f8..6fcdcaeed40e 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+- if ((!is_gso && skb_linearize(skb)) ||
++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
+ goto discard_it;
+
+--
+2.50.1
+
block-make-req_op_zone_finish-a-write-operation.patch
mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
mfd-cros_ec-separate-charge-control-probing-from-usb-pd.patch
+habanalabs-fix-uaf-in-export_dmabuf.patch
+mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
+xfrm-restore-gso-for-sw-crypto.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+net-hibmcge-fix-rtnl-deadlock-issue.patch
+net-hibmcge-fix-the-division-by-zero-issue.patch
+net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch
+net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
+net-page_pool-allow-enabling-recycling-late-fix-fals.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+net-lapbether-ignore-ops-locked-netdevs.patch
+hamradio-ignore-ops-locked-netdevs.patch
+erofs-fix-block-count-report-when-48-bit-layout-is-o.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch
+riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
+ipvs-fix-estimator-kthreads-preferred-affinity.patch
+net-kcm-fix-race-condition-in-kcm_unattach.patch
--- /dev/null
+From 2f6ca8c2086da5826a2e90788e7990e6a81f6da8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h | 2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c | 3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index 774859b63f0d..4e077068e6d9 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -196,7 +196,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index 095cf31bae0b..d71643b494a1 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+ strp->stm.offset = offset;
+ }
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+ struct strp_msg *rxm;
+ struct tls_msg *tlm;
+@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+
+ if (!strp->copy_mode && force_refresh) {
+- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+- return;
++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++ WRITE_ONCE(strp->msg_ready, 0);
++ memset(&strp->stm, 0, sizeof(strp->stm));
++ return false;
++ }
+
+ tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+ }
+@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ rxm->offset = strp->stm.offset;
+ tlm = tls_msg(strp->anchor);
+ tlm->control = strp->mark;
++
++ return true;
+ }
+
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 549d1ea01a72..51c98a007dda 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+ return sock_intr_errno(timeo);
+ }
+
+- tls_strp_msg_load(&ctx->strp, released);
++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++ return tls_rx_rec_wait(sk, psock, nonblock, false);
+
+ return 1;
+ }
+--
+2.50.1
+
--- /dev/null
+From b41a25fd1983a12413c6fe2ac51da32c519ef08e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index a1aca6308677..4245522d4201 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+
--- /dev/null
+From c4a6ec2c44c573d9dca08240a0c2e0c8ba20a461 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:25 +0200
+Subject: xfrm: restore GSO for SW crypto
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 234d1eff5d4987024be9d40ac07b918a5ae8db1a ]
+
+Commit 49431af6c4ef incorrectly assumes that the GSO path is only used
+by HW offload, but it's also useful for SW crypto.
+
+This patch re-enables GSO for SW crypto. It's not an exact revert to
+preserve the other changes made to xfrm_dev_offload_ok afterwards, but
+it reverts all of its effects.
+
+Fixes: 49431af6c4ef ("xfrm: rely on XFRM offload")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xfrm/xfrm_device.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
+index a2d3a5f3b485..a6c289858401 100644
+--- a/net/xfrm/xfrm_device.c
++++ b/net/xfrm/xfrm_device.c
+@@ -415,10 +415,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+ struct net_device *dev = x->xso.dev;
+ bool check_tunnel_size;
+
+- if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED)
++ if (!x->type_offload ||
++ (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
+ return false;
+
+- if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) {
++ if ((!dev || dev == xfrm_dst_path(dst)->dev) &&
++ !xdst->child->xfrm) {
+ mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
+ if (skb->len <= mtu)
+ goto ok;
+@@ -430,6 +432,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+ return false;
+
+ ok:
++ if (!dev)
++ return true;
++
+ check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
+ x->props.mode == XFRM_MODE_TUNNEL;
+ switch (x->props.family) {
+--
+2.50.1
+
--- /dev/null
+From dddfb590c42c3c450b2cac204c7b69ea3c79f1d1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 11:29:07 -0700
+Subject: bnxt: fill data page pool with frags if PAGE_SIZE > BNXT_RX_PAGE_SIZE
+
+From: David Wei <dw@davidwei.uk>
+
+[ Upstream commit 39f8fcda2088382a4aa70b258d6f7225aa386f11 ]
+
+The data page pool always fills the HW rx ring with pages. On arm64 with
+64K pages, this will waste _at least_ 32K of memory per entry in the rx
+ring.
+
+Fix by fragmenting the pages if PAGE_SIZE > BNXT_RX_PAGE_SIZE. This
+makes the data page pool the same as the header pool.
+
+Tested with iperf3 with a small (64 entries) rx ring to encourage buffer
+circulation.
+
+Fixes: cd1fafe7da1f ("eth: bnxt: add support rx side device memory TCP")
+Reviewed-by: Michael Chan <michael.chan@broadcom.com>
+Signed-off-by: David Wei <dw@davidwei.uk>
+Link: https://patch.msgid.link/20250812182907.1540755-1-dw@davidwei.uk
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 8d950b43846e..e165490af6ac 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -921,15 +921,21 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
+
+ static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
+ struct bnxt_rx_ring_info *rxr,
++ unsigned int *offset,
+ gfp_t gfp)
+ {
+ netmem_ref netmem;
+
+- netmem = page_pool_alloc_netmems(rxr->page_pool, gfp);
++ if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) {
++ netmem = page_pool_alloc_frag_netmem(rxr->page_pool, offset, BNXT_RX_PAGE_SIZE, gfp);
++ } else {
++ netmem = page_pool_alloc_netmems(rxr->page_pool, gfp);
++ *offset = 0;
++ }
+ if (!netmem)
+ return 0;
+
+- *mapping = page_pool_get_dma_addr_netmem(netmem);
++ *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
+ return netmem;
+ }
+
+@@ -1024,7 +1030,7 @@ static int bnxt_alloc_rx_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
+ dma_addr_t mapping;
+ netmem_ref netmem;
+
+- netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, gfp);
++ netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp);
+ if (!netmem)
+ return -ENOMEM;
+
+--
+2.50.1
+
--- /dev/null
+From 7cde332159d0e7a0dd2d95c8374ad52850b4db14 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2]. This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period. Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index 52d5d26fc7c6..81306612a5c6 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns)
+
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++ /* Update the repeating-pattern data. */
++ data->intervals[data->interval_ptr++] = interval_us;
++ if (data->interval_ptr >= INTERVALS)
++ data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+ /*
+@@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ if (data->needs_update) {
+ menu_update(drv, dev);
+ data->needs_update = 0;
++ } else if (!dev->last_residency_ns) {
++ /*
++ * This happens when the driver rejects the previously selected
++ * idle state and returns an error, so update the recent
++ * intervals table to prevent invalid information from being
++ * used going forward.
++ */
++ menu_update_intervals(data, UINT_MAX);
+ }
+
+ /* Find the shortest expected idle interval. */
+@@ -482,10 +498,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+
+ data->correction_factor[data->bucket] = new_factor;
+
+- /* update the repeating-pattern data */
+- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+- if (data->interval_ptr >= INTERVALS)
+- data->interval_ptr = 0;
++ menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+
+ /**
+--
+2.50.1
+
--- /dev/null
+From 04b932e34aabb02993b5947cbdae75426353973a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:20:19 +0800
+Subject: erofs: fix block count report when 48-bit layout is on
+
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+
+[ Upstream commit 0b96d9bed324a1c1b7d02bfb9596351ef178428d ]
+
+Fix incorrect shift order when combining the 48-bit block count.
+
+Fixes: 2e1473d5195f ("erofs: implement 48-bit block addressing for unencoded inodes")
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Link: https://lore.kernel.org/r/20250807082019.3093539-1-hsiangkao@linux.alibaba.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/erofs/super.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/erofs/super.c b/fs/erofs/super.c
+index e1e9f06e8342..799fef437aa8 100644
+--- a/fs/erofs/super.c
++++ b/fs/erofs/super.c
+@@ -313,8 +313,8 @@ static int erofs_read_superblock(struct super_block *sb)
+ sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+ if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+ sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+- sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
+- le16_to_cpu(dsb->rb.blocks_hi);
++ sbi->dif0.blocks = sbi->dif0.blocks |
++ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
+ } else {
+ sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+ }
+--
+2.50.1
+
--- /dev/null
+From 94bb4ee76202eae8bd46a9e2051579b0bd2c8aa8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 12 Jul 2025 06:02:31 +0100
+Subject: habanalabs: fix UAF in export_dmabuf()
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ]
+
+As soon as we'd inserted a file reference into descriptor table, another
+thread could close it. That's fine for the case when all we are doing is
+returning that descriptor to userland (it's a race, but it's a userland
+race and there's nothing the kernel can do about it). However, if we
+follow fd_install() with any kind of access to objects that would be
+destroyed on close (be it the struct file itself or anything destroyed
+by its ->release()), we have a UAF.
+
+dma_buf_fd() is a combination of reserving a descriptor and fd_install().
+habanalabs export_dmabuf() calls it and then proceeds to access the
+objects destroyed on close. In particular, it grabs an extra reference to
+another struct file that will be dropped as part of ->release() for ours;
+that "will be" is actually "might have already been".
+
+Fix that by reserving descriptor before anything else and do fd_install()
+only when everything had been set up. As a side benefit, we no longer
+have the failure exit with file already created, but reference to
+underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet;
+unlike dma_buf_fd(), fd_install() can't fail.
+
+Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/accel/habanalabs/common/memory.c | 23 +++++++----------------
+ 1 file changed, 7 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
+index 601fdbe70179..61472a381904 100644
+--- a/drivers/accel/habanalabs/common/memory.c
++++ b/drivers/accel/habanalabs/common/memory.c
+@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
+ struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
+ struct hl_ctx *ctx;
+
+- if (!hl_dmabuf)
+- return;
+-
+ ctx = hl_dmabuf->ctx;
+
+ if (hl_dmabuf->memhash_hnode)
+@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ {
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct hl_device *hdev = ctx->hdev;
+- int rc, fd;
++ CLASS(get_unused_fd, fd)(flags);
++
++ if (fd < 0) {
++ dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
++ return fd;
++ }
+
+ exp_info.ops = &habanalabs_dmabuf_ops;
+ exp_info.size = total_size;
+@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ return PTR_ERR(hl_dmabuf->dmabuf);
+ }
+
+- fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
+- if (fd < 0) {
+- dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
+- rc = fd;
+- goto err_dma_buf_put;
+- }
+-
+ hl_dmabuf->ctx = ctx;
+ hl_ctx_get(hl_dmabuf->ctx);
+ atomic_inc(&ctx->hdev->dmabuf_export_cnt);
+@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx,
+ get_file(ctx->hpriv->file_priv->filp);
+
+ *dmabuf_fd = fd;
++ fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);
+
+ return 0;
+-
+-err_dma_buf_put:
+- hl_dmabuf->dmabuf->priv = NULL;
+- dma_buf_put(hl_dmabuf->dmabuf);
+- return rc;
+ }
+
+ static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
+--
+2.50.1
+
--- /dev/null
+From f83f4f27bb385d16cb1c9541d6b03c8088d15f0d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:26 -0700
+Subject: hamradio: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit c64237960819aee1766d03f446ae6de94b1e3f73 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding hamradio for ops-locked
+devices.
+
+ xsk_notifier+0x89/0x230 net/xdp/xsk.c:1664
+ notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+ call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+ call_netdevice_notifiers net/core/dev.c:2281 [inline]
+ unregister_netdevice_many_notify+0x14d7/0x1ff0 net/core/dev.c:12156
+ unregister_netdevice_many net/core/dev.c:12219 [inline]
+ unregister_netdevice_queue+0x33c/0x380 net/core/dev.c:12063
+ register_netdevice+0x1689/0x1ae0 net/core/dev.c:11241
+ bpq_new_device drivers/net/hamradio/bpqether.c:481 [inline]
+ bpq_device_event+0x491/0x600 drivers/net/hamradio/bpqether.c:523
+ notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85
+ call_netdevice_notifiers_extack net/core/dev.c:2267 [inline]
+ call_netdevice_notifiers net/core/dev.c:2281 [inline]
+ __dev_notify_flags+0x18d/0x2e0 net/core/dev.c:-1
+ netif_change_flags+0xe8/0x1a0 net/core/dev.c:9608
+ dev_change_flags+0x130/0x260 net/core/dev_api.c:68
+ devinet_ioctl+0xbb4/0x1b50 net/ipv4/devinet.c:1200
+ inet_ioctl+0x3c0/0x4c0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e6300f66a999a6612477@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e6300f66a999a6612477
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-2-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/hamradio/bpqether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
+index 0e0fe32d2da4..045c5177262e 100644
+--- a/drivers/net/hamradio/bpqether.c
++++ b/drivers/net/hamradio/bpqether.c
+@@ -138,7 +138,7 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev)
+
+ static inline int dev_is_ethdev(struct net_device *dev)
+ {
+- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+
+ /* ------------------------------------------------------------------------ */
+--
+2.50.1
+
--- /dev/null
+From 5bc45712394018e4bf1a06c92753e90d4d00252a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6. Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 73747d20df85..91a7b7e7c0c8 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1679,7 +1679,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+ {}
+ };
+
+--
+2.50.1
+
--- /dev/null
+From f4ed6bb5279fbef65bd65697b4e54aa15facbcaa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 29 Jul 2025 14:26:11 +0200
+Subject: ipvs: Fix estimator kthreads preferred affinity
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 ]
+
+The estimator kthreads' affinity are defined by sysctl overwritten
+preferences and applied through a plain call to the scheduler's affinity
+API.
+
+However since the introduction of managed kthreads preferred affinity,
+such a practice shortcuts the kthreads core code which eventually
+overwrites the target to the default unbound affinity.
+
+Fix this with using the appropriate kthread's API.
+
+Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Acked-by: Julian Anastasov <ja@ssi.bg>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/ip_vs.h | 13 +++++++++++++
+ kernel/kthread.c | 1 +
+ net/netfilter/ipvs/ip_vs_est.c | 3 ++-
+ 3 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
+index ff406ef4fd4a..29a36709e7f3 100644
+--- a/include/net/ip_vs.h
++++ b/include/net/ip_vs.h
+@@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+ return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
+
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++ if (ipvs->est_cpulist_valid)
++ return ipvs->sysctl_est_cpulist;
++ else
++ return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+ return ipvs->sysctl_est_nice;
+@@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+ return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ }
+
++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
++{
++ return NULL;
++}
++
+ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+ {
+ return IPVS_EST_NICE;
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index 85fc068f0083..8d5e87b03d1e 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -894,6 +894,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+
+ return ret;
+ }
++EXPORT_SYMBOL_GPL(kthread_affine_preferred);
+
+ /*
+ * Re-affine kthreads according to their preferences
+diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
+index f821ad2e19b3..15049b826732 100644
+--- a/net/netfilter/ipvs/ip_vs_est.c
++++ b/net/netfilter/ipvs/ip_vs_est.c
+@@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+ }
+
+ set_user_nice(kd->task, sysctl_est_nice(ipvs));
+- set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
++ if (sysctl_est_preferred_cpulist(ipvs))
++ kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
+
+ pr_info("starting estimator thread %d...\n", kd->id);
+ wake_up_process(kd->task);
+--
+2.50.1
+
--- /dev/null
+From fbc4eefd9eaf2473aa9cf85fa2deda55d8ec654d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 17:09:56 +0800
+Subject: mm/smaps: fix race between smaps_hugetlb_range and migration
+
+From: Jinjiang Tu <tujinjiang@huawei.com>
+
+[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ]
+
+smaps_hugetlb_range() handles the pte without holdling ptl, and may be
+concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page().
+The race is as follows.
+
+smaps_hugetlb_range migrate_pages
+ huge_ptep_get
+ remove_migration_ptes
+ folio_unlock
+ pfn_swap_entry_folio
+ BUG_ON
+
+To fix it, hold ptl lock in smaps_hugetlb_range().
+
+Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com
+Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com
+Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps")
+Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrei Vagin <avagin@gmail.com>
+Cc: Andrii Nakryiko <andrii@kernel.org>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Brahmajit Das <brahmajit.xyz@gmail.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dev Jain <dev.jain@arm.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/proc/task_mmu.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index 751479eb128f..0102ab3aaec1 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1020,10 +1020,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ {
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+- pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
+ struct folio *folio = NULL;
+ bool present = false;
++ spinlock_t *ptl;
++ pte_t ptent;
+
++ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
++ ptent = huge_ptep_get(walk->mm, addr, pte);
+ if (pte_present(ptent)) {
+ folio = page_folio(pte_page(ptent));
+ present = true;
+@@ -1042,6 +1045,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
++ spin_unlock(ptl);
+ return 0;
+ }
+ #else
+--
+2.50.1
+
--- /dev/null
+From f88d08d5c14c6994d3611fc3adc2f16564729220 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:56 +0800
+Subject: net: hibmcge: fix rtnl deadlock issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit c875503a9b9082928d7d3fc60b5400d16fbfae4e ]
+
+Currently, the hibmcge netdev acquires the rtnl_lock in
+pci_error_handlers.reset_prepare() and releases it in
+pci_error_handlers.reset_done().
+
+However, in the PCI framework:
+pci_reset_bus - __pci_reset_slot - pci_slot_save_and_disable_locked -
+ pci_dev_save_and_disable - err_handler->reset_prepare(dev);
+
+In pci_slot_save_and_disable_locked():
+ list_for_each_entry(dev, &slot->bus->devices, bus_list) {
+ if (!dev->slot || dev->slot!= slot)
+ continue;
+ pci_dev_save_and_disable(dev);
+ if (dev->subordinate)
+ pci_bus_save_and_disable_locked(dev->subordinate);
+ }
+
+This will iterate through all devices under the current bus and execute
+err_handler->reset_prepare(), causing two devices of the hibmcge driver
+to sequentially request the rtnl_lock, leading to a deadlock.
+
+Since the driver now executes netif_device_detach()
+before the reset process, it will not concurrently with
+other netdev APIs, so there is no need to hold the rtnl_lock now.
+
+Therefore, this patch removes the rtnl_lock during the reset process and
+adjusts the position of HBG_NIC_STATE_RESETTING to ensure
+that multiple resets are not executed concurrently.
+
+Fixes: 3f5a61f6d504f ("net: hibmcge: Add reset supported in this module")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c | 14 +++++---------
+ 1 file changed, 5 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+index ff3295b60a69..dee1e8681157 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+@@ -53,9 +53,11 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+ {
+ int ret;
+
+- ASSERT_RTNL();
++ if (test_and_set_bit(HBG_NIC_STATE_RESETTING, &priv->state))
++ return -EBUSY;
+
+ if (netif_running(priv->netdev)) {
++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ dev_warn(&priv->pdev->dev,
+ "failed to reset because port is up\n");
+ return -EBUSY;
+@@ -64,7 +66,6 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
+ netif_device_detach(priv->netdev);
+
+ priv->reset_type = type;
+- set_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
+ ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET);
+ if (ret) {
+@@ -83,28 +84,25 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type)
+ type != priv->reset_type)
+ return 0;
+
+- ASSERT_RTNL();
+-
+- clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ ret = hbg_rebuild(priv);
+ if (ret) {
+ set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+ dev_err(&priv->pdev->dev, "failed to rebuild after reset\n");
+ return ret;
+ }
+
+ netif_device_attach(priv->netdev);
++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
+
+ dev_info(&priv->pdev->dev, "reset done\n");
+ return ret;
+ }
+
+-/* must be protected by rtnl lock */
+ int hbg_reset(struct hbg_priv *priv)
+ {
+ int ret;
+
+- ASSERT_RTNL();
+ ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION);
+ if (ret)
+ return ret;
+@@ -169,7 +167,6 @@ static void hbg_pci_err_reset_prepare(struct pci_dev *pdev)
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct hbg_priv *priv = netdev_priv(netdev);
+
+- rtnl_lock();
+ hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR);
+ }
+
+@@ -179,7 +176,6 @@ static void hbg_pci_err_reset_done(struct pci_dev *pdev)
+ struct hbg_priv *priv = netdev_priv(netdev);
+
+ hbg_reset_done(priv, HBG_RESET_TYPE_FLR);
+- rtnl_unlock();
+ }
+
+ static const struct pci_error_handlers hbg_pci_err_handler = {
+--
+2.50.1
+
--- /dev/null
+From 2d5cc1e9320bffb1c936a0e982dce8ab8803a836 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:57 +0800
+Subject: net: hibmcge: fix the division by zero issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 7004b26f0b64331143eb0b312e77a357a11427ce ]
+
+When the network port is down, the queue is released, and ring->len is 0.
+In debugfs, hbg_get_queue_used_num() will be called,
+which may lead to a division by zero issue.
+
+This patch adds a check, if ring->len is 0,
+hbg_get_queue_used_num() directly returns 0.
+
+Fixes: 40735e7543f9 ("net: hibmcge: Implement .ndo_start_xmit function")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+index 2883a5899ae2..8b6110599e10 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+@@ -29,7 +29,12 @@ static inline bool hbg_fifo_is_full(struct hbg_priv *priv, enum hbg_dir dir)
+
+ static inline u32 hbg_get_queue_used_num(struct hbg_ring *ring)
+ {
+- return (ring->ntu + ring->len - ring->ntc) % ring->len;
++ u32 len = READ_ONCE(ring->len);
++
++ if (!len)
++ return 0;
++
++ return (READ_ONCE(ring->ntu) + len - READ_ONCE(ring->ntc)) % len;
+ }
+
+ netdev_tx_t hbg_net_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+--
+2.50.1
+
--- /dev/null
+From d8162f09505cdd9f80470c636561332cc9e2e7d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 18:27:58 +0800
+Subject: net: hibmcge: fix the np_link_fail error reporting issue
+
+From: Jijie Shao <shaojijie@huawei.com>
+
+[ Upstream commit 62c50180ffda01468e640ac14925503796f255e2 ]
+
+Currently, after modifying device port mode, the np_link_ok state
+is immediately checked. At this point, the device may not yet ready,
+leading to the querying of an intermediate state.
+
+This patch will poll to check if np_link is ok after
+modifying device port mode, and only report np_link_fail upon timeout.
+
+Fixes: e0306637e85d ("net: hibmcge: Add support for mac link exception handling feature")
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+index 9b65eef62b3f..2844124f306d 100644
+--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+@@ -12,6 +12,8 @@
+
+ #define HBG_HW_EVENT_WAIT_TIMEOUT_US (2 * 1000 * 1000)
+ #define HBG_HW_EVENT_WAIT_INTERVAL_US (10 * 1000)
++#define HBG_MAC_LINK_WAIT_TIMEOUT_US (500 * 1000)
++#define HBG_MAC_LINK_WAIT_INTERVAL_US (5 * 1000)
+ /* little endian or big endian.
+ * ctrl means packet description, data means skb packet data
+ */
+@@ -213,6 +215,9 @@ void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr)
+
+ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+ {
++ u32 link_status;
++ int ret;
++
+ hbg_hw_mac_enable(priv, HBG_STATUS_DISABLE);
+
+ hbg_reg_write_field(priv, HBG_REG_PORT_MODE_ADDR,
+@@ -224,8 +229,14 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
+
+ hbg_hw_mac_enable(priv, HBG_STATUS_ENABLE);
+
+- if (!hbg_reg_read_field(priv, HBG_REG_AN_NEG_STATE_ADDR,
+- HBG_REG_AN_NEG_STATE_NP_LINK_OK_B))
++ /* wait MAC link up */
++ ret = readl_poll_timeout(priv->io_base + HBG_REG_AN_NEG_STATE_ADDR,
++ link_status,
++ FIELD_GET(HBG_REG_AN_NEG_STATE_NP_LINK_OK_B,
++ link_status),
++ HBG_MAC_LINK_WAIT_INTERVAL_US,
++ HBG_MAC_LINK_WAIT_TIMEOUT_US);
++ if (ret)
+ hbg_np_link_fail_task_schedule(priv);
+ }
+
+--
+2.50.1
+
--- /dev/null
+From b6830a257156db66722ed8d61507ba2528f6c0a0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Aug 2025 21:18:03 +0200
+Subject: net: kcm: Fix race condition in kcm_unattach()
+
+From: Sven Stegemann <sven@stegemann.de>
+
+[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ]
+
+syzbot found a race condition when kcm_unattach(psock)
+and kcm_release(kcm) are executed at the same time.
+
+kcm_unattach() is missing a check of the flag
+kcm->tx_stopped before calling queue_work().
+
+If the kcm has a reserved psock, kcm_unattach() might get executed
+between cancel_work_sync() and unreserve_psock() in kcm_release(),
+requeuing kcm->tx_work right before kcm gets freed in kcm_done().
+
+Remove kcm->tx_stopped and replace it by the less
+error-prone disable_work_sync().
+
+Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
+Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662
+Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94
+Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e
+Signed-off-by: Sven Stegemann <sven@stegemann.de>
+Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/kcm.h | 1 -
+ net/kcm/kcmsock.c | 10 ++--------
+ 2 files changed, 2 insertions(+), 9 deletions(-)
+
+diff --git a/include/net/kcm.h b/include/net/kcm.h
+index 441e993be634..d9c35e71ecea 100644
+--- a/include/net/kcm.h
++++ b/include/net/kcm.h
+@@ -71,7 +71,6 @@ struct kcm_sock {
+ struct list_head wait_psock_list;
+ struct sk_buff *seq_skb;
+ struct mutex tx_mutex;
+- u32 tx_stopped : 1;
+
+ /* Don't use bit fields here, these are set under different locks */
+ bool tx_wait;
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index c05047dad62d..d0a001ebabfe 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -430,7 +430,7 @@ static void psock_write_space(struct sock *sk)
+
+ /* Check if the socket is reserved so someone is waiting for sending. */
+ kcm = psock->tx_kcm;
+- if (kcm && !unlikely(kcm->tx_stopped))
++ if (kcm)
+ queue_work(kcm_wq, &kcm->tx_work);
+
+ spin_unlock_bh(&mux->lock);
+@@ -1694,12 +1694,6 @@ static int kcm_release(struct socket *sock)
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+
+- /* Set tx_stopped. This is checked when psock is bound to a kcm and we
+- * get a writespace callback. This prevents further work being queued
+- * from the callback (unbinding the psock occurs after canceling work.
+- */
+- kcm->tx_stopped = 1;
+-
+ release_sock(sk);
+
+ spin_lock_bh(&mux->lock);
+@@ -1715,7 +1709,7 @@ static int kcm_release(struct socket *sock)
+ /* Cancel work. After this point there should be no outside references
+ * to the kcm socket.
+ */
+- cancel_work_sync(&kcm->tx_work);
++ disable_work_sync(&kcm->tx_work);
+
+ lock_sock(sk);
+ psock = kcm->tx_psock;
+--
+2.50.1
+
--- /dev/null
+From 3fdafaf1ab15d2a63c9237a845b9c1448d472fe2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 14:37:25 -0700
+Subject: net: lapbether: ignore ops-locked netdevs
+
+From: Stanislav Fomichev <sdf@fomichev.me>
+
+[ Upstream commit 53898ebabe843bfa7baea9dae152797d5d0563c9 ]
+
+Syzkaller managed to trigger lock dependency in xsk_notify via
+register_netdevice. As discussed in [0], using register_netdevice
+in the notifiers is problematic so skip adding lapbeth for ops-locked
+devices.
+
+ xsk_notifier+0xa4/0x280 net/xdp/xsk.c:1645
+ notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+ call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+ call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+ call_netdevice_notifiers net/core/dev.c:2282 [inline]
+ unregister_netdevice_many_notify+0xf9d/0x2700 net/core/dev.c:12077
+ unregister_netdevice_many net/core/dev.c:12140 [inline]
+ unregister_netdevice_queue+0x305/0x3f0 net/core/dev.c:11984
+ register_netdevice+0x18f1/0x2270 net/core/dev.c:11149
+ lapbeth_new_device drivers/net/wan/lapbether.c:420 [inline]
+ lapbeth_device_event+0x5b1/0xbe0 drivers/net/wan/lapbether.c:462
+ notifier_call_chain+0xbc/0x410 kernel/notifier.c:85
+ call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230
+ call_netdevice_notifiers_extack net/core/dev.c:2268 [inline]
+ call_netdevice_notifiers net/core/dev.c:2282 [inline]
+ __dev_notify_flags+0x12c/0x2e0 net/core/dev.c:9497
+ netif_change_flags+0x108/0x160 net/core/dev.c:9526
+ dev_change_flags+0xba/0x250 net/core/dev_api.c:68
+ devinet_ioctl+0x11d5/0x1f50 net/ipv4/devinet.c:1200
+ inet_ioctl+0x3a7/0x3f0 net/ipv4/af_inet.c:1001
+
+0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/
+Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP")
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Reported-by: syzbot+e67ea9c235b13b4f0020@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e67ea9c235b13b4f0020
+Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
+Link: https://patch.msgid.link/20250806213726.1383379-1-sdf@fomichev.me
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/wan/lapbether.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
+index 995a7207bdf8..f357a7ac70ac 100644
+--- a/drivers/net/wan/lapbether.c
++++ b/drivers/net/wan/lapbether.c
+@@ -81,7 +81,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev)
+
+ static __inline__ int dev_is_ethdev(struct net_device *dev)
+ {
+- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
+ }
+
+ /* ------------------------------------------------------------------------ */
+--
+2.50.1
+
--- /dev/null
+From 133eb2c7ce6134ab45bff3f7632a07aa5bf086a5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:54:49 +0200
+Subject: net: mdiobus: release reset_gpio in mdiobus_unregister_device()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Buday Csaba <buday.csaba@prolan.hu>
+
+[ Upstream commit 8ea25274ebaf2f6be8be374633b2ed8348ec0e70 ]
+
+reset_gpio is claimed in mdiobus_register_device(), but it is not
+released in mdiobus_unregister_device(). It is instead only
+released when the whole MDIO bus is unregistered.
+When a device uses the reset_gpio property, it becomes impossible
+to unregister it and register it again, because the GPIO remains
+claimed.
+This patch resolves that issue.
+
+Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support") # see notes
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Cc: Csókás Bence <csokas.bence@prolan.hu>
+[ csokas.bence: Resolve rebase conflict and clarify msg ]
+Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
+Link: https://patch.msgid.link/20250807135449.254254-2-csokas.bence@prolan.hu
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/mdio_bus.c | 1 +
+ drivers/net/phy/mdio_bus_provider.c | 3 ---
+ 2 files changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
+index fda2e27c1810..cad6ed3aa10b 100644
+--- a/drivers/net/phy/mdio_bus.c
++++ b/drivers/net/phy/mdio_bus.c
+@@ -91,6 +91,7 @@ int mdiobus_unregister_device(struct mdio_device *mdiodev)
+ if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev)
+ return -EINVAL;
+
++ gpiod_put(mdiodev->reset_gpio);
+ reset_control_put(mdiodev->reset_ctrl);
+
+ mdiodev->bus->mdio_map[mdiodev->addr] = NULL;
+diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c
+index 65850e36284d..5401170f14e5 100644
+--- a/drivers/net/phy/mdio_bus_provider.c
++++ b/drivers/net/phy/mdio_bus_provider.c
+@@ -444,9 +444,6 @@ void mdiobus_unregister(struct mii_bus *bus)
+ if (!mdiodev)
+ continue;
+
+- if (mdiodev->reset_gpio)
+- gpiod_put(mdiodev->reset_gpio);
+-
+ mdiodev->device_remove(mdiodev);
+ mdiodev->device_free(mdiodev);
+ }
+--
+2.50.1
+
--- /dev/null
+From a2759ceb0274ed9b8379560e6e516170a71b2101 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 17:36:54 -0700
+Subject: net: page_pool: allow enabling recycling late, fix false positive
+ warning
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 64fdaa94bfe0cca3a0f4b2dd922486c5f59fe678 ]
+
+Page pool can have pages "directly" (locklessly) recycled to it,
+if the NAPI that owns the page pool is scheduled to run on the same CPU.
+To make this safe we check that the NAPI is disabled while we destroy
+the page pool. In most cases NAPI and page pool lifetimes are tied
+together so this happens naturally.
+
+The queue API expects the following order of calls:
+ -> mem_alloc
+ alloc new pp
+ -> stop
+ napi_disable
+ -> start
+ napi_enable
+ -> mem_free
+ free old pp
+
+Here we allocate the page pool in ->mem_alloc and free in ->mem_free.
+But the NAPIs are only stopped between ->stop and ->start. We created
+page_pool_disable_direct_recycling() to safely shut down the recycling
+in ->stop. This way the page_pool_destroy() call in ->mem_free doesn't
+have to worry about recycling any more.
+
+Unfortunately, the page_pool_disable_direct_recycling() is not enough
+to deal with failures which necessitate freeing the _new_ page pool.
+If we hit a failure in ->mem_alloc or ->stop the new page pool has
+to be freed while the NAPI is active (assuming driver attaches the
+page pool to an existing NAPI instance and doesn't reallocate NAPIs).
+
+Freeing the new page pool is technically safe because it hasn't been
+used for any packets, yet, so there can be no recycling. But the check
+in napi_assert_will_not_race() has no way of knowing that. We could
+check if page pool is empty but that'd make the check much less likely
+to trigger during development.
+
+Add page_pool_enable_direct_recycling(), pairing with
+page_pool_disable_direct_recycling(). It will allow us to create the new
+page pools in "disabled" state and only enable recycling when we know
+the reconfig operation will not fail.
+
+Coincidentally it will also let us re-enable the recycling for the old
+pool, if the reconfig failed:
+
+ -> mem_alloc (new)
+ -> stop (old)
+ # disables direct recycling for old
+ -> start (new)
+ # fail!!
+ -> start (old)
+ # go back to old pp but direct recycling is lost :(
+ -> mem_free (new)
+
+The new helper is idempotent to make the life easier for drivers,
+which can operate in HDS mode and support zero-copy Rx.
+The driver can call the helper twice whether there are two pools
+or it has multiple references to a single pool.
+
+Fixes: 40eca00ae605 ("bnxt_en: unlink page pool when stopping Rx queue")
+Tested-by: David Wei <dw@davidwei.uk>
+Link: https://patch.msgid.link/20250805003654.2944974-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++-
+ include/net/page_pool/types.h | 2 ++
+ net/core/page_pool.c | 29 +++++++++++++++++++++++
+ 3 files changed, 39 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 243cb13cb01c..8d950b43846e 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -3810,7 +3810,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+ if (BNXT_RX_PAGE_MODE(bp))
+ pp.pool_size += bp->rx_ring_size;
+ pp.nid = numa_node;
+- pp.napi = &rxr->bnapi->napi;
+ pp.netdev = bp->dev;
+ pp.dev = &bp->pdev->dev;
+ pp.dma_dir = bp->rx_dir;
+@@ -3842,6 +3841,12 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
+ return PTR_ERR(pool);
+ }
+
++static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr)
++{
++ page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi);
++ page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi);
++}
++
+ static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
+ {
+ u16 mem_size;
+@@ -3880,6 +3885,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
+ rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
+ if (rc)
+ return rc;
++ bnxt_enable_rx_page_pool(rxr);
+
+ rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0);
+ if (rc < 0)
+@@ -16042,6 +16048,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx)
+ goto err_reset;
+ }
+
++ bnxt_enable_rx_page_pool(rxr);
+ napi_enable_locked(&bnapi->napi);
+ bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons);
+
+diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
+index 431b593de709..1509a536cb85 100644
+--- a/include/net/page_pool/types.h
++++ b/include/net/page_pool/types.h
+@@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
+ struct xdp_mem_info;
+
+ #ifdef CONFIG_PAGE_POOL
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++ struct napi_struct *napi);
+ void page_pool_disable_direct_recycling(struct page_pool *pool);
+ void page_pool_destroy(struct page_pool *pool);
+ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+diff --git a/net/core/page_pool.c b/net/core/page_pool.c
+index ba7cf3e3c32f..368412baad26 100644
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -1201,6 +1201,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+ pool->xdp_mem_id = mem->id;
+ }
+
++/**
++ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
++ * @pool: page pool to modify
++ * @napi: NAPI instance to associate the page pool with
++ *
++ * Associate a page pool with a NAPI instance for lockless page recycling.
++ * This is useful when a new page pool has to be added to a NAPI instance
++ * without disabling that NAPI instance, to mark the point at which control
++ * path "hands over" the page pool to the NAPI instance. In most cases driver
++ * can simply set the @napi field in struct page_pool_params, and does not
++ * have to call this helper.
++ *
++ * The function is idempotent, but does not implement any refcounting.
++ * Single page_pool_disable_direct_recycling() will disable recycling,
++ * no matter how many times enable was called.
++ */
++void page_pool_enable_direct_recycling(struct page_pool *pool,
++ struct napi_struct *napi)
++{
++ if (READ_ONCE(pool->p.napi) == napi)
++ return;
++ WARN_ON(!napi || pool->p.napi);
++
++ mutex_lock(&page_pools_lock);
++ WRITE_ONCE(pool->p.napi, napi);
++ mutex_unlock(&page_pools_lock);
++}
++EXPORT_SYMBOL(page_pool_enable_direct_recycling);
++
+ void page_pool_disable_direct_recycling(struct page_pool *pool)
+ {
+ /* Disable direct recycling based on pool->cpuid.
+--
+2.50.1
+
--- /dev/null
+From 03536af7c40cc0517826bb09f63fd02513d39540 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 12:08:32 +0800
+Subject: net: phy: nxp-c45-tja11xx: fix the PHY ID mismatch issue when using
+ C45
+
+From: Clark Wang <xiaoning.wang@nxp.com>
+
+[ Upstream commit 8ee90742cf29427683294a6a80f1e2b7f4af1cff ]
+
+TJA1103/04/20/21 support both C22 and C45 accessing methods.
+
+The TJA11xx driver has implemented the match_phy_device() API.
+However, it does not handle the C45 ID. If C45 was used to access
+TJA11xx, match_phy_device() would always return false due to
+phydev->phy_id only used by C22 being empty, resulting in the
+generic phy driver being used for TJA11xx PHYs.
+
+Therefore, check phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] when
+using C45.
+
+Fixes: 1b76b2497aba ("net: phy: nxp-c45-tja11xx: simplify .match_phy_device OP")
+Signed-off-by: Clark Wang <xiaoning.wang@nxp.com>
+Link: https://patch.msgid.link/20250807040832.2455306-1-xiaoning.wang@nxp.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/nxp-c45-tja11xx.c | 23 +++++++++++++----------
+ 1 file changed, 13 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
+index 4c6d905f0a9f..87adb6508017 100644
+--- a/drivers/net/phy/nxp-c45-tja11xx.c
++++ b/drivers/net/phy/nxp-c45-tja11xx.c
+@@ -1965,24 +1965,27 @@ static int nxp_c45_macsec_ability(struct phy_device *phydev)
+ return macsec_ability;
+ }
+
++static bool tja11xx_phy_id_compare(struct phy_device *phydev,
++ const struct phy_driver *phydrv)
++{
++ u32 id = phydev->is_c45 ? phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] :
++ phydev->phy_id;
++
++ return phy_id_compare(id, phydrv->phy_id, phydrv->phy_id_mask);
++}
++
+ static int tja11xx_no_macsec_match_phy_device(struct phy_device *phydev,
+ const struct phy_driver *phydrv)
+ {
+- if (!phy_id_compare(phydev->phy_id, phydrv->phy_id,
+- phydrv->phy_id_mask))
+- return 0;
+-
+- return !nxp_c45_macsec_ability(phydev);
++ return tja11xx_phy_id_compare(phydev, phydrv) &&
++ !nxp_c45_macsec_ability(phydev);
+ }
+
+ static int tja11xx_macsec_match_phy_device(struct phy_device *phydev,
+ const struct phy_driver *phydrv)
+ {
+- if (!phy_id_compare(phydev->phy_id, phydrv->phy_id,
+- phydrv->phy_id_mask))
+- return 0;
+-
+- return nxp_c45_macsec_ability(phydev);
++ return tja11xx_phy_id_compare(phydev, phydrv) &&
++ nxp_c45_macsec_ability(phydev);
+ }
+
+ static const struct nxp_c45_regmap tja1120_regmap = {
+--
+2.50.1
+
--- /dev/null
+From c53fdb12017debb6f4da4eac9248b6aa4ecc5bb5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:55 +0000
+Subject: net: stmmac: thead: Get and enable APB clock on initialization
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit 4cc339ce482ba78589a2d5cbe1c84b735d263383 ]
+
+It's necessary to adjust the MAC TX clock when the linkspeed changes,
+but it's noted such adjustment always fails on TH1520 SoC, and reading
+back from APB glue registers that control clock generation results in
+garbage, causing broken link.
+
+With some testing, it's found a clock must be ungated for access to APB
+glue registers. Without any consumer, the clock is automatically
+disabled during late kernel startup. Let's get and enable it if it's
+described in devicetree.
+
+For backward compatibility with older devicetrees, probing won't fail if
+the APB clock isn't found. In this case, we emit a warning since the
+link will break if the speed changes.
+
+Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-4-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+index c72ee759aae5..f2946bea0bc2 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+@@ -211,6 +211,7 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+ struct stmmac_resources stmmac_res;
+ struct plat_stmmacenet_data *plat;
+ struct thead_dwmac *dwmac;
++ struct clk *apb_clk;
+ void __iomem *apb;
+ int ret;
+
+@@ -224,6 +225,19 @@ static int thead_dwmac_probe(struct platform_device *pdev)
+ return dev_err_probe(&pdev->dev, PTR_ERR(plat),
+ "dt configuration failed\n");
+
++ /*
++ * The APB clock is essential for accessing glue registers. However,
++ * old devicetrees don't describe it correctly. We continue to probe
++ * and emit a warning if it isn't present.
++ */
++ apb_clk = devm_clk_get_enabled(&pdev->dev, "apb");
++ if (PTR_ERR(apb_clk) == -ENOENT)
++ dev_warn(&pdev->dev,
++ "cannot get apb clock, link may break after speed changes\n");
++ else if (IS_ERR(apb_clk))
++ return dev_err_probe(&pdev->dev, PTR_ERR(apb_clk),
++ "failed to get apb clock\n");
++
+ dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL);
+ if (!dwmac)
+ return -ENOMEM;
+--
+2.50.1
+
--- /dev/null
+From 10472084e5247a182caea03db3b210864d598363 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 50bfbc2779e4..d8c9fe1d98c4 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+- u32 val, cap, ret = 0;
++ u32 val, cap;
++ int ret = 0;
+
+ mutex_lock(&iep->ptp_clk_mutex);
+
+--
+2.50.1
+
--- /dev/null
+From be96c5c83b7e3f06515946d993a3bbb28a66ad22 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 23:08:12 +0530
+Subject: net: ti: icssg-prueth: Fix emac link speed handling
+
+From: MD Danish Anwar <danishanwar@ti.com>
+
+[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ]
+
+When link settings are changed emac->speed is populated by
+emac_adjust_link(). The link speed and other settings are then written into
+the DRAM. However if both ports are brought down after this and brought up
+again or if the operating mode is changed and a firmware reload is needed,
+the DRAM is cleared by icssg_config(). As a result the link settings are
+lost.
+
+Fix this by calling emac_adjust_link() after icssg_config(). This re
+populates the settings in the DRAM after a new firmware load.
+
+Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.")
+Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Message-ID: <20250805173812.2183161-1-danishanwar@ti.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+index 2f5c4335dec3..008d77727400 100644
+--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+@@ -50,6 +50,8 @@
+ /* CTRLMMR_ICSSG_RGMII_CTRL register bits */
+ #define ICSSG_CTRL_RGMII_ID_MODE BIT(24)
+
++static void emac_adjust_link(struct net_device *ndev);
++
+ static int emac_get_tx_ts(struct prueth_emac *emac,
+ struct emac_tx_ts_response *rsp)
+ {
+@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth)
+ ret = icssg_config(prueth, emac, slice);
+ if (ret)
+ goto disable_class;
++
++ mutex_lock(&emac->ndev->phydev->lock);
++ emac_adjust_link(emac->ndev);
++ mutex_unlock(&emac->ndev->phydev->lock);
+ }
+
+ ret = prueth_emac_start(prueth);
+--
+2.50.1
+
--- /dev/null
+From 2a6a033f9923cd580fdd3ad1f09e4155d839f3ea Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 2cc0fde23344..5fdcae45e0bc 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -884,8 +884,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -1208,19 +1206,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -1257,7 +1262,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -1270,8 +1275,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, true, flags);
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -1284,12 +1288,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
--- /dev/null
+From b5625a8250316d9830487dc5d90c7f8bd77a3a8e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:09 +0200
+Subject: netfilter: ctnetlink: remove refcounting in expectation dumpers
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 1492e3dcb2be3aa46d1963da96aa9593e4e4db5a ]
+
+Same pattern as previous patch: do not keep the expectation object
+alive via refcount, only store a cookie value and then use that
+as the skip hint for dump resumption.
+
+AFAICS this has the same issue as the one resolved in the conntrack
+dumper, when we do
+ if (!refcount_inc_not_zero(&exp->use))
+
+to increment the refcount, there is a chance that exp == last, which
+causes a double-increment of the refcount and subsequent memory leak.
+
+Fixes: cf6994c2b981 ("[NETFILTER]: nf_conntrack_netlink: sync expectation dumping with conntrack table dumping")
+Fixes: e844a928431f ("netfilter: ctnetlink: allow to dump expectation per master conntrack")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 41 ++++++++++++----------------
+ 1 file changed, 17 insertions(+), 24 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 5fdcae45e0bc..2273ead8102f 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -3171,23 +3171,27 @@ ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item)
+ return 0;
+ }
+ #endif
+-static int ctnetlink_exp_done(struct netlink_callback *cb)
++
++static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp)
+ {
+- if (cb->args[1])
+- nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
+- return 0;
++ unsigned long id = (unsigned long)exp;
++
++ id += nf_ct_get_id(exp->master);
++ id += exp->class;
++
++ return id ? id : 1;
+ }
+
+ static int
+ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ struct net *net = sock_net(skb->sk);
+- struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u_int8_t l3proto = nfmsg->nfgen_family;
++ unsigned long last_id = cb->args[1];
++ struct nf_conntrack_expect *exp;
+
+ rcu_read_lock();
+- last = (struct nf_conntrack_expect *)cb->args[1];
+ for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
+ restart:
+ hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
+@@ -3199,7 +3203,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (exp != last)
++ if (ctnetlink_exp_id(exp) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -3208,9 +3212,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+- if (!refcount_inc_not_zero(&exp->use))
+- continue;
+- cb->args[1] = (unsigned long)exp;
++ cb->args[1] = ctnetlink_exp_id(exp);
+ goto out;
+ }
+ }
+@@ -3221,32 +3223,30 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ rcu_read_unlock();
+- if (last)
+- nf_ct_expect_put(last);
+-
+ return skb->len;
+ }
+
+ static int
+ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+- struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nf_conn *ct = cb->data;
+ struct nf_conn_help *help = nfct_help(ct);
+ u_int8_t l3proto = nfmsg->nfgen_family;
++ unsigned long last_id = cb->args[1];
++ struct nf_conntrack_expect *exp;
+
+ if (cb->args[0])
+ return 0;
+
+ rcu_read_lock();
+- last = (struct nf_conntrack_expect *)cb->args[1];
++
+ restart:
+ hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
+ if (l3proto && exp->tuple.src.l3num != l3proto)
+ continue;
+ if (cb->args[1]) {
+- if (exp != last)
++ if (ctnetlink_exp_id(exp) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -3254,9 +3254,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+- if (!refcount_inc_not_zero(&exp->use))
+- continue;
+- cb->args[1] = (unsigned long)exp;
++ cb->args[1] = ctnetlink_exp_id(exp);
+ goto out;
+ }
+ }
+@@ -3267,9 +3265,6 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ cb->args[0] = 1;
+ out:
+ rcu_read_unlock();
+- if (last)
+- nf_ct_expect_put(last);
+-
+ return skb->len;
+ }
+
+@@ -3288,7 +3283,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
+ struct nf_conntrack_zone zone;
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_ct_dump_table,
+- .done = ctnetlink_exp_done,
+ };
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+@@ -3338,7 +3332,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
+ else {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_dump_table,
+- .done = ctnetlink_exp_done,
+ };
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
+ }
+--
+2.50.1
+
--- /dev/null
+From 41a492b44d333b69f99787abd80e483c77ed08f0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 13 Aug 2025 02:38:50 +0200
+Subject: netfilter: nf_tables: reject duplicate device on updates
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit cf5fb87fcdaaaafec55dcc0dc5a9e15ead343973 ]
+
+A chain/flowtable update with duplicated devices in the same batch is
+possible. Unfortunately, netdev event path only removes the first
+device that is found, leaving unregistered the hook of the duplicated
+device.
+
+Check if a duplicated device exists in the transaction batch, bail out
+with EEXIST in such case.
+
+WARNING is hit when unregistering the hook:
+
+ [49042.221275] WARNING: CPU: 4 PID: 8425 at net/netfilter/core.c:340 nf_hook_entry_head+0xaa/0x150
+ [49042.221375] CPU: 4 UID: 0 PID: 8425 Comm: nft Tainted: G S 6.16.0+ #170 PREEMPT(full)
+ [...]
+ [49042.221382] RIP: 0010:nf_hook_entry_head+0xaa/0x150
+
+Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable")
+Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index 064f18792d98..46ca725d6538 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -2790,6 +2790,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
+ struct nft_chain *chain = ctx->chain;
+ struct nft_chain_hook hook = {};
+ struct nft_stats __percpu *stats = NULL;
++ struct nftables_pernet *nft_net;
+ struct nft_hook *h, *next;
+ struct nf_hook_ops *ops;
+ struct nft_trans *trans;
+@@ -2832,6 +2833,20 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
+ if (nft_hook_list_find(&basechain->hook_list, h)) {
+ list_del(&h->list);
+ nft_netdev_hook_free(h);
++ continue;
++ }
++
++ nft_net = nft_pernet(ctx->net);
++ list_for_each_entry(trans, &nft_net->commit_list, list) {
++ if (trans->msg_type != NFT_MSG_NEWCHAIN ||
++ trans->table != ctx->table ||
++ !nft_trans_chain_update(trans))
++ continue;
++
++ if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) {
++ nft_chain_release_hook(&hook);
++ return -EEXIST;
++ }
+ }
+ }
+ } else {
+@@ -9033,6 +9048,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
+ {
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_flowtable_hook flowtable_hook;
++ struct nftables_pernet *nft_net;
+ struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
+ struct nft_trans *trans;
+@@ -9049,6 +9065,20 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
+ if (nft_hook_list_find(&flowtable->hook_list, hook)) {
+ list_del(&hook->list);
+ nft_netdev_hook_free(hook);
++ continue;
++ }
++
++ nft_net = nft_pernet(ctx->net);
++ list_for_each_entry(trans, &nft_net->commit_list, list) {
++ if (trans->msg_type != NFT_MSG_NEWFLOWTABLE ||
++ trans->table != ctx->table ||
++ !nft_trans_flowtable_update(trans))
++ continue;
++
++ if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) {
++ err = -EEXIST;
++ goto err_flowtable_update_hook;
++ }
+ }
+ }
+
+--
+2.50.1
+
--- /dev/null
+From 6618a73f0c48d3ab0731910efbb5972d5afde30f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+ CPU0 CPU1
+ ---- ----
+ n_vclocks_store()
+ lock(&ptp->n_vclocks_mux) [1]
+ (physical clock)
+ pc_clock_adjtime()
+ lock(&clk->rwsem) [2]
+ (physical clock)
+ ...
+ ptp_clock_freerun()
+ ptp_vclock_in_use()
+ lock(&ptp->n_vclocks_mux) [3]
+ (physical clock)
+ ptp_clock_unregister()
+ posix_clock_unregister()
+ lock(&clk->rwsem) [4]
+ (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a6aad743c282..b352df4cd3f9 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -24,6 +24,11 @@
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+ #define PTP_MAX_CHANNELS 2048
+
++enum {
++ PTP_LOCK_PHYSICAL = 0,
++ PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+ struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+ int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index 7febfdcbde8b..8ed4b8598924 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+ return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
+
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+ .owner = THIS_MODULE,
+ .name = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+ return NULL;
+ }
+
++ ptp_vclock_set_subclass(vclock->clock);
++
+ timecounter_init(&vclock->tc, &vclock->cc, 0);
+ ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+--
+2.50.1
+
--- /dev/null
+From 85ec46b8224074065b1bc397b3fb3ccff51c5ca8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 09:36:56 +0000
+Subject: riscv: dts: thead: Add APB clocks for TH1520 GMACs
+
+From: Yao Zi <ziyao@disroot.org>
+
+[ Upstream commit a7f75e2883c4bd57b12c3be61bb926929adad9c0 ]
+
+Describe perisys-apb4-hclk as the APB clock for TH1520 SoC, which is
+essential for accessing GMAC glue registers.
+
+Fixes: 7e756671a664 ("riscv: dts: thead: Add TH1520 ethernet nodes")
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Reviewed-by: Drew Fustini <fustini@kernel.org>
+Tested-by: Drew Fustini <fustini@kernel.org>
+Link: https://patch.msgid.link/20250808093655.48074-5-ziyao@disroot.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/boot/dts/thead/th1520.dtsi | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi
+index 1db0054c4e09..93135e0f5a77 100644
+--- a/arch/riscv/boot/dts/thead/th1520.dtsi
++++ b/arch/riscv/boot/dts/thead/th1520.dtsi
+@@ -294,8 +294,9 @@ gmac1: ethernet@ffe7060000 {
+ reg-names = "dwmac", "apb";
+ interrupts = <67 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "macirq";
+- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>;
+- clock-names = "stmmaceth", "pclk";
++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>,
++ <&clk CLK_PERISYS_APB4_HCLK>;
++ clock-names = "stmmaceth", "pclk", "apb";
+ snps,pbl = <32>;
+ snps,fixed-burst;
+ snps,multicast-filter-bins = <64>;
+@@ -316,8 +317,9 @@ gmac0: ethernet@ffe7070000 {
+ reg-names = "dwmac", "apb";
+ interrupts = <66 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "macirq";
+- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>;
+- clock-names = "stmmaceth", "pclk";
++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>,
++ <&clk CLK_PERISYS_APB4_HCLK>;
++ clock-names = "stmmaceth", "pclk", "apb";
+ snps,pbl = <32>;
+ snps,fixed-burst;
+ snps,multicast-filter-bins = <64>;
+--
+2.50.1
+
--- /dev/null
+From 3e9fc7914a86dfe7937c516e67ff01c5544bec23 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+ BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+ sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+ sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+ __release_sock+0x1da/0x330 net/core/sock.c:3106
+ release_sock+0x6b/0x250 net/core/sock.c:3660
+ sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+ sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+ sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+ inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+ BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+ sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+ __release_sock+0x1d3/0x330 net/core/sock.c:3213
+ release_sock+0x6b/0x270 net/core/sock.c:3767
+ sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+ sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+ sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+ inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index 0c0d2757f6f8..6fcdcaeed40e 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+- if ((!is_gso && skb_linearize(skb)) ||
++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
+ goto discard_it;
+
+--
+2.50.1
+
block-make-req_op_zone_finish-a-write-operation.patch
mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
mfd-cros_ec-separate-charge-control-probing-from-usb-pd.patch
+habanalabs-fix-uaf-in-export_dmabuf.patch
+mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch
+xfrm-flush-all-states-in-xfrm_state_fini.patch
+xfrm-restore-gso-for-sw-crypto.patch
+xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+netfilter-ctnetlink-remove-refcounting-in-expectatio.patch
+net-hibmcge-fix-rtnl-deadlock-issue.patch
+net-hibmcge-fix-the-division-by-zero-issue.patch
+net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch
+net-ti-icssg-prueth-fix-emac-link-speed-handling.patch
+net-page_pool-allow-enabling-recycling-late-fix-fals.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+net-lapbether-ignore-ops-locked-netdevs.patch
+hamradio-ignore-ops-locked-netdevs.patch
+erofs-fix-block-count-report-when-48-bit-layout-is-o.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch
+net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch
+net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch
+riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
+ipvs-fix-estimator-kthreads-preferred-affinity.patch
+netfilter-nf_tables-reject-duplicate-device-on-updat.patch
+bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch
+net-kcm-fix-race-condition-in-kcm_unattach.patch
--- /dev/null
+From 370e01dc4b6dee8ec7f8338bd058e954edaed079 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h | 2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c | 3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index 774859b63f0d..4e077068e6d9 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -196,7 +196,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index 095cf31bae0b..d71643b494a1 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+ strp->stm.offset = offset;
+ }
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+ struct strp_msg *rxm;
+ struct tls_msg *tlm;
+@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+
+ if (!strp->copy_mode && force_refresh) {
+- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+- return;
++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++ WRITE_ONCE(strp->msg_ready, 0);
++ memset(&strp->stm, 0, sizeof(strp->stm));
++ return false;
++ }
+
+ tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+ }
+@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ rxm->offset = strp->stm.offset;
+ tlm = tls_msg(strp->anchor);
+ tlm->control = strp->mark;
++
++ return true;
+ }
+
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 549d1ea01a72..51c98a007dda 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+ return sock_intr_errno(timeo);
+ }
+
+- tls_strp_msg_load(&ctx->strp, released);
++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++ return tls_rx_rec_wait(sk, psock, nonblock, false);
+
+ return 1;
+ }
+--
+2.50.1
+
--- /dev/null
+From db3787d79bafca5f191f3fc9d0d8f261e0c90bba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index e0a6bfa95118..eeac86bacdba 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -224,7 +224,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+
--- /dev/null
+From 813b078a896269b17b6d904ba2cae892c2cc6d03 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:26 +0200
+Subject: xfrm: bring back device check in validate_xmit_xfrm
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 65f079a6c446a939eefe71e6d5957d5d6365fcf9 ]
+
+This is partial revert of commit d53dda291bbd993a29b84d358d282076e3d01506.
+
+This change causes traffic using GSO with SW crypto running through a
+NIC capable of HW offload to no longer get segmented during
+validate_xmit_xfrm, and is unrelated to the bonding use case mentioned
+in the commit.
+
+Fixes: d53dda291bbd ("xfrm: Remove unneeded device check from validate_xmit_xfrm")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xfrm/xfrm_device.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
+index 1f88472aaac0..c7a1f080d2de 100644
+--- a/net/xfrm/xfrm_device.c
++++ b/net/xfrm/xfrm_device.c
+@@ -155,7 +155,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
+ return skb;
+ }
+
+- if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) {
++ if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
++ unlikely(xmit_xfrm_check_overflow(skb)))) {
+ struct sk_buff *segs;
+
+ /* Packet got rerouted, fixup features and segment it. */
+--
+2.50.1
+
--- /dev/null
+From 72312704ac548ae1474e05e1017aeb91aceb8a58 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:05:43 +0200
+Subject: xfrm: flush all states in xfrm_state_fini
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 42e42562c9cfcdacf000f1b42284a4fad24f8546 ]
+
+While reverting commit f75a2804da39 ("xfrm: destroy xfrm_state
+synchronously on net exit path"), I incorrectly changed
+xfrm_state_flush's "proto" argument back to IPSEC_PROTO_ANY. This
+reverts some of the changes in commit dbb2483b2a46 ("xfrm: clean up
+xfrm protocol checks"), and leads to some states not being removed
+when we exit the netns.
+
+Pass 0 instead of IPSEC_PROTO_ANY from both xfrm_state_fini
+xfrm6_tunnel_net_exit, so that xfrm_state_flush deletes all states.
+
+Fixes: 2a198bbec691 ("Revert "xfrm: destroy xfrm_state synchronously on net exit path"")
+Reported-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=6641a61fe0e2e89ae8c5
+Tested-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/xfrm6_tunnel.c | 2 +-
+ net/xfrm/xfrm_state.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
+index 5120a763da0d..0a0eeaed0591 100644
+--- a/net/ipv6/xfrm6_tunnel.c
++++ b/net/ipv6/xfrm6_tunnel.c
+@@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
+ struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+ unsigned int i;
+
+- xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
++ xfrm_state_flush(net, 0, false);
+ xfrm_flush_gc();
+
+ for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
+diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
+index 97ff756191ba..5f1da305eea8 100644
+--- a/net/xfrm/xfrm_state.c
++++ b/net/xfrm/xfrm_state.c
+@@ -3278,7 +3278,7 @@ void xfrm_state_fini(struct net *net)
+ unsigned int sz;
+
+ flush_work(&net->xfrm.state_hash_work);
+- xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
++ xfrm_state_flush(net, 0, false);
+ flush_work(&xfrm_state_gc_work);
+
+ WARN_ON(!list_empty(&net->xfrm.state_all));
+--
+2.50.1
+
--- /dev/null
+From da846e56cf21a09f2ac3e5528405cfcdb9f956cc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:25 +0200
+Subject: xfrm: restore GSO for SW crypto
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 234d1eff5d4987024be9d40ac07b918a5ae8db1a ]
+
+Commit 49431af6c4ef incorrectly assumes that the GSO path is only used
+by HW offload, but it's also useful for SW crypto.
+
+This patch re-enables GSO for SW crypto. It's not an exact revert to
+preserve the other changes made to xfrm_dev_offload_ok afterwards, but
+it reverts all of its effects.
+
+Fixes: 49431af6c4ef ("xfrm: rely on XFRM offload")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xfrm/xfrm_device.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
+index d2819baea414..1f88472aaac0 100644
+--- a/net/xfrm/xfrm_device.c
++++ b/net/xfrm/xfrm_device.c
+@@ -415,10 +415,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+ struct net_device *dev = x->xso.dev;
+ bool check_tunnel_size;
+
+- if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED)
++ if (!x->type_offload ||
++ (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
+ return false;
+
+- if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) {
++ if ((!dev || dev == xfrm_dst_path(dst)->dev) &&
++ !xdst->child->xfrm) {
+ mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
+ if (skb->len <= mtu)
+ goto ok;
+@@ -430,6 +432,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+ return false;
+
+ ok:
++ if (!dev)
++ return true;
++
+ check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
+ x->props.mode == XFRM_MODE_TUNNEL;
+ switch (x->props.family) {
+--
+2.50.1
+
--- /dev/null
+From a8a27555c9f7f82114edc7e731fd752b9483bf61 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Aug 2025 17:03:11 +0200
+Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ]
+
+Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid
+discarding useful information") caused the number of wakeup interrupts
+to increase on an idle system [1], which was not expected to happen
+after merely allowing shallower idle states to be selected by the
+governor in some cases.
+
+However, on the system in question, all of the idle states deeper than
+WFI are rejected by the driver due to a firmware issue [2]. This causes
+the governor to only consider the recent interval duriation data
+corresponding to attempts to enter WFI that are successful and the
+recent invervals table is filled with values lower than the scheduler
+tick period. Consequently, the governor predicts an idle duration
+below the scheduler tick period length and avoids stopping the tick
+more often which leads to the observed symptom.
+
+Address it by modifying the governor to update the recent intervals
+table also when entering the previously selected idle state fails, so
+it knows that the short idle intervals might have been the minority
+had the selected idle states been actually entered every time.
+
+Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information")
+Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1]
+Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Tested-by: Christian Loehle <christian.loehle@arm.com>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Christian Loehle <christian.loehle@arm.com>
+Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index edd9a8fb9878..f3a071ac3b2a 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -160,6 +160,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters)
+
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
++{
++ /* Update the repeating-pattern data. */
++ data->intervals[data->interval_ptr++] = interval_us;
++ if (data->interval_ptr >= INTERVALS)
++ data->interval_ptr = 0;
++}
++
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+ /*
+@@ -284,6 +292,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ if (data->needs_update) {
+ menu_update(drv, dev);
+ data->needs_update = 0;
++ } else if (!dev->last_residency_ns) {
++ /*
++ * This happens when the driver rejects the previously selected
++ * idle state and returns an error, so update the recent
++ * intervals table to prevent invalid information from being
++ * used going forward.
++ */
++ menu_update_intervals(data, UINT_MAX);
+ }
+
+ nr_iowaiters = nr_iowait_cpu(dev->cpu);
+@@ -553,10 +569,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+
+ data->correction_factor[data->bucket] = new_factor;
+
+- /* update the repeating-pattern data */
+- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+- if (data->interval_ptr >= INTERVALS)
+- data->interval_ptr = 0;
++ menu_update_intervals(data, ktime_to_us(measured_ns));
+ }
+
+ /**
+--
+2.50.1
+
--- /dev/null
+From 49fe61784622f7ba647e5db6df74aa4619ee0dd9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Aug 2025 15:37:14 -0400
+Subject: intel_idle: Allow loading ACPI tables for any family
+
+From: Len Brown <len.brown@intel.com>
+
+[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ]
+
+There is no reason to limit intel_idle's loading of ACPI tables to
+family 6. Upcoming Intel processors are not in family 6.
+
+Below "Fixes" really means "applies cleanly until".
+That syntax commit didn't change the previous logic,
+but shows this patch applies back 5-years.
+
+Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros")
+Signed-off-by: Len Brown <len.brown@intel.com>
+Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 44842f243f40..6908052dea77 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1432,7 +1432,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ };
+
+ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
+ {}
+ };
+
+--
+2.50.1
+
--- /dev/null
+From 430f336058e516557bd4d0e04e7597307b4b537c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:38 -0700
+Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
+
+Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports
+a subset of hardware functionality, i.e. KVM can't rely on hardware to
+detect illegal/unsupported values. Failure to check the vmcs12 value
+would allow the guest to load any harware-supported value while running L2.
+
+Take care to exempt BTF and LBR from the validity check in order to match
+KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even
+if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect
+that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR
+are being intercepted.
+
+Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set
+*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but
+that would incur non-trivial complexity and wouldn't change the fact that
+KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity
+is not worth carrying.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 12 ++++++++++--
+ arch/x86/kvm/vmx/vmx.c | 5 ++---
+ arch/x86/kvm/vmx/vmx.h | 3 +++
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index fdf7503491f9..10236ecdad95 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2564,7 +2564,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
++ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
++ vmx_get_supported_debugctl(vcpu, false));
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+@@ -3045,7 +3046,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
++ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
++ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+@@ -4435,6 +4437,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+ (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+
++ /*
++ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
++ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
++ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
++ * vmcs02 doesn't strictly track vmcs12.
++ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index aaa767ed170e..32f1a38a1010 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2149,7 +2149,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+ return (unsigned long)data;
+ }
+
+-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+ u64 debugctl = 0;
+
+@@ -2168,8 +2168,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ return debugctl;
+ }
+
+-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
+- bool host_initiated)
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+ {
+ u64 invalid;
+
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 50d32d830890..5816fdd2dfa8 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -429,6 +429,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+
+ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
++
+ /*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+--
+2.50.1
+
--- /dev/null
+From dbe73b19d1fdc8dd7627b1c3710fd6f9191a1429 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:24 -0700
+Subject: KVM: nVMX: Defer SVI update to vmcs01 on EOI when L2 is active w/o
+ VID
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chao Gao <chao.gao@intel.com>
+
+[ Upstream commit 04bc93cf49d16d01753b95ddb5d4f230b809a991 ]
+
+If KVM emulates an EOI for L1's virtual APIC while L2 is active, defer
+updating GUEST_INTERUPT_STATUS.SVI, i.e. the VMCS's cache of the highest
+in-service IRQ, until L1 is active, as vmcs01, not vmcs02, needs to track
+vISR. The missed SVI update for vmcs01 can result in L1 interrupts being
+incorrectly blocked, e.g. if there is a pending interrupt with lower
+priority than the interrupt that was EOI'd.
+
+This bug only affects use cases where L1's vAPIC is effectively passed
+through to L2, e.g. in a pKVM scenario where L2 is L1's depriveleged host,
+as KVM will only emulate an EOI for L1's vAPIC if Virtual Interrupt
+Delivery (VID) is disabled in vmc12, and L1 isn't intercepting L2 accesses
+to its (virtual) APIC page (or if x2APIC is enabled, the EOI MSR).
+
+WARN() if KVM updates L1's ISR while L2 is active with VID enabled, as an
+EOI from L2 is supposed to affect L2's vAPIC, but still defer the update,
+to try to keep L1 alive. Specifically, KVM forwards all APICv-related
+VM-Exits to L1 via nested_vmx_l1_wants_exit():
+
+ case EXIT_REASON_APIC_ACCESS:
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /*
+ * The controls for "virtualize APIC accesses," "APIC-
+ * register virtualization," and "virtual-interrupt
+ * delivery" only come from vmcs12.
+ */
+ return true;
+
+Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/kvm/20230312180048.1778187-1-jason.cj.chen@intel.com
+Reported-by: Markku Ahvenjärvi <mankku@gmail.com>
+Closes: https://lore.kernel.org/all/20240920080012.74405-1-mankku@gmail.com
+Cc: Janne Karhunen <janne.karhunen@gmail.com>
+Signed-off-by: Chao Gao <chao.gao@intel.com>
+[sean: drop request, handle in VMX, write changelog]
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntactic conflict in lapic.h, account for lack of
+ kvm_x86_call(), drop sanity check due to lack of wants_to_run]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/lapic.c | 11 +++++++++++
+ arch/x86/kvm/lapic.h | 1 +
+ arch/x86/kvm/vmx/nested.c | 5 +++++
+ arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++++++
+ arch/x86/kvm/vmx/vmx.h | 1 +
+ 5 files changed, 34 insertions(+)
+
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index cbf85a1ffb74..ba1c2a7f74f7 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -803,6 +803,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+ }
+ }
+
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
++{
++ struct kvm_lapic *apic = vcpu->arch.apic;
++
++ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
++ return;
++
++ static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
++}
++EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
++
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+ {
+ /* This may race with setting of irr in __apic_accept_irq() and
+diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
+index 0a0ea4b5dd8c..0dd069b8d6d1 100644
+--- a/arch/x86/kvm/lapic.h
++++ b/arch/x86/kvm/lapic.h
+@@ -124,6 +124,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+ enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
+ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+
+ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index d3e346a574f1..fdf7503491f9 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4900,6 +4900,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
++ if (vmx->nested.update_vmcs01_hwapic_isr) {
++ vmx->nested.update_vmcs01_hwapic_isr = false;
++ kvm_apic_update_hwapic_isr(vcpu);
++ }
++
+ if ((vm_exit_reason != -1) &&
+ (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index cde01eb1f5e3..4563e7a9a851 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6839,6 +6839,22 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ u16 status;
+ u8 old;
+
++ /*
++ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
++ * is only relevant for if and only if Virtual Interrupt Delivery is
++ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
++ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
++ * VM-Exit, otherwise L1 with run with a stale SVI.
++ */
++ if (is_guest_mode(vcpu)) {
++ /*
++ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
++ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
++ */
++ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
++ return;
++ }
++
+ if (max_isr == -1)
+ max_isr = 0;
+
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 6be1627d888e..88c5b7ebf9d3 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -177,6 +177,7 @@ struct nested_vmx {
+ bool reload_vmcs01_apic_access_page;
+ bool update_vmcs01_cpu_dirty_logging;
+ bool update_vmcs01_apicv_status;
++ bool update_vmcs01_hwapic_isr;
+
+ /*
+ * Enlightened VMCS has been enabled. It does not mean that L1 has to
+--
+2.50.1
+
--- /dev/null
+From 5b61b1298fba19f1de5269df84bb30ea8bbb71f4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:22 -0700
+Subject: KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI
+ shadow
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit be45bc4eff33d9a7dae84a2150f242a91a617402 ]
+
+Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common
+svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff()
+so that VMRUN is not executed in an STI shadow. AMD CPUs have a quirk
+(some would say "bug"), where the STI shadow bleeds into the guest's
+intr_state field if a #VMEXIT occurs during injection of an event, i.e. if
+the VMRUN doesn't complete before the subsequent #VMEXIT.
+
+The spurious "interrupts masked" state is relatively benign, as it only
+occurs during event injection and is transient. Because KVM is already
+injecting an event, the guest can't be in HLT, and if KVM is querying IRQ
+blocking for injection, then KVM would need to force an immediate exit
+anyways since injecting multiple events is impossible.
+
+However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the
+spurious STI shadow is visible to L1 when running a nested VM, which can
+trip sanity checks, e.g. in VMware's VMM.
+
+Hoist the STI+CLI all the way to C code, as the aforementioned calls to
+guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are
+enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already
+possible. I.e. if there's kernel code that is confused by running with
+RFLAGS.IF=1, then it's already a problem. In practice, since GIF=0 also
+blocks NMIs, the only change in exposure to non-KVM code (relative to
+surrounding VMRUN with STI+CLI) is exception handling code, and except for
+the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path
+are fatal.
+
+Use the "raw" variants to enable/disable IRQs to avoid tracing in the
+"no instrumentation" code; the guest state helpers also take care of
+tracing IRQ state.
+
+Oppurtunstically document why KVM needs to do STI in the first place.
+
+Reported-by: Doug Covelli <doug.covelli@broadcom.com>
+Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com
+Fixes: f14eec0a3203 ("KVM: SVM: move more vmentry code to assembly")
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in __svm_sev_es_vcpu_run()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c | 14 ++++++++++++++
+ arch/x86/kvm/svm/vmenter.S | 9 +--------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 86c50747e158..abbb84ddfe02 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4170,6 +4170,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+
+ guest_state_enter_irqoff();
+
++ /*
++ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
++ * VMRUN controls whether or not physical IRQs are masked (KVM always
++ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
++ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
++ * into guest state if delivery of an event during VMRUN triggers a
++ * #VMEXIT, and the guest_state transitions already tell lockdep that
++ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
++ * this path, so IRQs aren't actually unmasked while running host code.
++ */
++ raw_local_irq_enable();
++
+ amd_clear_divider();
+
+ if (sev_es_guest(vcpu->kvm))
+@@ -4177,6 +4189,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+ else
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
+
++ raw_local_irq_disable();
++
+ guest_state_exit_irqoff();
+ }
+
+diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
+index 56fe34d9397f..81ecb9e1101d 100644
+--- a/arch/x86/kvm/svm/vmenter.S
++++ b/arch/x86/kvm/svm/vmenter.S
+@@ -171,12 +171,8 @@ SYM_FUNC_START(__svm_vcpu_run)
+ VM_CLEAR_CPU_BUFFERS
+
+ /* Enter guest mode */
+- sti
+-
+ 3: vmrun %_ASM_AX
+ 4:
+- cli
+-
+ /* Pop @svm to RAX while it's the only available register. */
+ pop %_ASM_AX
+
+@@ -341,11 +337,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ VM_CLEAR_CPU_BUFFERS
+
+ /* Enter guest mode */
+- sti
+-
+ 1: vmrun %_ASM_AX
+-
+-2: cli
++2:
+
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+--
+2.50.1
+
--- /dev/null
+From f4b282d7e8425cc2b1c48f385fa3c049a29e137b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:36 -0700
+Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ]
+
+Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the
+guest CPUID model, as debug support is supposed to be available if RTM is
+supported, and there are no known downsides to letting the guest debug RTM
+aborts.
+
+Note, there are no known bug reports related to RTM_DEBUG, the primary
+motivation is to reduce the probability of breaking existing guests when a
+future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL
+(KVM currently lets L2 run with whatever hardware supports; whoops).
+
+Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to
+DR7.RTM.
+
+Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/msr-index.h | 1 +
+ arch/x86/kvm/vmx/vmx.c | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index 033855457581..723e48b57bd0 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -380,6 +380,7 @@
+ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
+ #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+ #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
++#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
+
+ #define MSR_PEBS_FRONTEND 0x000003f7
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 08ca218ee858..359c3b7f52a1 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2161,6 +2161,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
++ if (boot_cpu_has(X86_FEATURE_RTM) &&
++ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM)))
++ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
++
+ return debugctl;
+ }
+
+--
+2.50.1
+
--- /dev/null
+From b170b1c7fa1f5907611a190f0e1c6fa6d1ae712e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:37 -0700
+Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
+
+Move VMX's logic to check DEBUGCTL values into a standalone helper so that
+the code can be used by nested VM-Enter to apply the same logic to the
+value being loaded from vmcs12.
+
+KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested
+VM-Enter, as hardware may support features that KVM does not, i.e. relying
+on hardware to detect invalid guest state will result in false negatives.
+Unfortunately, that means applying KVM's funky suppression of BTF and LBR
+to vmcs12 so as not to break existing guests.
+
+No functional change intended.
+
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com
+Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------
+ 1 file changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 359c3b7f52a1..aaa767ed170e 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2168,6 +2168,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
+ return debugctl;
+ }
+
++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data,
++ bool host_initiated)
++{
++ u64 invalid;
++
++ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
++ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
++ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
++ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
++ }
++ return !invalid;
++}
++
+ /*
+ * Writes msr value into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+@@ -2236,19 +2249,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ }
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+- case MSR_IA32_DEBUGCTLMSR: {
+- u64 invalid;
+-
+- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+- }
+-
+- if (invalid)
++ case MSR_IA32_DEBUGCTLMSR:
++ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+ return 1;
+
++ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
++
+ if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+@@ -2258,7 +2264,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+- }
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+--
+2.50.1
+
--- /dev/null
+From 10389ae08622b1effe126e28b1a647b66752a860 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:30 -0700
+Subject: KVM: VMX: Handle forced exit due to preemption timer in fastpath
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 11776aa0cfa7d007ad1799b1553bdcbd830e5010 ]
+
+Handle VMX preemption timer VM-Exits due to KVM forcing an exit in the
+exit fastpath, i.e. avoid calling back into handle_preemption_timer() for
+the same exit. There is no work to be done for forced exits, as the name
+suggests the goal is purely to get control back in KVM.
+
+In addition to shaving a few cycles, this will allow cleanly separating
+handle_fastpath_preemption_timer() from handle_preemption_timer(), e.g.
+it's not immediately obvious why _apparently_ calling
+handle_fastpath_preemption_timer() twice on a "slow" exit is necessary:
+the "slow" call is necessary to handle exits from L2, which are excluded
+from the fastpath by vmx_vcpu_run().
+
+Link: https://lore.kernel.org/r/20240110012705.506918-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 32b792387271..631fdd4a575a 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6027,12 +6027,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+- if (!vmx->req_immediate_exit) {
+- kvm_lapic_expired_hv_timer(vcpu);
+- return EXIT_FASTPATH_REENTER_GUEST;
+- }
++ /*
++ * If the timer expired because KVM used it to force an immediate exit,
++ * then mission accomplished.
++ */
++ if (vmx->req_immediate_exit)
++ return EXIT_FASTPATH_EXIT_HANDLED;
+
+- return EXIT_FASTPATH_NONE;
++ kvm_lapic_expired_hv_timer(vcpu);
++ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+--
+2.50.1
+
--- /dev/null
+From 346070b6afc211b7d9c548666678021841dbbc67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:32 -0700
+Subject: KVM: VMX: Handle KVM-induced preemption timer exits in fastpath for
+ L2
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 7b3d1bbf8d68d76fb21210932a5e8ed8ea80dbcc ]
+
+Eat VMX treemption timer exits in the fastpath regardless of whether L1 or
+L2 is active. The VM-Exit is 100% KVM-induced, i.e. there is nothing
+directly related to the exit that KVM needs to do on behalf of the guest,
+thus there is no reason to wait until the slow path to do nothing.
+
+Opportunistically add comments explaining why preemption timer exits for
+emulating the guest's APIC timer need to go down the slow path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++--
+ 1 file changed, 20 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4c991d514015..0ecc0e996386 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6034,13 +6034,26 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ if (vmx->req_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
++ /*
++ * If L2 is active, go down the slow path as emulating the guest timer
++ * expiration likely requires synthesizing a nested VM-Exit.
++ */
++ if (is_guest_mode(vcpu))
++ return EXIT_FASTPATH_NONE;
++
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+- handle_fastpath_preemption_timer(vcpu);
++ /*
++ * This non-fastpath handler is reached if and only if the preemption
++ * timer was being used to emulate a guest timer while L2 is active.
++ * All other scenarios are supposed to be handled in the fastpath.
++ */
++ WARN_ON_ONCE(!is_guest_mode(vcpu));
++ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+ }
+
+@@ -7258,7 +7271,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
+- if (is_guest_mode(vcpu))
++ /*
++ * If L2 is active, some VMX preemption timer exits can be handled in
++ * the fastpath even, all other exits must use the slow path.
++ */
++ if (is_guest_mode(vcpu) &&
++ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
+ switch (to_vmx(vcpu)->exit_reason.basic) {
+--
+2.50.1
+
--- /dev/null
+From 40b4fc9e84bf81654f1ef6040150a04d2e2fc2fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:40 -0700
+Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the
+ guest
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ]
+
+Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the
+host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting
+while running the guest. When running with the "default treatment of SMIs"
+in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that
+is visible to host (non-SMM) software, and instead transitions directly
+from VMX non-root to SMM. And critically, DEBUGCTL isn't context switched
+by hardware on SMI or RSM, i.e. SMM will run with whatever value was
+resident in hardware at the time of the SMI.
+
+Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting
+events while the CPU is executing in SMM, which can pollute profiling and
+potentially leak information into the guest.
+
+Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner
+run loop, as the bit can be toggled in IRQ context via IPI callback (SMP
+function call), by way of /sys/devices/cpu/freeze_on_smi.
+
+Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be
+preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs,
+i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and
+at worst could lead to undesirable behavior in the future if AMD CPUs ever
+happened to pick up a collision with the bit.
+
+Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module
+owns and controls GUEST_IA32_DEBUGCTL.
+
+WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the
+lack of handling isn't a KVM bug (TDX already WARNs on any run_flag).
+
+Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed
+by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state().
+Doing so avoids the need to track host_debugctl on a per-VMCS basis, as
+GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and
+load_vmcs12_host_state(). For the VM-Fail case, even though KVM won't
+have actually entered the guest, vcpu_enter_guest() will have run with
+vmcs02 active and thus could result in vmcs01 being run with a stale value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: move vmx/main.c change to vmx/vmx.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 7 +++++++
+ arch/x86/kvm/vmx/nested.c | 3 +++
+ arch/x86/kvm/vmx/vmx.c | 5 +++++
+ arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++++-
+ arch/x86/kvm/x86.c | 14 ++++++++++++--
+ 5 files changed, 41 insertions(+), 3 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 7373b22c02a7..813887324d52 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1553,6 +1553,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
++ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+ };
+
+ struct kvm_x86_ops {
+@@ -1580,6 +1581,12 @@ struct kvm_x86_ops {
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
++ /*
++ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
++ * match the host's value even while the guest is active.
++ */
++ const u64 HOST_OWNED_DEBUGCTL;
++
+ void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 2ce39ffbcefb..d2fa192d7ce7 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4688,6 +4688,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
++ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
++ vmx_reload_guest_debugctl(vcpu);
++
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index d0973bd7853c..9b1f22bcb716 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7399,6 +7399,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
++ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
++ vmx_reload_guest_debugctl(vcpu);
++
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8326,6 +8329,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
++ .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
++
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 769e70fd142c..5d73d3e570d7 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -434,12 +434,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+
+ static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+ {
++ WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
++
++ val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+ }
+
+ static inline u64 vmx_guest_debugctl_read(void)
+ {
+- return vmcs_read64(GUEST_IA32_DEBUGCTL);
++ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
++}
++
++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
++{
++ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
++
++ if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
++ return;
++
++ vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
+ }
+
+ /*
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index fbb2e70e3031..fc2cafc33b37 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10518,7 +10518,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
+- u64 run_flags;
++ u64 run_flags, debug_ctl;
+
+ bool req_immediate_exit = false;
+
+@@ -10777,7 +10777,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(0, 7);
+ }
+
+- vcpu->arch.host_debugctl = get_debugctlmsr();
++ /*
++ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
++ * can be modified in IRQ context, e.g. via SMP function calls. Inform
++ * vendor code if any host-owned bits were changed, e.g. so that the
++ * value loaded into hardware while running the guest can be updated.
++ */
++ debug_ctl = get_debugctlmsr();
++ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
++ !vcpu->arch.guest_state_protected)
++ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
++ vcpu->arch.host_debugctl = debug_ctl;
+
+ guest_timing_enter_irqoff();
+
+--
+2.50.1
+
--- /dev/null
+From 010b3aed9b879bc35a20a52e2435f99c018fd9bc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:29 -0700
+Subject: KVM: VMX: Re-enter guest in fastpath for "spurious" preemption timer
+ exits
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit e6b5d16bbd2d4c8259ad76aa33de80d561aba5f9 ]
+
+Re-enter the guest in the fast path if VMX preeemption timer VM-Exit was
+"spurious", i.e. if KVM "soft disabled" the timer by writing -1u and by
+some miracle the timer expired before any other VM-Exit occurred. This is
+just an intermediate step to cleaning up the preemption timer handling,
+optimizing these types of spurious VM-Exits is not interesting as they are
+extremely rare/infrequent.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index ee501871ddb0..32b792387271 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6019,8 +6019,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+- if (!vmx->req_immediate_exit &&
+- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
++ /*
++ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
++ * due to the timer expiring while it was "soft" disabled, just eat the
++ * exit and re-enter the guest.
++ */
++ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
++ return EXIT_FASTPATH_REENTER_GUEST;
++
++ if (!vmx->req_immediate_exit) {
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+--
+2.50.1
+
--- /dev/null
+From 2277369c1b499bf85b3b553e281b264495bb2514 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:39 -0700
+Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
+
+Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to
+vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into
+GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state
+into the guest, and without needing to copy+paste the FREEZE_IN_SMM
+logic into every patch that accesses GUEST_IA32_DEBUGCTL.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+[sean: massage changelog, make inline, use in all prepare_vmcs02() cases]
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 10 +++++-----
+ arch/x86/kvm/vmx/pmu_intel.c | 8 ++++----
+ arch/x86/kvm/vmx/vmx.c | 8 +++++---
+ arch/x86/kvm/vmx/vmx.h | 10 ++++++++++
+ 4 files changed, 24 insertions(+), 12 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 10236ecdad95..2ce39ffbcefb 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2564,11 +2564,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl &
+- vmx_get_supported_debugctl(vcpu, false));
++ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
++ vmx_get_supported_debugctl(vcpu, false));
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
++ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
+ }
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -3433,7 +3433,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
+ if (kvm_mpx_supported() &&
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+@@ -4633,7 +4633,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+
+ kvm_set_dr(vcpu, 7, 0x400);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++ vmx_guest_debugctl_write(vcpu, 0);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 48a2f77f62ef..50364e00e4e9 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -633,11 +633,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+ */
+ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+ {
+- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ u64 data = vmx_guest_debugctl_read();
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++ vmx_guest_debugctl_write(vcpu, data);
+ }
+ }
+
+@@ -707,7 +707,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
++ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+@@ -729,7 +729,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+
+ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+ {
+- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
++ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
+ }
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 32f1a38a1010..d0973bd7853c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2124,7 +2124,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
++ msr_info->data = vmx_guest_debugctl_read();
+ break;
+ default:
+ find_uret_msr:
+@@ -2258,7 +2258,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+
+- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
++ vmx_guest_debugctl_write(vcpu, data);
++
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+@@ -4826,7 +4827,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++
++ vmx_guest_debugctl_write(&vmx->vcpu, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 5816fdd2dfa8..769e70fd142c 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -432,6 +432,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
++{
++ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
++}
++
++static inline u64 vmx_guest_debugctl_read(void)
++{
++ return vmcs_read64(GUEST_IA32_DEBUGCTL);
++}
++
+ /*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+--
+2.50.1
+
--- /dev/null
+From ab60a5a234aeb79d78d4830caee1d001313cd5e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:34 -0700
+Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic
+ bitmap
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ]
+
+Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter
+into an a generic bitmap so that similar "take action" information can be
+passed to vendor code without creating a pile of boolean parameters.
+
+This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and
+will also allow for adding similar functionality for re-loading debugctl
+in the active VMCS.
+
+Opportunistically massage the TDX WARN and comment to prepare for adding
+more run_flags, all of which are expected to be mutually exclusive with
+TDX, i.e. should be WARNed on.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: drop TDX crud, account for lack of kvm_x86_call()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 6 +++++-
+ arch/x86/kvm/svm/svm.c | 4 ++--
+ arch/x86/kvm/vmx/vmx.c | 3 ++-
+ arch/x86/kvm/x86.c | 10 ++++++++--
+ 4 files changed, 17 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 8898ad8cb3de..aa6d04cd9ee6 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1550,6 +1550,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+ return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+ }
+
++enum kvm_x86_run_flags {
++ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
++};
++
+ struct kvm_x86_ops {
+ const char *name;
+
+@@ -1625,7 +1629,7 @@ struct kvm_x86_ops {
+
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+- bool force_immediate_exit);
++ u64 run_flags);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath);
+ int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 4a53b38ea386..61e5e261cde2 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4197,9 +4197,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+ guest_state_exit_irqoff();
+ }
+
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+- bool force_immediate_exit)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 704e5a552b4f..065aac2f4bce 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7345,8 +7345,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ guest_state_exit_irqoff();
+ }
+
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ {
++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index f3150d9a1918..ecc151397341 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10518,6 +10518,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
++ u64 run_flags;
+
+ bool req_immediate_exit = false;
+
+@@ -10750,8 +10751,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ goto cancel_injection;
+ }
+
+- if (req_immediate_exit)
++ run_flags = 0;
++ if (req_immediate_exit) {
++ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
++ }
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+@@ -10787,7 +10791,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+
+- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+@@ -10799,6 +10803,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ break;
+ }
+
++ run_flags = 0;
++
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
+ }
+--
+2.50.1
+
--- /dev/null
+From 22c51f0290ecf799d1bb5992d6add57aaa64597f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:35 -0700
+Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ]
+
+Instruct vendor code to load the guest's DR6 into hardware via a new
+KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to
+load vcpu->arch.dr6 into hardware when DR6 can be read/written directly
+by the guest.
+
+Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM
+thinks DR6 needs to be reloaded. TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH
+and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of vmx/main.c]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h | 1 -
+ arch/x86/include/asm/kvm_host.h | 2 +-
+ arch/x86/kvm/svm/svm.c | 10 ++++++----
+ arch/x86/kvm/vmx/vmx.c | 10 +++-------
+ arch/x86/kvm/x86.c | 2 +-
+ 5 files changed, 11 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 8fe6667d945f..a0a4fc684e63 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -48,7 +48,6 @@ KVM_X86_OP(set_idt)
+ KVM_X86_OP(get_gdt)
+ KVM_X86_OP(set_gdt)
+ KVM_X86_OP(sync_dirty_debug_regs)
+-KVM_X86_OP(set_dr6)
+ KVM_X86_OP(set_dr7)
+ KVM_X86_OP(cache_reg)
+ KVM_X86_OP(get_rflags)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index aa6d04cd9ee6..7373b22c02a7 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1552,6 +1552,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+
+ enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
++ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ };
+
+ struct kvm_x86_ops {
+@@ -1600,7 +1601,6 @@ struct kvm_x86_ops {
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 61e5e261cde2..abff6d45ae33 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4241,10 +4241,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
+ /*
+- * Run with all-zero DR6 unless needed, so that we can get the exact cause
+- * of a #DB.
++ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
++ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
++ * KVM's snapshot is only necessary when DR accesses won't exit.
+ */
+- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
++ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
++ svm_set_dr6(vcpu, vcpu->arch.dr6);
++ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
+
+ clgi();
+@@ -5021,7 +5024,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+- .set_dr6 = svm_set_dr6,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 065aac2f4bce..08ca218ee858 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5616,12 +5616,6 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+ set_debugreg(DR6_RESERVED, 6);
+ }
+
+-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+-{
+- lockdep_assert_irqs_disabled();
+- set_debugreg(vcpu->arch.dr6, 6);
+-}
+-
+ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+ {
+ vmcs_writel(GUEST_DR7, val);
+@@ -7392,6 +7386,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+
++ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
++ set_debugreg(vcpu->arch.dr6, 6);
++
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+@@ -8337,7 +8334,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+- .set_dr6 = vmx_set_dr6,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index ecc151397341..fbb2e70e3031 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10772,7 +10772,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+- static_call(kvm_x86_set_dr6)(vcpu, vcpu->arch.dr6);
++ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
+ }
+--
+2.50.1
+
--- /dev/null
+From f141c80dbc877633ec0fb299da98a44a81d7c5aa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:33 -0700
+Subject: KVM: x86: Fully defer to vendor code to decide how to force immediate
+ exit
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 0ec3d6d1f169baa7fc512ae4b78d17e7c94b7763 ]
+
+Now that vmx->req_immediate_exit is used only in the scope of
+vmx_vcpu_run(), use force_immediate_exit to detect that KVM should usurp
+the VMX preemption to force a VM-Exit and let vendor code fully handle
+forcing a VM-Exit.
+
+Opportunsitically drop __kvm_request_immediate_exit() and just have
+vendor code call smp_send_reschedule() directly. SVM already does this
+when injecting an event while also trying to single-step an IRET, i.e.
+it's not exactly secret knowledge that KVM uses a reschedule IPI to force
+an exit.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-7-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve absurd conflict due to funky kvm_x86_ops.sched_in prototype]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h | 1 -
+ arch/x86/include/asm/kvm_host.h | 3 ---
+ arch/x86/kvm/svm/svm.c | 7 ++++---
+ arch/x86/kvm/vmx/vmx.c | 32 +++++++++++++-----------------
+ arch/x86/kvm/vmx/vmx.h | 2 --
+ arch/x86/kvm/x86.c | 10 +---------
+ 6 files changed, 19 insertions(+), 36 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index e59ded976166..8fe6667d945f 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -102,7 +102,6 @@ KVM_X86_OP(write_tsc_multiplier)
+ KVM_X86_OP(get_exit_info)
+ KVM_X86_OP(check_intercept)
+ KVM_X86_OP(handle_exit_irqoff)
+-KVM_X86_OP(request_immediate_exit)
+ KVM_X86_OP(sched_in)
+ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
+ KVM_X86_OP_OPTIONAL(vcpu_blocking)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 5703600a454e..8898ad8cb3de 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1695,8 +1695,6 @@ struct kvm_x86_ops {
+ struct x86_exception *exception);
+ void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+
+- void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
+-
+ void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+ /*
+@@ -2182,7 +2180,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
+
+ void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index f42c6ef7dc20..4a53b38ea386 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4222,9 +4222,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+- smp_send_reschedule(vcpu->cpu);
++ force_immediate_exit = true;
+ }
+
++ if (force_immediate_exit)
++ smp_send_reschedule(vcpu->cpu);
++
+ pre_svm_run(vcpu);
+
+ sync_lapic_to_cr8(vcpu);
+@@ -5075,8 +5078,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .check_intercept = svm_check_intercept,
+ .handle_exit_irqoff = svm_handle_exit_irqoff,
+
+- .request_immediate_exit = __kvm_request_immediate_exit,
+-
+ .sched_in = svm_sched_in,
+
+ .nested_ops = &svm_nested_ops,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0ecc0e996386..704e5a552b4f 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -49,6 +49,8 @@
+ #include <asm/spec-ctrl.h>
+ #include <asm/vmx.h>
+
++#include <trace/events/ipi.h>
++
+ #include "capabilities.h"
+ #include "cpuid.h"
+ #include "hyperv.h"
+@@ -1304,8 +1306,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+ u16 fs_sel, gs_sel;
+ int i;
+
+- vmx->req_immediate_exit = false;
+-
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+@@ -6015,7 +6015,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
+ return 1;
+ }
+
+-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
++static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+@@ -6031,7 +6032,7 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+- if (vmx->req_immediate_exit)
++ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+@@ -7210,13 +7211,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+ msrs[i].host, false);
+ }
+
+-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
++static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+- if (vmx->req_immediate_exit) {
++ if (force_immediate_exit) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (vmx->hv_deadline_tsc != -1) {
+@@ -7269,7 +7270,8 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ barrier_nospec();
+ }
+
+-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit)
+ {
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+@@ -7283,7 +7285,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_set_msr_irqoff(vcpu);
+ case EXIT_REASON_PREEMPTION_TIMER:
+- return handle_fastpath_preemption_timer(vcpu);
++ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+@@ -7425,7 +7427,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ vmx_passthrough_lbr_msrs(vcpu);
+
+ if (enable_preemption_timer)
+- vmx_update_hv_timer(vcpu);
++ vmx_update_hv_timer(vcpu, force_immediate_exit);
++ else if (force_immediate_exit)
++ smp_send_reschedule(vcpu->cpu);
+
+ kvm_wait_lapic_expire(vcpu);
+
+@@ -7489,7 +7493,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+
+- return vmx_exit_handlers_fastpath(vcpu);
++ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
+ }
+
+ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+@@ -7988,11 +7992,6 @@ static __init void vmx_set_cpu_caps(void)
+ kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+ }
+
+-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+- to_vmx(vcpu)->req_immediate_exit = true;
+-}
+-
+ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+ {
+@@ -8404,8 +8403,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+- .request_immediate_exit = vmx_request_immediate_exit,
+-
+ .sched_in = vmx_sched_in,
+
+ .cpu_dirty_log_size = PML_ENTITY_NUM,
+@@ -8663,7 +8660,6 @@ static __init int hardware_setup(void)
+ if (!enable_preemption_timer) {
+ vmx_x86_ops.set_hv_timer = NULL;
+ vmx_x86_ops.cancel_hv_timer = NULL;
+- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
+ }
+
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index fb36bde2dd87..50d32d830890 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -331,8 +331,6 @@ struct vcpu_vmx {
+ unsigned int ple_window;
+ bool ple_window_dirty;
+
+- bool req_immediate_exit;
+-
+ /* Support for PML */
+ #define PML_ENTITY_NUM 512
+ struct page *pml_pg;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index d04066099567..f3150d9a1918 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10505,12 +10505,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+ static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+ }
+
+-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+-{
+- smp_send_reschedule(vcpu->cpu);
+-}
+-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+-
+ /*
+ * Called within kvm->srcu read side.
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
+@@ -10756,10 +10750,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ goto cancel_injection;
+ }
+
+- if (req_immediate_exit) {
++ if (req_immediate_exit)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+- static_call(kvm_x86_request_immediate_exit)(vcpu);
+- }
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+--
+2.50.1
+
--- /dev/null
+From 21d37a330aba310a5c2dc24ee8eea174acdfb829 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:21 -0700
+Subject: KVM: x86/hyper-v: Skip non-canonical addresses during PV TLB flush
+
+From: Manuel Andreas <manuel.andreas@tum.de>
+
+[ Upstream commit fa787ac07b3ceb56dd88a62d1866038498e96230 ]
+
+In KVM guests with Hyper-V hypercalls enabled, the hypercalls
+HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST and HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
+allow a guest to request invalidation of portions of a virtual TLB.
+For this, the hypercall parameter includes a list of GVAs that are supposed
+to be invalidated.
+
+However, when non-canonical GVAs are passed, there is currently no
+filtering in place and they are eventually passed to checked invocations of
+INVVPID on Intel / INVLPGA on AMD. While AMD's INVLPGA silently ignores
+non-canonical addresses (effectively a no-op), Intel's INVVPID explicitly
+signals VM-Fail and ultimately triggers the WARN_ONCE in invvpid_error():
+
+ invvpid failed: ext=0x0 vpid=1 gva=0xaaaaaaaaaaaaa000
+ WARNING: CPU: 6 PID: 326 at arch/x86/kvm/vmx/vmx.c:482
+ invvpid_error+0x91/0xa0 [kvm_intel]
+ Modules linked in: kvm_intel kvm 9pnet_virtio irqbypass fuse
+ CPU: 6 UID: 0 PID: 326 Comm: kvm-vm Not tainted 6.15.0 #14 PREEMPT(voluntary)
+ RIP: 0010:invvpid_error+0x91/0xa0 [kvm_intel]
+ Call Trace:
+ vmx_flush_tlb_gva+0x320/0x490 [kvm_intel]
+ kvm_hv_vcpu_flush_tlb+0x24f/0x4f0 [kvm]
+ kvm_arch_vcpu_ioctl_run+0x3013/0x5810 [kvm]
+
+Hyper-V documents that invalid GVAs (those that are beyond a partition's
+GVA space) are to be ignored. While not completely clear whether this
+ruling also applies to non-canonical GVAs, it is likely fine to make that
+assumption, and manual testing on Azure confirms "real" Hyper-V interprets
+the specification in the same way.
+
+Skip non-canonical GVAs when processing the list of address to avoid
+tripping the INVVPID failure. Alternatively, KVM could filter out "bad"
+GVAs before inserting into the FIFO, but practically speaking the only
+downside of pushing validation to the final processing is that doing so
+is suboptimal for the guest, and no well-behaved guest will request TLB
+flushes for non-canonical addresses.
+
+Fixes: 260970862c88 ("KVM: x86: hyper-v: Handle HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls gently")
+Cc: stable@vger.kernel.org
+Signed-off-by: Manuel Andreas <manuel.andreas@tum.de>
+Suggested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Link: https://lore.kernel.org/r/c090efb3-ef82-499f-a5e0-360fc8420fb7@tum.de
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: use plain is_noncanonical_address()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/hyperv.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
+index bd3fbd5be5da..223f4fa6a849 100644
+--- a/arch/x86/kvm/hyperv.c
++++ b/arch/x86/kvm/hyperv.c
+@@ -1929,6 +1929,9 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+ if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+ goto out_flush_all;
+
++ if (is_noncanonical_address(entries[i], vcpu))
++ continue;
++
+ /*
+ * Lower 12 bits of 'address' encode the number of additional
+ * pages to flush.
+--
+2.50.1
+
--- /dev/null
+From 03a5de01f1a2bd29e4c24999991e84839d3b1fa1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:31 -0700
+Subject: KVM: x86: Move handling of is_guest_mode() into fastpath exit
+ handlers
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit bf1a49436ea37b98dd2f37c57608951d0e28eecc ]
+
+Let the fastpath code decide which exits can/can't be handled in the
+fastpath when L2 is active, e.g. when KVM generates a VMX preemption
+timer exit to forcefully regain control, there is no "work" to be done and
+so such exits can be handled in the fastpath regardless of whether L1 or
+L2 is active.
+
+Moving the is_guest_mode() check into the fastpath code also makes it
+easier to see that L2 isn't allowed to use the fastpath in most cases,
+e.g. it's not immediately obvious why handle_fastpath_preemption_timer()
+is called from the fastpath and the normal path.
+
+Link: https://lore.kernel.org/r/20240110012705.506918-5-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c | 6 +++---
+ arch/x86/kvm/vmx/vmx.c | 6 +++---
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 5a230be224d1..f42c6ef7dc20 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4157,6 +4157,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+
+ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
++ if (is_guest_mode(vcpu))
++ return EXIT_FASTPATH_NONE;
++
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ to_svm(vcpu)->vmcb->control.exit_info_1)
+ return handle_fastpath_set_msr_irqoff(vcpu);
+@@ -4315,9 +4318,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+
+ svm_complete_interrupts(vcpu);
+
+- if (is_guest_mode(vcpu))
+- return EXIT_FASTPATH_NONE;
+-
+ return svm_exit_handlers_fastpath(vcpu);
+ }
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 631fdd4a575a..4c991d514015 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7258,6 +7258,9 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+
+ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
++ if (is_guest_mode(vcpu))
++ return EXIT_FASTPATH_NONE;
++
+ switch (to_vmx(vcpu)->exit_reason.basic) {
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_set_msr_irqoff(vcpu);
+@@ -7468,9 +7471,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+
+- if (is_guest_mode(vcpu))
+- return EXIT_FASTPATH_NONE;
+-
+ return vmx_exit_handlers_fastpath(vcpu);
+ }
+
+--
+2.50.1
+
--- /dev/null
+From c1681ed16b27201a312047cb6a8038088fdc0608 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:28 -0700
+Subject: KVM: x86: Plumb "force_immediate_exit" into kvm_entry() tracepoint
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 9c9025ea003a03f967affd690f39b4ef3452c0f5 ]
+
+Annotate the kvm_entry() tracepoint with "immediate exit" when KVM is
+forcing a VM-Exit immediately after VM-Enter, e.g. when KVM wants to
+inject an event but needs to first complete some other operation.
+Knowing that KVM is (or isn't) forcing an exit is useful information when
+debugging issues related to event injection.
+
+Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
+Link: https://lore.kernel.org/r/20240110012705.506918-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 3 ++-
+ arch/x86/kvm/svm/svm.c | 5 +++--
+ arch/x86/kvm/trace.h | 9 ++++++---
+ arch/x86/kvm/vmx/vmx.c | 4 ++--
+ arch/x86/kvm/x86.c | 2 +-
+ 5 files changed, 14 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index b5210505abfa..5703600a454e 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1624,7 +1624,8 @@ struct kvm_x86_ops {
+ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
+
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
++ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath);
+ int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index abbb84ddfe02..5a230be224d1 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4194,12 +4194,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
+ guest_state_exit_irqoff();
+ }
+
+-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
++ bool force_immediate_exit)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+- trace_kvm_entry(vcpu);
++ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
+index b82e6ed4f024..c6b4b1728006 100644
+--- a/arch/x86/kvm/trace.h
++++ b/arch/x86/kvm/trace.h
+@@ -15,20 +15,23 @@
+ * Tracepoint for guest mode entry.
+ */
+ TRACE_EVENT(kvm_entry,
+- TP_PROTO(struct kvm_vcpu *vcpu),
+- TP_ARGS(vcpu),
++ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
++ TP_ARGS(vcpu, force_immediate_exit),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned long, rip )
++ __field( bool, immediate_exit )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->rip = kvm_rip_read(vcpu);
++ __entry->immediate_exit = force_immediate_exit;
+ ),
+
+- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
++ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
++ __entry->immediate_exit ? "[immediate exit]" : "")
+ );
+
+ /*
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 9ba4baf2a9e9..ee501871ddb0 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7312,7 +7312,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ guest_state_exit_irqoff();
+ }
+
+-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+@@ -7339,7 +7339,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+ return EXIT_FASTPATH_NONE;
+ }
+
+- trace_kvm_entry(vcpu);
++ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 9944b32b0b30..d04066099567 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10795,7 +10795,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+
+- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+--
+2.50.1
+
--- /dev/null
+From 8fd23c953af487158937416f6ea3a2e16c6c7503 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:23 -0700
+Subject: KVM: x86: Plumb in the vCPU to kvm_x86_ops.hwapic_isr_update()
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 76bce9f10162cd4b36ac0b7889649b22baf70ebd ]
+
+Pass the target vCPU to the hwapic_isr_update() vendor hook so that VMX
+can defer the update until after nested VM-Exit if an EOI for L1's vAPIC
+occurs while L2 is active.
+
+Note, commit d39850f57d21 ("KVM: x86: Drop @vcpu parameter from
+kvm_x86_ops.hwapic_isr_update()") removed the parameter with the
+justification that doing so "allows for a decent amount of (future)
+cleanup in the APIC code", but it's not at all clear what cleanup was
+intended, or if it was ever realized.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Chao Gao <chao.gao@intel.com>
+Tested-by: Chao Gao <chao.gao@intel.com>
+Link: https://lore.kernel.org/r/20241128000010.4051275-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: account for lack of kvm_x86_call(), drop vmx/x86_ops.h change]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 2 +-
+ arch/x86/kvm/lapic.c | 8 ++++----
+ arch/x86/kvm/vmx/vmx.c | 2 +-
+ 3 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 5dfb8cc9616e..5fc89d255550 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1657,7 +1657,7 @@ struct kvm_x86_ops {
+ bool allow_apicv_in_x2apic_without_x2apic_virtualization;
+ void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+ void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+- void (*hwapic_isr_update)(int isr);
++ void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+ bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 66c7f2367bb3..cbf85a1ffb74 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -750,7 +750,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+ * just set SVI.
+ */
+ if (unlikely(apic->apicv_active))
+- static_call_cond(kvm_x86_hwapic_isr_update)(vec);
++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+@@ -795,7 +795,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+ * and must be left alone.
+ */
+ if (unlikely(apic->apicv_active))
+- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
+ else {
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+@@ -2772,7 +2772,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+ if (apic->apicv_active) {
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+- static_call_cond(kvm_x86_hwapic_isr_update)(-1);
++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
+ }
+
+ vcpu->arch.apic_arb_prio = 0;
+@@ -3072,7 +3072,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+ if (apic->apicv_active) {
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index e53620e18925..cde01eb1f5e3 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6834,7 +6834,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+ kvm_release_pfn_clean(pfn);
+ }
+
+-static void vmx_hwapic_isr_update(int max_isr)
++static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ {
+ u16 status;
+ u8 old;
+--
+2.50.1
+
--- /dev/null
+From 5bfa7e5a50ee261faf4f40ec4bae020fe4f2e08b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:27 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 189ecdb3e112da703ac0699f4ec76aa78122f911 ]
+
+Snapshot the host's DEBUGCTL after disabling IRQs, as perf can toggle
+debugctl bits from IRQ context, e.g. when enabling/disabling events via
+smp_call_function_single(). Taking the snapshot (long) before IRQs are
+disabled could result in KVM effectively clobbering DEBUGCTL due to using
+a stale snapshot.
+
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-6-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 7aff0fe469c3..9944b32b0b30 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4823,7 +4823,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
+- vcpu->arch.host_debugctl = get_debugctlmsr();
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+@@ -10782,6 +10781,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ set_debugreg(0, 7);
+ }
+
++ vcpu->arch.host_debugctl = get_debugctlmsr();
++
+ guest_timing_enter_irqoff();
+
+ for (;;) {
+--
+2.50.1
+
--- /dev/null
+From c711192f36c4c41ec0716b1b0a20448a9cc2194f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Aug 2025 17:25:26 -0700
+Subject: KVM: x86: Snapshot the host's DEBUGCTL in common x86
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit fb71c795935652fa20eaf9517ca9547f5af99a76 ]
+
+Move KVM's snapshot of DEBUGCTL to kvm_vcpu_arch and take the snapshot in
+common x86, so that SVM can also use the snapshot.
+
+Opportunistically change the field to a u64. While bits 63:32 are reserved
+on AMD, not mentioned at all in Intel's SDM, and managed as an "unsigned
+long" by the kernel, DEBUGCTL is an MSR and therefore a 64-bit value.
+
+Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
+Cc: stable@vger.kernel.org
+Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20250227222411.3490595-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+[sean: resolve minor syntatic conflict in vmx_vcpu_load()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/vmx/vmx.c | 8 ++------
+ arch/x86/kvm/vmx/vmx.h | 2 --
+ arch/x86/kvm/x86.c | 1 +
+ 4 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 5fc89d255550..b5210505abfa 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -733,6 +733,7 @@ struct kvm_vcpu_arch {
+ u32 pkru;
+ u32 hflags;
+ u64 efer;
++ u64 host_debugctl;
+ u64 apic_base;
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ bool load_eoi_exitmap_pending;
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4563e7a9a851..9ba4baf2a9e9 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -1499,13 +1499,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ */
+ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+- struct vcpu_vmx *vmx = to_vmx(vcpu);
+-
+ vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+-
+- vmx->host_debugctlmsr = get_debugctlmsr();
+ }
+
+ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+@@ -7414,8 +7410,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+ }
+
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+- if (vmx->host_debugctlmsr)
+- update_debugctlmsr(vmx->host_debugctlmsr);
++ if (vcpu->arch.host_debugctl)
++ update_debugctlmsr(vcpu->arch.host_debugctl);
+
+ #ifndef CONFIG_X86_64
+ /*
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 88c5b7ebf9d3..fb36bde2dd87 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -340,8 +340,6 @@ struct vcpu_vmx {
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
+- unsigned long host_debugctlmsr;
+-
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 55185670e0e5..7aff0fe469c3 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4823,6 +4823,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
++ vcpu->arch.host_debugctl = get_debugctlmsr();
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+--
+2.50.1
+
--- /dev/null
+From 420eaab27f40ffc253c0d624df6b87a47a58e99c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Aug 2025 07:23:18 -0700
+Subject: net: ti: icss-iep: Fix incorrect type for return value in
+ extts_enable()
+
+From: Alok Tiwari <alok.a.tiwari@oracle.com>
+
+[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ]
+
+The variable ret in icss_iep_extts_enable() was incorrectly declared
+as u32, while the function returns int and may return negative error
+codes. This will cause sign extension issues and incorrect error
+propagation. Update ret to be int to fix error handling.
+
+This change corrects the declaration to avoid potential type mismatch.
+
+Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
+Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
+index 8ed72c8b210f..e7306ed52922 100644
+--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
+@@ -638,7 +638,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on)
+
+ static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
+ {
+- u32 val, cap, ret = 0;
++ u32 val, cap;
++ int ret = 0;
+
+ mutex_lock(&iep->ptp_clk_mutex);
+
+--
+2.50.1
+
--- /dev/null
+From a6dce037ccb6be527d9dd4d896d9612980da17e6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Aug 2025 17:25:08 +0200
+Subject: netfilter: ctnetlink: fix refcount leak on table dump
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ]
+
+There is a reference count leak in ctnetlink_dump_table():
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general); // HERE
+ cb->args[1] = (unsigned long)ct;
+ ...
+
+While its very unlikely, its possible that ct == last.
+If this happens, then the refcount of ct was already incremented.
+This 2nd increment is never undone.
+
+This prevents the conntrack object from being released, which in turn
+keeps prevents cnet->count from dropping back to 0.
+
+This will then block the netns dismantle (or conntrack rmmod) as
+nf_conntrack_cleanup_net_list() will wait forever.
+
+This can be reproduced by running conntrack_resize.sh selftest in a loop.
+It takes ~20 minutes for me on a preemptible kernel on average before
+I see a runaway kworker spinning in nf_conntrack_cleanup_net_list.
+
+One fix would to change this to:
+ if (res < 0) {
+ if (ct != last)
+ nf_conntrack_get(&ct->ct_general);
+
+But this reference counting isn't needed in the first place.
+We can just store a cookie value instead.
+
+A followup patch will do the same for ctnetlink_exp_dump_table,
+it looks to me as if this has the same problem and like
+ctnetlink_dump_table, we only need a 'skip hint', not the actual
+object so we can apply the same cookie strategy there as well.
+
+Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 282e9644f6fd..928bd2013289 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -859,8 +859,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+
+ static int ctnetlink_done(struct netlink_callback *cb)
+ {
+- if (cb->args[1])
+- nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+ }
+@@ -1175,19 +1173,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+ return 0;
+ }
+
++static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
++{
++ unsigned long id = nf_ct_get_id(ct);
++
++ return id ? id : 1;
++}
++
+ static int
+ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ {
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
+ struct net *net = sock_net(skb->sk);
+- struct nf_conn *ct, *last;
++ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *nf_ct_evict[8];
++ struct nf_conn *ct;
+ int res, i;
+ spinlock_t *lockp;
+
+- last = (struct nf_conn *)cb->args[1];
+ i = 0;
+
+ local_bh_disable();
+@@ -1224,7 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ continue;
+
+ if (cb->args[1]) {
+- if (ct != last)
++ if (ctnetlink_get_id(ct) != last_id)
+ continue;
+ cb->args[1] = 0;
+ }
+@@ -1237,8 +1242,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, true, flags);
+ if (res < 0) {
+- nf_conntrack_get(&ct->ct_general);
+- cb->args[1] = (unsigned long)ct;
++ cb->args[1] = ctnetlink_get_id(ct);
+ spin_unlock(lockp);
+ goto out;
+ }
+@@ -1251,12 +1255,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ }
+ out:
+ local_bh_enable();
+- if (last) {
++ if (last_id) {
+ /* nf ct hash resize happened, now clear the leftover. */
+- if ((struct nf_conn *)cb->args[1] == last)
++ if (cb->args[1] == last_id)
+ cb->args[1] = 0;
+-
+- nf_ct_put(last);
+ }
+
+ while (i) {
+--
+2.50.1
+
--- /dev/null
+From 5bca481007ef80b38edd17f64f35c01f02fae3f0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Jul 2025 15:26:49 +0900
+Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun()
+
+From: Jeongjun Park <aha310510@gmail.com>
+
+[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ]
+
+syzbot reported the following ABBA deadlock:
+
+ CPU0 CPU1
+ ---- ----
+ n_vclocks_store()
+ lock(&ptp->n_vclocks_mux) [1]
+ (physical clock)
+ pc_clock_adjtime()
+ lock(&clk->rwsem) [2]
+ (physical clock)
+ ...
+ ptp_clock_freerun()
+ ptp_vclock_in_use()
+ lock(&ptp->n_vclocks_mux) [3]
+ (physical clock)
+ ptp_clock_unregister()
+ posix_clock_unregister()
+ lock(&clk->rwsem) [4]
+ (virtual clock)
+
+Since ptp virtual clock is registered only under ptp physical clock, both
+ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use()
+to lock &ptp->n_vclocks_mux and check ptp->n_vclocks.
+
+However, when unregistering vclocks in n_vclocks_store(), the locking
+ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of
+ptp_clock_unregister() called through device_for_each_child_reverse()
+is a virtual clock lock.
+
+Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are
+different locks, but in lockdep, a false positive occurs because the
+possibility of deadlock is determined through lock-class.
+
+To solve this, lock subclass annotation must be added to the posix_clock
+rwsem of the vclock.
+
+Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad
+Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Richard Cochran <richardcochran@gmail.com>
+Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ptp/ptp_private.h | 5 +++++
+ drivers/ptp/ptp_vclock.c | 7 +++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
+index a54124269c2f..3fbd1d68a9bc 100644
+--- a/drivers/ptp/ptp_private.h
++++ b/drivers/ptp/ptp_private.h
+@@ -20,6 +20,11 @@
+ #define PTP_BUF_TIMESTAMPS 30
+ #define PTP_DEFAULT_MAX_VCLOCKS 20
+
++enum {
++ PTP_LOCK_PHYSICAL = 0,
++ PTP_LOCK_VIRTUAL,
++};
++
+ struct timestamp_event_queue {
+ struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
+ int head;
+diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
+index dcf752c9e045..7d08ff3b30fc 100644
+--- a/drivers/ptp/ptp_vclock.c
++++ b/drivers/ptp/ptp_vclock.c
+@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+ return PTP_VCLOCK_REFRESH_INTERVAL;
+ }
+
++static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
++{
++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
++}
++
+ static const struct ptp_clock_info ptp_vclock_info = {
+ .owner = THIS_MODULE,
+ .name = "ptp virtual clock",
+@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+ return NULL;
+ }
+
++ ptp_vclock_set_subclass(vclock->clock);
++
+ timecounter_init(&vclock->tc, &vclock->cc, 0);
+ ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+--
+2.50.1
+
--- /dev/null
+From 61292c12f981631257858fa1a7e22814646ed11e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 15:40:11 -0400
+Subject: sctp: linearize cloned gso packets in sctp_rcv
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ]
+
+A cloned head skb still shares these frag skbs in fraglist with the
+original head skb. It's not safe to access these frag skbs.
+
+syzbot reported two use-of-uninitialized-memory bugs caused by this:
+
+ BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211
+ sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998
+ sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331
+ sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122
+ __release_sock+0x1da/0x330 net/core/sock.c:3106
+ release_sock+0x6b/0x250 net/core/sock.c:3660
+ sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360
+ sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885
+ sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031
+ inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:718 [inline]
+
+and
+
+ BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987
+ sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88
+ sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331
+ sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148
+ __release_sock+0x1d3/0x330 net/core/sock.c:3213
+ release_sock+0x6b/0x270 net/core/sock.c:3767
+ sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367
+ sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886
+ sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032
+ inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+
+This patch fixes it by linearizing cloned gso packets in sctp_rcv().
+
+Fixes: 90017accff61 ("sctp: Add GSO support")
+Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com
+Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sctp/input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sctp/input.c b/net/sctp/input.c
+index a8a254a5008e..032a10d82302 100644
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+- if ((!is_gso && skb_linearize(skb)) ||
++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
+ goto discard_it;
+
+--
+2.50.1
+
documentation-acpi-fix-parent-device-references.patch
acpi-processor-perflib-fix-initial-_ppc-limit-application.patch
acpi-processor-perflib-move-problematic-pr-performance-check.patch
+kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch
+kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch
+kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch
+kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch
+kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch
+kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch
+kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch
+kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch
+kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch
+kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch
+kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch
+kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch
+kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch
+kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch
+kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch
+kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch
+kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch
+kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch
+kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch
+udp-also-consider-secpath-when-evaluating-ipsec-use-.patch
+netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch
+net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch
+sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch
+intel_idle-allow-loading-acpi-tables-for-any-family.patch
+cpuidle-governors-menu-avoid-using-invalid-recent-in.patch
+ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch
+tls-handle-data-disappearing-from-under-the-tls-ulp.patch
--- /dev/null
+From 7b3746b0fb7bce25102c2ab1f5d3c2d406a17e0a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Aug 2025 16:29:06 -0700
+Subject: tls: handle data disappearing from under the TLS ULP
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ]
+
+TLS expects that it owns the receive queue of the TCP socket.
+This cannot be guaranteed in case the reader of the TCP socket
+entered before the TLS ULP was installed, or uses some non-standard
+read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy
+early exit (which leaves anchor pointing to a freed skb) with real
+error handling. Wipe the parsing state and tell the reader to retry.
+
+We already reload the anchor every time we (re)acquire the socket lock,
+so the only condition we need to avoid is an out of bounds read
+(not having enough bytes in the socket for previously parsed record len).
+
+If some data was read from under TLS but there's enough in the queue
+we'll reload and decrypt what is most likely not a valid TLS record.
+Leading to some undefined behavior from TLS perspective (corrupting
+a stream? missing an alert? missing an attack?) but no kernel crash
+should take place.
+
+Reported-by: William Liu <will@willsroot.io>
+Reported-by: Savino Dicanosa <savy@syst3mfailure.io>
+Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io
+Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser")
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls.h | 2 +-
+ net/tls/tls_strp.c | 11 ++++++++---
+ net/tls/tls_sw.c | 3 ++-
+ 3 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/tls/tls.h b/net/tls/tls.h
+index 02038d0381b7..5dc61c85c076 100644
+--- a/net/tls/tls.h
++++ b/net/tls/tls.h
+@@ -192,7 +192,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
+ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+ void tls_rx_msg_ready(struct tls_strparser *strp);
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
+ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
+ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
+diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
+index bea60b0160d1..6ce64a6e4495 100644
+--- a/net/tls/tls_strp.c
++++ b/net/tls/tls_strp.c
+@@ -474,7 +474,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+ strp->stm.offset = offset;
+ }
+
+-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ {
+ struct strp_msg *rxm;
+ struct tls_msg *tlm;
+@@ -483,8 +483,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+
+ if (!strp->copy_mode && force_refresh) {
+- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+- return;
++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
++ WRITE_ONCE(strp->msg_ready, 0);
++ memset(&strp->stm, 0, sizeof(strp->stm));
++ return false;
++ }
+
+ tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+ }
+@@ -494,6 +497,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+ rxm->offset = strp->stm.offset;
+ tlm = tls_msg(strp->anchor);
+ tlm->control = strp->mark;
++
++ return true;
+ }
+
+ /* Called with lock held on lower socket */
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 4905a81c4ac1..c9b53472e955 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1380,7 +1380,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+ return sock_intr_errno(timeo);
+ }
+
+- tls_strp_msg_load(&ctx->strp, released);
++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
++ return tls_rx_rec_wait(sk, psock, nonblock, false);
+
+ return 1;
+ }
+--
+2.50.1
+
--- /dev/null
+From d605402dabbd308ab13f66983c4babc3e4773210 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Aug 2025 11:26:27 +0200
+Subject: udp: also consider secpath when evaluating ipsec use for checksumming
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ]
+
+Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in
+IPsec case") tried to fix checksumming in UFO when the packets are
+going through IPsec, so that we can't rely on offloads because the UDP
+header and payload will be encrypted.
+
+But when doing a TCP test over VXLAN going through IPsec transport
+mode with GSO enabled (esp4_offload module loaded), I'm seeing broken
+UDP checksums on the encap after successful decryption.
+
+The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via
+__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this
+point we've already dropped the dst (unless the device sets
+IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and
+we proceed with checksum offload.
+
+Make need_ipsec also check the secpath, which is not dropped on this
+callpath.
+
+Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp_offload.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 3870b59f5400..9be9df2caf65 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+--
+2.50.1
+