From: Sasha Levin Date: Sat, 16 Aug 2025 20:59:40 +0000 (-0400) Subject: Fixes for all trees X-Git-Tag: v6.12.43~55^2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c0ca0b01482c13e82a54148651c0bdb2d1478796;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for all trees Signed-off-by: Sasha Levin --- diff --git a/queue-5.10/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-5.10/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch new file mode 100644 index 0000000000..74b04a4dc7 --- /dev/null +++ b/queue-5.10/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch @@ -0,0 +1,91 @@ +From cfccde6ceaa234284620cfaca692e6585dba91b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Aug 2025 17:03:11 +0200 +Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data + +From: Rafael J. Wysocki + +[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ] + +Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid +discarding useful information") caused the number of wakeup interrupts +to increase on an idle system [1], which was not expected to happen +after merely allowing shallower idle states to be selected by the +governor in some cases. + +However, on the system in question, all of the idle states deeper than +WFI are rejected by the driver due to a firmware issue [2]. This causes +the governor to only consider the recent interval duriation data +corresponding to attempts to enter WFI that are successful and the +recent invervals table is filled with values lower than the scheduler +tick period. Consequently, the governor predicts an idle duration +below the scheduler tick period length and avoids stopping the tick +more often which leads to the observed symptom. + +Address it by modifying the governor to update the recent intervals +table also when entering the previously selected idle state fails, so +it knows that the short idle intervals might have been the minority +had the selected idle states been actually entered every time. + +Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") +Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1] +Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2] +Signed-off-by: Rafael J. Wysocki +Tested-by: Christian Loehle +Tested-by: Marc Zyngier +Reviewed-by: Christian Loehle +Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki +Signed-off-by: Sasha Levin +--- + drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c +index a95cc8f024fd..d34463f96848 100644 +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -158,6 +158,14 @@ static inline int performance_multiplier(unsigned long nr_iowaiters) + + static DEFINE_PER_CPU(struct menu_device, menu_devices); + ++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us) ++{ ++ /* Update the repeating-pattern data. */ ++ data->intervals[data->interval_ptr++] = interval_us; ++ if (data->interval_ptr >= INTERVALS) ++ data->interval_ptr = 0; ++} ++ + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + + /* +@@ -288,6 +296,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; ++ } else if (!dev->last_residency_ns) { ++ /* ++ * This happens when the driver rejects the previously selected ++ * idle state and returns an error, so update the recent ++ * intervals table to prevent invalid information from being ++ * used going forward. ++ */ ++ menu_update_intervals(data, UINT_MAX); + } + + /* determine the expected residency time, round up */ +@@ -537,10 +553,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) + + data->correction_factor[data->bucket] = new_factor; + +- /* update the repeating-pattern data */ +- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); +- if (data->interval_ptr >= INTERVALS) +- data->interval_ptr = 0; ++ menu_update_intervals(data, ktime_to_us(measured_ns)); + } + + /** +-- +2.50.1 + diff --git a/queue-5.10/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-5.10/intel_idle-allow-loading-acpi-tables-for-any-family.patch new file mode 100644 index 0000000000..2883299014 --- /dev/null +++ b/queue-5.10/intel_idle-allow-loading-acpi-tables-for-any-family.patch @@ -0,0 +1,41 @@ +From 55146a8e555eb6bcaf596bf8b7455a06175b4760 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 15:37:14 -0400 +Subject: intel_idle: Allow loading ACPI tables for any family + +From: Len Brown + +[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ] + +There is no reason to limit intel_idle's loading of ACPI tables to +family 6. Upcoming Intel processors are not in family 6. + +Below "Fixes" really means "applies cleanly until". +That syntax commit didn't change the previous logic, +but shows this patch applies back 5-years. + +Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros") +Signed-off-by: Len Brown +Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 1cead368f961..f6a2211ca4ef 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -1154,7 +1154,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + }; + + static const struct x86_cpu_id intel_mwait_ids[] __initconst = { +- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL), ++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL), + {} + }; + +-- +2.50.1 + diff --git a/queue-5.10/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-5.10/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..bc69d6cc0c --- /dev/null +++ b/queue-5.10/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,129 @@ +From f713f56980c58c0297138ace9f7b483378f6bd73 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index b2b06033ef2c..f622fcad3f50 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -839,8 +839,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -1112,19 +1110,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -1160,7 +1165,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -1173,8 +1178,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, true, flags); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -1187,12 +1191,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-5.10/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-5.10/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch new file mode 100644 index 0000000000..5c69f7ae7d --- /dev/null +++ b/queue-5.10/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch @@ -0,0 +1,73 @@ +From 3e634f70881c39e2c08fb3e91544b90694df00bc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:40:11 -0400 +Subject: sctp: linearize cloned gso packets in sctp_rcv + +From: Xin Long + +[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ] + +A cloned head skb still shares these frag skbs in fraglist with the +original head skb. It's not safe to access these frag skbs. + +syzbot reported two use-of-uninitialized-memory bugs caused by this: + + BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 + sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 + sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 + __release_sock+0x1da/0x330 net/core/sock.c:3106 + release_sock+0x6b/0x250 net/core/sock.c:3660 + sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 + sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 + sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 + inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:718 [inline] + +and + + BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 + sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 + __release_sock+0x1d3/0x330 net/core/sock.c:3213 + release_sock+0x6b/0x270 net/core/sock.c:3767 + sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 + sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 + sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 + inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:712 [inline] + +This patch fixes it by linearizing cloned gso packets in sctp_rcv(). + +Fixes: 90017accff61 ("sctp: Add GSO support") +Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com +Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com +Signed-off-by: Xin Long +Reviewed-by: Marcelo Ricardo Leitner +Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sctp/input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index 8fe1a74f0618..079b1bfc7d31 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb) + * it's better to just linearize it otherwise crc computing + * takes longer. + */ +- if ((!is_gso && skb_linearize(skb)) || ++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; + +-- +2.50.1 + diff --git a/queue-5.10/series b/queue-5.10/series index f16f2f202e..c167be5b1b 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -203,3 +203,8 @@ fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch documentation-acpi-fix-parent-device-references.patch acpi-processor-perflib-fix-initial-_ppc-limit-application.patch acpi-processor-perflib-move-problematic-pr-performance-check.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch +sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch +intel_idle-allow-loading-acpi-tables-for-any-family.patch +cpuidle-governors-menu-avoid-using-invalid-recent-in.patch diff --git a/queue-5.10/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-5.10/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..35b776b5f5 --- /dev/null +++ b/queue-5.10/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From 703e70d1d8e2e3fa7a948735d5f6cd1cc8ce9e8d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index 73beaa7e2d70..5d4413fe4195 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 + diff --git a/queue-5.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-5.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch new file mode 100644 index 0000000000..5a7ef1190e --- /dev/null +++ b/queue-5.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch @@ -0,0 +1,91 @@ +From 05efdd270d75536fbf901a5eae7145a45a532748 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Aug 2025 17:03:11 +0200 +Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data + +From: Rafael J. Wysocki + +[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ] + +Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid +discarding useful information") caused the number of wakeup interrupts +to increase on an idle system [1], which was not expected to happen +after merely allowing shallower idle states to be selected by the +governor in some cases. + +However, on the system in question, all of the idle states deeper than +WFI are rejected by the driver due to a firmware issue [2]. This causes +the governor to only consider the recent interval duriation data +corresponding to attempts to enter WFI that are successful and the +recent invervals table is filled with values lower than the scheduler +tick period. Consequently, the governor predicts an idle duration +below the scheduler tick period length and avoids stopping the tick +more often which leads to the observed symptom. + +Address it by modifying the governor to update the recent intervals +table also when entering the previously selected idle state fails, so +it knows that the short idle intervals might have been the minority +had the selected idle states been actually entered every time. + +Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") +Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1] +Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2] +Signed-off-by: Rafael J. Wysocki +Tested-by: Christian Loehle +Tested-by: Marc Zyngier +Reviewed-by: Christian Loehle +Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki +Signed-off-by: Sasha Levin +--- + drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c +index e1e2721beb75..246b4a1b664a 100644 +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -158,6 +158,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters) + + static DEFINE_PER_CPU(struct menu_device, menu_devices); + ++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us) ++{ ++ /* Update the repeating-pattern data. */ ++ data->intervals[data->interval_ptr++] = interval_us; ++ if (data->interval_ptr >= INTERVALS) ++ data->interval_ptr = 0; ++} ++ + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + + /* +@@ -288,6 +296,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; ++ } else if (!dev->last_residency_ns) { ++ /* ++ * This happens when the driver rejects the previously selected ++ * idle state and returns an error, so update the recent ++ * intervals table to prevent invalid information from being ++ * used going forward. ++ */ ++ menu_update_intervals(data, UINT_MAX); + } + + /* determine the expected residency time, round up */ +@@ -542,10 +558,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) + + data->correction_factor[data->bucket] = new_factor; + +- /* update the repeating-pattern data */ +- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); +- if (data->interval_ptr >= INTERVALS) +- data->interval_ptr = 0; ++ menu_update_intervals(data, ktime_to_us(measured_ns)); + } + + /** +-- +2.50.1 + diff --git a/queue-5.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-5.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch new file mode 100644 index 0000000000..05a42045f8 --- /dev/null +++ b/queue-5.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch @@ -0,0 +1,41 @@ +From 1ee55ceeeb4fb9720509f1f18eb551a41c5568c1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 15:37:14 -0400 +Subject: intel_idle: Allow loading ACPI tables for any family + +From: Len Brown + +[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ] + +There is no reason to limit intel_idle's loading of ACPI tables to +family 6. Upcoming Intel processors are not in family 6. + +Below "Fixes" really means "applies cleanly until". +That syntax commit didn't change the previous logic, +but shows this patch applies back 5-years. + +Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros") +Signed-off-by: Len Brown +Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 359272ce8e29..96002f35405e 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -1194,7 +1194,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + }; + + static const struct x86_cpu_id intel_mwait_ids[] __initconst = { +- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL), ++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL), + {} + }; + +-- +2.50.1 + diff --git a/queue-5.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-5.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..892daa8e30 --- /dev/null +++ b/queue-5.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,129 @@ +From 93145a29f5f86a93148422799f8ec6667e0b6f50 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 585103c16a8a..50f7531221c3 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -848,8 +848,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -1164,19 +1162,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -1211,7 +1216,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -1224,8 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, true, flags); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -1238,12 +1242,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-5.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-5.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch new file mode 100644 index 0000000000..8483738a86 --- /dev/null +++ b/queue-5.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch @@ -0,0 +1,103 @@ +From 167cea59f060ec2d6f527a724186f5e6a9a3f4d6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Jul 2025 15:26:49 +0900 +Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun() + +From: Jeongjun Park + +[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ] + +syzbot reported the following ABBA deadlock: + + CPU0 CPU1 + ---- ---- + n_vclocks_store() + lock(&ptp->n_vclocks_mux) [1] + (physical clock) + pc_clock_adjtime() + lock(&clk->rwsem) [2] + (physical clock) + ... + ptp_clock_freerun() + ptp_vclock_in_use() + lock(&ptp->n_vclocks_mux) [3] + (physical clock) + ptp_clock_unregister() + posix_clock_unregister() + lock(&clk->rwsem) [4] + (virtual clock) + +Since ptp virtual clock is registered only under ptp physical clock, both +ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use() +to lock &ptp->n_vclocks_mux and check ptp->n_vclocks. + +However, when unregistering vclocks in n_vclocks_store(), the locking +ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of +ptp_clock_unregister() called through device_for_each_child_reverse() +is a virtual clock lock. + +Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are +different locks, but in lockdep, a false positive occurs because the +possibility of deadlock is determined through lock-class. + +To solve this, lock subclass annotation must be added to the posix_clock +rwsem of the vclock. + +Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad +Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") +Signed-off-by: Jeongjun Park +Acked-by: Richard Cochran +Reviewed-by: Vladimir Oltean +Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/ptp/ptp_private.h | 5 +++++ + drivers/ptp/ptp_vclock.c | 7 +++++++ + 2 files changed, 12 insertions(+) + +diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h +index b8d3df8a393a..bf823b8c3c8f 100644 +--- a/drivers/ptp/ptp_private.h ++++ b/drivers/ptp/ptp_private.h +@@ -20,6 +20,11 @@ + #define PTP_BUF_TIMESTAMPS 30 + #define PTP_DEFAULT_MAX_VCLOCKS 20 + ++enum { ++ PTP_LOCK_PHYSICAL = 0, ++ PTP_LOCK_VIRTUAL, ++}; ++ + struct timestamp_event_queue { + struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS]; + int head; +diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c +index ab1d233173e1..6a14c39c4508 100644 +--- a/drivers/ptp/ptp_vclock.c ++++ b/drivers/ptp/ptp_vclock.c +@@ -81,6 +81,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp) + return PTP_VCLOCK_REFRESH_INTERVAL; + } + ++static void ptp_vclock_set_subclass(struct ptp_clock *ptp) ++{ ++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); ++} ++ + static const struct ptp_clock_info ptp_vclock_info = { + .owner = THIS_MODULE, + .name = "ptp virtual clock", +@@ -137,6 +142,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) + return NULL; + } + ++ ptp_vclock_set_subclass(vclock->clock); ++ + timecounter_init(&vclock->tc, &vclock->cc, 0); + ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); + +-- +2.50.1 + diff --git a/queue-5.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-5.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch new file mode 100644 index 0000000000..d7d882ef87 --- /dev/null +++ b/queue-5.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch @@ -0,0 +1,73 @@ +From ecc53eb08e436c50345fff6ff4f2d84eddc7ffc3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:40:11 -0400 +Subject: sctp: linearize cloned gso packets in sctp_rcv + +From: Xin Long + +[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ] + +A cloned head skb still shares these frag skbs in fraglist with the +original head skb. It's not safe to access these frag skbs. + +syzbot reported two use-of-uninitialized-memory bugs caused by this: + + BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 + sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 + sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 + __release_sock+0x1da/0x330 net/core/sock.c:3106 + release_sock+0x6b/0x250 net/core/sock.c:3660 + sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 + sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 + sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 + inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:718 [inline] + +and + + BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 + sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 + __release_sock+0x1d3/0x330 net/core/sock.c:3213 + release_sock+0x6b/0x270 net/core/sock.c:3767 + sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 + sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 + sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 + inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:712 [inline] + +This patch fixes it by linearizing cloned gso packets in sctp_rcv(). + +Fixes: 90017accff61 ("sctp: Add GSO support") +Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com +Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com +Signed-off-by: Xin Long +Reviewed-by: Marcelo Ricardo Leitner +Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sctp/input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index 4ee9374dcfb9..182898cb754a 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb) + * it's better to just linearize it otherwise crc computing + * takes longer. + */ +- if ((!is_gso && skb_linearize(skb)) || ++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; + +-- +2.50.1 + diff --git a/queue-5.15/series b/queue-5.15/series index 059ff24647..83a81a15be 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -271,3 +271,9 @@ eventpoll-fix-semi-unbounded-recursion.patch documentation-acpi-fix-parent-device-references.patch acpi-processor-perflib-fix-initial-_ppc-limit-application.patch acpi-processor-perflib-move-problematic-pr-performance-check.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch +sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch +intel_idle-allow-loading-acpi-tables-for-any-family.patch +cpuidle-governors-menu-avoid-using-invalid-recent-in.patch +ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch diff --git a/queue-5.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-5.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..403334953c --- /dev/null +++ b/queue-5.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From 92e0e5246675bee6f45ac39d6c8c1ff8e588dd53 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index 612da8ec1081..8f47d07c49fb 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -59,7 +59,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 + diff --git a/queue-5.4/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-5.4/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..7529f30f54 --- /dev/null +++ b/queue-5.4/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,128 @@ +From 4ad31ff02bc58329fdb26bd716b8c3ab15ba0533 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index eeb000e41ad7..5d6f9b375c0f 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -808,8 +808,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -890,18 +888,25 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -936,7 +941,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -951,8 +956,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + ct); + rcu_read_unlock(); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -965,12 +969,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-5.4/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-5.4/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch new file mode 100644 index 0000000000..758f201494 --- /dev/null +++ b/queue-5.4/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch @@ -0,0 +1,73 @@ +From d22304d70bb1c9842e7607c98275fa7c2316a3a9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:40:11 -0400 +Subject: sctp: linearize cloned gso packets in sctp_rcv + +From: Xin Long + +[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ] + +A cloned head skb still shares these frag skbs in fraglist with the +original head skb. It's not safe to access these frag skbs. + +syzbot reported two use-of-uninitialized-memory bugs caused by this: + + BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 + sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 + sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 + __release_sock+0x1da/0x330 net/core/sock.c:3106 + release_sock+0x6b/0x250 net/core/sock.c:3660 + sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 + sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 + sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 + inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:718 [inline] + +and + + BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 + sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 + __release_sock+0x1d3/0x330 net/core/sock.c:3213 + release_sock+0x6b/0x270 net/core/sock.c:3767 + sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 + sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 + sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 + inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:712 [inline] + +This patch fixes it by linearizing cloned gso packets in sctp_rcv(). + +Fixes: 90017accff61 ("sctp: Add GSO support") +Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com +Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com +Signed-off-by: Xin Long +Reviewed-by: Marcelo Ricardo Leitner +Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sctp/input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index b1d3e342ac83..9013257cf3df 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -114,7 +114,7 @@ int sctp_rcv(struct sk_buff *skb) + * it's better to just linearize it otherwise crc computing + * takes longer. + */ +- if ((!is_gso && skb_linearize(skb)) || ++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; + +-- +2.50.1 + diff --git a/queue-5.4/series b/queue-5.4/series index 9124675c64..7973eb6c30 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -165,3 +165,6 @@ fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch documentation-acpi-fix-parent-device-references.patch acpi-processor-perflib-fix-initial-_ppc-limit-application.patch acpi-processor-perflib-move-problematic-pr-performance-check.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch +sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch diff --git a/queue-5.4/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-5.4/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..b59c3309cb --- /dev/null +++ b/queue-5.4/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From 35a73e471818ebb5d92b5a51ddfbc30c777fb59c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index 6505a6fd245a..7e025f3517b8 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 + diff --git a/queue-6.1/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch b/queue-6.1/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch new file mode 100644 index 0000000000..3e26d9fc0b --- /dev/null +++ b/queue-6.1/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch @@ -0,0 +1,117 @@ +From 9e1075bdd03cf356ae89ba3b703080b5c4fa2278 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:12:03 -0700 +Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter + +From: Maxim Levitsky + +[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ] + +Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports +a subset of hardware functionality, i.e. KVM can't rely on hardware to +detect illegal/unsupported values. Failure to check the vmcs12 value +would allow the guest to load any harware-supported value while running L2. + +Take care to exempt BTF and LBR from the validity check in order to match +KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even +if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect +that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR +are being intercepted. + +Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set +*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but +that would incur non-trivial complexity and wouldn't change the fact that +KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity +is not worth carrying. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Co-developed-by: Sean Christopherson +Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com +Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/nested.c | 12 ++++++++++-- + arch/x86/kvm/vmx/vmx.c | 5 ++--- + arch/x86/kvm/vmx/vmx.h | 3 +++ + 3 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index d55f7edc0860..da129e12cff9 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2532,7 +2532,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); ++ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & ++ vmx_get_supported_debugctl(vcpu, false)); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); +@@ -3022,7 +3023,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && +- CC(!kvm_dr7_valid(vmcs12->guest_dr7))) ++ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || ++ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && +@@ -4374,6 +4376,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + ++ /* ++ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. ++ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately ++ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into ++ * vmcs02 doesn't strictly track vmcs12. ++ */ + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 6517b9d929bf..0b37e21d55b1 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2052,7 +2052,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, + return (unsigned long)data; + } + +-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) ++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) + { + u64 debugctl = 0; + +@@ -2071,8 +2071,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + return debugctl; + } + +-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, +- bool host_initiated) ++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) + { + u64 invalid; + +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index ddbe73958d7f..99e3f46de2ec 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -442,6 +442,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, + + void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + ++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); ++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); ++ + /* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and +-- +2.50.1 + diff --git a/queue-6.1/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch b/queue-6.1/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch new file mode 100644 index 0000000000..f57b6eeea8 --- /dev/null +++ b/queue-6.1/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch @@ -0,0 +1,156 @@ +From 2fbc005722e5d1985ef69a071a4a889ff1cb6120 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:48 -0700 +Subject: KVM: nVMX: Defer SVI update to vmcs01 on EOI when L2 is active w/o + VID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Chao Gao + +[ Upstream commit 04bc93cf49d16d01753b95ddb5d4f230b809a991 ] + +If KVM emulates an EOI for L1's virtual APIC while L2 is active, defer +updating GUEST_INTERUPT_STATUS.SVI, i.e. the VMCS's cache of the highest +in-service IRQ, until L1 is active, as vmcs01, not vmcs02, needs to track +vISR. The missed SVI update for vmcs01 can result in L1 interrupts being +incorrectly blocked, e.g. if there is a pending interrupt with lower +priority than the interrupt that was EOI'd. + +This bug only affects use cases where L1's vAPIC is effectively passed +through to L2, e.g. in a pKVM scenario where L2 is L1's depriveleged host, +as KVM will only emulate an EOI for L1's vAPIC if Virtual Interrupt +Delivery (VID) is disabled in vmc12, and L1 isn't intercepting L2 accesses +to its (virtual) APIC page (or if x2APIC is enabled, the EOI MSR). + +WARN() if KVM updates L1's ISR while L2 is active with VID enabled, as an +EOI from L2 is supposed to affect L2's vAPIC, but still defer the update, +to try to keep L1 alive. Specifically, KVM forwards all APICv-related +VM-Exits to L1 via nested_vmx_l1_wants_exit(): + + case EXIT_REASON_APIC_ACCESS: + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* + * The controls for "virtualize APIC accesses," "APIC- + * register virtualization," and "virtual-interrupt + * delivery" only come from vmcs12. + */ + return true; + +Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/kvm/20230312180048.1778187-1-jason.cj.chen@intel.com +Reported-by: Markku Ahvenjärvi +Closes: https://lore.kernel.org/all/20240920080012.74405-1-mankku@gmail.com +Cc: Janne Karhunen +Signed-off-by: Chao Gao +[sean: drop request, handle in VMX, write changelog] +Tested-by: Chao Gao +Link: https://lore.kernel.org/r/20241128000010.4051275-3-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve minor syntactic conflict in lapic.h, account for lack of + kvm_x86_call(), drop sanity check due to lack of wants_to_run] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/lapic.c | 11 +++++++++++ + arch/x86/kvm/lapic.h | 1 + + arch/x86/kvm/vmx/nested.c | 5 +++++ + arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++++++ + arch/x86/kvm/vmx/vmx.h | 1 + + 5 files changed, 34 insertions(+) + +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index 3d65d6a023c9..9aae76b74417 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -640,6 +640,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) + } + } + ++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_lapic *apic = vcpu->arch.apic; ++ ++ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) ++ return; ++ ++ static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); ++} ++EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); ++ + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) + { + /* This may race with setting of irr in __apic_accept_irq() and +diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h +index a5ac4a5a5179..e5d2dc58fcf8 100644 +--- a/arch/x86/kvm/lapic.h ++++ b/arch/x86/kvm/lapic.h +@@ -122,6 +122,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); + int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); + int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); + enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu); ++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu); + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); + + u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 8052f8b7d8e1..d55f7edc0860 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -4839,6 +4839,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } + ++ if (vmx->nested.update_vmcs01_hwapic_isr) { ++ vmx->nested.update_vmcs01_hwapic_isr = false; ++ kvm_apic_update_hwapic_isr(vcpu); ++ } ++ + if ((vm_exit_reason != -1) && + (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) + vmx->nested.need_vmcs12_to_shadow_sync = true; +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 721ba6ddb121..7b87fbc69b21 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6713,6 +6713,22 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) + u16 status; + u8 old; + ++ /* ++ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI ++ * is only relevant for if and only if Virtual Interrupt Delivery is ++ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's ++ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested ++ * VM-Exit, otherwise L1 with run with a stale SVI. ++ */ ++ if (is_guest_mode(vcpu)) { ++ /* ++ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID ++ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. ++ */ ++ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; ++ return; ++ } ++ + if (max_isr == -1) + max_isr = 0; + +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 9e0bb98b116d..8b4b149bd9c1 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -189,6 +189,7 @@ struct nested_vmx { + bool reload_vmcs01_apic_access_page; + bool update_vmcs01_cpu_dirty_logging; + bool update_vmcs01_apicv_status; ++ bool update_vmcs01_hwapic_isr; + + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to +-- +2.50.1 + diff --git a/queue-6.1/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch b/queue-6.1/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch new file mode 100644 index 0000000000..aff25308bc --- /dev/null +++ b/queue-6.1/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch @@ -0,0 +1,123 @@ +From 7a3ebf358c60cdf6f7ef1c175053ec17e59945c3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:45 -0700 +Subject: KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI + shadow + +From: Sean Christopherson + +[ Upstream commit be45bc4eff33d9a7dae84a2150f242a91a617402 ] + +Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common +svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff() +so that VMRUN is not executed in an STI shadow. AMD CPUs have a quirk +(some would say "bug"), where the STI shadow bleeds into the guest's +intr_state field if a #VMEXIT occurs during injection of an event, i.e. if +the VMRUN doesn't complete before the subsequent #VMEXIT. + +The spurious "interrupts masked" state is relatively benign, as it only +occurs during event injection and is transient. Because KVM is already +injecting an event, the guest can't be in HLT, and if KVM is querying IRQ +blocking for injection, then KVM would need to force an immediate exit +anyways since injecting multiple events is impossible. + +However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the +spurious STI shadow is visible to L1 when running a nested VM, which can +trip sanity checks, e.g. in VMware's VMM. + +Hoist the STI+CLI all the way to C code, as the aforementioned calls to +guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are +enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already +possible. I.e. if there's kernel code that is confused by running with +RFLAGS.IF=1, then it's already a problem. In practice, since GIF=0 also +blocks NMIs, the only change in exposure to non-KVM code (relative to +surrounding VMRUN with STI+CLI) is exception handling code, and except for +the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path +are fatal. + +Use the "raw" variants to enable/disable IRQs to avoid tracing in the +"no instrumentation" code; the guest state helpers also take care of +tracing IRQ state. + +Oppurtunstically document why KVM needs to do STI in the first place. + +Reported-by: Doug Covelli +Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com +Fixes: f14eec0a3203 ("KVM: SVM: move more vmentry code to assembly") +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve minor syntatic conflict in __svm_sev_es_vcpu_run()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/svm/svm.c | 14 ++++++++++++++ + arch/x86/kvm/svm/vmenter.S | 9 +-------- + 2 files changed, 15 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index b6bbd0dc4e65..c95a84afc35f 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -3982,6 +3982,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + + guest_state_enter_irqoff(); + ++ /* ++ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of ++ * VMRUN controls whether or not physical IRQs are masked (KVM always ++ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the ++ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow ++ * into guest state if delivery of an event during VMRUN triggers a ++ * #VMEXIT, and the guest_state transitions already tell lockdep that ++ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of ++ * this path, so IRQs aren't actually unmasked while running host code. ++ */ ++ raw_local_irq_enable(); ++ + amd_clear_divider(); + + if (sev_es_guest(vcpu->kvm)) +@@ -3989,6 +4001,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + else + __svm_vcpu_run(svm, spec_ctrl_intercepted); + ++ raw_local_irq_disable(); ++ + guest_state_exit_irqoff(); + } + +diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S +index 42824f9b06a2..48b72625cc45 100644 +--- a/arch/x86/kvm/svm/vmenter.S ++++ b/arch/x86/kvm/svm/vmenter.S +@@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) + VM_CLEAR_CPU_BUFFERS + + /* Enter guest mode */ +- sti +- + 3: vmrun %_ASM_AX + 4: +- cli +- + /* Pop @svm to RAX while it's the only available register. */ + pop %_ASM_AX + +@@ -343,11 +339,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) + VM_CLEAR_CPU_BUFFERS + + /* Enter guest mode */ +- sti +- + 1: vmrun %_ASM_AX +- +-2: cli ++2: + + /* Pop @svm to RDI, guest registers have been saved already. */ + pop %_ASM_DI +-- +2.50.1 + diff --git a/queue-6.1/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch b/queue-6.1/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch new file mode 100644 index 0000000000..eb432649b1 --- /dev/null +++ b/queue-6.1/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch @@ -0,0 +1,63 @@ +From a0343421f8ed3cfa76b9719e3d3f1d575d5dd176 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:12:01 -0700 +Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported + +From: Sean Christopherson + +[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ] + +Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the +guest CPUID model, as debug support is supposed to be available if RTM is +supported, and there are no known downsides to letting the guest debug RTM +aborts. + +Note, there are no known bug reports related to RTM_DEBUG, the primary +motivation is to reduce the probability of breaking existing guests when a +future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL +(KVM currently lets L2 run with whatever hardware supports; whoops). + +Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to +DR7.RTM. + +Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/msr-index.h | 1 + + arch/x86/kvm/vmx/vmx.c | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 727947ed5e5e..afd65c815043 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -379,6 +379,7 @@ + #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) + #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 + #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) ++#define DEBUGCTLMSR_RTM_DEBUG BIT(15) + + #define MSR_PEBS_FRONTEND 0x000003f7 + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 60d1ff3fca45..9445def2b3d2 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2064,6 +2064,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) + debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + ++ if (boot_cpu_has(X86_FEATURE_RTM) && ++ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM))) ++ debugctl |= DEBUGCTLMSR_RTM_DEBUG; ++ + return debugctl; + } + +-- +2.50.1 + diff --git a/queue-6.1/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch b/queue-6.1/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch new file mode 100644 index 0000000000..3177e70799 --- /dev/null +++ b/queue-6.1/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch @@ -0,0 +1,90 @@ +From ada33297c8f7efa38a5100d5dde191508fc0254b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:12:02 -0700 +Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper + +From: Sean Christopherson + +[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ] + +Move VMX's logic to check DEBUGCTL values into a standalone helper so that +the code can be used by nested VM-Enter to apply the same logic to the +value being loaded from vmcs12. + +KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested +VM-Enter, as hardware may support features that KVM does not, i.e. relying +on hardware to detect invalid guest state will result in false negatives. +Unfortunately, that means applying KVM's funky suppression of BTF and LBR +to vmcs12 so as not to break existing guests. + +No functional change intended. + +Reviewed-by: Dapeng Mi +Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com +Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------ + 1 file changed, 17 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 9445def2b3d2..6517b9d929bf 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2071,6 +2071,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + return debugctl; + } + ++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, ++ bool host_initiated) ++{ ++ u64 invalid; ++ ++ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); ++ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { ++ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); ++ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); ++ } ++ return !invalid; ++} ++ + /* + * Writes msr value into the appropriate "register". + * Returns 0 on success, non-0 otherwise. +@@ -2139,19 +2152,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + } + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; +- case MSR_IA32_DEBUGCTLMSR: { +- u64 invalid; +- +- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); +- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { +- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); +- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); +- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); +- } +- +- if (invalid) ++ case MSR_IA32_DEBUGCTLMSR: ++ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) + return 1; + ++ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); ++ + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; +@@ -2161,7 +2167,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); + return 0; +- } + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && +-- +2.50.1 + diff --git a/queue-6.1/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch b/queue-6.1/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch new file mode 100644 index 0000000000..077191824a --- /dev/null +++ b/queue-6.1/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch @@ -0,0 +1,56 @@ +From 318a7d25fb1c4671eb3c0e5ead8980801321fd3a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:55 -0700 +Subject: KVM: VMX: Handle forced exit due to preemption timer in fastpath + +From: Sean Christopherson + +[ Upstream commit 11776aa0cfa7d007ad1799b1553bdcbd830e5010 ] + +Handle VMX preemption timer VM-Exits due to KVM forcing an exit in the +exit fastpath, i.e. avoid calling back into handle_preemption_timer() for +the same exit. There is no work to be done for forced exits, as the name +suggests the goal is purely to get control back in KVM. + +In addition to shaving a few cycles, this will allow cleanly separating +handle_fastpath_preemption_timer() from handle_preemption_timer(), e.g. +it's not immediately obvious why _apparently_ calling +handle_fastpath_preemption_timer() twice on a "slow" exit is necessary: +the "slow" call is necessary to handle exits from L2, which are excluded +from the fastpath by vmx_vcpu_run(). + +Link: https://lore.kernel.org/r/20240110012705.506918-4-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 96bbccd9477c..c804ad001a79 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -5941,12 +5941,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) + return EXIT_FASTPATH_REENTER_GUEST; + +- if (!vmx->req_immediate_exit) { +- kvm_lapic_expired_hv_timer(vcpu); +- return EXIT_FASTPATH_REENTER_GUEST; +- } ++ /* ++ * If the timer expired because KVM used it to force an immediate exit, ++ * then mission accomplished. ++ */ ++ if (vmx->req_immediate_exit) ++ return EXIT_FASTPATH_EXIT_HANDLED; + +- return EXIT_FASTPATH_NONE; ++ kvm_lapic_expired_hv_timer(vcpu); ++ return EXIT_FASTPATH_REENTER_GUEST; + } + + static int handle_preemption_timer(struct kvm_vcpu *vcpu) +-- +2.50.1 + diff --git a/queue-6.1/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch b/queue-6.1/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch new file mode 100644 index 0000000000..f71c1ee30b --- /dev/null +++ b/queue-6.1/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch @@ -0,0 +1,74 @@ +From d21ac42171b150d6870e91a395f72845982311ff Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:57 -0700 +Subject: KVM: VMX: Handle KVM-induced preemption timer exits in fastpath for + L2 + +From: Sean Christopherson + +[ Upstream commit 7b3d1bbf8d68d76fb21210932a5e8ed8ea80dbcc ] + +Eat VMX treemption timer exits in the fastpath regardless of whether L1 or +L2 is active. The VM-Exit is 100% KVM-induced, i.e. there is nothing +directly related to the exit that KVM needs to do on behalf of the guest, +thus there is no reason to wait until the slow path to do nothing. + +Opportunistically add comments explaining why preemption timer exits for +emulating the guest's APIC timer need to go down the slow path. + +Link: https://lore.kernel.org/r/20240110012705.506918-6-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 18ceed9046a9..4db9d41d988c 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -5948,13 +5948,26 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + if (vmx->req_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + ++ /* ++ * If L2 is active, go down the slow path as emulating the guest timer ++ * expiration likely requires synthesizing a nested VM-Exit. ++ */ ++ if (is_guest_mode(vcpu)) ++ return EXIT_FASTPATH_NONE; ++ + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; + } + + static int handle_preemption_timer(struct kvm_vcpu *vcpu) + { +- handle_fastpath_preemption_timer(vcpu); ++ /* ++ * This non-fastpath handler is reached if and only if the preemption ++ * timer was being used to emulate a guest timer while L2 is active. ++ * All other scenarios are supposed to be handled in the fastpath. ++ */ ++ WARN_ON_ONCE(!is_guest_mode(vcpu)); ++ kvm_lapic_expired_hv_timer(vcpu); + return 1; + } + +@@ -7138,7 +7151,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + + static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + { +- if (is_guest_mode(vcpu)) ++ /* ++ * If L2 is active, some VMX preemption timer exits can be handled in ++ * the fastpath even, all other exits must use the slow path. ++ */ ++ if (is_guest_mode(vcpu) && ++ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + return EXIT_FASTPATH_NONE; + + switch (to_vmx(vcpu)->exit_reason.basic) { +-- +2.50.1 + diff --git a/queue-6.1/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch b/queue-6.1/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch new file mode 100644 index 0000000000..94781bd2e9 --- /dev/null +++ b/queue-6.1/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch @@ -0,0 +1,191 @@ +From 94d2d32566130542daf6fc1a32f0c8b615def9bd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:12:05 -0700 +Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the + guest + +From: Maxim Levitsky + +[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ] + +Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the +host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting +while running the guest. When running with the "default treatment of SMIs" +in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that +is visible to host (non-SMM) software, and instead transitions directly +from VMX non-root to SMM. And critically, DEBUGCTL isn't context switched +by hardware on SMI or RSM, i.e. SMM will run with whatever value was +resident in hardware at the time of the SMI. + +Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting +events while the CPU is executing in SMM, which can pollute profiling and +potentially leak information into the guest. + +Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner +run loop, as the bit can be toggled in IRQ context via IPI callback (SMP +function call), by way of /sys/devices/cpu/freeze_on_smi. + +Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be +preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs, +i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and +at worst could lead to undesirable behavior in the future if AMD CPUs ever +happened to pick up a collision with the bit. + +Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module +owns and controls GUEST_IA32_DEBUGCTL. + +WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the +lack of handling isn't a KVM bug (TDX already WARNs on any run_flag). + +Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed +by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state(). +Doing so avoids the need to track host_debugctl on a per-VMCS basis, as +GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and +load_vmcs12_host_state(). For the VM-Fail case, even though KVM won't +have actually entered the guest, vcpu_enter_guest() will have run with +vmcs02 active and thus could result in vmcs01 being run with a stale value. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Co-developed-by: Sean Christopherson +Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: move vmx/main.c change to vmx/vmx.c] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 7 +++++++ + arch/x86/kvm/vmx/nested.c | 3 +++ + arch/x86/kvm/vmx/vmx.c | 5 +++++ + arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++++- + arch/x86/kvm/x86.c | 14 ++++++++++++-- + 5 files changed, 41 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index c8fc4f2acf69..d0229323ca63 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1459,6 +1459,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + enum kvm_x86_run_flags { + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), + KVM_RUN_LOAD_GUEST_DR6 = BIT(1), ++ KVM_RUN_LOAD_DEBUGCTL = BIT(2), + }; + + struct kvm_x86_ops { +@@ -1484,6 +1485,12 @@ struct kvm_x86_ops { + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + ++ /* ++ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to ++ * match the host's value even while the guest is active. ++ */ ++ const u64 HOST_OWNED_DEBUGCTL; ++ + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index a220770644e1..2c3cf4351c4c 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -4627,6 +4627,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + ++ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ ++ vmx_reload_guest_debugctl(vcpu); ++ + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index e470a294b22d..3fef4e14abc6 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7258,6 +7258,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + if (run_flags & KVM_RUN_LOAD_GUEST_DR6) + set_debugreg(vcpu->arch.dr6, 6); + ++ if (run_flags & KVM_RUN_LOAD_DEBUGCTL) ++ vmx_reload_guest_debugctl(vcpu); ++ + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time +@@ -8197,6 +8200,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + ++ .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM, ++ + .update_exception_bitmap = vmx_update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, + .get_msr = vmx_get_msr, +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index b7ae263cde7b..dc6f06326648 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -447,12 +447,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) + + static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) + { ++ WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM); ++ ++ val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM; + vmcs_write64(GUEST_IA32_DEBUGCTL, val); + } + + static inline u64 vmx_guest_debugctl_read(void) + { +- return vmcs_read64(GUEST_IA32_DEBUGCTL); ++ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM; ++} ++ ++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu) ++{ ++ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ ++ if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM)) ++ return; ++ ++ vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM); + } + + /* +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 9d66830d594c..dfecf5ba5aa7 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10591,7 +10591,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + fastpath_t exit_fastpath; +- u64 run_flags; ++ u64 run_flags, debug_ctl; + + bool req_immediate_exit = false; + +@@ -10838,7 +10838,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(0, 7); + } + +- vcpu->arch.host_debugctl = get_debugctlmsr(); ++ /* ++ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL ++ * can be modified in IRQ context, e.g. via SMP function calls. Inform ++ * vendor code if any host-owned bits were changed, e.g. so that the ++ * value loaded into hardware while running the guest can be updated. ++ */ ++ debug_ctl = get_debugctlmsr(); ++ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && ++ !vcpu->arch.guest_state_protected) ++ run_flags |= KVM_RUN_LOAD_DEBUGCTL; ++ vcpu->arch.host_debugctl = debug_ctl; + + guest_timing_enter_irqoff(); + +-- +2.50.1 + diff --git a/queue-6.1/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch b/queue-6.1/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch new file mode 100644 index 0000000000..d873837ce6 --- /dev/null +++ b/queue-6.1/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch @@ -0,0 +1,49 @@ +From 46e5f37d619ea0a3b02610d32be90ddab43d9393 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:54 -0700 +Subject: KVM: VMX: Re-enter guest in fastpath for "spurious" preemption timer + exits + +From: Sean Christopherson + +[ Upstream commit e6b5d16bbd2d4c8259ad76aa33de80d561aba5f9 ] + +Re-enter the guest in the fast path if VMX preeemption timer VM-Exit was +"spurious", i.e. if KVM "soft disabled" the timer by writing -1u and by +some miracle the timer expired before any other VM-Exit occurred. This is +just an intermediate step to cleaning up the preemption timer handling, +optimizing these types of spurious VM-Exits is not interesting as they are +extremely rare/infrequent. + +Link: https://lore.kernel.org/r/20240110012705.506918-3-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 0b495979a02b..96bbccd9477c 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -5933,8 +5933,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + +- if (!vmx->req_immediate_exit && +- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { ++ /* ++ * In the *extremely* unlikely scenario that this is a spurious VM-Exit ++ * due to the timer expiring while it was "soft" disabled, just eat the ++ * exit and re-enter the guest. ++ */ ++ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) ++ return EXIT_FASTPATH_REENTER_GUEST; ++ ++ if (!vmx->req_immediate_exit) { + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; + } +-- +2.50.1 + diff --git a/queue-6.1/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch b/queue-6.1/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch new file mode 100644 index 0000000000..d108a9e43c --- /dev/null +++ b/queue-6.1/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch @@ -0,0 +1,162 @@ +From 495f4d2993192a89076ae3ae03216019fc88fa55 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:12:04 -0700 +Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs + +From: Maxim Levitsky + +[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ] + +Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to +vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into +GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state +into the guest, and without needing to copy+paste the FREEZE_IN_SMM +logic into every patch that accesses GUEST_IA32_DEBUGCTL. + +No functional change intended. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +[sean: massage changelog, make inline, use in all prepare_vmcs02() cases] +Reviewed-by: Dapeng Mi +Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/nested.c | 10 +++++----- + arch/x86/kvm/vmx/pmu_intel.c | 8 ++++---- + arch/x86/kvm/vmx/vmx.c | 8 +++++--- + arch/x86/kvm/vmx/vmx.h | 10 ++++++++++ + 4 files changed, 24 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index da129e12cff9..a220770644e1 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2532,11 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & +- vmx_get_supported_debugctl(vcpu, false)); ++ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & ++ vmx_get_supported_debugctl(vcpu, false)); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); ++ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); + } + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) +@@ -3404,7 +3404,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) +- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); + if (kvm_mpx_supported() && + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) +@@ -4572,7 +4572,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); + + kvm_set_dr(vcpu, 7, 0x400); +- vmcs_write64(GUEST_IA32_DEBUGCTL, 0); ++ vmx_guest_debugctl_write(vcpu, 0); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index 220cdbe1e286..76d3ed8abf6a 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -672,11 +672,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) + */ + static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) + { +- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ u64 data = vmx_guest_debugctl_read(); + + if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { + data &= ~DEBUGCTLMSR_LBR; +- vmcs_write64(GUEST_IA32_DEBUGCTL, data); ++ vmx_guest_debugctl_write(vcpu, data); + } + } + +@@ -746,7 +746,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) + + if (!lbr_desc->event) { + vmx_disable_lbr_msrs_passthrough(vcpu); +- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) ++ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) + goto warn; + if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) + goto warn; +@@ -769,7 +769,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) + + static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) + { +- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) ++ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) + intel_pmu_release_guest_lbr_event(vcpu); + } + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 0b37e21d55b1..e470a294b22d 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2027,7 +2027,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; + break; + case MSR_IA32_DEBUGCTLMSR: +- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ msr_info->data = vmx_guest_debugctl_read(); + break; + default: + find_uret_msr: +@@ -2161,7 +2161,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; + +- vmcs_write64(GUEST_IA32_DEBUGCTL, data); ++ vmx_guest_debugctl_write(vcpu, data); ++ + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); +@@ -4751,7 +4752,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); +- vmcs_write64(GUEST_IA32_DEBUGCTL, 0); ++ ++ vmx_guest_debugctl_write(&vmx->vcpu, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 99e3f46de2ec..b7ae263cde7b 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -445,6 +445,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); + bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); + ++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) ++{ ++ vmcs_write64(GUEST_IA32_DEBUGCTL, val); ++} ++ ++static inline u64 vmx_guest_debugctl_read(void) ++{ ++ return vmcs_read64(GUEST_IA32_DEBUGCTL); ++} ++ + /* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch b/queue-6.1/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch new file mode 100644 index 0000000000..bdc82eb3c9 --- /dev/null +++ b/queue-6.1/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch @@ -0,0 +1,138 @@ +From 36f7addde5e161c3ad08eccfdaaf6d318b6e6461 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:59 -0700 +Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic + bitmap + +From: Sean Christopherson + +[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ] + +Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter +into an a generic bitmap so that similar "take action" information can be +passed to vendor code without creating a pile of boolean parameters. + +This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and +will also allow for adding similar functionality for re-loading debugctl +in the active VMCS. + +Opportunistically massage the TDX WARN and comment to prepare for adding +more run_flags, all of which are expected to be mutually exclusive with +TDX, i.e. should be WARNed on. + +No functional change intended. + +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: drop TDX crud, account for lack of kvm_x86_call()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 6 +++++- + arch/x86/kvm/svm/svm.c | 4 ++-- + arch/x86/kvm/vmx/vmx.c | 3 ++- + arch/x86/kvm/x86.c | 10 ++++++++-- + 4 files changed, 17 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 86f3bd6601e7..1383f5e5238a 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1456,6 +1456,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; + } + ++enum kvm_x86_run_flags { ++ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), ++}; ++ + struct kvm_x86_ops { + const char *name; + +@@ -1529,7 +1533,7 @@ struct kvm_x86_ops { + + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, +- bool force_immediate_exit); ++ u64 run_flags); + int (*handle_exit)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 12de50db401f..dc8a1b72d8ec 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4008,9 +4008,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + guest_state_exit_irqoff(); + } + +-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, +- bool force_immediate_exit) ++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + { ++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; + struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 179747d04edc..382f42200688 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7204,8 +7204,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + guest_state_exit_irqoff(); + } + +-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) ++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + { ++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 400a6e9fb0be..83e5e823cbae 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10591,6 +10591,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + fastpath_t exit_fastpath; ++ u64 run_flags; + + bool req_immediate_exit = false; + +@@ -10811,8 +10812,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + goto cancel_injection; + } + +- if (req_immediate_exit) ++ run_flags = 0; ++ if (req_immediate_exit) { ++ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; + kvm_make_request(KVM_REQ_EVENT, vcpu); ++ } + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) +@@ -10848,7 +10852,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); + +- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); ++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + +@@ -10860,6 +10864,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + break; + } + ++ run_flags = 0; ++ + /* Note, VM-Exits that go down the "slow" path are accounted below. */ + ++vcpu->stat.exits; + } +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch b/queue-6.1/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch new file mode 100644 index 0000000000..6c0c8781d0 --- /dev/null +++ b/queue-6.1/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch @@ -0,0 +1,144 @@ +From 3a65689ab6b232b205f7e1d222883025eacb62d1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:12:00 -0700 +Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag + +From: Sean Christopherson + +[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ] + +Instruct vendor code to load the guest's DR6 into hardware via a new +KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to +load vcpu->arch.dr6 into hardware when DR6 can be read/written directly +by the guest. + +Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM +thinks DR6 needs to be reloaded. TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH +and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6. + +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: account for lack of vmx/main.c] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 - + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kvm/svm/svm.c | 10 ++++++---- + arch/x86/kvm/vmx/vmx.c | 10 +++------- + arch/x86/kvm/x86.c | 2 +- + 5 files changed, 11 insertions(+), 14 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index 0e5ae3b0c867..c068565fe954 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -47,7 +47,6 @@ KVM_X86_OP(set_idt) + KVM_X86_OP(get_gdt) + KVM_X86_OP(set_gdt) + KVM_X86_OP(sync_dirty_debug_regs) +-KVM_X86_OP(set_dr6) + KVM_X86_OP(set_dr7) + KVM_X86_OP(cache_reg) + KVM_X86_OP(get_rflags) +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 1383f5e5238a..c8fc4f2acf69 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1458,6 +1458,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + + enum kvm_x86_run_flags { + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), ++ KVM_RUN_LOAD_GUEST_DR6 = BIT(1), + }; + + struct kvm_x86_ops { +@@ -1504,7 +1505,6 @@ struct kvm_x86_ops { + void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); +- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); + void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index dc8a1b72d8ec..5a6bd9d5cceb 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4052,10 +4052,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + svm_hv_update_vp_id(svm->vmcb, vcpu); + + /* +- * Run with all-zero DR6 unless needed, so that we can get the exact cause +- * of a #DB. ++ * Run with all-zero DR6 unless the guest can write DR6 freely, so that ++ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from ++ * KVM's snapshot is only necessary when DR accesses won't exit. + */ +- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) ++ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) ++ svm_set_dr6(vcpu, vcpu->arch.dr6); ++ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); + + clgi(); +@@ -4822,7 +4825,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .set_idt = svm_set_idt, + .get_gdt = svm_get_gdt, + .set_gdt = svm_set_gdt, +- .set_dr6 = svm_set_dr6, + .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, + .cache_reg = svm_cache_reg, +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 382f42200688..60d1ff3fca45 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -5530,12 +5530,6 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) + set_debugreg(DR6_RESERVED, 6); + } + +-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +-{ +- lockdep_assert_irqs_disabled(); +- set_debugreg(vcpu->arch.dr6, 6); +-} +- + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) + { + vmcs_writel(GUEST_DR7, val); +@@ -7251,6 +7245,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + vcpu->arch.regs_dirty = 0; + ++ if (run_flags & KVM_RUN_LOAD_GUEST_DR6) ++ set_debugreg(vcpu->arch.dr6, 6); ++ + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time +@@ -8208,7 +8205,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, +- .set_dr6 = vmx_set_dr6, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 83e5e823cbae..9d66830d594c 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10833,7 +10833,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) +- static_call(kvm_x86_set_dr6)(vcpu, vcpu->arch.dr6); ++ run_flags |= KVM_RUN_LOAD_GUEST_DR6; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(0, 7); + } +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch b/queue-6.1/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch new file mode 100644 index 0000000000..5c676853cb --- /dev/null +++ b/queue-6.1/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch @@ -0,0 +1,265 @@ +From b596c99630a856d3912ec549084a96dd2546752f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:58 -0700 +Subject: KVM: x86: Fully defer to vendor code to decide how to force immediate + exit + +From: Sean Christopherson + +[ Upstream commit 0ec3d6d1f169baa7fc512ae4b78d17e7c94b7763 ] + +Now that vmx->req_immediate_exit is used only in the scope of +vmx_vcpu_run(), use force_immediate_exit to detect that KVM should usurp +the VMX preemption to force a VM-Exit and let vendor code fully handle +forcing a VM-Exit. + +Opportunsitically drop __kvm_request_immediate_exit() and just have +vendor code call smp_send_reschedule() directly. SVM already does this +when injecting an event while also trying to single-step an IRET, i.e. +it's not exactly secret knowledge that KVM uses a reschedule IPI to force +an exit. + +Link: https://lore.kernel.org/r/20240110012705.506918-7-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve absurd conflict due to funky kvm_x86_ops.sched_in prototype] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 - + arch/x86/include/asm/kvm_host.h | 3 --- + arch/x86/kvm/svm/svm.c | 7 ++++--- + arch/x86/kvm/vmx/vmx.c | 32 +++++++++++++----------------- + arch/x86/kvm/vmx/vmx.h | 2 -- + arch/x86/kvm/x86.c | 10 +--------- + 6 files changed, 19 insertions(+), 36 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index 29bef25ac77c..0e5ae3b0c867 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -100,7 +100,6 @@ KVM_X86_OP(write_tsc_multiplier) + KVM_X86_OP(get_exit_info) + KVM_X86_OP(check_intercept) + KVM_X86_OP(handle_exit_irqoff) +-KVM_X86_OP(request_immediate_exit) + KVM_X86_OP(sched_in) + KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) + KVM_X86_OP_OPTIONAL(vcpu_blocking) +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 93f523762854..86f3bd6601e7 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1590,8 +1590,6 @@ struct kvm_x86_ops { + struct x86_exception *exception); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); + +- void (*request_immediate_exit)(struct kvm_vcpu *vcpu); +- + void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + + /* +@@ -2059,7 +2057,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); + + int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); + + void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 337a304d211b..12de50db401f 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4033,9 +4033,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); +- smp_send_reschedule(vcpu->cpu); ++ force_immediate_exit = true; + } + ++ if (force_immediate_exit) ++ smp_send_reschedule(vcpu->cpu); ++ + pre_svm_run(vcpu); + + sync_lapic_to_cr8(vcpu); +@@ -4874,8 +4877,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .check_intercept = svm_check_intercept, + .handle_exit_irqoff = svm_handle_exit_irqoff, + +- .request_immediate_exit = __kvm_request_immediate_exit, +- + .sched_in = svm_sched_in, + + .nested_ops = &svm_nested_ops, +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 4db9d41d988c..179747d04edc 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -49,6 +49,8 @@ + #include + #include + ++#include ++ + #include "capabilities.h" + #include "cpuid.h" + #include "evmcs.h" +@@ -1223,8 +1225,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) + u16 fs_sel, gs_sel; + int i; + +- vmx->req_immediate_exit = false; +- + /* + * Note that guest MSRs to be saved/restored can also be changed + * when guest state is loaded. This happens when guest transitions +@@ -5929,7 +5929,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) + return 1; + } + +-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) ++static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + +@@ -5945,7 +5946,7 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + * If the timer expired because KVM used it to force an immediate exit, + * then mission accomplished. + */ +- if (vmx->req_immediate_exit) ++ if (force_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + + /* +@@ -7090,13 +7091,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) + msrs[i].host, false); + } + +-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) ++static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + +- if (vmx->req_immediate_exit) { ++ if (force_immediate_exit) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (vmx->hv_deadline_tsc != -1) { +@@ -7149,7 +7150,8 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + barrier_nospec(); + } + +-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) ++static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit) + { + /* + * If L2 is active, some VMX preemption timer exits can be handled in +@@ -7163,7 +7165,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + case EXIT_REASON_MSR_WRITE: + return handle_fastpath_set_msr_irqoff(vcpu); + case EXIT_REASON_PREEMPTION_TIMER: +- return handle_fastpath_preemption_timer(vcpu); ++ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); + default: + return EXIT_FASTPATH_NONE; + } +@@ -7284,7 +7286,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + vmx_passthrough_lbr_msrs(vcpu); + + if (enable_preemption_timer) +- vmx_update_hv_timer(vcpu); ++ vmx_update_hv_timer(vcpu, force_immediate_exit); ++ else if (force_immediate_exit) ++ smp_send_reschedule(vcpu->cpu); + + kvm_wait_lapic_expire(vcpu); + +@@ -7358,7 +7362,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); + +- return vmx_exit_handlers_fastpath(vcpu); ++ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); + } + + static void vmx_vcpu_free(struct kvm_vcpu *vcpu) +@@ -7865,11 +7869,6 @@ static __init void vmx_set_cpu_caps(void) + kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + } + +-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +-{ +- to_vmx(vcpu)->req_immediate_exit = true; +-} +- + static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info) + { +@@ -8275,8 +8274,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + +- .request_immediate_exit = vmx_request_immediate_exit, +- + .sched_in = vmx_sched_in, + + .cpu_dirty_log_size = PML_ENTITY_NUM, +@@ -8533,7 +8530,6 @@ static __init int hardware_setup(void) + if (!enable_preemption_timer) { + vmx_x86_ops.set_hv_timer = NULL; + vmx_x86_ops.cancel_hv_timer = NULL; +- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; + } + + kvm_caps.supported_mce_cap |= MCG_LMCE_P; +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 357819872d80..ddbe73958d7f 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -343,8 +343,6 @@ struct vcpu_vmx { + unsigned int ple_window; + bool ple_window_dirty; + +- bool req_immediate_exit; +- + /* Support for PML */ + #define PML_ENTITY_NUM 512 + struct page *pml_pg; +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 08c3da88f402..400a6e9fb0be 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10578,12 +10578,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); + } + +-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) +-{ +- smp_send_reschedule(vcpu->cpu); +-} +-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); +- + /* + * Called within kvm->srcu read side. + * Returns 1 to let vcpu_run() continue the guest execution loop without +@@ -10817,10 +10811,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + goto cancel_injection; + } + +- if (req_immediate_exit) { ++ if (req_immediate_exit) + kvm_make_request(KVM_REQ_EVENT, vcpu); +- static_call(kvm_x86_request_immediate_exit)(vcpu); +- } + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch b/queue-6.1/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch new file mode 100644 index 0000000000..2c2f2c1a4e --- /dev/null +++ b/queue-6.1/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch @@ -0,0 +1,82 @@ +From 2ce55c36cca09ff95c3ba4cdb09407fc864500b0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:56 -0700 +Subject: KVM: x86: Move handling of is_guest_mode() into fastpath exit + handlers + +From: Sean Christopherson + +[ Upstream commit bf1a49436ea37b98dd2f37c57608951d0e28eecc ] + +Let the fastpath code decide which exits can/can't be handled in the +fastpath when L2 is active, e.g. when KVM generates a VMX preemption +timer exit to forcefully regain control, there is no "work" to be done and +so such exits can be handled in the fastpath regardless of whether L1 or +L2 is active. + +Moving the is_guest_mode() check into the fastpath code also makes it +easier to see that L2 isn't allowed to use the fastpath in most cases, +e.g. it's not immediately obvious why handle_fastpath_preemption_timer() +is called from the fastpath and the normal path. + +Link: https://lore.kernel.org/r/20240110012705.506918-5-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve syntactic conflict in svm_exit_handlers_fastpath()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/svm/svm.c | 6 +++--- + arch/x86/kvm/vmx/vmx.c | 6 +++--- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index b4283c2358a6..337a304d211b 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -3964,6 +3964,9 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + { + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + ++ if (is_guest_mode(vcpu)) ++ return EXIT_FASTPATH_NONE; ++ + /* + * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM + * can't read guest memory (dereference memslots) to decode the WRMSR. +@@ -4127,9 +4130,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, + + svm_complete_interrupts(vcpu); + +- if (is_guest_mode(vcpu)) +- return EXIT_FASTPATH_NONE; +- + return svm_exit_handlers_fastpath(vcpu); + } + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index c804ad001a79..18ceed9046a9 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7138,6 +7138,9 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + + static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + { ++ if (is_guest_mode(vcpu)) ++ return EXIT_FASTPATH_NONE; ++ + switch (to_vmx(vcpu)->exit_reason.basic) { + case EXIT_REASON_MSR_WRITE: + return handle_fastpath_set_msr_irqoff(vcpu); +@@ -7337,9 +7340,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); + +- if (is_guest_mode(vcpu)) +- return EXIT_FASTPATH_NONE; +- + return vmx_exit_handlers_fastpath(vcpu); + } + +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch b/queue-6.1/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch new file mode 100644 index 0000000000..cac06bc481 --- /dev/null +++ b/queue-6.1/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch @@ -0,0 +1,130 @@ +From b8df9da8aaf5d2d743800536dbd0bf0ec684f320 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:53 -0700 +Subject: KVM: x86: Plumb "force_immediate_exit" into kvm_entry() tracepoint + +From: Sean Christopherson + +[ Upstream commit 9c9025ea003a03f967affd690f39b4ef3452c0f5 ] + +Annotate the kvm_entry() tracepoint with "immediate exit" when KVM is +forcing a VM-Exit immediately after VM-Enter, e.g. when KVM wants to +inject an event but needs to first complete some other operation. +Knowing that KVM is (or isn't) forcing an exit is useful information when +debugging issues related to event injection. + +Suggested-by: Maxim Levitsky +Link: https://lore.kernel.org/r/20240110012705.506918-2-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 3 ++- + arch/x86/kvm/svm/svm.c | 5 +++-- + arch/x86/kvm/trace.h | 9 ++++++--- + arch/x86/kvm/vmx/vmx.c | 4 ++-- + arch/x86/kvm/x86.c | 2 +- + 5 files changed, 14 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 555c7bf35e28..93f523762854 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1528,7 +1528,8 @@ struct kvm_x86_ops { + void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); + + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); +- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); ++ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit); + int (*handle_exit)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 2c0f9c7d1242..b4283c2358a6 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4005,12 +4005,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + guest_state_exit_irqoff(); + } + +-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) ++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit) + { + struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); + +- trace_kvm_entry(vcpu); ++ trace_kvm_entry(vcpu, force_immediate_exit); + + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; +diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h +index 6c1dcf44c4fa..ab407bc00d84 100644 +--- a/arch/x86/kvm/trace.h ++++ b/arch/x86/kvm/trace.h +@@ -15,20 +15,23 @@ + * Tracepoint for guest mode entry. + */ + TRACE_EVENT(kvm_entry, +- TP_PROTO(struct kvm_vcpu *vcpu), +- TP_ARGS(vcpu), ++ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit), ++ TP_ARGS(vcpu, force_immediate_exit), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned long, rip ) ++ __field( bool, immediate_exit ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->rip = kvm_rip_read(vcpu); ++ __entry->immediate_exit = force_immediate_exit; + ), + +- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) ++ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, ++ __entry->immediate_exit ? "[immediate exit]" : "") + ); + + /* +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 390af16d9a67..0b495979a02b 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7171,7 +7171,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + guest_state_exit_irqoff(); + } + +-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) ++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; +@@ -7198,7 +7198,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) + return EXIT_FASTPATH_NONE; + } + +- trace_kvm_entry(vcpu); ++ trace_kvm_entry(vcpu, force_immediate_exit); + + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index d224180c56f5..08c3da88f402 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10856,7 +10856,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); + +- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); ++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch b/queue-6.1/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch new file mode 100644 index 0000000000..7ad4d28931 --- /dev/null +++ b/queue-6.1/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch @@ -0,0 +1,104 @@ +From 8aadc6631ffd7b08508de7b053eb6e237402d947 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:47 -0700 +Subject: KVM: x86: Plumb in the vCPU to kvm_x86_ops.hwapic_isr_update() + +From: Sean Christopherson + +[ Upstream commit 76bce9f10162cd4b36ac0b7889649b22baf70ebd ] + +Pass the target vCPU to the hwapic_isr_update() vendor hook so that VMX +can defer the update until after nested VM-Exit if an EOI for L1's vAPIC +occurs while L2 is active. + +Note, commit d39850f57d21 ("KVM: x86: Drop @vcpu parameter from +kvm_x86_ops.hwapic_isr_update()") removed the parameter with the +justification that doing so "allows for a decent amount of (future) +cleanup in the APIC code", but it's not at all clear what cleanup was +intended, or if it was ever realized. + +No functional change intended. + +Cc: stable@vger.kernel.org +Reviewed-by: Chao Gao +Tested-by: Chao Gao +Link: https://lore.kernel.org/r/20241128000010.4051275-2-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: account for lack of kvm_x86_call(), drop vmx/x86_ops.h change] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kvm/lapic.c | 8 ++++---- + arch/x86/kvm/vmx/vmx.c | 2 +- + 3 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 17b4e61a52b9..6db42ee82032 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1552,7 +1552,7 @@ struct kvm_x86_ops { + bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); + void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); + void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); +- void (*hwapic_isr_update)(int isr); ++ void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index 42eec987ac3d..3d65d6a023c9 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -587,7 +587,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) + * just set SVI. + */ + if (unlikely(apic->apicv_active)) +- static_call_cond(kvm_x86_hwapic_isr_update)(vec); ++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec); + else { + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); +@@ -632,7 +632,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) + * and must be left alone. + */ + if (unlikely(apic->apicv_active)) +- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); ++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); + else { + --apic->isr_count; + BUG_ON(apic->isr_count < 0); +@@ -2554,7 +2554,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) + if (apic->apicv_active) { + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); +- static_call_cond(kvm_x86_hwapic_isr_update)(-1); ++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); + } + + vcpu->arch.apic_arb_prio = 0; +@@ -2847,7 +2847,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) + if (apic->apicv_active) { + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); +- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); ++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (ioapic_in_kernel(vcpu->kvm)) +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 9a5cb896229f..721ba6ddb121 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6708,7 +6708,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) + put_page(page); + } + +-static void vmx_hwapic_isr_update(int max_isr) ++static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) + { + u16 status; + u8 old; +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch b/queue-6.1/kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch new file mode 100644 index 0000000000..9d8f871812 --- /dev/null +++ b/queue-6.1/kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch @@ -0,0 +1,222 @@ +From 7a29b546168ea7252a909633d7860beb1599191b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:52 -0700 +Subject: KVM: x86/pmu: Gate all "unimplemented MSR" prints on + report_ignored_msrs + +From: Sean Christopherson + +[ Upstream commit e76ae52747a82a548742107b4100e90da41a624d ] + +Add helpers to print unimplemented MSR accesses and condition all such +prints on report_ignored_msrs, i.e. honor userspace's request to not +print unimplemented MSRs. Even though vcpu_unimpl() is ratelimited, +printing can still be problematic, e.g. if a print gets stalled when host +userspace is writing MSRs during live migration, an effective stall can +result in very noticeable disruption in the guest. + +E.g. the profile below was taken while calling KVM_SET_MSRS on the PMU +counters while the PMU was disabled in KVM. + + - 99.75% 0.00% [.] __ioctl + - __ioctl + - 99.74% entry_SYSCALL_64_after_hwframe + do_syscall_64 + sys_ioctl + - do_vfs_ioctl + - 92.48% kvm_vcpu_ioctl + - kvm_arch_vcpu_ioctl + - 85.12% kvm_set_msr_ignored_check + svm_set_msr + kvm_set_msr_common + printk + vprintk_func + vprintk_default + vprintk_emit + console_unlock + call_console_drivers + univ8250_console_write + serial8250_console_write + uart_console_write + +Reported-by: Aaron Lewis +Reviewed-by: Vitaly Kuznetsov +Link: https://lore.kernel.org/r/20230124234905.3774678-3-seanjc@google.com +Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/hyperv.c | 10 ++++------ + arch/x86/kvm/svm/svm.c | 5 ++--- + arch/x86/kvm/vmx/vmx.c | 4 +--- + arch/x86/kvm/x86.c | 18 +++++------------- + arch/x86/kvm/x86.h | 12 ++++++++++++ + 5 files changed, 24 insertions(+), 25 deletions(-) + +diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c +index 28555bbd52e8..cb0a531e13c5 100644 +--- a/arch/x86/kvm/hyperv.c ++++ b/arch/x86/kvm/hyperv.c +@@ -1406,8 +1406,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_set_msr(vcpu, msr, data, host); + default: +- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", +- msr, data); ++ kvm_pr_unimpl_wrmsr(vcpu, msr, data); + return 1; + } + return 0; +@@ -1528,8 +1527,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) + return 1; + break; + default: +- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", +- msr, data); ++ kvm_pr_unimpl_wrmsr(vcpu, msr, data); + return 1; + } + +@@ -1581,7 +1579,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_get_msr(vcpu, msr, pdata, host); + default: +- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); ++ kvm_pr_unimpl_rdmsr(vcpu, msr); + return 1; + } + +@@ -1646,7 +1644,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + data = APIC_BUS_FREQUENCY; + break; + default: +- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); ++ kvm_pr_unimpl_rdmsr(vcpu, msr); + return 1; + } + *pdata = data; +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index b922f31d1415..2c0f9c7d1242 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -3035,8 +3035,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) + break; + case MSR_IA32_DEBUGCTLMSR: + if (!lbrv) { +- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", +- __func__, data); ++ kvm_pr_unimpl_wrmsr(vcpu, ecx, data); + break; + } + +@@ -3077,7 +3076,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) + case MSR_VM_CR: + return svm_set_vm_cr(vcpu, data); + case MSR_VM_IGNNE: +- vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); ++ kvm_pr_unimpl_wrmsr(vcpu, ecx, data); + break; + case MSR_AMD64_DE_CFG: { + struct kvm_msr_entry msr_entry; +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index c24da2cff208..390af16d9a67 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2140,9 +2140,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + + invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); + if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { +- if (report_ignored_msrs) +- vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", +- __func__, data); ++ kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); + data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + } +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index b0ae61ba9b99..d224180c56f5 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3573,7 +3573,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu) + + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + { +- bool pr = false; + u32 msr = msr_info->index; + u64 data = msr_info->data; + +@@ -3625,15 +3624,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + if (data == BIT_ULL(18)) { + vcpu->arch.msr_hwcr = data; + } else if (data != 0) { +- vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", +- data); ++ kvm_pr_unimpl_wrmsr(vcpu, msr, data); + return 1; + } + break; + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { +- vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " +- "0x%llx\n", data); ++ kvm_pr_unimpl_wrmsr(vcpu, msr, data); + return 1; + } + break; +@@ -3813,16 +3810,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: +- pr = true; +- fallthrough; + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + +- if (pr || data != 0) +- vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " +- "0x%x data 0x%llx\n", msr, data); ++ if (data) ++ kvm_pr_unimpl_wrmsr(vcpu, msr, data); + break; + case MSR_K7_CLK_CTL: + /* +@@ -3849,9 +3843,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + /* Drop writes to this legacy MSR -- see rdmsr + * counterpart for further detail. + */ +- if (report_ignored_msrs) +- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", +- msr, data); ++ kvm_pr_unimpl_wrmsr(vcpu, msr, data); + break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) +diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h +index 9de72586f406..f3554bf05201 100644 +--- a/arch/x86/kvm/x86.h ++++ b/arch/x86/kvm/x86.h +@@ -331,6 +331,18 @@ extern bool report_ignored_msrs; + + extern bool eager_page_split; + ++static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) ++{ ++ if (report_ignored_msrs) ++ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); ++} ++ ++static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ if (report_ignored_msrs) ++ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); ++} ++ + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) + { + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch b/queue-6.1/kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch new file mode 100644 index 0000000000..fa413c4e05 --- /dev/null +++ b/queue-6.1/kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch @@ -0,0 +1,162 @@ +From c53c4c4220e372f9a392cb4dd337b2ddd5b5596a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:46 -0700 +Subject: KVM: x86: Re-split x2APIC ICR into ICR+ICR2 for AMD (x2AVIC) + +From: Sean Christopherson + +[ Upstream commit 73b42dc69be8564d4951a14d00f827929fe5ef79 ] + +Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's +IPI virtualization support, but only for AMD. While not stated anywhere +in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs +store the 64-bit ICR as two separate 32-bit values in ICR and ICR2. When +IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled, +KVM needs to match CPU behavior as some ICR ICR writes will be handled by +the CPU, not by KVM. + +Add a kvm_x86_ops knob to control the underlying format used by the CPU to +store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether +or not x2AVIC is enabled. If KVM is handling all ICR writes, the storage +format for x2APIC mode doesn't matter, and having the behavior follow AMD +versus Intel will provide better test coverage and ease debugging. + +Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode") +Cc: stable@vger.kernel.org +Cc: Maxim Levitsky +Cc: Suravee Suthikulpanit +Link: https://lore.kernel.org/r/20240719235107.3023592-4-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve minor syntatic conflicts] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 2 ++ + arch/x86/kvm/lapic.c | 42 +++++++++++++++++++++++---------- + arch/x86/kvm/svm/svm.c | 2 ++ + arch/x86/kvm/vmx/vmx.c | 2 ++ + 4 files changed, 36 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index eb06c2f68314..17b4e61a52b9 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1547,6 +1547,8 @@ struct kvm_x86_ops { + void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + void (*enable_irq_window)(struct kvm_vcpu *vcpu); + void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); ++ ++ const bool x2apic_icr_is_split; + bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); + void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); + void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index 7f57dce5c828..42eec987ac3d 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -2315,11 +2315,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) + data &= ~APIC_ICR_BUSY; + + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); +- kvm_lapic_set_reg64(apic, APIC_ICR, data); ++ if (kvm_x86_ops.x2apic_icr_is_split) { ++ kvm_lapic_set_reg(apic, APIC_ICR, data); ++ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); ++ } else { ++ kvm_lapic_set_reg64(apic, APIC_ICR, data); ++ } + trace_kvm_apic_write(APIC_ICR, data); + return 0; + } + ++static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) ++{ ++ if (kvm_x86_ops.x2apic_icr_is_split) ++ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | ++ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; ++ ++ return kvm_lapic_get_reg64(apic, APIC_ICR); ++} ++ + /* emulate APIC access in a trap manner */ + void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) + { +@@ -2337,7 +2351,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) + * maybe-unecessary write, and both are in the noise anyways. + */ + if (apic_x2apic_mode(apic) && offset == APIC_ICR) +- WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR))); ++ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); + else + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); + } +@@ -2760,18 +2774,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, + + /* + * In x2APIC mode, the LDR is fixed and based on the id. And +- * ICR is internally a single 64-bit register, but needs to be +- * split to ICR+ICR2 in userspace for backwards compatibility. ++ * if the ICR is _not_ split, ICR is internally a single 64-bit ++ * register, but needs to be split to ICR+ICR2 in userspace for ++ * backwards compatibility. + */ +- if (set) { ++ if (set) + *ldr = kvm_apic_calc_x2apic_ldr(*id); + +- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | +- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; +- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); +- } else { +- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); +- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); ++ if (!kvm_x86_ops.x2apic_icr_is_split) { ++ if (set) { ++ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | ++ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; ++ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); ++ } else { ++ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); ++ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); ++ } + } + } + +@@ -2971,7 +2989,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) + u32 low; + + if (reg == APIC_ICR) { +- *data = kvm_lapic_get_reg64(apic, APIC_ICR); ++ *data = kvm_x2apic_icr_read(apic); + return 0; + } + +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index c95a84afc35f..b922f31d1415 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4851,6 +4851,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .enable_nmi_window = svm_enable_nmi_window, + .enable_irq_window = svm_enable_irq_window, + .update_cr8_intercept = svm_update_cr8_intercept, ++ ++ .x2apic_icr_is_split = true, + .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, + .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, + .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index fbe26b88f731..9a5cb896229f 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -8202,6 +8202,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .enable_nmi_window = vmx_enable_nmi_window, + .enable_irq_window = vmx_enable_irq_window, + .update_cr8_intercept = vmx_update_cr8_intercept, ++ ++ .x2apic_icr_is_split = false, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch b/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch new file mode 100644 index 0000000000..f70020876f --- /dev/null +++ b/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch @@ -0,0 +1,48 @@ +From ac35d395216d2db6535082fde4a62a3ee3849d40 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:51 -0700 +Subject: KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs + +From: Sean Christopherson + +[ Upstream commit 189ecdb3e112da703ac0699f4ec76aa78122f911 ] + +Snapshot the host's DEBUGCTL after disabling IRQs, as perf can toggle +debugctl bits from IRQ context, e.g. when enabling/disabling events via +smp_call_function_single(). Taking the snapshot (long) before IRQs are +disabled could result in KVM effectively clobbering DEBUGCTL due to using +a stale snapshot. + +Cc: stable@vger.kernel.org +Reviewed-and-tested-by: Ravi Bangoria +Link: https://lore.kernel.org/r/20250227222411.3490595-6-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/x86.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index ba24bb50af57..b0ae61ba9b99 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -4742,7 +4742,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); +- vcpu->arch.host_debugctl = get_debugctlmsr(); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { +@@ -10851,6 +10850,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(0, 7); + } + ++ vcpu->arch.host_debugctl = get_debugctlmsr(); ++ + guest_timing_enter_irqoff(); + + for (;;) { +-- +2.50.1 + diff --git a/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch b/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch new file mode 100644 index 0000000000..94ab9eac53 --- /dev/null +++ b/queue-6.1/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch @@ -0,0 +1,100 @@ +From 52e78074c894adecdf2fb1d987959707ce46beed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:11:50 -0700 +Subject: KVM: x86: Snapshot the host's DEBUGCTL in common x86 + +From: Sean Christopherson + +[ Upstream commit fb71c795935652fa20eaf9517ca9547f5af99a76 ] + +Move KVM's snapshot of DEBUGCTL to kvm_vcpu_arch and take the snapshot in +common x86, so that SVM can also use the snapshot. + +Opportunistically change the field to a u64. While bits 63:32 are reserved +on AMD, not mentioned at all in Intel's SDM, and managed as an "unsigned +long" by the kernel, DEBUGCTL is an MSR and therefore a 64-bit value. + +Reviewed-by: Xiaoyao Li +Cc: stable@vger.kernel.org +Reviewed-and-tested-by: Ravi Bangoria +Link: https://lore.kernel.org/r/20250227222411.3490595-4-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve minor syntatic conflict in vmx_vcpu_load()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/vmx/vmx.c | 8 ++------ + arch/x86/kvm/vmx/vmx.h | 2 -- + arch/x86/kvm/x86.c | 1 + + 4 files changed, 4 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 6db42ee82032..555c7bf35e28 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -677,6 +677,7 @@ struct kvm_vcpu_arch { + u32 pkru; + u32 hflags; + u64 efer; ++ u64 host_debugctl; + u64 apic_base; + struct kvm_lapic *apic; /* kernel irqchip context */ + bool load_eoi_exitmap_pending; +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 7b87fbc69b21..c24da2cff208 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -1418,13 +1418,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + */ + static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + { +- struct vcpu_vmx *vmx = to_vmx(vcpu); +- + vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + + vmx_vcpu_pi_load(vcpu, cpu); +- +- vmx->host_debugctlmsr = get_debugctlmsr(); + } + + static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +@@ -7275,8 +7271,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) + } + + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ +- if (vmx->host_debugctlmsr) +- update_debugctlmsr(vmx->host_debugctlmsr); ++ if (vcpu->arch.host_debugctl) ++ update_debugctlmsr(vcpu->arch.host_debugctl); + + #ifndef CONFIG_X86_64 + /* +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 8b4b149bd9c1..357819872d80 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -352,8 +352,6 @@ struct vcpu_vmx { + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + +- unsigned long host_debugctlmsr; +- + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index a6dc8f662fa4..ba24bb50af57 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -4742,6 +4742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); ++ vcpu->arch.host_debugctl = get_debugctlmsr(); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { +-- +2.50.1 + diff --git a/queue-6.1/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.1/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..95b6bda6e3 --- /dev/null +++ b/queue-6.1/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,129 @@ +From 70d909202444ad2c328a4944d265dc9ad7efe92a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 2cf58a8b8e4d..d3e28574ceb9 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -859,8 +859,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -1175,19 +1173,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -1224,7 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -1237,8 +1242,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, true, flags); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -1251,12 +1255,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-6.1/series b/queue-6.1/series index 3df0a6ae83..c1c590bc03 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -24,3 +24,25 @@ eventpoll-fix-semi-unbounded-recursion.patch documentation-acpi-fix-parent-device-references.patch acpi-processor-perflib-fix-initial-_ppc-limit-application.patch acpi-processor-perflib-move-problematic-pr-performance-check.patch +kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch +kvm-x86-re-split-x2apic-icr-into-icr-icr2-for-amd-x2.patch +kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch +kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch +kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch +kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch +kvm-x86-pmu-gate-all-unimplemented-msr-prints-on-rep.patch +kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch +kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch +kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch +kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch +kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch +kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch +kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch +kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch +kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch +kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch +kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch +kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch +kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch diff --git a/queue-6.1/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.1/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..bdf07f42d0 --- /dev/null +++ b/queue-6.1/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From 21b9dfb2ec919b8b4561d84dd45c0ee4799c62d0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index 1a51c4b44c00..593108049ab7 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -60,7 +60,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 + diff --git a/queue-6.12/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.12/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch new file mode 100644 index 0000000000..246baaa7bf --- /dev/null +++ b/queue-6.12/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch @@ -0,0 +1,91 @@ +From a05287bd1654c451e1eb7b9e28de5ef9f1b9d901 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Aug 2025 17:03:11 +0200 +Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data + +From: Rafael J. Wysocki + +[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ] + +Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid +discarding useful information") caused the number of wakeup interrupts +to increase on an idle system [1], which was not expected to happen +after merely allowing shallower idle states to be selected by the +governor in some cases. + +However, on the system in question, all of the idle states deeper than +WFI are rejected by the driver due to a firmware issue [2]. This causes +the governor to only consider the recent interval duriation data +corresponding to attempts to enter WFI that are successful and the +recent invervals table is filled with values lower than the scheduler +tick period. Consequently, the governor predicts an idle duration +below the scheduler tick period length and avoids stopping the tick +more often which leads to the observed symptom. + +Address it by modifying the governor to update the recent intervals +table also when entering the previously selected idle state fails, so +it knows that the short idle intervals might have been the minority +had the selected idle states been actually entered every time. + +Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") +Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1] +Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2] +Signed-off-by: Rafael J. Wysocki +Tested-by: Christian Loehle +Tested-by: Marc Zyngier +Reviewed-by: Christian Loehle +Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki +Signed-off-by: Sasha Levin +--- + drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c +index 97ffadc7e57a..01322a905414 100644 +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -153,6 +153,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters) + + static DEFINE_PER_CPU(struct menu_device, menu_devices); + ++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us) ++{ ++ /* Update the repeating-pattern data. */ ++ data->intervals[data->interval_ptr++] = interval_us; ++ if (data->interval_ptr >= INTERVALS) ++ data->interval_ptr = 0; ++} ++ + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + + /* +@@ -277,6 +285,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; ++ } else if (!dev->last_residency_ns) { ++ /* ++ * This happens when the driver rejects the previously selected ++ * idle state and returns an error, so update the recent ++ * intervals table to prevent invalid information from being ++ * used going forward. ++ */ ++ menu_update_intervals(data, UINT_MAX); + } + + nr_iowaiters = nr_iowait_cpu(dev->cpu); +@@ -546,10 +562,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) + + data->correction_factor[data->bucket] = new_factor; + +- /* update the repeating-pattern data */ +- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); +- if (data->interval_ptr >= INTERVALS) +- data->interval_ptr = 0; ++ menu_update_intervals(data, ktime_to_us(measured_ns)); + } + + /** +-- +2.50.1 + diff --git a/queue-6.12/habanalabs-fix-uaf-in-export_dmabuf.patch b/queue-6.12/habanalabs-fix-uaf-in-export_dmabuf.patch new file mode 100644 index 0000000000..852df8f157 --- /dev/null +++ b/queue-6.12/habanalabs-fix-uaf-in-export_dmabuf.patch @@ -0,0 +1,96 @@ +From b5a874d6221e42baa1685f2af96f79fd75b92995 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 12 Jul 2025 06:02:31 +0100 +Subject: habanalabs: fix UAF in export_dmabuf() + +From: Al Viro + +[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ] + +As soon as we'd inserted a file reference into descriptor table, another +thread could close it. That's fine for the case when all we are doing is +returning that descriptor to userland (it's a race, but it's a userland +race and there's nothing the kernel can do about it). However, if we +follow fd_install() with any kind of access to objects that would be +destroyed on close (be it the struct file itself or anything destroyed +by its ->release()), we have a UAF. + +dma_buf_fd() is a combination of reserving a descriptor and fd_install(). +habanalabs export_dmabuf() calls it and then proceeds to access the +objects destroyed on close. In particular, it grabs an extra reference to +another struct file that will be dropped as part of ->release() for ours; +that "will be" is actually "might have already been". + +Fix that by reserving descriptor before anything else and do fd_install() +only when everything had been set up. As a side benefit, we no longer +have the failure exit with file already created, but reference to +underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet; +unlike dma_buf_fd(), fd_install() can't fail. + +Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter") +Signed-off-by: Al Viro +Signed-off-by: Sasha Levin +--- + drivers/accel/habanalabs/common/memory.c | 23 +++++++---------------- + 1 file changed, 7 insertions(+), 16 deletions(-) + +diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c +index 3348ad12c237..11c55fd76db5 100644 +--- a/drivers/accel/habanalabs/common/memory.c ++++ b/drivers/accel/habanalabs/common/memory.c +@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf) + struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv; + struct hl_ctx *ctx; + +- if (!hl_dmabuf) +- return; +- + ctx = hl_dmabuf->ctx; + + if (hl_dmabuf->memhash_hnode) +@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx, + { + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct hl_device *hdev = ctx->hdev; +- int rc, fd; ++ CLASS(get_unused_fd, fd)(flags); ++ ++ if (fd < 0) { ++ dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd); ++ return fd; ++ } + + exp_info.ops = &habanalabs_dmabuf_ops; + exp_info.size = total_size; +@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx, + return PTR_ERR(hl_dmabuf->dmabuf); + } + +- fd = dma_buf_fd(hl_dmabuf->dmabuf, flags); +- if (fd < 0) { +- dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd); +- rc = fd; +- goto err_dma_buf_put; +- } +- + hl_dmabuf->ctx = ctx; + hl_ctx_get(hl_dmabuf->ctx); + atomic_inc(&ctx->hdev->dmabuf_export_cnt); +@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx, + get_file(ctx->hpriv->file_priv->filp); + + *dmabuf_fd = fd; ++ fd_install(take_fd(fd), hl_dmabuf->dmabuf->file); + + return 0; +- +-err_dma_buf_put: +- hl_dmabuf->dmabuf->priv = NULL; +- dma_buf_put(hl_dmabuf->dmabuf); +- return rc; + } + + static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset) +-- +2.50.1 + diff --git a/queue-6.12/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.12/intel_idle-allow-loading-acpi-tables-for-any-family.patch new file mode 100644 index 0000000000..4cc26eab14 --- /dev/null +++ b/queue-6.12/intel_idle-allow-loading-acpi-tables-for-any-family.patch @@ -0,0 +1,41 @@ +From b77cd82a6114ef64d2a6bf354fbd8a7e91c721fd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 15:37:14 -0400 +Subject: intel_idle: Allow loading ACPI tables for any family + +From: Len Brown + +[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ] + +There is no reason to limit intel_idle's loading of ACPI tables to +family 6. Upcoming Intel processors are not in family 6. + +Below "Fixes" really means "applies cleanly until". +That syntax commit didn't change the previous logic, +but shows this patch applies back 5-years. + +Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros") +Signed-off-by: Len Brown +Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 524ed143f875..4506e1cc4b65 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -1608,7 +1608,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + }; + + static const struct x86_cpu_id intel_mwait_ids[] __initconst = { +- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL), ++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL), + {} + }; + +-- +2.50.1 + diff --git a/queue-6.12/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch b/queue-6.12/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch new file mode 100644 index 0000000000..34ddf0c4e9 --- /dev/null +++ b/queue-6.12/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch @@ -0,0 +1,117 @@ +From c61650533f1bd7068592df158f48962bfcd8bd98 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:57:23 -0700 +Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter + +From: Maxim Levitsky + +[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ] + +Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports +a subset of hardware functionality, i.e. KVM can't rely on hardware to +detect illegal/unsupported values. Failure to check the vmcs12 value +would allow the guest to load any harware-supported value while running L2. + +Take care to exempt BTF and LBR from the validity check in order to match +KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even +if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect +that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR +are being intercepted. + +Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set +*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but +that would incur non-trivial complexity and wouldn't change the fact that +KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity +is not worth carrying. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Co-developed-by: Sean Christopherson +Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com +Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/nested.c | 12 ++++++++++-- + arch/x86/kvm/vmx/vmx.c | 5 ++--- + arch/x86/kvm/vmx/vmx.h | 3 +++ + 3 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 903e874041ac..1e0b9f92ff18 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2653,7 +2653,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); ++ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & ++ vmx_get_supported_debugctl(vcpu, false)); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); +@@ -3135,7 +3136,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && +- CC(!kvm_dr7_valid(vmcs12->guest_dr7))) ++ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || ++ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && +@@ -4576,6 +4578,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + ++ /* ++ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. ++ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately ++ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into ++ * vmcs02 doesn't strictly track vmcs12. ++ */ + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) + vmcs12->guest_dr7 = vcpu->arch.dr7; + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index ff61093e9af7..50d45c18fce9 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2173,7 +2173,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, + return (unsigned long)data; + } + +-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) ++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) + { + u64 debugctl = 0; + +@@ -2192,8 +2192,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + return debugctl; + } + +-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, +- bool host_initiated) ++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) + { + u64 invalid; + +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index cf57fbf12104..ee330d14089d 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -435,6 +435,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, + + void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + ++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); ++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); ++ + /* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and +-- +2.50.1 + diff --git a/queue-6.12/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch b/queue-6.12/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch new file mode 100644 index 0000000000..9117d3e144 --- /dev/null +++ b/queue-6.12/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch @@ -0,0 +1,63 @@ +From 442fe2ed58d95e8ffd4c75c29b7f1d884bce1d02 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:57:21 -0700 +Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported + +From: Sean Christopherson + +[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ] + +Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the +guest CPUID model, as debug support is supposed to be available if RTM is +supported, and there are no known downsides to letting the guest debug RTM +aborts. + +Note, there are no known bug reports related to RTM_DEBUG, the primary +motivation is to reduce the probability of breaking existing guests when a +future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL +(KVM currently lets L2 run with whatever hardware supports; whoops). + +Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to +DR7.RTM. + +Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/msr-index.h | 1 + + arch/x86/kvm/vmx/vmx.c | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 7ebe76f69417..2b6e3127ef4e 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -417,6 +417,7 @@ + #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) + #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 + #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) ++#define DEBUGCTLMSR_RTM_DEBUG BIT(15) + + #define MSR_PEBS_FRONTEND 0x000003f7 + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index b9c7940feac6..529a10bba056 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2185,6 +2185,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) + debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + ++ if (boot_cpu_has(X86_FEATURE_RTM) && ++ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM))) ++ debugctl |= DEBUGCTLMSR_RTM_DEBUG; ++ + return debugctl; + } + +-- +2.50.1 + diff --git a/queue-6.12/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch b/queue-6.12/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch new file mode 100644 index 0000000000..031bb7681b --- /dev/null +++ b/queue-6.12/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch @@ -0,0 +1,90 @@ +From 9169769cd413b64e64d5b12b3b21446c9d1340a5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:57:22 -0700 +Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper + +From: Sean Christopherson + +[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ] + +Move VMX's logic to check DEBUGCTL values into a standalone helper so that +the code can be used by nested VM-Enter to apply the same logic to the +value being loaded from vmcs12. + +KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested +VM-Enter, as hardware may support features that KVM does not, i.e. relying +on hardware to detect invalid guest state will result in false negatives. +Unfortunately, that means applying KVM's funky suppression of BTF and LBR +to vmcs12 so as not to break existing guests. + +No functional change intended. + +Reviewed-by: Dapeng Mi +Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com +Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------ + 1 file changed, 17 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 529a10bba056..ff61093e9af7 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2192,6 +2192,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + return debugctl; + } + ++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, ++ bool host_initiated) ++{ ++ u64 invalid; ++ ++ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); ++ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { ++ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); ++ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); ++ } ++ return !invalid; ++} ++ + /* + * Writes msr value into the appropriate "register". + * Returns 0 on success, non-0 otherwise. +@@ -2260,19 +2273,12 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + } + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; +- case MSR_IA32_DEBUGCTLMSR: { +- u64 invalid; +- +- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); +- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { +- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); +- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); +- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); +- } +- +- if (invalid) ++ case MSR_IA32_DEBUGCTLMSR: ++ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) + return 1; + ++ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); ++ + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; +@@ -2282,7 +2288,6 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); + return 0; +- } + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && +-- +2.50.1 + diff --git a/queue-6.12/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch b/queue-6.12/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch new file mode 100644 index 0000000000..ac1e857db1 --- /dev/null +++ b/queue-6.12/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch @@ -0,0 +1,196 @@ +From 064fd232cf9cd7db42a4842d7bec28e315b2ac1b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:57:25 -0700 +Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the + guest + +From: Maxim Levitsky + +[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ] + +Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the +host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting +while running the guest. When running with the "default treatment of SMIs" +in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that +is visible to host (non-SMM) software, and instead transitions directly +from VMX non-root to SMM. And critically, DEBUGCTL isn't context switched +by hardware on SMI or RSM, i.e. SMM will run with whatever value was +resident in hardware at the time of the SMI. + +Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting +events while the CPU is executing in SMM, which can pollute profiling and +potentially leak information into the guest. + +Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner +run loop, as the bit can be toggled in IRQ context via IPI callback (SMP +function call), by way of /sys/devices/cpu/freeze_on_smi. + +Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be +preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs, +i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and +at worst could lead to undesirable behavior in the future if AMD CPUs ever +happened to pick up a collision with the bit. + +Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module +owns and controls GUEST_IA32_DEBUGCTL. + +WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the +lack of handling isn't a KVM bug (TDX already WARNs on any run_flag). + +Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed +by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state(). +Doing so avoids the need to track host_debugctl on a per-VMCS basis, as +GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and +load_vmcs12_host_state(). For the VM-Fail case, even though KVM won't +have actually entered the guest, vcpu_enter_guest() will have run with +vmcs02 active and thus could result in vmcs01 being run with a stale value. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Co-developed-by: Sean Christopherson +Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve syntactic conflict in vt_x86_ops definition] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 7 +++++++ + arch/x86/kvm/vmx/main.c | 2 ++ + arch/x86/kvm/vmx/nested.c | 3 +++ + arch/x86/kvm/vmx/vmx.c | 3 +++ + arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++++- + arch/x86/kvm/x86.c | 14 ++++++++++++-- + 6 files changed, 41 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 2ed05925d9d5..d27df86aa62c 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1630,6 +1630,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + enum kvm_x86_run_flags { + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), + KVM_RUN_LOAD_GUEST_DR6 = BIT(1), ++ KVM_RUN_LOAD_DEBUGCTL = BIT(2), + }; + + struct kvm_x86_ops { +@@ -1659,6 +1660,12 @@ struct kvm_x86_ops { + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + ++ /* ++ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to ++ * match the host's value even while the guest is active. ++ */ ++ const u64 HOST_OWNED_DEBUGCTL; ++ + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); +diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c +index 7668e2fb8043..3f83e36a657b 100644 +--- a/arch/x86/kvm/vmx/main.c ++++ b/arch/x86/kvm/vmx/main.c +@@ -42,6 +42,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + ++ .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM, ++ + .update_exception_bitmap = vmx_update_exception_bitmap, + .get_feature_msr = vmx_get_feature_msr, + .get_msr = vmx_get_msr, +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 9a336f661fc6..60bd2791d933 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -4829,6 +4829,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + ++ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ ++ vmx_reload_guest_debugctl(vcpu); ++ + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 4bb25519e7ce..6c185a260c5b 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7407,6 +7407,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + if (run_flags & KVM_RUN_LOAD_GUEST_DR6) + set_debugreg(vcpu->arch.dr6, 6); + ++ if (run_flags & KVM_RUN_LOAD_DEBUGCTL) ++ vmx_reload_guest_debugctl(vcpu); ++ + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 5b2c5cb5e32e..a7e2de50d27f 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -440,12 +440,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) + + static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) + { ++ WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM); ++ ++ val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM; + vmcs_write64(GUEST_IA32_DEBUGCTL, val); + } + + static inline u64 vmx_guest_debugctl_read(void) + { +- return vmcs_read64(GUEST_IA32_DEBUGCTL); ++ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM; ++} ++ ++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu) ++{ ++ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ ++ if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM)) ++ return; ++ ++ vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM); + } + + /* +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 7beea8fb6ea6..dbd295ef3eba 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10711,7 +10711,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + fastpath_t exit_fastpath; +- u64 run_flags; ++ u64 run_flags, debug_ctl; + + bool req_immediate_exit = false; + +@@ -10982,7 +10982,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(DR7_FIXED_1, 7); + } + +- vcpu->arch.host_debugctl = get_debugctlmsr(); ++ /* ++ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL ++ * can be modified in IRQ context, e.g. via SMP function calls. Inform ++ * vendor code if any host-owned bits were changed, e.g. so that the ++ * value loaded into hardware while running the guest can be updated. ++ */ ++ debug_ctl = get_debugctlmsr(); ++ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && ++ !vcpu->arch.guest_state_protected) ++ run_flags |= KVM_RUN_LOAD_DEBUGCTL; ++ vcpu->arch.host_debugctl = debug_ctl; + + guest_timing_enter_irqoff(); + +-- +2.50.1 + diff --git a/queue-6.12/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch b/queue-6.12/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch new file mode 100644 index 0000000000..6767b6d361 --- /dev/null +++ b/queue-6.12/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch @@ -0,0 +1,162 @@ +From 23fe0561dff1a54e2d0cadace8e98dc9775bd0b3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:57:24 -0700 +Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs + +From: Maxim Levitsky + +[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ] + +Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to +vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into +GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state +into the guest, and without needing to copy+paste the FREEZE_IN_SMM +logic into every patch that accesses GUEST_IA32_DEBUGCTL. + +No functional change intended. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +[sean: massage changelog, make inline, use in all prepare_vmcs02() cases] +Reviewed-by: Dapeng Mi +Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/nested.c | 10 +++++----- + arch/x86/kvm/vmx/pmu_intel.c | 8 ++++---- + arch/x86/kvm/vmx/vmx.c | 8 +++++--- + arch/x86/kvm/vmx/vmx.h | 10 ++++++++++ + 4 files changed, 24 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 1e0b9f92ff18..9a336f661fc6 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2653,11 +2653,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & +- vmx_get_supported_debugctl(vcpu, false)); ++ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & ++ vmx_get_supported_debugctl(vcpu, false)); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); ++ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); + } + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) +@@ -3527,7 +3527,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) +- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); + if (kvm_mpx_supported() && + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) +@@ -4774,7 +4774,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); + + kvm_set_dr(vcpu, 7, 0x400); +- vmcs_write64(GUEST_IA32_DEBUGCTL, 0); ++ vmx_guest_debugctl_write(vcpu, 0); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index 9c9d4a336166..a5edc623166a 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -605,11 +605,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) + */ + static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) + { +- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ u64 data = vmx_guest_debugctl_read(); + + if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { + data &= ~DEBUGCTLMSR_LBR; +- vmcs_write64(GUEST_IA32_DEBUGCTL, data); ++ vmx_guest_debugctl_write(vcpu, data); + } + } + +@@ -679,7 +679,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) + + if (!lbr_desc->event) { + vmx_disable_lbr_msrs_passthrough(vcpu); +- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) ++ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) + goto warn; + if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) + goto warn; +@@ -701,7 +701,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) + + static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) + { +- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) ++ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) + intel_pmu_release_guest_lbr_event(vcpu); + } + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 50d45c18fce9..4bb25519e7ce 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2148,7 +2148,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; + break; + case MSR_IA32_DEBUGCTLMSR: +- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ msr_info->data = vmx_guest_debugctl_read(); + break; + default: + find_uret_msr: +@@ -2282,7 +2282,8 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; + +- vmcs_write64(GUEST_IA32_DEBUGCTL, data); ++ vmx_guest_debugctl_write(vcpu, data); ++ + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); +@@ -4831,7 +4832,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); +- vmcs_write64(GUEST_IA32_DEBUGCTL, 0); ++ ++ vmx_guest_debugctl_write(&vmx->vcpu, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index ee330d14089d..5b2c5cb5e32e 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -438,6 +438,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); + bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); + ++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) ++{ ++ vmcs_write64(GUEST_IA32_DEBUGCTL, val); ++} ++ ++static inline u64 vmx_guest_debugctl_read(void) ++{ ++ return vmcs_read64(GUEST_IA32_DEBUGCTL); ++} ++ + /* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and +-- +2.50.1 + diff --git a/queue-6.12/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch b/queue-6.12/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch new file mode 100644 index 0000000000..a83f1b711e --- /dev/null +++ b/queue-6.12/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch @@ -0,0 +1,153 @@ +From d43a98921ac0ceecd8840b7a5d4dc24377a1c4d9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:57:19 -0700 +Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic + bitmap + +From: Sean Christopherson + +[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ] + +Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter +into an a generic bitmap so that similar "take action" information can be +passed to vendor code without creating a pile of boolean parameters. + +This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and +will also allow for adding similar functionality for re-loading debugctl +in the active VMCS. + +Opportunistically massage the TDX WARN and comment to prepare for adding +more run_flags, all of which are expected to be mutually exclusive with +TDX, i.e. should be WARNed on. + +No functional change intended. + +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: drop TDX changes] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 6 +++++- + arch/x86/kvm/svm/svm.c | 4 ++-- + arch/x86/kvm/vmx/vmx.c | 3 ++- + arch/x86/kvm/vmx/x86_ops.h | 2 +- + arch/x86/kvm/x86.c | 11 ++++++++--- + 5 files changed, 18 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 0caa3293f6db..cccc8cbe72db 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1627,6 +1627,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; + } + ++enum kvm_x86_run_flags { ++ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), ++}; ++ + struct kvm_x86_ops { + const char *name; + +@@ -1706,7 +1710,7 @@ struct kvm_x86_ops { + + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, +- bool force_immediate_exit); ++ u64 run_flags); + int (*handle_exit)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 1f42a71b15c0..7d1b871cfc02 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4226,9 +4226,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + guest_state_exit_irqoff(); + } + +-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, +- bool force_immediate_exit) ++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + { ++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; + struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 9a4ebf3dfbfc..2a977cdfcd0c 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7353,8 +7353,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + guest_state_exit_irqoff(); + } + +-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) ++fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + { ++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + +diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h +index 4aba200f435d..5e4ce13ab305 100644 +--- a/arch/x86/kvm/vmx/x86_ops.h ++++ b/arch/x86/kvm/vmx/x86_ops.h +@@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm); + int vmx_vcpu_precreate(struct kvm *kvm); + int vmx_vcpu_create(struct kvm_vcpu *vcpu); + int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu); +-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); ++fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags); + void vmx_vcpu_free(struct kvm_vcpu *vcpu); + void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); + void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 213af0fda768..44ab46f2a2d2 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10711,6 +10711,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + fastpath_t exit_fastpath; ++ u64 run_flags; + + bool req_immediate_exit = false; + +@@ -10955,8 +10956,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + goto cancel_injection; + } + +- if (req_immediate_exit) ++ run_flags = 0; ++ if (req_immediate_exit) { ++ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; + kvm_make_request(KVM_REQ_EVENT, vcpu); ++ } + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) +@@ -10992,8 +10996,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); + +- exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, +- req_immediate_exit); ++ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + +@@ -11005,6 +11008,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + break; + } + ++ run_flags = 0; ++ + /* Note, VM-Exits that go down the "slow" path are accounted below. */ + ++vcpu->stat.exits; + } +-- +2.50.1 + diff --git a/queue-6.12/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch b/queue-6.12/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch new file mode 100644 index 0000000000..3a7650569a --- /dev/null +++ b/queue-6.12/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch @@ -0,0 +1,149 @@ +From 60ac0019cd78125bddc4cc6b46b022c333b534cb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:57:20 -0700 +Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag + +From: Sean Christopherson + +[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ] + +Instruct vendor code to load the guest's DR6 into hardware via a new +KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to +load vcpu->arch.dr6 into hardware when DR6 can be read/written directly +by the guest. + +Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM +thinks DR6 needs to be reloaded. TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH +and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6. + +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: drop TDX changes] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 - + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kvm/svm/svm.c | 10 ++++++---- + arch/x86/kvm/vmx/main.c | 1 - + arch/x86/kvm/vmx/vmx.c | 9 +++------ + arch/x86/kvm/x86.c | 2 +- + 6 files changed, 11 insertions(+), 14 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index cfb22f8c451a..861d080ed4c6 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -47,7 +47,6 @@ KVM_X86_OP(set_idt) + KVM_X86_OP(get_gdt) + KVM_X86_OP(set_gdt) + KVM_X86_OP(sync_dirty_debug_regs) +-KVM_X86_OP(set_dr6) + KVM_X86_OP(set_dr7) + KVM_X86_OP(cache_reg) + KVM_X86_OP(get_rflags) +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index cccc8cbe72db..2ed05925d9d5 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1629,6 +1629,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + + enum kvm_x86_run_flags { + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), ++ KVM_RUN_LOAD_GUEST_DR6 = BIT(1), + }; + + struct kvm_x86_ops { +@@ -1679,7 +1680,6 @@ struct kvm_x86_ops { + void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); +- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); + void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 7d1b871cfc02..800f781475c0 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4270,10 +4270,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + svm_hv_update_vp_id(svm->vmcb, vcpu); + + /* +- * Run with all-zero DR6 unless needed, so that we can get the exact cause +- * of a #DB. ++ * Run with all-zero DR6 unless the guest can write DR6 freely, so that ++ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from ++ * KVM's snapshot is only necessary when DR accesses won't exit. + */ +- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) ++ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) ++ svm_set_dr6(vcpu, vcpu->arch.dr6); ++ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); + + clgi(); +@@ -5084,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .set_idt = svm_set_idt, + .get_gdt = svm_get_gdt, + .set_gdt = svm_set_gdt, +- .set_dr6 = svm_set_dr6, + .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, + .cache_reg = svm_cache_reg, +diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c +index 47476fcc179a..7668e2fb8043 100644 +--- a/arch/x86/kvm/vmx/main.c ++++ b/arch/x86/kvm/vmx/main.c +@@ -60,7 +60,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = { + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, +- .set_dr6 = vmx_set_dr6, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 2a977cdfcd0c..b9c7940feac6 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -5630,12 +5630,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) + set_debugreg(DR6_RESERVED, 6); + } + +-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +-{ +- lockdep_assert_irqs_disabled(); +- set_debugreg(vcpu->arch.dr6, 6); +-} +- + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) + { + vmcs_writel(GUEST_DR7, val); +@@ -7400,6 +7394,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + vcpu->arch.regs_dirty = 0; + ++ if (run_flags & KVM_RUN_LOAD_GUEST_DR6) ++ set_debugreg(vcpu->arch.dr6, 6); ++ + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 44ab46f2a2d2..7beea8fb6ea6 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10977,7 +10977,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) +- kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); ++ run_flags |= KVM_RUN_LOAD_GUEST_DR6; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(DR7_FIXED_1, 7); + } +-- +2.50.1 + diff --git a/queue-6.12/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch b/queue-6.12/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch new file mode 100644 index 0000000000..4bdc787f3f --- /dev/null +++ b/queue-6.12/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch @@ -0,0 +1,78 @@ +From e14e8193de61d485369bd36f87d887c94c48751d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 24 Jul 2025 17:09:56 +0800 +Subject: mm/smaps: fix race between smaps_hugetlb_range and migration + +From: Jinjiang Tu + +[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ] + +smaps_hugetlb_range() handles the pte without holdling ptl, and may be +concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page(). +The race is as follows. + +smaps_hugetlb_range migrate_pages + huge_ptep_get + remove_migration_ptes + folio_unlock + pfn_swap_entry_folio + BUG_ON + +To fix it, hold ptl lock in smaps_hugetlb_range(). + +Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com +Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com +Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps") +Signed-off-by: Jinjiang Tu +Acked-by: David Hildenbrand +Cc: Andrei Vagin +Cc: Andrii Nakryiko +Cc: Baolin Wang +Cc: Brahmajit Das +Cc: Catalin Marinas +Cc: Christophe Leroy +Cc: David Rientjes +Cc: Dev Jain +Cc: Hugh Dickins +Cc: Joern Engel +Cc: Kefeng Wang +Cc: Lorenzo Stoakes +Cc: Michal Hocko +Cc: Ryan Roberts +Cc: Thiago Jung Bauermann +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + fs/proc/task_mmu.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 72a58681f031..2257bf52fb2a 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1007,10 +1007,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + { + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; +- pte_t ptent = huge_ptep_get(walk->mm, addr, pte); + struct folio *folio = NULL; + bool present = false; ++ spinlock_t *ptl; ++ pte_t ptent; + ++ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); ++ ptent = huge_ptep_get(walk->mm, addr, pte); + if (pte_present(ptent)) { + folio = page_folio(pte_page(ptent)); + present = true; +@@ -1029,6 +1032,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } ++ spin_unlock(ptl); + return 0; + } + #else +-- +2.50.1 + diff --git a/queue-6.12/net-kcm-fix-race-condition-in-kcm_unattach.patch b/queue-6.12/net-kcm-fix-race-condition-in-kcm_unattach.patch new file mode 100644 index 0000000000..c8a4938657 --- /dev/null +++ b/queue-6.12/net-kcm-fix-race-condition-in-kcm_unattach.patch @@ -0,0 +1,88 @@ +From 6ece36736d8033ce02a676412c51e99271b4ef6a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Aug 2025 21:18:03 +0200 +Subject: net: kcm: Fix race condition in kcm_unattach() + +From: Sven Stegemann + +[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ] + +syzbot found a race condition when kcm_unattach(psock) +and kcm_release(kcm) are executed at the same time. + +kcm_unattach() is missing a check of the flag +kcm->tx_stopped before calling queue_work(). + +If the kcm has a reserved psock, kcm_unattach() might get executed +between cancel_work_sync() and unreserve_psock() in kcm_release(), +requeuing kcm->tx_work right before kcm gets freed in kcm_done(). + +Remove kcm->tx_stopped and replace it by the less +error-prone disable_work_sync(). + +Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") +Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662 +Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94 +Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e +Signed-off-by: Sven Stegemann +Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/kcm.h | 1 - + net/kcm/kcmsock.c | 10 ++-------- + 2 files changed, 2 insertions(+), 9 deletions(-) + +diff --git a/include/net/kcm.h b/include/net/kcm.h +index 441e993be634..d9c35e71ecea 100644 +--- a/include/net/kcm.h ++++ b/include/net/kcm.h +@@ -71,7 +71,6 @@ struct kcm_sock { + struct list_head wait_psock_list; + struct sk_buff *seq_skb; + struct mutex tx_mutex; +- u32 tx_stopped : 1; + + /* Don't use bit fields here, these are set under different locks */ + bool tx_wait; +diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c +index d4118c796290..1d37b26ea2ef 100644 +--- a/net/kcm/kcmsock.c ++++ b/net/kcm/kcmsock.c +@@ -429,7 +429,7 @@ static void psock_write_space(struct sock *sk) + + /* Check if the socket is reserved so someone is waiting for sending. */ + kcm = psock->tx_kcm; +- if (kcm && !unlikely(kcm->tx_stopped)) ++ if (kcm) + queue_work(kcm_wq, &kcm->tx_work); + + spin_unlock_bh(&mux->lock); +@@ -1696,12 +1696,6 @@ static int kcm_release(struct socket *sock) + */ + __skb_queue_purge(&sk->sk_write_queue); + +- /* Set tx_stopped. This is checked when psock is bound to a kcm and we +- * get a writespace callback. This prevents further work being queued +- * from the callback (unbinding the psock occurs after canceling work. +- */ +- kcm->tx_stopped = 1; +- + release_sock(sk); + + spin_lock_bh(&mux->lock); +@@ -1717,7 +1711,7 @@ static int kcm_release(struct socket *sock) + /* Cancel work. After this point there should be no outside references + * to the kcm socket. + */ +- cancel_work_sync(&kcm->tx_work); ++ disable_work_sync(&kcm->tx_work); + + lock_sock(sk); + psock = kcm->tx_psock; +-- +2.50.1 + diff --git a/queue-6.12/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.12/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch new file mode 100644 index 0000000000..697eaa71e2 --- /dev/null +++ b/queue-6.12/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch @@ -0,0 +1,44 @@ +From 2f916039451174e3ab687b9a37e3c5231e2ed92a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Aug 2025 07:23:18 -0700 +Subject: net: ti: icss-iep: Fix incorrect type for return value in + extts_enable() + +From: Alok Tiwari + +[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ] + +The variable ret in icss_iep_extts_enable() was incorrectly declared +as u32, while the function returns int and may return negative error +codes. This will cause sign extension issues and incorrect error +propagation. Update ret to be int to fix error handling. + +This change corrects the declaration to avoid potential type mismatch. + +Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") +Signed-off-by: Alok Tiwari +Reviewed-by: Andrew Lunn +Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c +index 50bfbc2779e4..d8c9fe1d98c4 100644 +--- a/drivers/net/ethernet/ti/icssg/icss_iep.c ++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c +@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on) + + static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on) + { +- u32 val, cap, ret = 0; ++ u32 val, cap; ++ int ret = 0; + + mutex_lock(&iep->ptp_clk_mutex); + +-- +2.50.1 + diff --git a/queue-6.12/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch b/queue-6.12/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch new file mode 100644 index 0000000000..3f1c6d7bab --- /dev/null +++ b/queue-6.12/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch @@ -0,0 +1,56 @@ +From ff2cbb791d9045e359020bf8dcdb70db907b394d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Aug 2025 23:08:12 +0530 +Subject: net: ti: icssg-prueth: Fix emac link speed handling + +From: MD Danish Anwar + +[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ] + +When link settings are changed emac->speed is populated by +emac_adjust_link(). The link speed and other settings are then written into +the DRAM. However if both ports are brought down after this and brought up +again or if the operating mode is changed and a firmware reload is needed, +the DRAM is cleared by icssg_config(). As a result the link settings are +lost. + +Fix this by calling emac_adjust_link() after icssg_config(). This re +populates the settings in the DRAM after a new firmware load. + +Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.") +Signed-off-by: MD Danish Anwar +Reviewed-by: Andrew Lunn +Message-ID: <20250805173812.2183161-1-danishanwar@ti.com> +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c +index 0769e1ade30b..ddbc4624ae88 100644 +--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c ++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c +@@ -50,6 +50,8 @@ + /* CTRLMMR_ICSSG_RGMII_CTRL register bits */ + #define ICSSG_CTRL_RGMII_ID_MODE BIT(24) + ++static void emac_adjust_link(struct net_device *ndev); ++ + static int emac_get_tx_ts(struct prueth_emac *emac, + struct emac_tx_ts_response *rsp) + { +@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth) + ret = icssg_config(prueth, emac, slice); + if (ret) + goto disable_class; ++ ++ mutex_lock(&emac->ndev->phydev->lock); ++ emac_adjust_link(emac->ndev); ++ mutex_unlock(&emac->ndev->phydev->lock); + } + + ret = prueth_emac_start(prueth); +-- +2.50.1 + diff --git a/queue-6.12/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.12/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..9c046fd511 --- /dev/null +++ b/queue-6.12/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,129 @@ +From e029781097349b203ded1588deab6713cbf6a350 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 6a1239433830..18a91c031554 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -860,8 +860,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -1184,19 +1182,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -1233,7 +1238,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -1246,8 +1251,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, true, flags); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -1260,12 +1264,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-6.12/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.12/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch new file mode 100644 index 0000000000..56beb5a026 --- /dev/null +++ b/queue-6.12/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch @@ -0,0 +1,103 @@ +From 311ad70a27210004849b7d07dc87eb8eec7af3b0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Jul 2025 15:26:49 +0900 +Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun() + +From: Jeongjun Park + +[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ] + +syzbot reported the following ABBA deadlock: + + CPU0 CPU1 + ---- ---- + n_vclocks_store() + lock(&ptp->n_vclocks_mux) [1] + (physical clock) + pc_clock_adjtime() + lock(&clk->rwsem) [2] + (physical clock) + ... + ptp_clock_freerun() + ptp_vclock_in_use() + lock(&ptp->n_vclocks_mux) [3] + (physical clock) + ptp_clock_unregister() + posix_clock_unregister() + lock(&clk->rwsem) [4] + (virtual clock) + +Since ptp virtual clock is registered only under ptp physical clock, both +ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use() +to lock &ptp->n_vclocks_mux and check ptp->n_vclocks. + +However, when unregistering vclocks in n_vclocks_store(), the locking +ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of +ptp_clock_unregister() called through device_for_each_child_reverse() +is a virtual clock lock. + +Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are +different locks, but in lockdep, a false positive occurs because the +possibility of deadlock is determined through lock-class. + +To solve this, lock subclass annotation must be added to the posix_clock +rwsem of the vclock. + +Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad +Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") +Signed-off-by: Jeongjun Park +Acked-by: Richard Cochran +Reviewed-by: Vladimir Oltean +Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/ptp/ptp_private.h | 5 +++++ + drivers/ptp/ptp_vclock.c | 7 +++++++ + 2 files changed, 12 insertions(+) + +diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h +index a6aad743c282..b352df4cd3f9 100644 +--- a/drivers/ptp/ptp_private.h ++++ b/drivers/ptp/ptp_private.h +@@ -24,6 +24,11 @@ + #define PTP_DEFAULT_MAX_VCLOCKS 20 + #define PTP_MAX_CHANNELS 2048 + ++enum { ++ PTP_LOCK_PHYSICAL = 0, ++ PTP_LOCK_VIRTUAL, ++}; ++ + struct timestamp_event_queue { + struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS]; + int head; +diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c +index 7febfdcbde8b..8ed4b8598924 100644 +--- a/drivers/ptp/ptp_vclock.c ++++ b/drivers/ptp/ptp_vclock.c +@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp) + return PTP_VCLOCK_REFRESH_INTERVAL; + } + ++static void ptp_vclock_set_subclass(struct ptp_clock *ptp) ++{ ++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); ++} ++ + static const struct ptp_clock_info ptp_vclock_info = { + .owner = THIS_MODULE, + .name = "ptp virtual clock", +@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) + return NULL; + } + ++ ptp_vclock_set_subclass(vclock->clock); ++ + timecounter_init(&vclock->tc, &vclock->cc, 0); + ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); + +-- +2.50.1 + diff --git a/queue-6.12/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.12/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch new file mode 100644 index 0000000000..66b427f572 --- /dev/null +++ b/queue-6.12/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch @@ -0,0 +1,73 @@ +From 7a09b3640b9f599fabc4fa354e9ea99af238d33c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:40:11 -0400 +Subject: sctp: linearize cloned gso packets in sctp_rcv + +From: Xin Long + +[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ] + +A cloned head skb still shares these frag skbs in fraglist with the +original head skb. It's not safe to access these frag skbs. + +syzbot reported two use-of-uninitialized-memory bugs caused by this: + + BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 + sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 + sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 + __release_sock+0x1da/0x330 net/core/sock.c:3106 + release_sock+0x6b/0x250 net/core/sock.c:3660 + sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 + sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 + sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 + inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:718 [inline] + +and + + BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 + sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 + __release_sock+0x1d3/0x330 net/core/sock.c:3213 + release_sock+0x6b/0x270 net/core/sock.c:3767 + sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 + sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 + sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 + inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:712 [inline] + +This patch fixes it by linearizing cloned gso packets in sctp_rcv(). + +Fixes: 90017accff61 ("sctp: Add GSO support") +Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com +Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com +Signed-off-by: Xin Long +Reviewed-by: Marcelo Ricardo Leitner +Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sctp/input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index a8a254a5008e..032a10d82302 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb) + * it's better to just linearize it otherwise crc computing + * takes longer. + */ +- if ((!is_gso && skb_linearize(skb)) || ++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; + +-- +2.50.1 + diff --git a/queue-6.12/series b/queue-6.12/series index 61295a552b..d5f954c3af 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -40,3 +40,22 @@ acpi-processor-perflib-fix-initial-_ppc-limit-application.patch acpi-processor-perflib-move-problematic-pr-performance-check.patch block-make-req_op_zone_finish-a-write-operation.patch mm-memory-tier-fix-abstract-distance-calculation-overflow.patch +kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch +kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch +kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch +kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch +kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch +kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch +kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch +habanalabs-fix-uaf-in-export_dmabuf.patch +mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch +net-ti-icssg-prueth-fix-emac-link-speed-handling.patch +net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch +sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch +intel_idle-allow-loading-acpi-tables-for-any-family.patch +cpuidle-governors-menu-avoid-using-invalid-recent-in.patch +ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch +tls-handle-data-disappearing-from-under-the-tls-ulp.patch +net-kcm-fix-race-condition-in-kcm_unattach.patch diff --git a/queue-6.12/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.12/tls-handle-data-disappearing-from-under-the-tls-ulp.patch new file mode 100644 index 0000000000..326d669fa6 --- /dev/null +++ b/queue-6.12/tls-handle-data-disappearing-from-under-the-tls-ulp.patch @@ -0,0 +1,106 @@ +From 6d4442b6803ab0cdf8929963a5e6113ae219f06e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 16:29:06 -0700 +Subject: tls: handle data disappearing from under the TLS ULP + +From: Jakub Kicinski + +[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ] + +TLS expects that it owns the receive queue of the TCP socket. +This cannot be guaranteed in case the reader of the TCP socket +entered before the TLS ULP was installed, or uses some non-standard +read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy +early exit (which leaves anchor pointing to a freed skb) with real +error handling. Wipe the parsing state and tell the reader to retry. + +We already reload the anchor every time we (re)acquire the socket lock, +so the only condition we need to avoid is an out of bounds read +(not having enough bytes in the socket for previously parsed record len). + +If some data was read from under TLS but there's enough in the queue +we'll reload and decrypt what is most likely not a valid TLS record. +Leading to some undefined behavior from TLS perspective (corrupting +a stream? missing an alert? missing an attack?) but no kernel crash +should take place. + +Reported-by: William Liu +Reported-by: Savino Dicanosa +Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io +Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") +Reviewed-by: Eric Dumazet +Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/tls/tls.h | 2 +- + net/tls/tls_strp.c | 11 ++++++++--- + net/tls/tls_sw.c | 3 ++- + 3 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/net/tls/tls.h b/net/tls/tls.h +index e5e47452308a..e1eaf12b3742 100644 +--- a/net/tls/tls.h ++++ b/net/tls/tls.h +@@ -195,7 +195,7 @@ void tls_strp_msg_done(struct tls_strparser *strp); + int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); + void tls_rx_msg_ready(struct tls_strparser *strp); + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); + int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); + struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); + int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); +diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c +index 095cf31bae0b..d71643b494a1 100644 +--- a/net/tls/tls_strp.c ++++ b/net/tls/tls_strp.c +@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) + strp->stm.offset = offset; + } + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + { + struct strp_msg *rxm; + struct tls_msg *tlm; +@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len); + + if (!strp->copy_mode && force_refresh) { +- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len)) +- return; ++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) { ++ WRITE_ONCE(strp->msg_ready, 0); ++ memset(&strp->stm, 0, sizeof(strp->stm)); ++ return false; ++ } + + tls_strp_load_anchor_with_queue(strp, strp->stm.full_len); + } +@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + rxm->offset = strp->stm.offset; + tlm = tls_msg(strp->anchor); + tlm->control = strp->mark; ++ ++ return true; + } + + /* Called with lock held on lower socket */ +diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c +index 1d7caadd0cbc..6385329ef98d 100644 +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -1380,7 +1380,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, + return sock_intr_errno(timeo); + } + +- tls_strp_msg_load(&ctx->strp, released); ++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) ++ return tls_rx_rec_wait(sk, psock, nonblock, false); + + return 1; + } +-- +2.50.1 + diff --git a/queue-6.12/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.12/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..9f55170bd9 --- /dev/null +++ b/queue-6.12/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From f4d9b128db5250a96a548994f339a395c002e13f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index 5de47dd5e909..12ba1a8db93a 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 + diff --git a/queue-6.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch new file mode 100644 index 0000000000..bd224848d6 --- /dev/null +++ b/queue-6.15/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch @@ -0,0 +1,91 @@ +From 5e058a0b161a48cd29cded0776081d5e31d66472 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Aug 2025 17:03:11 +0200 +Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data + +From: Rafael J. Wysocki + +[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ] + +Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid +discarding useful information") caused the number of wakeup interrupts +to increase on an idle system [1], which was not expected to happen +after merely allowing shallower idle states to be selected by the +governor in some cases. + +However, on the system in question, all of the idle states deeper than +WFI are rejected by the driver due to a firmware issue [2]. This causes +the governor to only consider the recent interval duriation data +corresponding to attempts to enter WFI that are successful and the +recent invervals table is filled with values lower than the scheduler +tick period. Consequently, the governor predicts an idle duration +below the scheduler tick period length and avoids stopping the tick +more often which leads to the observed symptom. + +Address it by modifying the governor to update the recent intervals +table also when entering the previously selected idle state fails, so +it knows that the short idle intervals might have been the minority +had the selected idle states been actually entered every time. + +Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") +Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1] +Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2] +Signed-off-by: Rafael J. Wysocki +Tested-by: Christian Loehle +Tested-by: Marc Zyngier +Reviewed-by: Christian Loehle +Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki +Signed-off-by: Sasha Levin +--- + drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c +index 39aa0aea61c6..711517bd43a1 100644 +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns) + + static DEFINE_PER_CPU(struct menu_device, menu_devices); + ++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us) ++{ ++ /* Update the repeating-pattern data. */ ++ data->intervals[data->interval_ptr++] = interval_us; ++ if (data->interval_ptr >= INTERVALS) ++ data->interval_ptr = 0; ++} ++ + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + + /* +@@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; ++ } else if (!dev->last_residency_ns) { ++ /* ++ * This happens when the driver rejects the previously selected ++ * idle state and returns an error, so update the recent ++ * intervals table to prevent invalid information from being ++ * used going forward. ++ */ ++ menu_update_intervals(data, UINT_MAX); + } + + /* Find the shortest expected idle interval. */ +@@ -482,10 +498,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) + + data->correction_factor[data->bucket] = new_factor; + +- /* update the repeating-pattern data */ +- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); +- if (data->interval_ptr >= INTERVALS) +- data->interval_ptr = 0; ++ menu_update_intervals(data, ktime_to_us(measured_ns)); + } + + /** +-- +2.50.1 + diff --git a/queue-6.15/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch b/queue-6.15/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch new file mode 100644 index 0000000000..84b618c868 --- /dev/null +++ b/queue-6.15/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch @@ -0,0 +1,37 @@ +From eb0b60837981894893c566d7ac0d81ad2b5d8126 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 16:20:19 +0800 +Subject: erofs: fix block count report when 48-bit layout is on + +From: Gao Xiang + +[ Upstream commit 0b96d9bed324a1c1b7d02bfb9596351ef178428d ] + +Fix incorrect shift order when combining the 48-bit block count. + +Fixes: 2e1473d5195f ("erofs: implement 48-bit block addressing for unencoded inodes") +Signed-off-by: Gao Xiang +Link: https://lore.kernel.org/r/20250807082019.3093539-1-hsiangkao@linux.alibaba.com +Signed-off-by: Sasha Levin +--- + fs/erofs/super.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/erofs/super.c b/fs/erofs/super.c +index 6e57b9cc6ed2..cfe454dbf415 100644 +--- a/fs/erofs/super.c ++++ b/fs/erofs/super.c +@@ -313,8 +313,8 @@ static int erofs_read_superblock(struct super_block *sb) + sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); + if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { + sbi->root_nid = le64_to_cpu(dsb->rootnid_8b); +- sbi->dif0.blocks = (sbi->dif0.blocks << 32) | +- le16_to_cpu(dsb->rb.blocks_hi); ++ sbi->dif0.blocks = sbi->dif0.blocks | ++ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32); + } else { + sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); + } +-- +2.50.1 + diff --git a/queue-6.15/habanalabs-fix-uaf-in-export_dmabuf.patch b/queue-6.15/habanalabs-fix-uaf-in-export_dmabuf.patch new file mode 100644 index 0000000000..0b4f8a9e42 --- /dev/null +++ b/queue-6.15/habanalabs-fix-uaf-in-export_dmabuf.patch @@ -0,0 +1,96 @@ +From 89ee3cca075191f343cb997a8c8f9baefda963f1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 12 Jul 2025 06:02:31 +0100 +Subject: habanalabs: fix UAF in export_dmabuf() + +From: Al Viro + +[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ] + +As soon as we'd inserted a file reference into descriptor table, another +thread could close it. That's fine for the case when all we are doing is +returning that descriptor to userland (it's a race, but it's a userland +race and there's nothing the kernel can do about it). However, if we +follow fd_install() with any kind of access to objects that would be +destroyed on close (be it the struct file itself or anything destroyed +by its ->release()), we have a UAF. + +dma_buf_fd() is a combination of reserving a descriptor and fd_install(). +habanalabs export_dmabuf() calls it and then proceeds to access the +objects destroyed on close. In particular, it grabs an extra reference to +another struct file that will be dropped as part of ->release() for ours; +that "will be" is actually "might have already been". + +Fix that by reserving descriptor before anything else and do fd_install() +only when everything had been set up. As a side benefit, we no longer +have the failure exit with file already created, but reference to +underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet; +unlike dma_buf_fd(), fd_install() can't fail. + +Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter") +Signed-off-by: Al Viro +Signed-off-by: Sasha Levin +--- + drivers/accel/habanalabs/common/memory.c | 23 +++++++---------------- + 1 file changed, 7 insertions(+), 16 deletions(-) + +diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c +index 601fdbe70179..61472a381904 100644 +--- a/drivers/accel/habanalabs/common/memory.c ++++ b/drivers/accel/habanalabs/common/memory.c +@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf) + struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv; + struct hl_ctx *ctx; + +- if (!hl_dmabuf) +- return; +- + ctx = hl_dmabuf->ctx; + + if (hl_dmabuf->memhash_hnode) +@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx, + { + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct hl_device *hdev = ctx->hdev; +- int rc, fd; ++ CLASS(get_unused_fd, fd)(flags); ++ ++ if (fd < 0) { ++ dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd); ++ return fd; ++ } + + exp_info.ops = &habanalabs_dmabuf_ops; + exp_info.size = total_size; +@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx, + return PTR_ERR(hl_dmabuf->dmabuf); + } + +- fd = dma_buf_fd(hl_dmabuf->dmabuf, flags); +- if (fd < 0) { +- dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd); +- rc = fd; +- goto err_dma_buf_put; +- } +- + hl_dmabuf->ctx = ctx; + hl_ctx_get(hl_dmabuf->ctx); + atomic_inc(&ctx->hdev->dmabuf_export_cnt); +@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx, + get_file(ctx->hpriv->file_priv->filp); + + *dmabuf_fd = fd; ++ fd_install(take_fd(fd), hl_dmabuf->dmabuf->file); + + return 0; +- +-err_dma_buf_put: +- hl_dmabuf->dmabuf->priv = NULL; +- dma_buf_put(hl_dmabuf->dmabuf); +- return rc; + } + + static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset) +-- +2.50.1 + diff --git a/queue-6.15/hamradio-ignore-ops-locked-netdevs.patch b/queue-6.15/hamradio-ignore-ops-locked-netdevs.patch new file mode 100644 index 0000000000..90399e0fb4 --- /dev/null +++ b/queue-6.15/hamradio-ignore-ops-locked-netdevs.patch @@ -0,0 +1,62 @@ +From d23f33c1a9c34b07bd4781c90f234a9a1cbeaa8b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 14:37:26 -0700 +Subject: hamradio: ignore ops-locked netdevs + +From: Stanislav Fomichev + +[ Upstream commit c64237960819aee1766d03f446ae6de94b1e3f73 ] + +Syzkaller managed to trigger lock dependency in xsk_notify via +register_netdevice. As discussed in [0], using register_netdevice +in the notifiers is problematic so skip adding hamradio for ops-locked +devices. + + xsk_notifier+0x89/0x230 net/xdp/xsk.c:1664 + notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85 + call_netdevice_notifiers_extack net/core/dev.c:2267 [inline] + call_netdevice_notifiers net/core/dev.c:2281 [inline] + unregister_netdevice_many_notify+0x14d7/0x1ff0 net/core/dev.c:12156 + unregister_netdevice_many net/core/dev.c:12219 [inline] + unregister_netdevice_queue+0x33c/0x380 net/core/dev.c:12063 + register_netdevice+0x1689/0x1ae0 net/core/dev.c:11241 + bpq_new_device drivers/net/hamradio/bpqether.c:481 [inline] + bpq_device_event+0x491/0x600 drivers/net/hamradio/bpqether.c:523 + notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85 + call_netdevice_notifiers_extack net/core/dev.c:2267 [inline] + call_netdevice_notifiers net/core/dev.c:2281 [inline] + __dev_notify_flags+0x18d/0x2e0 net/core/dev.c:-1 + netif_change_flags+0xe8/0x1a0 net/core/dev.c:9608 + dev_change_flags+0x130/0x260 net/core/dev_api.c:68 + devinet_ioctl+0xbb4/0x1b50 net/ipv4/devinet.c:1200 + inet_ioctl+0x3c0/0x4c0 net/ipv4/af_inet.c:1001 + +0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/ +Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP") +Suggested-by: Jakub Kicinski +Reported-by: syzbot+e6300f66a999a6612477@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e6300f66a999a6612477 +Signed-off-by: Stanislav Fomichev +Link: https://patch.msgid.link/20250806213726.1383379-2-sdf@fomichev.me +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/hamradio/bpqether.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c +index 0e0fe32d2da4..045c5177262e 100644 +--- a/drivers/net/hamradio/bpqether.c ++++ b/drivers/net/hamradio/bpqether.c +@@ -138,7 +138,7 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev) + + static inline int dev_is_ethdev(struct net_device *dev) + { +- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); ++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev); + } + + /* ------------------------------------------------------------------------ */ +-- +2.50.1 + diff --git a/queue-6.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch new file mode 100644 index 0000000000..2fb426266a --- /dev/null +++ b/queue-6.15/intel_idle-allow-loading-acpi-tables-for-any-family.patch @@ -0,0 +1,41 @@ +From ceb238ac9f661be4f5172183ee62b9858f74ad67 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 15:37:14 -0400 +Subject: intel_idle: Allow loading ACPI tables for any family + +From: Len Brown + +[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ] + +There is no reason to limit intel_idle's loading of ACPI tables to +family 6. Upcoming Intel processors are not in family 6. + +Below "Fixes" really means "applies cleanly until". +That syntax commit didn't change the previous logic, +but shows this patch applies back 5-years. + +Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros") +Signed-off-by: Len Brown +Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 976f5be54e36..039dc42dd509 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -1665,7 +1665,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + }; + + static const struct x86_cpu_id intel_mwait_ids[] __initconst = { +- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL), ++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL), + {} + }; + +-- +2.50.1 + diff --git a/queue-6.15/ipvs-fix-estimator-kthreads-preferred-affinity.patch b/queue-6.15/ipvs-fix-estimator-kthreads-preferred-affinity.patch new file mode 100644 index 0000000000..5af8ee77a2 --- /dev/null +++ b/queue-6.15/ipvs-fix-estimator-kthreads-preferred-affinity.patch @@ -0,0 +1,90 @@ +From f981586f1a7a73248f159d3d77b4516449f568ed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 29 Jul 2025 14:26:11 +0200 +Subject: ipvs: Fix estimator kthreads preferred affinity + +From: Frederic Weisbecker + +[ Upstream commit c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 ] + +The estimator kthreads' affinity are defined by sysctl overwritten +preferences and applied through a plain call to the scheduler's affinity +API. + +However since the introduction of managed kthreads preferred affinity, +such a practice shortcuts the kthreads core code which eventually +overwrites the target to the default unbound affinity. + +Fix this with using the appropriate kthread's API. + +Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node") +Signed-off-by: Frederic Weisbecker +Acked-by: Julian Anastasov +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + include/net/ip_vs.h | 13 +++++++++++++ + kernel/kthread.c | 1 + + net/netfilter/ipvs/ip_vs_est.c | 3 ++- + 3 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h +index ff406ef4fd4a..29a36709e7f3 100644 +--- a/include/net/ip_vs.h ++++ b/include/net/ip_vs.h +@@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) + return housekeeping_cpumask(HK_TYPE_KTHREAD); + } + ++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) ++{ ++ if (ipvs->est_cpulist_valid) ++ return ipvs->sysctl_est_cpulist; ++ else ++ return NULL; ++} ++ + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) + { + return ipvs->sysctl_est_nice; +@@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) + return housekeeping_cpumask(HK_TYPE_KTHREAD); + } + ++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) ++{ ++ return NULL; ++} ++ + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) + { + return IPVS_EST_NICE; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 77c44924cf54..800c8fc46b08 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -894,6 +894,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) + + return ret; + } ++EXPORT_SYMBOL_GPL(kthread_affine_preferred); + + /* + * Re-affine kthreads according to their preferences +diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c +index f821ad2e19b3..15049b826732 100644 +--- a/net/netfilter/ipvs/ip_vs_est.c ++++ b/net/netfilter/ipvs/ip_vs_est.c +@@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + } + + set_user_nice(kd->task, sysctl_est_nice(ipvs)); +- set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); ++ if (sysctl_est_preferred_cpulist(ipvs)) ++ kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); + + pr_info("starting estimator thread %d...\n", kd->id); + wake_up_process(kd->task); +-- +2.50.1 + diff --git a/queue-6.15/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch b/queue-6.15/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch new file mode 100644 index 0000000000..c36a2fd198 --- /dev/null +++ b/queue-6.15/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch @@ -0,0 +1,78 @@ +From 9e464b22810b43dd7989a0886ce28831fb986189 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 24 Jul 2025 17:09:56 +0800 +Subject: mm/smaps: fix race between smaps_hugetlb_range and migration + +From: Jinjiang Tu + +[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ] + +smaps_hugetlb_range() handles the pte without holdling ptl, and may be +concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page(). +The race is as follows. + +smaps_hugetlb_range migrate_pages + huge_ptep_get + remove_migration_ptes + folio_unlock + pfn_swap_entry_folio + BUG_ON + +To fix it, hold ptl lock in smaps_hugetlb_range(). + +Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com +Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com +Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps") +Signed-off-by: Jinjiang Tu +Acked-by: David Hildenbrand +Cc: Andrei Vagin +Cc: Andrii Nakryiko +Cc: Baolin Wang +Cc: Brahmajit Das +Cc: Catalin Marinas +Cc: Christophe Leroy +Cc: David Rientjes +Cc: Dev Jain +Cc: Hugh Dickins +Cc: Joern Engel +Cc: Kefeng Wang +Cc: Lorenzo Stoakes +Cc: Michal Hocko +Cc: Ryan Roberts +Cc: Thiago Jung Bauermann +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + fs/proc/task_mmu.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index e57e323817e7..3b8eaa7722c8 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1020,10 +1020,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + { + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; +- pte_t ptent = huge_ptep_get(walk->mm, addr, pte); + struct folio *folio = NULL; + bool present = false; ++ spinlock_t *ptl; ++ pte_t ptent; + ++ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); ++ ptent = huge_ptep_get(walk->mm, addr, pte); + if (pte_present(ptent)) { + folio = page_folio(pte_page(ptent)); + present = true; +@@ -1042,6 +1045,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } ++ spin_unlock(ptl); + return 0; + } + #else +-- +2.50.1 + diff --git a/queue-6.15/net-hibmcge-fix-rtnl-deadlock-issue.patch b/queue-6.15/net-hibmcge-fix-rtnl-deadlock-issue.patch new file mode 100644 index 0000000000..757639b2b9 --- /dev/null +++ b/queue-6.15/net-hibmcge-fix-rtnl-deadlock-issue.patch @@ -0,0 +1,122 @@ +From ed1da7003cf22b8dda5eafc827e4981942bcc092 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 18:27:56 +0800 +Subject: net: hibmcge: fix rtnl deadlock issue + +From: Jijie Shao + +[ Upstream commit c875503a9b9082928d7d3fc60b5400d16fbfae4e ] + +Currently, the hibmcge netdev acquires the rtnl_lock in +pci_error_handlers.reset_prepare() and releases it in +pci_error_handlers.reset_done(). + +However, in the PCI framework: +pci_reset_bus - __pci_reset_slot - pci_slot_save_and_disable_locked - + pci_dev_save_and_disable - err_handler->reset_prepare(dev); + +In pci_slot_save_and_disable_locked(): + list_for_each_entry(dev, &slot->bus->devices, bus_list) { + if (!dev->slot || dev->slot!= slot) + continue; + pci_dev_save_and_disable(dev); + if (dev->subordinate) + pci_bus_save_and_disable_locked(dev->subordinate); + } + +This will iterate through all devices under the current bus and execute +err_handler->reset_prepare(), causing two devices of the hibmcge driver +to sequentially request the rtnl_lock, leading to a deadlock. + +Since the driver now executes netif_device_detach() +before the reset process, it will not concurrently with +other netdev APIs, so there is no need to hold the rtnl_lock now. + +Therefore, this patch removes the rtnl_lock during the reset process and +adjusts the position of HBG_NIC_STATE_RESETTING to ensure +that multiple resets are not executed concurrently. + +Fixes: 3f5a61f6d504f ("net: hibmcge: Add reset supported in this module") +Signed-off-by: Jijie Shao +Reviewed-by: Simon Horman +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c | 14 +++++--------- + 1 file changed, 5 insertions(+), 9 deletions(-) + +diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c +index ff3295b60a69..dee1e8681157 100644 +--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c ++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c +@@ -53,9 +53,11 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type) + { + int ret; + +- ASSERT_RTNL(); ++ if (test_and_set_bit(HBG_NIC_STATE_RESETTING, &priv->state)) ++ return -EBUSY; + + if (netif_running(priv->netdev)) { ++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + dev_warn(&priv->pdev->dev, + "failed to reset because port is up\n"); + return -EBUSY; +@@ -64,7 +66,6 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type) + netif_device_detach(priv->netdev); + + priv->reset_type = type; +- set_bit(HBG_NIC_STATE_RESETTING, &priv->state); + clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET); + if (ret) { +@@ -83,28 +84,25 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type) + type != priv->reset_type) + return 0; + +- ASSERT_RTNL(); +- +- clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + ret = hbg_rebuild(priv); + if (ret) { + set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); ++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + dev_err(&priv->pdev->dev, "failed to rebuild after reset\n"); + return ret; + } + + netif_device_attach(priv->netdev); ++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + + dev_info(&priv->pdev->dev, "reset done\n"); + return ret; + } + +-/* must be protected by rtnl lock */ + int hbg_reset(struct hbg_priv *priv) + { + int ret; + +- ASSERT_RTNL(); + ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION); + if (ret) + return ret; +@@ -169,7 +167,6 @@ static void hbg_pci_err_reset_prepare(struct pci_dev *pdev) + struct net_device *netdev = pci_get_drvdata(pdev); + struct hbg_priv *priv = netdev_priv(netdev); + +- rtnl_lock(); + hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR); + } + +@@ -179,7 +176,6 @@ static void hbg_pci_err_reset_done(struct pci_dev *pdev) + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_reset_done(priv, HBG_RESET_TYPE_FLR); +- rtnl_unlock(); + } + + static const struct pci_error_handlers hbg_pci_err_handler = { +-- +2.50.1 + diff --git a/queue-6.15/net-hibmcge-fix-the-division-by-zero-issue.patch b/queue-6.15/net-hibmcge-fix-the-division-by-zero-issue.patch new file mode 100644 index 0000000000..f98955e738 --- /dev/null +++ b/queue-6.15/net-hibmcge-fix-the-division-by-zero-issue.patch @@ -0,0 +1,46 @@ +From 2870ddbdb6f9caf8329f3c51d2f9946a176eda6c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 18:27:57 +0800 +Subject: net: hibmcge: fix the division by zero issue + +From: Jijie Shao + +[ Upstream commit 7004b26f0b64331143eb0b312e77a357a11427ce ] + +When the network port is down, the queue is released, and ring->len is 0. +In debugfs, hbg_get_queue_used_num() will be called, +which may lead to a division by zero issue. + +This patch adds a check, if ring->len is 0, +hbg_get_queue_used_num() directly returns 0. + +Fixes: 40735e7543f9 ("net: hibmcge: Implement .ndo_start_xmit function") +Signed-off-by: Jijie Shao +Reviewed-by: Simon Horman +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h +index 2883a5899ae2..8b6110599e10 100644 +--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h ++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h +@@ -29,7 +29,12 @@ static inline bool hbg_fifo_is_full(struct hbg_priv *priv, enum hbg_dir dir) + + static inline u32 hbg_get_queue_used_num(struct hbg_ring *ring) + { +- return (ring->ntu + ring->len - ring->ntc) % ring->len; ++ u32 len = READ_ONCE(ring->len); ++ ++ if (!len) ++ return 0; ++ ++ return (READ_ONCE(ring->ntu) + len - READ_ONCE(ring->ntc)) % len; + } + + netdev_tx_t hbg_net_start_xmit(struct sk_buff *skb, struct net_device *netdev); +-- +2.50.1 + diff --git a/queue-6.15/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch b/queue-6.15/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch new file mode 100644 index 0000000000..c84fb4c379 --- /dev/null +++ b/queue-6.15/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch @@ -0,0 +1,68 @@ +From 1e8262925f1a14e833eb34376195aa9aa71ca95e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 18:27:58 +0800 +Subject: net: hibmcge: fix the np_link_fail error reporting issue + +From: Jijie Shao + +[ Upstream commit 62c50180ffda01468e640ac14925503796f255e2 ] + +Currently, after modifying device port mode, the np_link_ok state +is immediately checked. At this point, the device may not yet ready, +leading to the querying of an intermediate state. + +This patch will poll to check if np_link is ok after +modifying device port mode, and only report np_link_fail upon timeout. + +Fixes: e0306637e85d ("net: hibmcge: Add support for mac link exception handling feature") +Signed-off-by: Jijie Shao +Reviewed-by: Simon Horman +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +index 9b65eef62b3f..2844124f306d 100644 +--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c ++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +@@ -12,6 +12,8 @@ + + #define HBG_HW_EVENT_WAIT_TIMEOUT_US (2 * 1000 * 1000) + #define HBG_HW_EVENT_WAIT_INTERVAL_US (10 * 1000) ++#define HBG_MAC_LINK_WAIT_TIMEOUT_US (500 * 1000) ++#define HBG_MAC_LINK_WAIT_INTERVAL_US (5 * 1000) + /* little endian or big endian. + * ctrl means packet description, data means skb packet data + */ +@@ -213,6 +215,9 @@ void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr) + + void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex) + { ++ u32 link_status; ++ int ret; ++ + hbg_hw_mac_enable(priv, HBG_STATUS_DISABLE); + + hbg_reg_write_field(priv, HBG_REG_PORT_MODE_ADDR, +@@ -224,8 +229,14 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex) + + hbg_hw_mac_enable(priv, HBG_STATUS_ENABLE); + +- if (!hbg_reg_read_field(priv, HBG_REG_AN_NEG_STATE_ADDR, +- HBG_REG_AN_NEG_STATE_NP_LINK_OK_B)) ++ /* wait MAC link up */ ++ ret = readl_poll_timeout(priv->io_base + HBG_REG_AN_NEG_STATE_ADDR, ++ link_status, ++ FIELD_GET(HBG_REG_AN_NEG_STATE_NP_LINK_OK_B, ++ link_status), ++ HBG_MAC_LINK_WAIT_INTERVAL_US, ++ HBG_MAC_LINK_WAIT_TIMEOUT_US); ++ if (ret) + hbg_np_link_fail_task_schedule(priv); + } + +-- +2.50.1 + diff --git a/queue-6.15/net-kcm-fix-race-condition-in-kcm_unattach.patch b/queue-6.15/net-kcm-fix-race-condition-in-kcm_unattach.patch new file mode 100644 index 0000000000..9de4428f16 --- /dev/null +++ b/queue-6.15/net-kcm-fix-race-condition-in-kcm_unattach.patch @@ -0,0 +1,88 @@ +From 8bee886e735d51eab3303b93156daeda9571fef3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Aug 2025 21:18:03 +0200 +Subject: net: kcm: Fix race condition in kcm_unattach() + +From: Sven Stegemann + +[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ] + +syzbot found a race condition when kcm_unattach(psock) +and kcm_release(kcm) are executed at the same time. + +kcm_unattach() is missing a check of the flag +kcm->tx_stopped before calling queue_work(). + +If the kcm has a reserved psock, kcm_unattach() might get executed +between cancel_work_sync() and unreserve_psock() in kcm_release(), +requeuing kcm->tx_work right before kcm gets freed in kcm_done(). + +Remove kcm->tx_stopped and replace it by the less +error-prone disable_work_sync(). + +Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") +Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662 +Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94 +Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e +Signed-off-by: Sven Stegemann +Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/kcm.h | 1 - + net/kcm/kcmsock.c | 10 ++-------- + 2 files changed, 2 insertions(+), 9 deletions(-) + +diff --git a/include/net/kcm.h b/include/net/kcm.h +index 441e993be634..d9c35e71ecea 100644 +--- a/include/net/kcm.h ++++ b/include/net/kcm.h +@@ -71,7 +71,6 @@ struct kcm_sock { + struct list_head wait_psock_list; + struct sk_buff *seq_skb; + struct mutex tx_mutex; +- u32 tx_stopped : 1; + + /* Don't use bit fields here, these are set under different locks */ + bool tx_wait; +diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c +index 24aec295a51c..8c0577cd764f 100644 +--- a/net/kcm/kcmsock.c ++++ b/net/kcm/kcmsock.c +@@ -429,7 +429,7 @@ static void psock_write_space(struct sock *sk) + + /* Check if the socket is reserved so someone is waiting for sending. */ + kcm = psock->tx_kcm; +- if (kcm && !unlikely(kcm->tx_stopped)) ++ if (kcm) + queue_work(kcm_wq, &kcm->tx_work); + + spin_unlock_bh(&mux->lock); +@@ -1688,12 +1688,6 @@ static int kcm_release(struct socket *sock) + */ + __skb_queue_purge(&sk->sk_write_queue); + +- /* Set tx_stopped. This is checked when psock is bound to a kcm and we +- * get a writespace callback. This prevents further work being queued +- * from the callback (unbinding the psock occurs after canceling work. +- */ +- kcm->tx_stopped = 1; +- + release_sock(sk); + + spin_lock_bh(&mux->lock); +@@ -1709,7 +1703,7 @@ static int kcm_release(struct socket *sock) + /* Cancel work. After this point there should be no outside references + * to the kcm socket. + */ +- cancel_work_sync(&kcm->tx_work); ++ disable_work_sync(&kcm->tx_work); + + lock_sock(sk); + psock = kcm->tx_psock; +-- +2.50.1 + diff --git a/queue-6.15/net-lapbether-ignore-ops-locked-netdevs.patch b/queue-6.15/net-lapbether-ignore-ops-locked-netdevs.patch new file mode 100644 index 0000000000..121320662e --- /dev/null +++ b/queue-6.15/net-lapbether-ignore-ops-locked-netdevs.patch @@ -0,0 +1,64 @@ +From cdf8a27cb2b49bb5d7d3fd82a048319d4cf78cba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 14:37:25 -0700 +Subject: net: lapbether: ignore ops-locked netdevs + +From: Stanislav Fomichev + +[ Upstream commit 53898ebabe843bfa7baea9dae152797d5d0563c9 ] + +Syzkaller managed to trigger lock dependency in xsk_notify via +register_netdevice. As discussed in [0], using register_netdevice +in the notifiers is problematic so skip adding lapbeth for ops-locked +devices. + + xsk_notifier+0xa4/0x280 net/xdp/xsk.c:1645 + notifier_call_chain+0xbc/0x410 kernel/notifier.c:85 + call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230 + call_netdevice_notifiers_extack net/core/dev.c:2268 [inline] + call_netdevice_notifiers net/core/dev.c:2282 [inline] + unregister_netdevice_many_notify+0xf9d/0x2700 net/core/dev.c:12077 + unregister_netdevice_many net/core/dev.c:12140 [inline] + unregister_netdevice_queue+0x305/0x3f0 net/core/dev.c:11984 + register_netdevice+0x18f1/0x2270 net/core/dev.c:11149 + lapbeth_new_device drivers/net/wan/lapbether.c:420 [inline] + lapbeth_device_event+0x5b1/0xbe0 drivers/net/wan/lapbether.c:462 + notifier_call_chain+0xbc/0x410 kernel/notifier.c:85 + call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230 + call_netdevice_notifiers_extack net/core/dev.c:2268 [inline] + call_netdevice_notifiers net/core/dev.c:2282 [inline] + __dev_notify_flags+0x12c/0x2e0 net/core/dev.c:9497 + netif_change_flags+0x108/0x160 net/core/dev.c:9526 + dev_change_flags+0xba/0x250 net/core/dev_api.c:68 + devinet_ioctl+0x11d5/0x1f50 net/ipv4/devinet.c:1200 + inet_ioctl+0x3a7/0x3f0 net/ipv4/af_inet.c:1001 + +0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/ +Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP") +Suggested-by: Jakub Kicinski +Reported-by: syzbot+e67ea9c235b13b4f0020@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e67ea9c235b13b4f0020 +Signed-off-by: Stanislav Fomichev +Link: https://patch.msgid.link/20250806213726.1383379-1-sdf@fomichev.me +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/wan/lapbether.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c +index 995a7207bdf8..f357a7ac70ac 100644 +--- a/drivers/net/wan/lapbether.c ++++ b/drivers/net/wan/lapbether.c +@@ -81,7 +81,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) + + static __inline__ int dev_is_ethdev(struct net_device *dev) + { +- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); ++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev); + } + + /* ------------------------------------------------------------------------ */ +-- +2.50.1 + diff --git a/queue-6.15/net-page_pool-allow-enabling-recycling-late-fix-fals.patch b/queue-6.15/net-page_pool-allow-enabling-recycling-late-fix-fals.patch new file mode 100644 index 0000000000..f6b8e63c0e --- /dev/null +++ b/queue-6.15/net-page_pool-allow-enabling-recycling-late-fix-fals.patch @@ -0,0 +1,174 @@ +From 50c290becd59110bb55ee279fd703e866f2814a0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 17:36:54 -0700 +Subject: net: page_pool: allow enabling recycling late, fix false positive + warning + +From: Jakub Kicinski + +[ Upstream commit 64fdaa94bfe0cca3a0f4b2dd922486c5f59fe678 ] + +Page pool can have pages "directly" (locklessly) recycled to it, +if the NAPI that owns the page pool is scheduled to run on the same CPU. +To make this safe we check that the NAPI is disabled while we destroy +the page pool. In most cases NAPI and page pool lifetimes are tied +together so this happens naturally. + +The queue API expects the following order of calls: + -> mem_alloc + alloc new pp + -> stop + napi_disable + -> start + napi_enable + -> mem_free + free old pp + +Here we allocate the page pool in ->mem_alloc and free in ->mem_free. +But the NAPIs are only stopped between ->stop and ->start. We created +page_pool_disable_direct_recycling() to safely shut down the recycling +in ->stop. This way the page_pool_destroy() call in ->mem_free doesn't +have to worry about recycling any more. + +Unfortunately, the page_pool_disable_direct_recycling() is not enough +to deal with failures which necessitate freeing the _new_ page pool. +If we hit a failure in ->mem_alloc or ->stop the new page pool has +to be freed while the NAPI is active (assuming driver attaches the +page pool to an existing NAPI instance and doesn't reallocate NAPIs). + +Freeing the new page pool is technically safe because it hasn't been +used for any packets, yet, so there can be no recycling. But the check +in napi_assert_will_not_race() has no way of knowing that. We could +check if page pool is empty but that'd make the check much less likely +to trigger during development. + +Add page_pool_enable_direct_recycling(), pairing with +page_pool_disable_direct_recycling(). It will allow us to create the new +page pools in "disabled" state and only enable recycling when we know +the reconfig operation will not fail. + +Coincidentally it will also let us re-enable the recycling for the old +pool, if the reconfig failed: + + -> mem_alloc (new) + -> stop (old) + # disables direct recycling for old + -> start (new) + # fail!! + -> start (old) + # go back to old pp but direct recycling is lost :( + -> mem_free (new) + +The new helper is idempotent to make the life easier for drivers, +which can operate in HDS mode and support zero-copy Rx. +The driver can call the helper twice whether there are two pools +or it has multiple references to a single pool. + +Fixes: 40eca00ae605 ("bnxt_en: unlink page pool when stopping Rx queue") +Tested-by: David Wei +Link: https://patch.msgid.link/20250805003654.2944974-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++- + include/net/page_pool/types.h | 2 ++ + net/core/page_pool.c | 29 +++++++++++++++++++++++ + 3 files changed, 39 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index d66519ce57af..8021d97f3f22 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -3779,7 +3779,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, + if (BNXT_RX_PAGE_MODE(bp)) + pp.pool_size += bp->rx_ring_size; + pp.nid = numa_node; +- pp.napi = &rxr->bnapi->napi; + pp.netdev = bp->dev; + pp.dev = &bp->pdev->dev; + pp.dma_dir = bp->rx_dir; +@@ -3807,6 +3806,12 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, + return PTR_ERR(pool); + } + ++static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr) ++{ ++ page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi); ++ page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi); ++} ++ + static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) + { + u16 mem_size; +@@ -3845,6 +3850,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) + rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); + if (rc) + return rc; ++ bnxt_enable_rx_page_pool(rxr); + + rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0); + if (rc < 0) +@@ -15998,6 +16004,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) + goto err_reset; + } + ++ bnxt_enable_rx_page_pool(rxr); + napi_enable_locked(&bnapi->napi); + bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons); + +diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h +index 431b593de709..1509a536cb85 100644 +--- a/include/net/page_pool/types.h ++++ b/include/net/page_pool/types.h +@@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, + struct xdp_mem_info; + + #ifdef CONFIG_PAGE_POOL ++void page_pool_enable_direct_recycling(struct page_pool *pool, ++ struct napi_struct *napi); + void page_pool_disable_direct_recycling(struct page_pool *pool); + void page_pool_destroy(struct page_pool *pool); + void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), +diff --git a/net/core/page_pool.c b/net/core/page_pool.c +index 3eabe78c93f4..ef870c21e854 100644 +--- a/net/core/page_pool.c ++++ b/net/core/page_pool.c +@@ -1201,6 +1201,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + pool->xdp_mem_id = mem->id; + } + ++/** ++ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI ++ * @pool: page pool to modify ++ * @napi: NAPI instance to associate the page pool with ++ * ++ * Associate a page pool with a NAPI instance for lockless page recycling. ++ * This is useful when a new page pool has to be added to a NAPI instance ++ * without disabling that NAPI instance, to mark the point at which control ++ * path "hands over" the page pool to the NAPI instance. In most cases driver ++ * can simply set the @napi field in struct page_pool_params, and does not ++ * have to call this helper. ++ * ++ * The function is idempotent, but does not implement any refcounting. ++ * Single page_pool_disable_direct_recycling() will disable recycling, ++ * no matter how many times enable was called. ++ */ ++void page_pool_enable_direct_recycling(struct page_pool *pool, ++ struct napi_struct *napi) ++{ ++ if (READ_ONCE(pool->p.napi) == napi) ++ return; ++ WARN_ON(!napi || pool->p.napi); ++ ++ mutex_lock(&page_pools_lock); ++ WRITE_ONCE(pool->p.napi, napi); ++ mutex_unlock(&page_pools_lock); ++} ++EXPORT_SYMBOL(page_pool_enable_direct_recycling); ++ + void page_pool_disable_direct_recycling(struct page_pool *pool) + { + /* Disable direct recycling based on pool->cpuid. +-- +2.50.1 + diff --git a/queue-6.15/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch b/queue-6.15/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch new file mode 100644 index 0000000000..ae5cb5dc03 --- /dev/null +++ b/queue-6.15/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch @@ -0,0 +1,69 @@ +From fb06884923f5b6158bb457592100bd9b1bb0ecbd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 09:36:55 +0000 +Subject: net: stmmac: thead: Get and enable APB clock on initialization + +From: Yao Zi + +[ Upstream commit 4cc339ce482ba78589a2d5cbe1c84b735d263383 ] + +It's necessary to adjust the MAC TX clock when the linkspeed changes, +but it's noted such adjustment always fails on TH1520 SoC, and reading +back from APB glue registers that control clock generation results in +garbage, causing broken link. + +With some testing, it's found a clock must be ungated for access to APB +glue registers. Without any consumer, the clock is automatically +disabled during late kernel startup. Let's get and enable it if it's +described in devicetree. + +For backward compatibility with older devicetrees, probing won't fail if +the APB clock isn't found. In this case, we emit a warning since the +link will break if the speed changes. + +Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC") +Signed-off-by: Yao Zi +Tested-by: Drew Fustini +Reviewed-by: Drew Fustini +Link: https://patch.msgid.link/20250808093655.48074-4-ziyao@disroot.org +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c +index c72ee759aae5..f2946bea0bc2 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c ++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c +@@ -211,6 +211,7 @@ static int thead_dwmac_probe(struct platform_device *pdev) + struct stmmac_resources stmmac_res; + struct plat_stmmacenet_data *plat; + struct thead_dwmac *dwmac; ++ struct clk *apb_clk; + void __iomem *apb; + int ret; + +@@ -224,6 +225,19 @@ static int thead_dwmac_probe(struct platform_device *pdev) + return dev_err_probe(&pdev->dev, PTR_ERR(plat), + "dt configuration failed\n"); + ++ /* ++ * The APB clock is essential for accessing glue registers. However, ++ * old devicetrees don't describe it correctly. We continue to probe ++ * and emit a warning if it isn't present. ++ */ ++ apb_clk = devm_clk_get_enabled(&pdev->dev, "apb"); ++ if (PTR_ERR(apb_clk) == -ENOENT) ++ dev_warn(&pdev->dev, ++ "cannot get apb clock, link may break after speed changes\n"); ++ else if (IS_ERR(apb_clk)) ++ return dev_err_probe(&pdev->dev, PTR_ERR(apb_clk), ++ "failed to get apb clock\n"); ++ + dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL); + if (!dwmac) + return -ENOMEM; +-- +2.50.1 + diff --git a/queue-6.15/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.15/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch new file mode 100644 index 0000000000..ab0957fe48 --- /dev/null +++ b/queue-6.15/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch @@ -0,0 +1,44 @@ +From 35bc060346d46b09199ed15886a9e6f60c6691ab Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Aug 2025 07:23:18 -0700 +Subject: net: ti: icss-iep: Fix incorrect type for return value in + extts_enable() + +From: Alok Tiwari + +[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ] + +The variable ret in icss_iep_extts_enable() was incorrectly declared +as u32, while the function returns int and may return negative error +codes. This will cause sign extension issues and incorrect error +propagation. Update ret to be int to fix error handling. + +This change corrects the declaration to avoid potential type mismatch. + +Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") +Signed-off-by: Alok Tiwari +Reviewed-by: Andrew Lunn +Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c +index 50bfbc2779e4..d8c9fe1d98c4 100644 +--- a/drivers/net/ethernet/ti/icssg/icss_iep.c ++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c +@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on) + + static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on) + { +- u32 val, cap, ret = 0; ++ u32 val, cap; ++ int ret = 0; + + mutex_lock(&iep->ptp_clk_mutex); + +-- +2.50.1 + diff --git a/queue-6.15/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch b/queue-6.15/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch new file mode 100644 index 0000000000..74e7ee86cf --- /dev/null +++ b/queue-6.15/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch @@ -0,0 +1,56 @@ +From 9d6910e013cfc66f3ee4de8d062a7ea32c989c2c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Aug 2025 23:08:12 +0530 +Subject: net: ti: icssg-prueth: Fix emac link speed handling + +From: MD Danish Anwar + +[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ] + +When link settings are changed emac->speed is populated by +emac_adjust_link(). The link speed and other settings are then written into +the DRAM. However if both ports are brought down after this and brought up +again or if the operating mode is changed and a firmware reload is needed, +the DRAM is cleared by icssg_config(). As a result the link settings are +lost. + +Fix this by calling emac_adjust_link() after icssg_config(). This re +populates the settings in the DRAM after a new firmware load. + +Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.") +Signed-off-by: MD Danish Anwar +Reviewed-by: Andrew Lunn +Message-ID: <20250805173812.2183161-1-danishanwar@ti.com> +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c +index 2f5c4335dec3..008d77727400 100644 +--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c ++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c +@@ -50,6 +50,8 @@ + /* CTRLMMR_ICSSG_RGMII_CTRL register bits */ + #define ICSSG_CTRL_RGMII_ID_MODE BIT(24) + ++static void emac_adjust_link(struct net_device *ndev); ++ + static int emac_get_tx_ts(struct prueth_emac *emac, + struct emac_tx_ts_response *rsp) + { +@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth) + ret = icssg_config(prueth, emac, slice); + if (ret) + goto disable_class; ++ ++ mutex_lock(&emac->ndev->phydev->lock); ++ emac_adjust_link(emac->ndev); ++ mutex_unlock(&emac->ndev->phydev->lock); + } + + ret = prueth_emac_start(prueth); +-- +2.50.1 + diff --git a/queue-6.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..ffee4f20f9 --- /dev/null +++ b/queue-6.15/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,129 @@ +From 1061094cf0f7026f7919fac281a9d2e9cf45d5b0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 2cc0fde23344..5fdcae45e0bc 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -884,8 +884,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -1208,19 +1206,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -1257,7 +1262,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -1270,8 +1275,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, true, flags); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -1284,12 +1288,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-6.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch new file mode 100644 index 0000000000..a89f30360a --- /dev/null +++ b/queue-6.15/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch @@ -0,0 +1,103 @@ +From cdcbf5a86ed33261d6360945f9d9033691f6bda8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Jul 2025 15:26:49 +0900 +Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun() + +From: Jeongjun Park + +[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ] + +syzbot reported the following ABBA deadlock: + + CPU0 CPU1 + ---- ---- + n_vclocks_store() + lock(&ptp->n_vclocks_mux) [1] + (physical clock) + pc_clock_adjtime() + lock(&clk->rwsem) [2] + (physical clock) + ... + ptp_clock_freerun() + ptp_vclock_in_use() + lock(&ptp->n_vclocks_mux) [3] + (physical clock) + ptp_clock_unregister() + posix_clock_unregister() + lock(&clk->rwsem) [4] + (virtual clock) + +Since ptp virtual clock is registered only under ptp physical clock, both +ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use() +to lock &ptp->n_vclocks_mux and check ptp->n_vclocks. + +However, when unregistering vclocks in n_vclocks_store(), the locking +ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of +ptp_clock_unregister() called through device_for_each_child_reverse() +is a virtual clock lock. + +Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are +different locks, but in lockdep, a false positive occurs because the +possibility of deadlock is determined through lock-class. + +To solve this, lock subclass annotation must be added to the posix_clock +rwsem of the vclock. + +Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad +Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") +Signed-off-by: Jeongjun Park +Acked-by: Richard Cochran +Reviewed-by: Vladimir Oltean +Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/ptp/ptp_private.h | 5 +++++ + drivers/ptp/ptp_vclock.c | 7 +++++++ + 2 files changed, 12 insertions(+) + +diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h +index a6aad743c282..b352df4cd3f9 100644 +--- a/drivers/ptp/ptp_private.h ++++ b/drivers/ptp/ptp_private.h +@@ -24,6 +24,11 @@ + #define PTP_DEFAULT_MAX_VCLOCKS 20 + #define PTP_MAX_CHANNELS 2048 + ++enum { ++ PTP_LOCK_PHYSICAL = 0, ++ PTP_LOCK_VIRTUAL, ++}; ++ + struct timestamp_event_queue { + struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS]; + int head; +diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c +index 7febfdcbde8b..8ed4b8598924 100644 +--- a/drivers/ptp/ptp_vclock.c ++++ b/drivers/ptp/ptp_vclock.c +@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp) + return PTP_VCLOCK_REFRESH_INTERVAL; + } + ++static void ptp_vclock_set_subclass(struct ptp_clock *ptp) ++{ ++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); ++} ++ + static const struct ptp_clock_info ptp_vclock_info = { + .owner = THIS_MODULE, + .name = "ptp virtual clock", +@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) + return NULL; + } + ++ ptp_vclock_set_subclass(vclock->clock); ++ + timecounter_init(&vclock->tc, &vclock->cc, 0); + ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); + +-- +2.50.1 + diff --git a/queue-6.15/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch b/queue-6.15/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch new file mode 100644 index 0000000000..d7a412657c --- /dev/null +++ b/queue-6.15/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch @@ -0,0 +1,54 @@ +From 9340f5b60b7593637159f924e6d0f92ffc7effa9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 09:36:56 +0000 +Subject: riscv: dts: thead: Add APB clocks for TH1520 GMACs + +From: Yao Zi + +[ Upstream commit a7f75e2883c4bd57b12c3be61bb926929adad9c0 ] + +Describe perisys-apb4-hclk as the APB clock for TH1520 SoC, which is +essential for accessing GMAC glue registers. + +Fixes: 7e756671a664 ("riscv: dts: thead: Add TH1520 ethernet nodes") +Signed-off-by: Yao Zi +Reviewed-by: Drew Fustini +Tested-by: Drew Fustini +Link: https://patch.msgid.link/20250808093655.48074-5-ziyao@disroot.org +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + arch/riscv/boot/dts/thead/th1520.dtsi | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi +index 527336417765..0aae4e6a5b33 100644 +--- a/arch/riscv/boot/dts/thead/th1520.dtsi ++++ b/arch/riscv/boot/dts/thead/th1520.dtsi +@@ -286,8 +286,9 @@ gmac1: ethernet@ffe7060000 { + reg-names = "dwmac", "apb"; + interrupts = <67 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; +- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>; +- clock-names = "stmmaceth", "pclk"; ++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>, ++ <&clk CLK_PERISYS_APB4_HCLK>; ++ clock-names = "stmmaceth", "pclk", "apb"; + snps,pbl = <32>; + snps,fixed-burst; + snps,multicast-filter-bins = <64>; +@@ -308,8 +309,9 @@ gmac0: ethernet@ffe7070000 { + reg-names = "dwmac", "apb"; + interrupts = <66 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; +- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>; +- clock-names = "stmmaceth", "pclk"; ++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>, ++ <&clk CLK_PERISYS_APB4_HCLK>; ++ clock-names = "stmmaceth", "pclk", "apb"; + snps,pbl = <32>; + snps,fixed-burst; + snps,multicast-filter-bins = <64>; +-- +2.50.1 + diff --git a/queue-6.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch new file mode 100644 index 0000000000..b0337e3e17 --- /dev/null +++ b/queue-6.15/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch @@ -0,0 +1,73 @@ +From bccdcc26cdd1a2db18a98c8314555d471f0ea68b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:40:11 -0400 +Subject: sctp: linearize cloned gso packets in sctp_rcv + +From: Xin Long + +[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ] + +A cloned head skb still shares these frag skbs in fraglist with the +original head skb. It's not safe to access these frag skbs. + +syzbot reported two use-of-uninitialized-memory bugs caused by this: + + BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 + sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 + sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 + __release_sock+0x1da/0x330 net/core/sock.c:3106 + release_sock+0x6b/0x250 net/core/sock.c:3660 + sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 + sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 + sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 + inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:718 [inline] + +and + + BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 + sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 + __release_sock+0x1d3/0x330 net/core/sock.c:3213 + release_sock+0x6b/0x270 net/core/sock.c:3767 + sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 + sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 + sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 + inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:712 [inline] + +This patch fixes it by linearizing cloned gso packets in sctp_rcv(). + +Fixes: 90017accff61 ("sctp: Add GSO support") +Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com +Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com +Signed-off-by: Xin Long +Reviewed-by: Marcelo Ricardo Leitner +Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sctp/input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index 0c0d2757f6f8..6fcdcaeed40e 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb) + * it's better to just linearize it otherwise crc computing + * takes longer. + */ +- if ((!is_gso && skb_linearize(skb)) || ++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; + +-- +2.50.1 + diff --git a/queue-6.15/series b/queue-6.15/series index 2f0e67372d..143d8e99f4 100644 --- a/queue-6.15/series +++ b/queue-6.15/series @@ -50,3 +50,26 @@ acpi-processor-perflib-move-problematic-pr-performance-check.patch block-make-req_op_zone_finish-a-write-operation.patch mm-memory-tier-fix-abstract-distance-calculation-overflow.patch mfd-cros_ec-separate-charge-control-probing-from-usb-pd.patch +habanalabs-fix-uaf-in-export_dmabuf.patch +mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch +xfrm-restore-gso-for-sw-crypto.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch +net-hibmcge-fix-rtnl-deadlock-issue.patch +net-hibmcge-fix-the-division-by-zero-issue.patch +net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch +net-ti-icssg-prueth-fix-emac-link-speed-handling.patch +net-page_pool-allow-enabling-recycling-late-fix-fals.patch +net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch +sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch +net-lapbether-ignore-ops-locked-netdevs.patch +hamradio-ignore-ops-locked-netdevs.patch +erofs-fix-block-count-report-when-48-bit-layout-is-o.patch +intel_idle-allow-loading-acpi-tables-for-any-family.patch +cpuidle-governors-menu-avoid-using-invalid-recent-in.patch +net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch +riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch +ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch +tls-handle-data-disappearing-from-under-the-tls-ulp.patch +ipvs-fix-estimator-kthreads-preferred-affinity.patch +net-kcm-fix-race-condition-in-kcm_unattach.patch diff --git a/queue-6.15/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.15/tls-handle-data-disappearing-from-under-the-tls-ulp.patch new file mode 100644 index 0000000000..0a9a3b481c --- /dev/null +++ b/queue-6.15/tls-handle-data-disappearing-from-under-the-tls-ulp.patch @@ -0,0 +1,106 @@ +From 2f6ca8c2086da5826a2e90788e7990e6a81f6da8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 16:29:06 -0700 +Subject: tls: handle data disappearing from under the TLS ULP + +From: Jakub Kicinski + +[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ] + +TLS expects that it owns the receive queue of the TCP socket. +This cannot be guaranteed in case the reader of the TCP socket +entered before the TLS ULP was installed, or uses some non-standard +read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy +early exit (which leaves anchor pointing to a freed skb) with real +error handling. Wipe the parsing state and tell the reader to retry. + +We already reload the anchor every time we (re)acquire the socket lock, +so the only condition we need to avoid is an out of bounds read +(not having enough bytes in the socket for previously parsed record len). + +If some data was read from under TLS but there's enough in the queue +we'll reload and decrypt what is most likely not a valid TLS record. +Leading to some undefined behavior from TLS perspective (corrupting +a stream? missing an alert? missing an attack?) but no kernel crash +should take place. + +Reported-by: William Liu +Reported-by: Savino Dicanosa +Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io +Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") +Reviewed-by: Eric Dumazet +Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/tls/tls.h | 2 +- + net/tls/tls_strp.c | 11 ++++++++--- + net/tls/tls_sw.c | 3 ++- + 3 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/net/tls/tls.h b/net/tls/tls.h +index 774859b63f0d..4e077068e6d9 100644 +--- a/net/tls/tls.h ++++ b/net/tls/tls.h +@@ -196,7 +196,7 @@ void tls_strp_msg_done(struct tls_strparser *strp); + int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); + void tls_rx_msg_ready(struct tls_strparser *strp); + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); + int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); + struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); + int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); +diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c +index 095cf31bae0b..d71643b494a1 100644 +--- a/net/tls/tls_strp.c ++++ b/net/tls/tls_strp.c +@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) + strp->stm.offset = offset; + } + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + { + struct strp_msg *rxm; + struct tls_msg *tlm; +@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len); + + if (!strp->copy_mode && force_refresh) { +- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len)) +- return; ++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) { ++ WRITE_ONCE(strp->msg_ready, 0); ++ memset(&strp->stm, 0, sizeof(strp->stm)); ++ return false; ++ } + + tls_strp_load_anchor_with_queue(strp, strp->stm.full_len); + } +@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + rxm->offset = strp->stm.offset; + tlm = tls_msg(strp->anchor); + tlm->control = strp->mark; ++ ++ return true; + } + + /* Called with lock held on lower socket */ +diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c +index 549d1ea01a72..51c98a007dda 100644 +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, + return sock_intr_errno(timeo); + } + +- tls_strp_msg_load(&ctx->strp, released); ++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) ++ return tls_rx_rec_wait(sk, psock, nonblock, false); + + return 1; + } +-- +2.50.1 + diff --git a/queue-6.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..7dd6f0bf43 --- /dev/null +++ b/queue-6.15/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From b41a25fd1983a12413c6fe2ac51da32c519ef08e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index a1aca6308677..4245522d4201 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 + diff --git a/queue-6.15/xfrm-restore-gso-for-sw-crypto.patch b/queue-6.15/xfrm-restore-gso-for-sw-crypto.patch new file mode 100644 index 0000000000..2a345249f6 --- /dev/null +++ b/queue-6.15/xfrm-restore-gso-for-sw-crypto.patch @@ -0,0 +1,58 @@ +From c4a6ec2c44c573d9dca08240a0c2e0c8ba20a461 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:25 +0200 +Subject: xfrm: restore GSO for SW crypto + +From: Sabrina Dubroca + +[ Upstream commit 234d1eff5d4987024be9d40ac07b918a5ae8db1a ] + +Commit 49431af6c4ef incorrectly assumes that the GSO path is only used +by HW offload, but it's also useful for SW crypto. + +This patch re-enables GSO for SW crypto. It's not an exact revert to +preserve the other changes made to xfrm_dev_offload_ok afterwards, but +it reverts all of its effects. + +Fixes: 49431af6c4ef ("xfrm: rely on XFRM offload") +Signed-off-by: Sabrina Dubroca +Reviewed-by: Leon Romanovsky +Reviewed-by: Zhu Yanjun +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/xfrm/xfrm_device.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c +index a2d3a5f3b485..a6c289858401 100644 +--- a/net/xfrm/xfrm_device.c ++++ b/net/xfrm/xfrm_device.c +@@ -415,10 +415,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) + struct net_device *dev = x->xso.dev; + bool check_tunnel_size; + +- if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) ++ if (!x->type_offload || ++ (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap)) + return false; + +- if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { ++ if ((!dev || dev == xfrm_dst_path(dst)->dev) && ++ !xdst->child->xfrm) { + mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); + if (skb->len <= mtu) + goto ok; +@@ -430,6 +432,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) + return false; + + ok: ++ if (!dev) ++ return true; ++ + check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && + x->props.mode == XFRM_MODE_TUNNEL; + switch (x->props.family) { +-- +2.50.1 + diff --git a/queue-6.16/bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch b/queue-6.16/bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch new file mode 100644 index 0000000000..a04ed34333 --- /dev/null +++ b/queue-6.16/bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch @@ -0,0 +1,69 @@ +From dddfb590c42c3c450b2cac204c7b69ea3c79f1d1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Aug 2025 11:29:07 -0700 +Subject: bnxt: fill data page pool with frags if PAGE_SIZE > BNXT_RX_PAGE_SIZE + +From: David Wei + +[ Upstream commit 39f8fcda2088382a4aa70b258d6f7225aa386f11 ] + +The data page pool always fills the HW rx ring with pages. On arm64 with +64K pages, this will waste _at least_ 32K of memory per entry in the rx +ring. + +Fix by fragmenting the pages if PAGE_SIZE > BNXT_RX_PAGE_SIZE. This +makes the data page pool the same as the header pool. + +Tested with iperf3 with a small (64 entries) rx ring to encourage buffer +circulation. + +Fixes: cd1fafe7da1f ("eth: bnxt: add support rx side device memory TCP") +Reviewed-by: Michael Chan +Signed-off-by: David Wei +Link: https://patch.msgid.link/20250812182907.1540755-1-dw@davidwei.uk +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 8d950b43846e..e165490af6ac 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -921,15 +921,21 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, + + static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, + struct bnxt_rx_ring_info *rxr, ++ unsigned int *offset, + gfp_t gfp) + { + netmem_ref netmem; + +- netmem = page_pool_alloc_netmems(rxr->page_pool, gfp); ++ if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) { ++ netmem = page_pool_alloc_frag_netmem(rxr->page_pool, offset, BNXT_RX_PAGE_SIZE, gfp); ++ } else { ++ netmem = page_pool_alloc_netmems(rxr->page_pool, gfp); ++ *offset = 0; ++ } + if (!netmem) + return 0; + +- *mapping = page_pool_get_dma_addr_netmem(netmem); ++ *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset; + return netmem; + } + +@@ -1024,7 +1030,7 @@ static int bnxt_alloc_rx_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, + dma_addr_t mapping; + netmem_ref netmem; + +- netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, gfp); ++ netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp); + if (!netmem) + return -ENOMEM; + +-- +2.50.1 + diff --git a/queue-6.16/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.16/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch new file mode 100644 index 0000000000..f36bad1a44 --- /dev/null +++ b/queue-6.16/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch @@ -0,0 +1,91 @@ +From 7cde332159d0e7a0dd2d95c8374ad52850b4db14 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Aug 2025 17:03:11 +0200 +Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data + +From: Rafael J. Wysocki + +[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ] + +Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid +discarding useful information") caused the number of wakeup interrupts +to increase on an idle system [1], which was not expected to happen +after merely allowing shallower idle states to be selected by the +governor in some cases. + +However, on the system in question, all of the idle states deeper than +WFI are rejected by the driver due to a firmware issue [2]. This causes +the governor to only consider the recent interval duriation data +corresponding to attempts to enter WFI that are successful and the +recent invervals table is filled with values lower than the scheduler +tick period. Consequently, the governor predicts an idle duration +below the scheduler tick period length and avoids stopping the tick +more often which leads to the observed symptom. + +Address it by modifying the governor to update the recent intervals +table also when entering the previously selected idle state fails, so +it knows that the short idle intervals might have been the minority +had the selected idle states been actually entered every time. + +Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") +Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1] +Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2] +Signed-off-by: Rafael J. Wysocki +Tested-by: Christian Loehle +Tested-by: Marc Zyngier +Reviewed-by: Christian Loehle +Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki +Signed-off-by: Sasha Levin +--- + drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c +index 52d5d26fc7c6..81306612a5c6 100644 +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns) + + static DEFINE_PER_CPU(struct menu_device, menu_devices); + ++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us) ++{ ++ /* Update the repeating-pattern data. */ ++ data->intervals[data->interval_ptr++] = interval_us; ++ if (data->interval_ptr >= INTERVALS) ++ data->interval_ptr = 0; ++} ++ + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + + /* +@@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; ++ } else if (!dev->last_residency_ns) { ++ /* ++ * This happens when the driver rejects the previously selected ++ * idle state and returns an error, so update the recent ++ * intervals table to prevent invalid information from being ++ * used going forward. ++ */ ++ menu_update_intervals(data, UINT_MAX); + } + + /* Find the shortest expected idle interval. */ +@@ -482,10 +498,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) + + data->correction_factor[data->bucket] = new_factor; + +- /* update the repeating-pattern data */ +- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); +- if (data->interval_ptr >= INTERVALS) +- data->interval_ptr = 0; ++ menu_update_intervals(data, ktime_to_us(measured_ns)); + } + + /** +-- +2.50.1 + diff --git a/queue-6.16/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch b/queue-6.16/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch new file mode 100644 index 0000000000..4e340df7a0 --- /dev/null +++ b/queue-6.16/erofs-fix-block-count-report-when-48-bit-layout-is-o.patch @@ -0,0 +1,37 @@ +From 04b932e34aabb02993b5947cbdae75426353973a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 16:20:19 +0800 +Subject: erofs: fix block count report when 48-bit layout is on + +From: Gao Xiang + +[ Upstream commit 0b96d9bed324a1c1b7d02bfb9596351ef178428d ] + +Fix incorrect shift order when combining the 48-bit block count. + +Fixes: 2e1473d5195f ("erofs: implement 48-bit block addressing for unencoded inodes") +Signed-off-by: Gao Xiang +Link: https://lore.kernel.org/r/20250807082019.3093539-1-hsiangkao@linux.alibaba.com +Signed-off-by: Sasha Levin +--- + fs/erofs/super.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/erofs/super.c b/fs/erofs/super.c +index e1e9f06e8342..799fef437aa8 100644 +--- a/fs/erofs/super.c ++++ b/fs/erofs/super.c +@@ -313,8 +313,8 @@ static int erofs_read_superblock(struct super_block *sb) + sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); + if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { + sbi->root_nid = le64_to_cpu(dsb->rootnid_8b); +- sbi->dif0.blocks = (sbi->dif0.blocks << 32) | +- le16_to_cpu(dsb->rb.blocks_hi); ++ sbi->dif0.blocks = sbi->dif0.blocks | ++ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32); + } else { + sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); + } +-- +2.50.1 + diff --git a/queue-6.16/habanalabs-fix-uaf-in-export_dmabuf.patch b/queue-6.16/habanalabs-fix-uaf-in-export_dmabuf.patch new file mode 100644 index 0000000000..480d832a15 --- /dev/null +++ b/queue-6.16/habanalabs-fix-uaf-in-export_dmabuf.patch @@ -0,0 +1,96 @@ +From 94bb4ee76202eae8bd46a9e2051579b0bd2c8aa8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 12 Jul 2025 06:02:31 +0100 +Subject: habanalabs: fix UAF in export_dmabuf() + +From: Al Viro + +[ Upstream commit 33927f3d0ecdcff06326d6e4edb6166aed42811c ] + +As soon as we'd inserted a file reference into descriptor table, another +thread could close it. That's fine for the case when all we are doing is +returning that descriptor to userland (it's a race, but it's a userland +race and there's nothing the kernel can do about it). However, if we +follow fd_install() with any kind of access to objects that would be +destroyed on close (be it the struct file itself or anything destroyed +by its ->release()), we have a UAF. + +dma_buf_fd() is a combination of reserving a descriptor and fd_install(). +habanalabs export_dmabuf() calls it and then proceeds to access the +objects destroyed on close. In particular, it grabs an extra reference to +another struct file that will be dropped as part of ->release() for ours; +that "will be" is actually "might have already been". + +Fix that by reserving descriptor before anything else and do fd_install() +only when everything had been set up. As a side benefit, we no longer +have the failure exit with file already created, but reference to +underlying file (as well as ->dmabuf_export_cnt, etc.) not grabbed yet; +unlike dma_buf_fd(), fd_install() can't fail. + +Fixes: db1a8dd916aa ("habanalabs: add support for dma-buf exporter") +Signed-off-by: Al Viro +Signed-off-by: Sasha Levin +--- + drivers/accel/habanalabs/common/memory.c | 23 +++++++---------------- + 1 file changed, 7 insertions(+), 16 deletions(-) + +diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c +index 601fdbe70179..61472a381904 100644 +--- a/drivers/accel/habanalabs/common/memory.c ++++ b/drivers/accel/habanalabs/common/memory.c +@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf) + struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv; + struct hl_ctx *ctx; + +- if (!hl_dmabuf) +- return; +- + ctx = hl_dmabuf->ctx; + + if (hl_dmabuf->memhash_hnode) +@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx, + { + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct hl_device *hdev = ctx->hdev; +- int rc, fd; ++ CLASS(get_unused_fd, fd)(flags); ++ ++ if (fd < 0) { ++ dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd); ++ return fd; ++ } + + exp_info.ops = &habanalabs_dmabuf_ops; + exp_info.size = total_size; +@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx, + return PTR_ERR(hl_dmabuf->dmabuf); + } + +- fd = dma_buf_fd(hl_dmabuf->dmabuf, flags); +- if (fd < 0) { +- dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd); +- rc = fd; +- goto err_dma_buf_put; +- } +- + hl_dmabuf->ctx = ctx; + hl_ctx_get(hl_dmabuf->ctx); + atomic_inc(&ctx->hdev->dmabuf_export_cnt); +@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx, + get_file(ctx->hpriv->file_priv->filp); + + *dmabuf_fd = fd; ++ fd_install(take_fd(fd), hl_dmabuf->dmabuf->file); + + return 0; +- +-err_dma_buf_put: +- hl_dmabuf->dmabuf->priv = NULL; +- dma_buf_put(hl_dmabuf->dmabuf); +- return rc; + } + + static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset) +-- +2.50.1 + diff --git a/queue-6.16/hamradio-ignore-ops-locked-netdevs.patch b/queue-6.16/hamradio-ignore-ops-locked-netdevs.patch new file mode 100644 index 0000000000..f0ac3d5cff --- /dev/null +++ b/queue-6.16/hamradio-ignore-ops-locked-netdevs.patch @@ -0,0 +1,62 @@ +From f83f4f27bb385d16cb1c9541d6b03c8088d15f0d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 14:37:26 -0700 +Subject: hamradio: ignore ops-locked netdevs + +From: Stanislav Fomichev + +[ Upstream commit c64237960819aee1766d03f446ae6de94b1e3f73 ] + +Syzkaller managed to trigger lock dependency in xsk_notify via +register_netdevice. As discussed in [0], using register_netdevice +in the notifiers is problematic so skip adding hamradio for ops-locked +devices. + + xsk_notifier+0x89/0x230 net/xdp/xsk.c:1664 + notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85 + call_netdevice_notifiers_extack net/core/dev.c:2267 [inline] + call_netdevice_notifiers net/core/dev.c:2281 [inline] + unregister_netdevice_many_notify+0x14d7/0x1ff0 net/core/dev.c:12156 + unregister_netdevice_many net/core/dev.c:12219 [inline] + unregister_netdevice_queue+0x33c/0x380 net/core/dev.c:12063 + register_netdevice+0x1689/0x1ae0 net/core/dev.c:11241 + bpq_new_device drivers/net/hamradio/bpqether.c:481 [inline] + bpq_device_event+0x491/0x600 drivers/net/hamradio/bpqether.c:523 + notifier_call_chain+0x1b6/0x3e0 kernel/notifier.c:85 + call_netdevice_notifiers_extack net/core/dev.c:2267 [inline] + call_netdevice_notifiers net/core/dev.c:2281 [inline] + __dev_notify_flags+0x18d/0x2e0 net/core/dev.c:-1 + netif_change_flags+0xe8/0x1a0 net/core/dev.c:9608 + dev_change_flags+0x130/0x260 net/core/dev_api.c:68 + devinet_ioctl+0xbb4/0x1b50 net/ipv4/devinet.c:1200 + inet_ioctl+0x3c0/0x4c0 net/ipv4/af_inet.c:1001 + +0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/ +Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP") +Suggested-by: Jakub Kicinski +Reported-by: syzbot+e6300f66a999a6612477@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e6300f66a999a6612477 +Signed-off-by: Stanislav Fomichev +Link: https://patch.msgid.link/20250806213726.1383379-2-sdf@fomichev.me +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/hamradio/bpqether.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c +index 0e0fe32d2da4..045c5177262e 100644 +--- a/drivers/net/hamradio/bpqether.c ++++ b/drivers/net/hamradio/bpqether.c +@@ -138,7 +138,7 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev) + + static inline int dev_is_ethdev(struct net_device *dev) + { +- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); ++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev); + } + + /* ------------------------------------------------------------------------ */ +-- +2.50.1 + diff --git a/queue-6.16/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.16/intel_idle-allow-loading-acpi-tables-for-any-family.patch new file mode 100644 index 0000000000..aacb76e92b --- /dev/null +++ b/queue-6.16/intel_idle-allow-loading-acpi-tables-for-any-family.patch @@ -0,0 +1,41 @@ +From 5bc45712394018e4bf1a06c92753e90d4d00252a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 15:37:14 -0400 +Subject: intel_idle: Allow loading ACPI tables for any family + +From: Len Brown + +[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ] + +There is no reason to limit intel_idle's loading of ACPI tables to +family 6. Upcoming Intel processors are not in family 6. + +Below "Fixes" really means "applies cleanly until". +That syntax commit didn't change the previous logic, +but shows this patch applies back 5-years. + +Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros") +Signed-off-by: Len Brown +Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 73747d20df85..91a7b7e7c0c8 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -1679,7 +1679,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + }; + + static const struct x86_cpu_id intel_mwait_ids[] __initconst = { +- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL), ++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL), + {} + }; + +-- +2.50.1 + diff --git a/queue-6.16/ipvs-fix-estimator-kthreads-preferred-affinity.patch b/queue-6.16/ipvs-fix-estimator-kthreads-preferred-affinity.patch new file mode 100644 index 0000000000..e2b91cf208 --- /dev/null +++ b/queue-6.16/ipvs-fix-estimator-kthreads-preferred-affinity.patch @@ -0,0 +1,90 @@ +From f4ed6bb5279fbef65bd65697b4e54aa15facbcaa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 29 Jul 2025 14:26:11 +0200 +Subject: ipvs: Fix estimator kthreads preferred affinity + +From: Frederic Weisbecker + +[ Upstream commit c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 ] + +The estimator kthreads' affinity are defined by sysctl overwritten +preferences and applied through a plain call to the scheduler's affinity +API. + +However since the introduction of managed kthreads preferred affinity, +such a practice shortcuts the kthreads core code which eventually +overwrites the target to the default unbound affinity. + +Fix this with using the appropriate kthread's API. + +Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node") +Signed-off-by: Frederic Weisbecker +Acked-by: Julian Anastasov +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + include/net/ip_vs.h | 13 +++++++++++++ + kernel/kthread.c | 1 + + net/netfilter/ipvs/ip_vs_est.c | 3 ++- + 3 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h +index ff406ef4fd4a..29a36709e7f3 100644 +--- a/include/net/ip_vs.h ++++ b/include/net/ip_vs.h +@@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) + return housekeeping_cpumask(HK_TYPE_KTHREAD); + } + ++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) ++{ ++ if (ipvs->est_cpulist_valid) ++ return ipvs->sysctl_est_cpulist; ++ else ++ return NULL; ++} ++ + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) + { + return ipvs->sysctl_est_nice; +@@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) + return housekeeping_cpumask(HK_TYPE_KTHREAD); + } + ++static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) ++{ ++ return NULL; ++} ++ + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) + { + return IPVS_EST_NICE; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 85fc068f0083..8d5e87b03d1e 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -894,6 +894,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) + + return ret; + } ++EXPORT_SYMBOL_GPL(kthread_affine_preferred); + + /* + * Re-affine kthreads according to their preferences +diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c +index f821ad2e19b3..15049b826732 100644 +--- a/net/netfilter/ipvs/ip_vs_est.c ++++ b/net/netfilter/ipvs/ip_vs_est.c +@@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + } + + set_user_nice(kd->task, sysctl_est_nice(ipvs)); +- set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); ++ if (sysctl_est_preferred_cpulist(ipvs)) ++ kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); + + pr_info("starting estimator thread %d...\n", kd->id); + wake_up_process(kd->task); +-- +2.50.1 + diff --git a/queue-6.16/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch b/queue-6.16/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch new file mode 100644 index 0000000000..e95b025b3a --- /dev/null +++ b/queue-6.16/mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch @@ -0,0 +1,78 @@ +From fbc4eefd9eaf2473aa9cf85fa2deda55d8ec654d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 24 Jul 2025 17:09:56 +0800 +Subject: mm/smaps: fix race between smaps_hugetlb_range and migration + +From: Jinjiang Tu + +[ Upstream commit 45d19b4b6c2d422771c29b83462d84afcbb33f01 ] + +smaps_hugetlb_range() handles the pte without holdling ptl, and may be +concurrenct with migration, leaing to BUG_ON in pfn_swap_entry_to_page(). +The race is as follows. + +smaps_hugetlb_range migrate_pages + huge_ptep_get + remove_migration_ptes + folio_unlock + pfn_swap_entry_folio + BUG_ON + +To fix it, hold ptl lock in smaps_hugetlb_range(). + +Link: https://lkml.kernel.org/r/20250724090958.455887-1-tujinjiang@huawei.com +Link: https://lkml.kernel.org/r/20250724090958.455887-2-tujinjiang@huawei.com +Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps") +Signed-off-by: Jinjiang Tu +Acked-by: David Hildenbrand +Cc: Andrei Vagin +Cc: Andrii Nakryiko +Cc: Baolin Wang +Cc: Brahmajit Das +Cc: Catalin Marinas +Cc: Christophe Leroy +Cc: David Rientjes +Cc: Dev Jain +Cc: Hugh Dickins +Cc: Joern Engel +Cc: Kefeng Wang +Cc: Lorenzo Stoakes +Cc: Michal Hocko +Cc: Ryan Roberts +Cc: Thiago Jung Bauermann +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + fs/proc/task_mmu.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 751479eb128f..0102ab3aaec1 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1020,10 +1020,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + { + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; +- pte_t ptent = huge_ptep_get(walk->mm, addr, pte); + struct folio *folio = NULL; + bool present = false; ++ spinlock_t *ptl; ++ pte_t ptent; + ++ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); ++ ptent = huge_ptep_get(walk->mm, addr, pte); + if (pte_present(ptent)) { + folio = page_folio(pte_page(ptent)); + present = true; +@@ -1042,6 +1045,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } ++ spin_unlock(ptl); + return 0; + } + #else +-- +2.50.1 + diff --git a/queue-6.16/net-hibmcge-fix-rtnl-deadlock-issue.patch b/queue-6.16/net-hibmcge-fix-rtnl-deadlock-issue.patch new file mode 100644 index 0000000000..f8558b7140 --- /dev/null +++ b/queue-6.16/net-hibmcge-fix-rtnl-deadlock-issue.patch @@ -0,0 +1,122 @@ +From f88d08d5c14c6994d3611fc3adc2f16564729220 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 18:27:56 +0800 +Subject: net: hibmcge: fix rtnl deadlock issue + +From: Jijie Shao + +[ Upstream commit c875503a9b9082928d7d3fc60b5400d16fbfae4e ] + +Currently, the hibmcge netdev acquires the rtnl_lock in +pci_error_handlers.reset_prepare() and releases it in +pci_error_handlers.reset_done(). + +However, in the PCI framework: +pci_reset_bus - __pci_reset_slot - pci_slot_save_and_disable_locked - + pci_dev_save_and_disable - err_handler->reset_prepare(dev); + +In pci_slot_save_and_disable_locked(): + list_for_each_entry(dev, &slot->bus->devices, bus_list) { + if (!dev->slot || dev->slot!= slot) + continue; + pci_dev_save_and_disable(dev); + if (dev->subordinate) + pci_bus_save_and_disable_locked(dev->subordinate); + } + +This will iterate through all devices under the current bus and execute +err_handler->reset_prepare(), causing two devices of the hibmcge driver +to sequentially request the rtnl_lock, leading to a deadlock. + +Since the driver now executes netif_device_detach() +before the reset process, it will not concurrently with +other netdev APIs, so there is no need to hold the rtnl_lock now. + +Therefore, this patch removes the rtnl_lock during the reset process and +adjusts the position of HBG_NIC_STATE_RESETTING to ensure +that multiple resets are not executed concurrently. + +Fixes: 3f5a61f6d504f ("net: hibmcge: Add reset supported in this module") +Signed-off-by: Jijie Shao +Reviewed-by: Simon Horman +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c | 14 +++++--------- + 1 file changed, 5 insertions(+), 9 deletions(-) + +diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c +index ff3295b60a69..dee1e8681157 100644 +--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c ++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c +@@ -53,9 +53,11 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type) + { + int ret; + +- ASSERT_RTNL(); ++ if (test_and_set_bit(HBG_NIC_STATE_RESETTING, &priv->state)) ++ return -EBUSY; + + if (netif_running(priv->netdev)) { ++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + dev_warn(&priv->pdev->dev, + "failed to reset because port is up\n"); + return -EBUSY; +@@ -64,7 +66,6 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type) + netif_device_detach(priv->netdev); + + priv->reset_type = type; +- set_bit(HBG_NIC_STATE_RESETTING, &priv->state); + clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET); + if (ret) { +@@ -83,28 +84,25 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type) + type != priv->reset_type) + return 0; + +- ASSERT_RTNL(); +- +- clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + ret = hbg_rebuild(priv); + if (ret) { + set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); ++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + dev_err(&priv->pdev->dev, "failed to rebuild after reset\n"); + return ret; + } + + netif_device_attach(priv->netdev); ++ clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + + dev_info(&priv->pdev->dev, "reset done\n"); + return ret; + } + +-/* must be protected by rtnl lock */ + int hbg_reset(struct hbg_priv *priv) + { + int ret; + +- ASSERT_RTNL(); + ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION); + if (ret) + return ret; +@@ -169,7 +167,6 @@ static void hbg_pci_err_reset_prepare(struct pci_dev *pdev) + struct net_device *netdev = pci_get_drvdata(pdev); + struct hbg_priv *priv = netdev_priv(netdev); + +- rtnl_lock(); + hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR); + } + +@@ -179,7 +176,6 @@ static void hbg_pci_err_reset_done(struct pci_dev *pdev) + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_reset_done(priv, HBG_RESET_TYPE_FLR); +- rtnl_unlock(); + } + + static const struct pci_error_handlers hbg_pci_err_handler = { +-- +2.50.1 + diff --git a/queue-6.16/net-hibmcge-fix-the-division-by-zero-issue.patch b/queue-6.16/net-hibmcge-fix-the-division-by-zero-issue.patch new file mode 100644 index 0000000000..67eff14c98 --- /dev/null +++ b/queue-6.16/net-hibmcge-fix-the-division-by-zero-issue.patch @@ -0,0 +1,46 @@ +From 2d5cc1e9320bffb1c936a0e982dce8ab8803a836 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 18:27:57 +0800 +Subject: net: hibmcge: fix the division by zero issue + +From: Jijie Shao + +[ Upstream commit 7004b26f0b64331143eb0b312e77a357a11427ce ] + +When the network port is down, the queue is released, and ring->len is 0. +In debugfs, hbg_get_queue_used_num() will be called, +which may lead to a division by zero issue. + +This patch adds a check, if ring->len is 0, +hbg_get_queue_used_num() directly returns 0. + +Fixes: 40735e7543f9 ("net: hibmcge: Implement .ndo_start_xmit function") +Signed-off-by: Jijie Shao +Reviewed-by: Simon Horman +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h +index 2883a5899ae2..8b6110599e10 100644 +--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h ++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h +@@ -29,7 +29,12 @@ static inline bool hbg_fifo_is_full(struct hbg_priv *priv, enum hbg_dir dir) + + static inline u32 hbg_get_queue_used_num(struct hbg_ring *ring) + { +- return (ring->ntu + ring->len - ring->ntc) % ring->len; ++ u32 len = READ_ONCE(ring->len); ++ ++ if (!len) ++ return 0; ++ ++ return (READ_ONCE(ring->ntu) + len - READ_ONCE(ring->ntc)) % len; + } + + netdev_tx_t hbg_net_start_xmit(struct sk_buff *skb, struct net_device *netdev); +-- +2.50.1 + diff --git a/queue-6.16/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch b/queue-6.16/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch new file mode 100644 index 0000000000..53b232d11f --- /dev/null +++ b/queue-6.16/net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch @@ -0,0 +1,68 @@ +From d8162f09505cdd9f80470c636561332cc9e2e7d0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 18:27:58 +0800 +Subject: net: hibmcge: fix the np_link_fail error reporting issue + +From: Jijie Shao + +[ Upstream commit 62c50180ffda01468e640ac14925503796f255e2 ] + +Currently, after modifying device port mode, the np_link_ok state +is immediately checked. At this point, the device may not yet ready, +leading to the querying of an intermediate state. + +This patch will poll to check if np_link is ok after +modifying device port mode, and only report np_link_fail upon timeout. + +Fixes: e0306637e85d ("net: hibmcge: Add support for mac link exception handling feature") +Signed-off-by: Jijie Shao +Reviewed-by: Simon Horman +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +index 9b65eef62b3f..2844124f306d 100644 +--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c ++++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +@@ -12,6 +12,8 @@ + + #define HBG_HW_EVENT_WAIT_TIMEOUT_US (2 * 1000 * 1000) + #define HBG_HW_EVENT_WAIT_INTERVAL_US (10 * 1000) ++#define HBG_MAC_LINK_WAIT_TIMEOUT_US (500 * 1000) ++#define HBG_MAC_LINK_WAIT_INTERVAL_US (5 * 1000) + /* little endian or big endian. + * ctrl means packet description, data means skb packet data + */ +@@ -213,6 +215,9 @@ void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr) + + void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex) + { ++ u32 link_status; ++ int ret; ++ + hbg_hw_mac_enable(priv, HBG_STATUS_DISABLE); + + hbg_reg_write_field(priv, HBG_REG_PORT_MODE_ADDR, +@@ -224,8 +229,14 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex) + + hbg_hw_mac_enable(priv, HBG_STATUS_ENABLE); + +- if (!hbg_reg_read_field(priv, HBG_REG_AN_NEG_STATE_ADDR, +- HBG_REG_AN_NEG_STATE_NP_LINK_OK_B)) ++ /* wait MAC link up */ ++ ret = readl_poll_timeout(priv->io_base + HBG_REG_AN_NEG_STATE_ADDR, ++ link_status, ++ FIELD_GET(HBG_REG_AN_NEG_STATE_NP_LINK_OK_B, ++ link_status), ++ HBG_MAC_LINK_WAIT_INTERVAL_US, ++ HBG_MAC_LINK_WAIT_TIMEOUT_US); ++ if (ret) + hbg_np_link_fail_task_schedule(priv); + } + +-- +2.50.1 + diff --git a/queue-6.16/net-kcm-fix-race-condition-in-kcm_unattach.patch b/queue-6.16/net-kcm-fix-race-condition-in-kcm_unattach.patch new file mode 100644 index 0000000000..6246393f90 --- /dev/null +++ b/queue-6.16/net-kcm-fix-race-condition-in-kcm_unattach.patch @@ -0,0 +1,88 @@ +From b6830a257156db66722ed8d61507ba2528f6c0a0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Aug 2025 21:18:03 +0200 +Subject: net: kcm: Fix race condition in kcm_unattach() + +From: Sven Stegemann + +[ Upstream commit 52565a935213cd6a8662ddb8efe5b4219343a25d ] + +syzbot found a race condition when kcm_unattach(psock) +and kcm_release(kcm) are executed at the same time. + +kcm_unattach() is missing a check of the flag +kcm->tx_stopped before calling queue_work(). + +If the kcm has a reserved psock, kcm_unattach() might get executed +between cancel_work_sync() and unreserve_psock() in kcm_release(), +requeuing kcm->tx_work right before kcm gets freed in kcm_done(). + +Remove kcm->tx_stopped and replace it by the less +error-prone disable_work_sync(). + +Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") +Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662 +Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94 +Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e +Signed-off-by: Sven Stegemann +Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/kcm.h | 1 - + net/kcm/kcmsock.c | 10 ++-------- + 2 files changed, 2 insertions(+), 9 deletions(-) + +diff --git a/include/net/kcm.h b/include/net/kcm.h +index 441e993be634..d9c35e71ecea 100644 +--- a/include/net/kcm.h ++++ b/include/net/kcm.h +@@ -71,7 +71,6 @@ struct kcm_sock { + struct list_head wait_psock_list; + struct sk_buff *seq_skb; + struct mutex tx_mutex; +- u32 tx_stopped : 1; + + /* Don't use bit fields here, these are set under different locks */ + bool tx_wait; +diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c +index c05047dad62d..d0a001ebabfe 100644 +--- a/net/kcm/kcmsock.c ++++ b/net/kcm/kcmsock.c +@@ -430,7 +430,7 @@ static void psock_write_space(struct sock *sk) + + /* Check if the socket is reserved so someone is waiting for sending. */ + kcm = psock->tx_kcm; +- if (kcm && !unlikely(kcm->tx_stopped)) ++ if (kcm) + queue_work(kcm_wq, &kcm->tx_work); + + spin_unlock_bh(&mux->lock); +@@ -1694,12 +1694,6 @@ static int kcm_release(struct socket *sock) + */ + __skb_queue_purge(&sk->sk_write_queue); + +- /* Set tx_stopped. This is checked when psock is bound to a kcm and we +- * get a writespace callback. This prevents further work being queued +- * from the callback (unbinding the psock occurs after canceling work. +- */ +- kcm->tx_stopped = 1; +- + release_sock(sk); + + spin_lock_bh(&mux->lock); +@@ -1715,7 +1709,7 @@ static int kcm_release(struct socket *sock) + /* Cancel work. After this point there should be no outside references + * to the kcm socket. + */ +- cancel_work_sync(&kcm->tx_work); ++ disable_work_sync(&kcm->tx_work); + + lock_sock(sk); + psock = kcm->tx_psock; +-- +2.50.1 + diff --git a/queue-6.16/net-lapbether-ignore-ops-locked-netdevs.patch b/queue-6.16/net-lapbether-ignore-ops-locked-netdevs.patch new file mode 100644 index 0000000000..0b2d2dfa16 --- /dev/null +++ b/queue-6.16/net-lapbether-ignore-ops-locked-netdevs.patch @@ -0,0 +1,64 @@ +From 3fdafaf1ab15d2a63c9237a845b9c1448d472fe2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 14:37:25 -0700 +Subject: net: lapbether: ignore ops-locked netdevs + +From: Stanislav Fomichev + +[ Upstream commit 53898ebabe843bfa7baea9dae152797d5d0563c9 ] + +Syzkaller managed to trigger lock dependency in xsk_notify via +register_netdevice. As discussed in [0], using register_netdevice +in the notifiers is problematic so skip adding lapbeth for ops-locked +devices. + + xsk_notifier+0xa4/0x280 net/xdp/xsk.c:1645 + notifier_call_chain+0xbc/0x410 kernel/notifier.c:85 + call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230 + call_netdevice_notifiers_extack net/core/dev.c:2268 [inline] + call_netdevice_notifiers net/core/dev.c:2282 [inline] + unregister_netdevice_many_notify+0xf9d/0x2700 net/core/dev.c:12077 + unregister_netdevice_many net/core/dev.c:12140 [inline] + unregister_netdevice_queue+0x305/0x3f0 net/core/dev.c:11984 + register_netdevice+0x18f1/0x2270 net/core/dev.c:11149 + lapbeth_new_device drivers/net/wan/lapbether.c:420 [inline] + lapbeth_device_event+0x5b1/0xbe0 drivers/net/wan/lapbether.c:462 + notifier_call_chain+0xbc/0x410 kernel/notifier.c:85 + call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:2230 + call_netdevice_notifiers_extack net/core/dev.c:2268 [inline] + call_netdevice_notifiers net/core/dev.c:2282 [inline] + __dev_notify_flags+0x12c/0x2e0 net/core/dev.c:9497 + netif_change_flags+0x108/0x160 net/core/dev.c:9526 + dev_change_flags+0xba/0x250 net/core/dev_api.c:68 + devinet_ioctl+0x11d5/0x1f50 net/ipv4/devinet.c:1200 + inet_ioctl+0x3a7/0x3f0 net/ipv4/af_inet.c:1001 + +0: https://lore.kernel.org/netdev/20250625140357.6203d0af@kernel.org/ +Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP") +Suggested-by: Jakub Kicinski +Reported-by: syzbot+e67ea9c235b13b4f0020@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e67ea9c235b13b4f0020 +Signed-off-by: Stanislav Fomichev +Link: https://patch.msgid.link/20250806213726.1383379-1-sdf@fomichev.me +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/wan/lapbether.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c +index 995a7207bdf8..f357a7ac70ac 100644 +--- a/drivers/net/wan/lapbether.c ++++ b/drivers/net/wan/lapbether.c +@@ -81,7 +81,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) + + static __inline__ int dev_is_ethdev(struct net_device *dev) + { +- return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); ++ return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev); + } + + /* ------------------------------------------------------------------------ */ +-- +2.50.1 + diff --git a/queue-6.16/net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch b/queue-6.16/net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch new file mode 100644 index 0000000000..b11abd13c6 --- /dev/null +++ b/queue-6.16/net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch @@ -0,0 +1,62 @@ +From 133eb2c7ce6134ab45bff3f7632a07aa5bf086a5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:54:49 +0200 +Subject: net: mdiobus: release reset_gpio in mdiobus_unregister_device() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Buday Csaba + +[ Upstream commit 8ea25274ebaf2f6be8be374633b2ed8348ec0e70 ] + +reset_gpio is claimed in mdiobus_register_device(), but it is not +released in mdiobus_unregister_device(). It is instead only +released when the whole MDIO bus is unregistered. +When a device uses the reset_gpio property, it becomes impossible +to unregister it and register it again, because the GPIO remains +claimed. +This patch resolves that issue. + +Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support") # see notes +Reviewed-by: Andrew Lunn +Cc: Csókás Bence +[ csokas.bence: Resolve rebase conflict and clarify msg ] +Signed-off-by: Buday Csaba +Link: https://patch.msgid.link/20250807135449.254254-2-csokas.bence@prolan.hu +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/phy/mdio_bus.c | 1 + + drivers/net/phy/mdio_bus_provider.c | 3 --- + 2 files changed, 1 insertion(+), 3 deletions(-) + +diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c +index fda2e27c1810..cad6ed3aa10b 100644 +--- a/drivers/net/phy/mdio_bus.c ++++ b/drivers/net/phy/mdio_bus.c +@@ -91,6 +91,7 @@ int mdiobus_unregister_device(struct mdio_device *mdiodev) + if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev) + return -EINVAL; + ++ gpiod_put(mdiodev->reset_gpio); + reset_control_put(mdiodev->reset_ctrl); + + mdiodev->bus->mdio_map[mdiodev->addr] = NULL; +diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c +index 65850e36284d..5401170f14e5 100644 +--- a/drivers/net/phy/mdio_bus_provider.c ++++ b/drivers/net/phy/mdio_bus_provider.c +@@ -444,9 +444,6 @@ void mdiobus_unregister(struct mii_bus *bus) + if (!mdiodev) + continue; + +- if (mdiodev->reset_gpio) +- gpiod_put(mdiodev->reset_gpio); +- + mdiodev->device_remove(mdiodev); + mdiodev->device_free(mdiodev); + } +-- +2.50.1 + diff --git a/queue-6.16/net-page_pool-allow-enabling-recycling-late-fix-fals.patch b/queue-6.16/net-page_pool-allow-enabling-recycling-late-fix-fals.patch new file mode 100644 index 0000000000..29d91c0b0f --- /dev/null +++ b/queue-6.16/net-page_pool-allow-enabling-recycling-late-fix-fals.patch @@ -0,0 +1,174 @@ +From a2759ceb0274ed9b8379560e6e516170a71b2101 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 17:36:54 -0700 +Subject: net: page_pool: allow enabling recycling late, fix false positive + warning + +From: Jakub Kicinski + +[ Upstream commit 64fdaa94bfe0cca3a0f4b2dd922486c5f59fe678 ] + +Page pool can have pages "directly" (locklessly) recycled to it, +if the NAPI that owns the page pool is scheduled to run on the same CPU. +To make this safe we check that the NAPI is disabled while we destroy +the page pool. In most cases NAPI and page pool lifetimes are tied +together so this happens naturally. + +The queue API expects the following order of calls: + -> mem_alloc + alloc new pp + -> stop + napi_disable + -> start + napi_enable + -> mem_free + free old pp + +Here we allocate the page pool in ->mem_alloc and free in ->mem_free. +But the NAPIs are only stopped between ->stop and ->start. We created +page_pool_disable_direct_recycling() to safely shut down the recycling +in ->stop. This way the page_pool_destroy() call in ->mem_free doesn't +have to worry about recycling any more. + +Unfortunately, the page_pool_disable_direct_recycling() is not enough +to deal with failures which necessitate freeing the _new_ page pool. +If we hit a failure in ->mem_alloc or ->stop the new page pool has +to be freed while the NAPI is active (assuming driver attaches the +page pool to an existing NAPI instance and doesn't reallocate NAPIs). + +Freeing the new page pool is technically safe because it hasn't been +used for any packets, yet, so there can be no recycling. But the check +in napi_assert_will_not_race() has no way of knowing that. We could +check if page pool is empty but that'd make the check much less likely +to trigger during development. + +Add page_pool_enable_direct_recycling(), pairing with +page_pool_disable_direct_recycling(). It will allow us to create the new +page pools in "disabled" state and only enable recycling when we know +the reconfig operation will not fail. + +Coincidentally it will also let us re-enable the recycling for the old +pool, if the reconfig failed: + + -> mem_alloc (new) + -> stop (old) + # disables direct recycling for old + -> start (new) + # fail!! + -> start (old) + # go back to old pp but direct recycling is lost :( + -> mem_free (new) + +The new helper is idempotent to make the life easier for drivers, +which can operate in HDS mode and support zero-copy Rx. +The driver can call the helper twice whether there are two pools +or it has multiple references to a single pool. + +Fixes: 40eca00ae605 ("bnxt_en: unlink page pool when stopping Rx queue") +Tested-by: David Wei +Link: https://patch.msgid.link/20250805003654.2944974-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++- + include/net/page_pool/types.h | 2 ++ + net/core/page_pool.c | 29 +++++++++++++++++++++++ + 3 files changed, 39 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 243cb13cb01c..8d950b43846e 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -3810,7 +3810,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, + if (BNXT_RX_PAGE_MODE(bp)) + pp.pool_size += bp->rx_ring_size; + pp.nid = numa_node; +- pp.napi = &rxr->bnapi->napi; + pp.netdev = bp->dev; + pp.dev = &bp->pdev->dev; + pp.dma_dir = bp->rx_dir; +@@ -3842,6 +3841,12 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, + return PTR_ERR(pool); + } + ++static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr) ++{ ++ page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi); ++ page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi); ++} ++ + static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) + { + u16 mem_size; +@@ -3880,6 +3885,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) + rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); + if (rc) + return rc; ++ bnxt_enable_rx_page_pool(rxr); + + rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0); + if (rc < 0) +@@ -16042,6 +16048,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) + goto err_reset; + } + ++ bnxt_enable_rx_page_pool(rxr); + napi_enable_locked(&bnapi->napi); + bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons); + +diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h +index 431b593de709..1509a536cb85 100644 +--- a/include/net/page_pool/types.h ++++ b/include/net/page_pool/types.h +@@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, + struct xdp_mem_info; + + #ifdef CONFIG_PAGE_POOL ++void page_pool_enable_direct_recycling(struct page_pool *pool, ++ struct napi_struct *napi); + void page_pool_disable_direct_recycling(struct page_pool *pool); + void page_pool_destroy(struct page_pool *pool); + void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), +diff --git a/net/core/page_pool.c b/net/core/page_pool.c +index ba7cf3e3c32f..368412baad26 100644 +--- a/net/core/page_pool.c ++++ b/net/core/page_pool.c +@@ -1201,6 +1201,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + pool->xdp_mem_id = mem->id; + } + ++/** ++ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI ++ * @pool: page pool to modify ++ * @napi: NAPI instance to associate the page pool with ++ * ++ * Associate a page pool with a NAPI instance for lockless page recycling. ++ * This is useful when a new page pool has to be added to a NAPI instance ++ * without disabling that NAPI instance, to mark the point at which control ++ * path "hands over" the page pool to the NAPI instance. In most cases driver ++ * can simply set the @napi field in struct page_pool_params, and does not ++ * have to call this helper. ++ * ++ * The function is idempotent, but does not implement any refcounting. ++ * Single page_pool_disable_direct_recycling() will disable recycling, ++ * no matter how many times enable was called. ++ */ ++void page_pool_enable_direct_recycling(struct page_pool *pool, ++ struct napi_struct *napi) ++{ ++ if (READ_ONCE(pool->p.napi) == napi) ++ return; ++ WARN_ON(!napi || pool->p.napi); ++ ++ mutex_lock(&page_pools_lock); ++ WRITE_ONCE(pool->p.napi, napi); ++ mutex_unlock(&page_pools_lock); ++} ++EXPORT_SYMBOL(page_pool_enable_direct_recycling); ++ + void page_pool_disable_direct_recycling(struct page_pool *pool) + { + /* Disable direct recycling based on pool->cpuid. +-- +2.50.1 + diff --git a/queue-6.16/net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch b/queue-6.16/net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch new file mode 100644 index 0000000000..ec00381c56 --- /dev/null +++ b/queue-6.16/net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch @@ -0,0 +1,75 @@ +From 03536af7c40cc0517826bb09f63fd02513d39540 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 12:08:32 +0800 +Subject: net: phy: nxp-c45-tja11xx: fix the PHY ID mismatch issue when using + C45 + +From: Clark Wang + +[ Upstream commit 8ee90742cf29427683294a6a80f1e2b7f4af1cff ] + +TJA1103/04/20/21 support both C22 and C45 accessing methods. + +The TJA11xx driver has implemented the match_phy_device() API. +However, it does not handle the C45 ID. If C45 was used to access +TJA11xx, match_phy_device() would always return false due to +phydev->phy_id only used by C22 being empty, resulting in the +generic phy driver being used for TJA11xx PHYs. + +Therefore, check phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] when +using C45. + +Fixes: 1b76b2497aba ("net: phy: nxp-c45-tja11xx: simplify .match_phy_device OP") +Signed-off-by: Clark Wang +Link: https://patch.msgid.link/20250807040832.2455306-1-xiaoning.wang@nxp.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/phy/nxp-c45-tja11xx.c | 23 +++++++++++++---------- + 1 file changed, 13 insertions(+), 10 deletions(-) + +diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c +index 4c6d905f0a9f..87adb6508017 100644 +--- a/drivers/net/phy/nxp-c45-tja11xx.c ++++ b/drivers/net/phy/nxp-c45-tja11xx.c +@@ -1965,24 +1965,27 @@ static int nxp_c45_macsec_ability(struct phy_device *phydev) + return macsec_ability; + } + ++static bool tja11xx_phy_id_compare(struct phy_device *phydev, ++ const struct phy_driver *phydrv) ++{ ++ u32 id = phydev->is_c45 ? phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] : ++ phydev->phy_id; ++ ++ return phy_id_compare(id, phydrv->phy_id, phydrv->phy_id_mask); ++} ++ + static int tja11xx_no_macsec_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) + { +- if (!phy_id_compare(phydev->phy_id, phydrv->phy_id, +- phydrv->phy_id_mask)) +- return 0; +- +- return !nxp_c45_macsec_ability(phydev); ++ return tja11xx_phy_id_compare(phydev, phydrv) && ++ !nxp_c45_macsec_ability(phydev); + } + + static int tja11xx_macsec_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) + { +- if (!phy_id_compare(phydev->phy_id, phydrv->phy_id, +- phydrv->phy_id_mask)) +- return 0; +- +- return nxp_c45_macsec_ability(phydev); ++ return tja11xx_phy_id_compare(phydev, phydrv) && ++ nxp_c45_macsec_ability(phydev); + } + + static const struct nxp_c45_regmap tja1120_regmap = { +-- +2.50.1 + diff --git a/queue-6.16/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch b/queue-6.16/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch new file mode 100644 index 0000000000..3fbac86a5d --- /dev/null +++ b/queue-6.16/net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch @@ -0,0 +1,69 @@ +From c53fdb12017debb6f4da4eac9248b6aa4ecc5bb5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 09:36:55 +0000 +Subject: net: stmmac: thead: Get and enable APB clock on initialization + +From: Yao Zi + +[ Upstream commit 4cc339ce482ba78589a2d5cbe1c84b735d263383 ] + +It's necessary to adjust the MAC TX clock when the linkspeed changes, +but it's noted such adjustment always fails on TH1520 SoC, and reading +back from APB glue registers that control clock generation results in +garbage, causing broken link. + +With some testing, it's found a clock must be ungated for access to APB +glue registers. Without any consumer, the clock is automatically +disabled during late kernel startup. Let's get and enable it if it's +described in devicetree. + +For backward compatibility with older devicetrees, probing won't fail if +the APB clock isn't found. In this case, we emit a warning since the +link will break if the speed changes. + +Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC") +Signed-off-by: Yao Zi +Tested-by: Drew Fustini +Reviewed-by: Drew Fustini +Link: https://patch.msgid.link/20250808093655.48074-4-ziyao@disroot.org +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c +index c72ee759aae5..f2946bea0bc2 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c ++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c +@@ -211,6 +211,7 @@ static int thead_dwmac_probe(struct platform_device *pdev) + struct stmmac_resources stmmac_res; + struct plat_stmmacenet_data *plat; + struct thead_dwmac *dwmac; ++ struct clk *apb_clk; + void __iomem *apb; + int ret; + +@@ -224,6 +225,19 @@ static int thead_dwmac_probe(struct platform_device *pdev) + return dev_err_probe(&pdev->dev, PTR_ERR(plat), + "dt configuration failed\n"); + ++ /* ++ * The APB clock is essential for accessing glue registers. However, ++ * old devicetrees don't describe it correctly. We continue to probe ++ * and emit a warning if it isn't present. ++ */ ++ apb_clk = devm_clk_get_enabled(&pdev->dev, "apb"); ++ if (PTR_ERR(apb_clk) == -ENOENT) ++ dev_warn(&pdev->dev, ++ "cannot get apb clock, link may break after speed changes\n"); ++ else if (IS_ERR(apb_clk)) ++ return dev_err_probe(&pdev->dev, PTR_ERR(apb_clk), ++ "failed to get apb clock\n"); ++ + dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL); + if (!dwmac) + return -ENOMEM; +-- +2.50.1 + diff --git a/queue-6.16/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.16/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch new file mode 100644 index 0000000000..e264d95d17 --- /dev/null +++ b/queue-6.16/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch @@ -0,0 +1,44 @@ +From 10472084e5247a182caea03db3b210864d598363 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Aug 2025 07:23:18 -0700 +Subject: net: ti: icss-iep: Fix incorrect type for return value in + extts_enable() + +From: Alok Tiwari + +[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ] + +The variable ret in icss_iep_extts_enable() was incorrectly declared +as u32, while the function returns int and may return negative error +codes. This will cause sign extension issues and incorrect error +propagation. Update ret to be int to fix error handling. + +This change corrects the declaration to avoid potential type mismatch. + +Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") +Signed-off-by: Alok Tiwari +Reviewed-by: Andrew Lunn +Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c +index 50bfbc2779e4..d8c9fe1d98c4 100644 +--- a/drivers/net/ethernet/ti/icssg/icss_iep.c ++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c +@@ -621,7 +621,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on) + + static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on) + { +- u32 val, cap, ret = 0; ++ u32 val, cap; ++ int ret = 0; + + mutex_lock(&iep->ptp_clk_mutex); + +-- +2.50.1 + diff --git a/queue-6.16/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch b/queue-6.16/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch new file mode 100644 index 0000000000..9ef95521f9 --- /dev/null +++ b/queue-6.16/net-ti-icssg-prueth-fix-emac-link-speed-handling.patch @@ -0,0 +1,56 @@ +From be96c5c83b7e3f06515946d993a3bbb28a66ad22 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Aug 2025 23:08:12 +0530 +Subject: net: ti: icssg-prueth: Fix emac link speed handling + +From: MD Danish Anwar + +[ Upstream commit 06feac15406f4f66f4c0c6ea60b10d44775d4133 ] + +When link settings are changed emac->speed is populated by +emac_adjust_link(). The link speed and other settings are then written into +the DRAM. However if both ports are brought down after this and brought up +again or if the operating mode is changed and a firmware reload is needed, +the DRAM is cleared by icssg_config(). As a result the link settings are +lost. + +Fix this by calling emac_adjust_link() after icssg_config(). This re +populates the settings in the DRAM after a new firmware load. + +Fixes: 9facce84f406 ("net: ti: icssg-prueth: Fix firmware load sequence.") +Signed-off-by: MD Danish Anwar +Reviewed-by: Andrew Lunn +Message-ID: <20250805173812.2183161-1-danishanwar@ti.com> +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/icssg/icssg_prueth.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c +index 2f5c4335dec3..008d77727400 100644 +--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c ++++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c +@@ -50,6 +50,8 @@ + /* CTRLMMR_ICSSG_RGMII_CTRL register bits */ + #define ICSSG_CTRL_RGMII_ID_MODE BIT(24) + ++static void emac_adjust_link(struct net_device *ndev); ++ + static int emac_get_tx_ts(struct prueth_emac *emac, + struct emac_tx_ts_response *rsp) + { +@@ -266,6 +268,10 @@ static int prueth_emac_common_start(struct prueth *prueth) + ret = icssg_config(prueth, emac, slice); + if (ret) + goto disable_class; ++ ++ mutex_lock(&emac->ndev->phydev->lock); ++ emac_adjust_link(emac->ndev); ++ mutex_unlock(&emac->ndev->phydev->lock); + } + + ret = prueth_emac_start(prueth); +-- +2.50.1 + diff --git a/queue-6.16/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.16/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..0b6a44ba97 --- /dev/null +++ b/queue-6.16/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,129 @@ +From 2a6a033f9923cd580fdd3ad1f09e4155d839f3ea Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 2cc0fde23344..5fdcae45e0bc 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -884,8 +884,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -1208,19 +1206,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -1257,7 +1262,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -1270,8 +1275,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, true, flags); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -1284,12 +1288,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-6.16/netfilter-ctnetlink-remove-refcounting-in-expectatio.patch b/queue-6.16/netfilter-ctnetlink-remove-refcounting-in-expectatio.patch new file mode 100644 index 0000000000..fff54e3444 --- /dev/null +++ b/queue-6.16/netfilter-ctnetlink-remove-refcounting-in-expectatio.patch @@ -0,0 +1,164 @@ +From b5625a8250316d9830487dc5d90c7f8bd77a3a8e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:09 +0200 +Subject: netfilter: ctnetlink: remove refcounting in expectation dumpers + +From: Florian Westphal + +[ Upstream commit 1492e3dcb2be3aa46d1963da96aa9593e4e4db5a ] + +Same pattern as previous patch: do not keep the expectation object +alive via refcount, only store a cookie value and then use that +as the skip hint for dump resumption. + +AFAICS this has the same issue as the one resolved in the conntrack +dumper, when we do + if (!refcount_inc_not_zero(&exp->use)) + +to increment the refcount, there is a chance that exp == last, which +causes a double-increment of the refcount and subsequent memory leak. + +Fixes: cf6994c2b981 ("[NETFILTER]: nf_conntrack_netlink: sync expectation dumping with conntrack table dumping") +Fixes: e844a928431f ("netfilter: ctnetlink: allow to dump expectation per master conntrack") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 41 ++++++++++++---------------- + 1 file changed, 17 insertions(+), 24 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 5fdcae45e0bc..2273ead8102f 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -3171,23 +3171,27 @@ ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item) + return 0; + } + #endif +-static int ctnetlink_exp_done(struct netlink_callback *cb) ++ ++static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp) + { +- if (cb->args[1]) +- nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); +- return 0; ++ unsigned long id = (unsigned long)exp; ++ ++ id += nf_ct_get_id(exp->master); ++ id += exp->class; ++ ++ return id ? id : 1; + } + + static int + ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + struct net *net = sock_net(skb->sk); +- struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; ++ unsigned long last_id = cb->args[1]; ++ struct nf_conntrack_expect *exp; + + rcu_read_lock(); +- last = (struct nf_conntrack_expect *)cb->args[1]; + for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { + restart: + hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], +@@ -3199,7 +3203,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (exp != last) ++ if (ctnetlink_exp_id(exp) != last_id) + continue; + cb->args[1] = 0; + } +@@ -3208,9 +3212,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { +- if (!refcount_inc_not_zero(&exp->use)) +- continue; +- cb->args[1] = (unsigned long)exp; ++ cb->args[1] = ctnetlink_exp_id(exp); + goto out; + } + } +@@ -3221,32 +3223,30 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + rcu_read_unlock(); +- if (last) +- nf_ct_expect_put(last); +- + return skb->len; + } + + static int + ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { +- struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nf_conn *ct = cb->data; + struct nf_conn_help *help = nfct_help(ct); + u_int8_t l3proto = nfmsg->nfgen_family; ++ unsigned long last_id = cb->args[1]; ++ struct nf_conntrack_expect *exp; + + if (cb->args[0]) + return 0; + + rcu_read_lock(); +- last = (struct nf_conntrack_expect *)cb->args[1]; ++ + restart: + hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (cb->args[1]) { +- if (exp != last) ++ if (ctnetlink_exp_id(exp) != last_id) + continue; + cb->args[1] = 0; + } +@@ -3254,9 +3254,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { +- if (!refcount_inc_not_zero(&exp->use)) +- continue; +- cb->args[1] = (unsigned long)exp; ++ cb->args[1] = ctnetlink_exp_id(exp); + goto out; + } + } +@@ -3267,9 +3265,6 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + cb->args[0] = 1; + out: + rcu_read_unlock(); +- if (last) +- nf_ct_expect_put(last); +- + return skb->len; + } + +@@ -3288,7 +3283,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, + struct nf_conntrack_zone zone; + struct netlink_dump_control c = { + .dump = ctnetlink_exp_ct_dump_table, +- .done = ctnetlink_exp_done, + }; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, +@@ -3338,7 +3332,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb, + else { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_dump_table, +- .done = ctnetlink_exp_done, + }; + return netlink_dump_start(info->sk, skb, info->nlh, &c); + } +-- +2.50.1 + diff --git a/queue-6.16/netfilter-nf_tables-reject-duplicate-device-on-updat.patch b/queue-6.16/netfilter-nf_tables-reject-duplicate-device-on-updat.patch new file mode 100644 index 0000000000..41df7dae02 --- /dev/null +++ b/queue-6.16/netfilter-nf_tables-reject-duplicate-device-on-updat.patch @@ -0,0 +1,98 @@ +From 41a492b44d333b69f99787abd80e483c77ed08f0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 02:38:50 +0200 +Subject: netfilter: nf_tables: reject duplicate device on updates + +From: Pablo Neira Ayuso + +[ Upstream commit cf5fb87fcdaaaafec55dcc0dc5a9e15ead343973 ] + +A chain/flowtable update with duplicated devices in the same batch is +possible. Unfortunately, netdev event path only removes the first +device that is found, leaving unregistered the hook of the duplicated +device. + +Check if a duplicated device exists in the transaction batch, bail out +with EEXIST in such case. + +WARNING is hit when unregistering the hook: + + [49042.221275] WARNING: CPU: 4 PID: 8425 at net/netfilter/core.c:340 nf_hook_entry_head+0xaa/0x150 + [49042.221375] CPU: 4 UID: 0 PID: 8425 Comm: nft Tainted: G S 6.16.0+ #170 PREEMPT(full) + [...] + [49042.221382] RIP: 0010:nf_hook_entry_head+0xaa/0x150 + +Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable") +Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++++++ + 1 file changed, 30 insertions(+) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 064f18792d98..46ca725d6538 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -2790,6 +2790,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, + struct nft_chain *chain = ctx->chain; + struct nft_chain_hook hook = {}; + struct nft_stats __percpu *stats = NULL; ++ struct nftables_pernet *nft_net; + struct nft_hook *h, *next; + struct nf_hook_ops *ops; + struct nft_trans *trans; +@@ -2832,6 +2833,20 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, + if (nft_hook_list_find(&basechain->hook_list, h)) { + list_del(&h->list); + nft_netdev_hook_free(h); ++ continue; ++ } ++ ++ nft_net = nft_pernet(ctx->net); ++ list_for_each_entry(trans, &nft_net->commit_list, list) { ++ if (trans->msg_type != NFT_MSG_NEWCHAIN || ++ trans->table != ctx->table || ++ !nft_trans_chain_update(trans)) ++ continue; ++ ++ if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) { ++ nft_chain_release_hook(&hook); ++ return -EEXIST; ++ } + } + } + } else { +@@ -9033,6 +9048,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, + { + const struct nlattr * const *nla = ctx->nla; + struct nft_flowtable_hook flowtable_hook; ++ struct nftables_pernet *nft_net; + struct nft_hook *hook, *next; + struct nf_hook_ops *ops; + struct nft_trans *trans; +@@ -9049,6 +9065,20 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, + if (nft_hook_list_find(&flowtable->hook_list, hook)) { + list_del(&hook->list); + nft_netdev_hook_free(hook); ++ continue; ++ } ++ ++ nft_net = nft_pernet(ctx->net); ++ list_for_each_entry(trans, &nft_net->commit_list, list) { ++ if (trans->msg_type != NFT_MSG_NEWFLOWTABLE || ++ trans->table != ctx->table || ++ !nft_trans_flowtable_update(trans)) ++ continue; ++ ++ if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) { ++ err = -EEXIST; ++ goto err_flowtable_update_hook; ++ } + } + } + +-- +2.50.1 + diff --git a/queue-6.16/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.16/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch new file mode 100644 index 0000000000..fe92f1ffce --- /dev/null +++ b/queue-6.16/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch @@ -0,0 +1,103 @@ +From 6618a73f0c48d3ab0731910efbb5972d5afde30f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Jul 2025 15:26:49 +0900 +Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun() + +From: Jeongjun Park + +[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ] + +syzbot reported the following ABBA deadlock: + + CPU0 CPU1 + ---- ---- + n_vclocks_store() + lock(&ptp->n_vclocks_mux) [1] + (physical clock) + pc_clock_adjtime() + lock(&clk->rwsem) [2] + (physical clock) + ... + ptp_clock_freerun() + ptp_vclock_in_use() + lock(&ptp->n_vclocks_mux) [3] + (physical clock) + ptp_clock_unregister() + posix_clock_unregister() + lock(&clk->rwsem) [4] + (virtual clock) + +Since ptp virtual clock is registered only under ptp physical clock, both +ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use() +to lock &ptp->n_vclocks_mux and check ptp->n_vclocks. + +However, when unregistering vclocks in n_vclocks_store(), the locking +ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of +ptp_clock_unregister() called through device_for_each_child_reverse() +is a virtual clock lock. + +Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are +different locks, but in lockdep, a false positive occurs because the +possibility of deadlock is determined through lock-class. + +To solve this, lock subclass annotation must be added to the posix_clock +rwsem of the vclock. + +Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad +Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") +Signed-off-by: Jeongjun Park +Acked-by: Richard Cochran +Reviewed-by: Vladimir Oltean +Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/ptp/ptp_private.h | 5 +++++ + drivers/ptp/ptp_vclock.c | 7 +++++++ + 2 files changed, 12 insertions(+) + +diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h +index a6aad743c282..b352df4cd3f9 100644 +--- a/drivers/ptp/ptp_private.h ++++ b/drivers/ptp/ptp_private.h +@@ -24,6 +24,11 @@ + #define PTP_DEFAULT_MAX_VCLOCKS 20 + #define PTP_MAX_CHANNELS 2048 + ++enum { ++ PTP_LOCK_PHYSICAL = 0, ++ PTP_LOCK_VIRTUAL, ++}; ++ + struct timestamp_event_queue { + struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS]; + int head; +diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c +index 7febfdcbde8b..8ed4b8598924 100644 +--- a/drivers/ptp/ptp_vclock.c ++++ b/drivers/ptp/ptp_vclock.c +@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp) + return PTP_VCLOCK_REFRESH_INTERVAL; + } + ++static void ptp_vclock_set_subclass(struct ptp_clock *ptp) ++{ ++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); ++} ++ + static const struct ptp_clock_info ptp_vclock_info = { + .owner = THIS_MODULE, + .name = "ptp virtual clock", +@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) + return NULL; + } + ++ ptp_vclock_set_subclass(vclock->clock); ++ + timecounter_init(&vclock->tc, &vclock->cc, 0); + ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); + +-- +2.50.1 + diff --git a/queue-6.16/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch b/queue-6.16/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch new file mode 100644 index 0000000000..e5332e4e3e --- /dev/null +++ b/queue-6.16/riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch @@ -0,0 +1,54 @@ +From 85ec46b8224074065b1bc397b3fb3ccff51c5ca8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 09:36:56 +0000 +Subject: riscv: dts: thead: Add APB clocks for TH1520 GMACs + +From: Yao Zi + +[ Upstream commit a7f75e2883c4bd57b12c3be61bb926929adad9c0 ] + +Describe perisys-apb4-hclk as the APB clock for TH1520 SoC, which is +essential for accessing GMAC glue registers. + +Fixes: 7e756671a664 ("riscv: dts: thead: Add TH1520 ethernet nodes") +Signed-off-by: Yao Zi +Reviewed-by: Drew Fustini +Tested-by: Drew Fustini +Link: https://patch.msgid.link/20250808093655.48074-5-ziyao@disroot.org +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + arch/riscv/boot/dts/thead/th1520.dtsi | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi +index 1db0054c4e09..93135e0f5a77 100644 +--- a/arch/riscv/boot/dts/thead/th1520.dtsi ++++ b/arch/riscv/boot/dts/thead/th1520.dtsi +@@ -294,8 +294,9 @@ gmac1: ethernet@ffe7060000 { + reg-names = "dwmac", "apb"; + interrupts = <67 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; +- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>; +- clock-names = "stmmaceth", "pclk"; ++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>, ++ <&clk CLK_PERISYS_APB4_HCLK>; ++ clock-names = "stmmaceth", "pclk", "apb"; + snps,pbl = <32>; + snps,fixed-burst; + snps,multicast-filter-bins = <64>; +@@ -316,8 +317,9 @@ gmac0: ethernet@ffe7070000 { + reg-names = "dwmac", "apb"; + interrupts = <66 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; +- clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>; +- clock-names = "stmmaceth", "pclk"; ++ clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>, ++ <&clk CLK_PERISYS_APB4_HCLK>; ++ clock-names = "stmmaceth", "pclk", "apb"; + snps,pbl = <32>; + snps,fixed-burst; + snps,multicast-filter-bins = <64>; +-- +2.50.1 + diff --git a/queue-6.16/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.16/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch new file mode 100644 index 0000000000..97ef3efaeb --- /dev/null +++ b/queue-6.16/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch @@ -0,0 +1,73 @@ +From 3e9fc7914a86dfe7937c516e67ff01c5544bec23 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:40:11 -0400 +Subject: sctp: linearize cloned gso packets in sctp_rcv + +From: Xin Long + +[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ] + +A cloned head skb still shares these frag skbs in fraglist with the +original head skb. It's not safe to access these frag skbs. + +syzbot reported two use-of-uninitialized-memory bugs caused by this: + + BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 + sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 + sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 + __release_sock+0x1da/0x330 net/core/sock.c:3106 + release_sock+0x6b/0x250 net/core/sock.c:3660 + sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 + sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 + sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 + inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:718 [inline] + +and + + BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 + sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 + __release_sock+0x1d3/0x330 net/core/sock.c:3213 + release_sock+0x6b/0x270 net/core/sock.c:3767 + sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 + sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 + sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 + inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:712 [inline] + +This patch fixes it by linearizing cloned gso packets in sctp_rcv(). + +Fixes: 90017accff61 ("sctp: Add GSO support") +Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com +Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com +Signed-off-by: Xin Long +Reviewed-by: Marcelo Ricardo Leitner +Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sctp/input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index 0c0d2757f6f8..6fcdcaeed40e 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb) + * it's better to just linearize it otherwise crc computing + * takes longer. + */ +- if ((!is_gso && skb_linearize(skb)) || ++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; + +-- +2.50.1 + diff --git a/queue-6.16/series b/queue-6.16/series index 8a30aa84eb..1c26cd75b2 100644 --- a/queue-6.16/series +++ b/queue-6.16/series @@ -54,3 +54,33 @@ acpi-processor-perflib-move-problematic-pr-performance-check.patch block-make-req_op_zone_finish-a-write-operation.patch mm-memory-tier-fix-abstract-distance-calculation-overflow.patch mfd-cros_ec-separate-charge-control-probing-from-usb-pd.patch +habanalabs-fix-uaf-in-export_dmabuf.patch +mm-smaps-fix-race-between-smaps_hugetlb_range-and-mi.patch +xfrm-flush-all-states-in-xfrm_state_fini.patch +xfrm-restore-gso-for-sw-crypto.patch +xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch +netfilter-ctnetlink-remove-refcounting-in-expectatio.patch +net-hibmcge-fix-rtnl-deadlock-issue.patch +net-hibmcge-fix-the-division-by-zero-issue.patch +net-hibmcge-fix-the-np_link_fail-error-reporting-iss.patch +net-ti-icssg-prueth-fix-emac-link-speed-handling.patch +net-page_pool-allow-enabling-recycling-late-fix-fals.patch +net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch +sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch +net-lapbether-ignore-ops-locked-netdevs.patch +hamradio-ignore-ops-locked-netdevs.patch +erofs-fix-block-count-report-when-48-bit-layout-is-o.patch +intel_idle-allow-loading-acpi-tables-for-any-family.patch +cpuidle-governors-menu-avoid-using-invalid-recent-in.patch +net-phy-nxp-c45-tja11xx-fix-the-phy-id-mismatch-issu.patch +net-mdiobus-release-reset_gpio-in-mdiobus_unregister.patch +net-stmmac-thead-get-and-enable-apb-clock-on-initial.patch +riscv-dts-thead-add-apb-clocks-for-th1520-gmacs.patch +ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch +tls-handle-data-disappearing-from-under-the-tls-ulp.patch +ipvs-fix-estimator-kthreads-preferred-affinity.patch +netfilter-nf_tables-reject-duplicate-device-on-updat.patch +bnxt-fill-data-page-pool-with-frags-if-page_size-bnx.patch +net-kcm-fix-race-condition-in-kcm_unattach.patch diff --git a/queue-6.16/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.16/tls-handle-data-disappearing-from-under-the-tls-ulp.patch new file mode 100644 index 0000000000..42dc41a26a --- /dev/null +++ b/queue-6.16/tls-handle-data-disappearing-from-under-the-tls-ulp.patch @@ -0,0 +1,106 @@ +From 370e01dc4b6dee8ec7f8338bd058e954edaed079 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 16:29:06 -0700 +Subject: tls: handle data disappearing from under the TLS ULP + +From: Jakub Kicinski + +[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ] + +TLS expects that it owns the receive queue of the TCP socket. +This cannot be guaranteed in case the reader of the TCP socket +entered before the TLS ULP was installed, or uses some non-standard +read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy +early exit (which leaves anchor pointing to a freed skb) with real +error handling. Wipe the parsing state and tell the reader to retry. + +We already reload the anchor every time we (re)acquire the socket lock, +so the only condition we need to avoid is an out of bounds read +(not having enough bytes in the socket for previously parsed record len). + +If some data was read from under TLS but there's enough in the queue +we'll reload and decrypt what is most likely not a valid TLS record. +Leading to some undefined behavior from TLS perspective (corrupting +a stream? missing an alert? missing an attack?) but no kernel crash +should take place. + +Reported-by: William Liu +Reported-by: Savino Dicanosa +Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io +Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") +Reviewed-by: Eric Dumazet +Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/tls/tls.h | 2 +- + net/tls/tls_strp.c | 11 ++++++++--- + net/tls/tls_sw.c | 3 ++- + 3 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/net/tls/tls.h b/net/tls/tls.h +index 774859b63f0d..4e077068e6d9 100644 +--- a/net/tls/tls.h ++++ b/net/tls/tls.h +@@ -196,7 +196,7 @@ void tls_strp_msg_done(struct tls_strparser *strp); + int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); + void tls_rx_msg_ready(struct tls_strparser *strp); + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); + int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); + struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); + int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); +diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c +index 095cf31bae0b..d71643b494a1 100644 +--- a/net/tls/tls_strp.c ++++ b/net/tls/tls_strp.c +@@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) + strp->stm.offset = offset; + } + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + { + struct strp_msg *rxm; + struct tls_msg *tlm; +@@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len); + + if (!strp->copy_mode && force_refresh) { +- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len)) +- return; ++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) { ++ WRITE_ONCE(strp->msg_ready, 0); ++ memset(&strp->stm, 0, sizeof(strp->stm)); ++ return false; ++ } + + tls_strp_load_anchor_with_queue(strp, strp->stm.full_len); + } +@@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + rxm->offset = strp->stm.offset; + tlm = tls_msg(strp->anchor); + tlm->control = strp->mark; ++ ++ return true; + } + + /* Called with lock held on lower socket */ +diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c +index 549d1ea01a72..51c98a007dda 100644 +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, + return sock_intr_errno(timeo); + } + +- tls_strp_msg_load(&ctx->strp, released); ++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) ++ return tls_rx_rec_wait(sk, psock, nonblock, false); + + return 1; + } +-- +2.50.1 + diff --git a/queue-6.16/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.16/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..7ab062f0bd --- /dev/null +++ b/queue-6.16/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From db3787d79bafca5f191f3fc9d0d8f261e0c90bba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index e0a6bfa95118..eeac86bacdba 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -224,7 +224,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 + diff --git a/queue-6.16/xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch b/queue-6.16/xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch new file mode 100644 index 0000000000..8549767a29 --- /dev/null +++ b/queue-6.16/xfrm-bring-back-device-check-in-validate_xmit_xfrm.patch @@ -0,0 +1,42 @@ +From 813b078a896269b17b6d904ba2cae892c2cc6d03 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:26 +0200 +Subject: xfrm: bring back device check in validate_xmit_xfrm + +From: Sabrina Dubroca + +[ Upstream commit 65f079a6c446a939eefe71e6d5957d5d6365fcf9 ] + +This is partial revert of commit d53dda291bbd993a29b84d358d282076e3d01506. + +This change causes traffic using GSO with SW crypto running through a +NIC capable of HW offload to no longer get segmented during +validate_xmit_xfrm, and is unrelated to the bonding use case mentioned +in the commit. + +Fixes: d53dda291bbd ("xfrm: Remove unneeded device check from validate_xmit_xfrm") +Signed-off-by: Sabrina Dubroca +Reviewed-by: Cosmin Ratiu +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/xfrm/xfrm_device.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c +index 1f88472aaac0..c7a1f080d2de 100644 +--- a/net/xfrm/xfrm_device.c ++++ b/net/xfrm/xfrm_device.c +@@ -155,7 +155,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur + return skb; + } + +- if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) { ++ if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || ++ unlikely(xmit_xfrm_check_overflow(skb)))) { + struct sk_buff *segs; + + /* Packet got rerouted, fixup features and segment it. */ +-- +2.50.1 + diff --git a/queue-6.16/xfrm-flush-all-states-in-xfrm_state_fini.patch b/queue-6.16/xfrm-flush-all-states-in-xfrm_state_fini.patch new file mode 100644 index 0000000000..499845f043 --- /dev/null +++ b/queue-6.16/xfrm-flush-all-states-in-xfrm_state_fini.patch @@ -0,0 +1,61 @@ +From 72312704ac548ae1474e05e1017aeb91aceb8a58 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:05:43 +0200 +Subject: xfrm: flush all states in xfrm_state_fini + +From: Sabrina Dubroca + +[ Upstream commit 42e42562c9cfcdacf000f1b42284a4fad24f8546 ] + +While reverting commit f75a2804da39 ("xfrm: destroy xfrm_state +synchronously on net exit path"), I incorrectly changed +xfrm_state_flush's "proto" argument back to IPSEC_PROTO_ANY. This +reverts some of the changes in commit dbb2483b2a46 ("xfrm: clean up +xfrm protocol checks"), and leads to some states not being removed +when we exit the netns. + +Pass 0 instead of IPSEC_PROTO_ANY from both xfrm_state_fini +xfrm6_tunnel_net_exit, so that xfrm_state_flush deletes all states. + +Fixes: 2a198bbec691 ("Revert "xfrm: destroy xfrm_state synchronously on net exit path"") +Reported-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=6641a61fe0e2e89ae8c5 +Tested-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com +Signed-off-by: Sabrina Dubroca +Reviewed-by: Simon Horman +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv6/xfrm6_tunnel.c | 2 +- + net/xfrm/xfrm_state.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c +index 5120a763da0d..0a0eeaed0591 100644 +--- a/net/ipv6/xfrm6_tunnel.c ++++ b/net/ipv6/xfrm6_tunnel.c +@@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + +- xfrm_state_flush(net, IPSEC_PROTO_ANY, false); ++ xfrm_state_flush(net, 0, false); + xfrm_flush_gc(); + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index 97ff756191ba..5f1da305eea8 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -3278,7 +3278,7 @@ void xfrm_state_fini(struct net *net) + unsigned int sz; + + flush_work(&net->xfrm.state_hash_work); +- xfrm_state_flush(net, IPSEC_PROTO_ANY, false); ++ xfrm_state_flush(net, 0, false); + flush_work(&xfrm_state_gc_work); + + WARN_ON(!list_empty(&net->xfrm.state_all)); +-- +2.50.1 + diff --git a/queue-6.16/xfrm-restore-gso-for-sw-crypto.patch b/queue-6.16/xfrm-restore-gso-for-sw-crypto.patch new file mode 100644 index 0000000000..eff6391b0b --- /dev/null +++ b/queue-6.16/xfrm-restore-gso-for-sw-crypto.patch @@ -0,0 +1,58 @@ +From da846e56cf21a09f2ac3e5528405cfcdb9f956cc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:25 +0200 +Subject: xfrm: restore GSO for SW crypto + +From: Sabrina Dubroca + +[ Upstream commit 234d1eff5d4987024be9d40ac07b918a5ae8db1a ] + +Commit 49431af6c4ef incorrectly assumes that the GSO path is only used +by HW offload, but it's also useful for SW crypto. + +This patch re-enables GSO for SW crypto. It's not an exact revert to +preserve the other changes made to xfrm_dev_offload_ok afterwards, but +it reverts all of its effects. + +Fixes: 49431af6c4ef ("xfrm: rely on XFRM offload") +Signed-off-by: Sabrina Dubroca +Reviewed-by: Leon Romanovsky +Reviewed-by: Zhu Yanjun +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/xfrm/xfrm_device.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c +index d2819baea414..1f88472aaac0 100644 +--- a/net/xfrm/xfrm_device.c ++++ b/net/xfrm/xfrm_device.c +@@ -415,10 +415,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) + struct net_device *dev = x->xso.dev; + bool check_tunnel_size; + +- if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) ++ if (!x->type_offload || ++ (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap)) + return false; + +- if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { ++ if ((!dev || dev == xfrm_dst_path(dst)->dev) && ++ !xdst->child->xfrm) { + mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); + if (skb->len <= mtu) + goto ok; +@@ -430,6 +432,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) + return false; + + ok: ++ if (!dev) ++ return true; ++ + check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && + x->props.mode == XFRM_MODE_TUNNEL; + switch (x->props.family) { +-- +2.50.1 + diff --git a/queue-6.6/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch b/queue-6.6/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch new file mode 100644 index 0000000000..589aa2319b --- /dev/null +++ b/queue-6.6/cpuidle-governors-menu-avoid-using-invalid-recent-in.patch @@ -0,0 +1,91 @@ +From a8a27555c9f7f82114edc7e731fd752b9483bf61 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Aug 2025 17:03:11 +0200 +Subject: cpuidle: governors: menu: Avoid using invalid recent intervals data + +From: Rafael J. Wysocki + +[ Upstream commit fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed ] + +Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid +discarding useful information") caused the number of wakeup interrupts +to increase on an idle system [1], which was not expected to happen +after merely allowing shallower idle states to be selected by the +governor in some cases. + +However, on the system in question, all of the idle states deeper than +WFI are rejected by the driver due to a firmware issue [2]. This causes +the governor to only consider the recent interval duriation data +corresponding to attempts to enter WFI that are successful and the +recent invervals table is filled with values lower than the scheduler +tick period. Consequently, the governor predicts an idle duration +below the scheduler tick period length and avoids stopping the tick +more often which leads to the observed symptom. + +Address it by modifying the governor to update the recent intervals +table also when entering the previously selected idle state fails, so +it knows that the short idle intervals might have been the minority +had the selected idle states been actually entered every time. + +Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") +Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1] +Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2] +Signed-off-by: Rafael J. Wysocki +Tested-by: Christian Loehle +Tested-by: Marc Zyngier +Reviewed-by: Christian Loehle +Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki +Signed-off-by: Sasha Levin +--- + drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c +index edd9a8fb9878..f3a071ac3b2a 100644 +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -160,6 +160,14 @@ static inline int performance_multiplier(unsigned int nr_iowaiters) + + static DEFINE_PER_CPU(struct menu_device, menu_devices); + ++static void menu_update_intervals(struct menu_device *data, unsigned int interval_us) ++{ ++ /* Update the repeating-pattern data. */ ++ data->intervals[data->interval_ptr++] = interval_us; ++ if (data->interval_ptr >= INTERVALS) ++ data->interval_ptr = 0; ++} ++ + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + + /* +@@ -284,6 +292,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; ++ } else if (!dev->last_residency_ns) { ++ /* ++ * This happens when the driver rejects the previously selected ++ * idle state and returns an error, so update the recent ++ * intervals table to prevent invalid information from being ++ * used going forward. ++ */ ++ menu_update_intervals(data, UINT_MAX); + } + + nr_iowaiters = nr_iowait_cpu(dev->cpu); +@@ -553,10 +569,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) + + data->correction_factor[data->bucket] = new_factor; + +- /* update the repeating-pattern data */ +- data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); +- if (data->interval_ptr >= INTERVALS) +- data->interval_ptr = 0; ++ menu_update_intervals(data, ktime_to_us(measured_ns)); + } + + /** +-- +2.50.1 + diff --git a/queue-6.6/intel_idle-allow-loading-acpi-tables-for-any-family.patch b/queue-6.6/intel_idle-allow-loading-acpi-tables-for-any-family.patch new file mode 100644 index 0000000000..606f124ccc --- /dev/null +++ b/queue-6.6/intel_idle-allow-loading-acpi-tables-for-any-family.patch @@ -0,0 +1,41 @@ +From 49fe61784622f7ba647e5db6df74aa4619ee0dd9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Aug 2025 15:37:14 -0400 +Subject: intel_idle: Allow loading ACPI tables for any family + +From: Len Brown + +[ Upstream commit e91a158b694d7f4bd937763dde79ed0afa472d8a ] + +There is no reason to limit intel_idle's loading of ACPI tables to +family 6. Upcoming Intel processors are not in family 6. + +Below "Fixes" really means "applies cleanly until". +That syntax commit didn't change the previous logic, +but shows this patch applies back 5-years. + +Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros") +Signed-off-by: Len Brown +Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 44842f243f40..6908052dea77 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -1432,7 +1432,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + }; + + static const struct x86_cpu_id intel_mwait_ids[] __initconst = { +- X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL), ++ X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL), + {} + }; + +-- +2.50.1 + diff --git a/queue-6.6/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch b/queue-6.6/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch new file mode 100644 index 0000000000..4b906e2f7c --- /dev/null +++ b/queue-6.6/kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch @@ -0,0 +1,117 @@ +From 430f336058e516557bd4d0e04e7597307b4b537c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:38 -0700 +Subject: KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter + +From: Maxim Levitsky + +[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ] + +Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports +a subset of hardware functionality, i.e. KVM can't rely on hardware to +detect illegal/unsupported values. Failure to check the vmcs12 value +would allow the guest to load any harware-supported value while running L2. + +Take care to exempt BTF and LBR from the validity check in order to match +KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even +if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect +that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR +are being intercepted. + +Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set +*and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but +that would incur non-trivial complexity and wouldn't change the fact that +KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity +is not worth carrying. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Co-developed-by: Sean Christopherson +Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com +Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/nested.c | 12 ++++++++++-- + arch/x86/kvm/vmx/vmx.c | 5 ++--- + arch/x86/kvm/vmx/vmx.h | 3 +++ + 3 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index fdf7503491f9..10236ecdad95 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2564,7 +2564,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); ++ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & ++ vmx_get_supported_debugctl(vcpu, false)); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); +@@ -3045,7 +3046,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && +- CC(!kvm_dr7_valid(vmcs12->guest_dr7))) ++ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || ++ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && +@@ -4435,6 +4437,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + ++ /* ++ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. ++ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately ++ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into ++ * vmcs02 doesn't strictly track vmcs12. ++ */ + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index aaa767ed170e..32f1a38a1010 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2149,7 +2149,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, + return (unsigned long)data; + } + +-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) ++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) + { + u64 debugctl = 0; + +@@ -2168,8 +2168,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + return debugctl; + } + +-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, +- bool host_initiated) ++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) + { + u64 invalid; + +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 50d32d830890..5816fdd2dfa8 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -429,6 +429,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, + + void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + ++u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); ++bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); ++ + /* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and +-- +2.50.1 + diff --git a/queue-6.6/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch b/queue-6.6/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch new file mode 100644 index 0000000000..06d5948e9d --- /dev/null +++ b/queue-6.6/kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch @@ -0,0 +1,156 @@ +From dbe73b19d1fdc8dd7627b1c3710fd6f9191a1429 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:24 -0700 +Subject: KVM: nVMX: Defer SVI update to vmcs01 on EOI when L2 is active w/o + VID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Chao Gao + +[ Upstream commit 04bc93cf49d16d01753b95ddb5d4f230b809a991 ] + +If KVM emulates an EOI for L1's virtual APIC while L2 is active, defer +updating GUEST_INTERUPT_STATUS.SVI, i.e. the VMCS's cache of the highest +in-service IRQ, until L1 is active, as vmcs01, not vmcs02, needs to track +vISR. The missed SVI update for vmcs01 can result in L1 interrupts being +incorrectly blocked, e.g. if there is a pending interrupt with lower +priority than the interrupt that was EOI'd. + +This bug only affects use cases where L1's vAPIC is effectively passed +through to L2, e.g. in a pKVM scenario where L2 is L1's depriveleged host, +as KVM will only emulate an EOI for L1's vAPIC if Virtual Interrupt +Delivery (VID) is disabled in vmc12, and L1 isn't intercepting L2 accesses +to its (virtual) APIC page (or if x2APIC is enabled, the EOI MSR). + +WARN() if KVM updates L1's ISR while L2 is active with VID enabled, as an +EOI from L2 is supposed to affect L2's vAPIC, but still defer the update, +to try to keep L1 alive. Specifically, KVM forwards all APICv-related +VM-Exits to L1 via nested_vmx_l1_wants_exit(): + + case EXIT_REASON_APIC_ACCESS: + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* + * The controls for "virtualize APIC accesses," "APIC- + * register virtualization," and "virtual-interrupt + * delivery" only come from vmcs12. + */ + return true; + +Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/kvm/20230312180048.1778187-1-jason.cj.chen@intel.com +Reported-by: Markku Ahvenjärvi +Closes: https://lore.kernel.org/all/20240920080012.74405-1-mankku@gmail.com +Cc: Janne Karhunen +Signed-off-by: Chao Gao +[sean: drop request, handle in VMX, write changelog] +Tested-by: Chao Gao +Link: https://lore.kernel.org/r/20241128000010.4051275-3-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve minor syntactic conflict in lapic.h, account for lack of + kvm_x86_call(), drop sanity check due to lack of wants_to_run] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/lapic.c | 11 +++++++++++ + arch/x86/kvm/lapic.h | 1 + + arch/x86/kvm/vmx/nested.c | 5 +++++ + arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++++++ + arch/x86/kvm/vmx/vmx.h | 1 + + 5 files changed, 34 insertions(+) + +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index cbf85a1ffb74..ba1c2a7f74f7 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -803,6 +803,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) + } + } + ++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_lapic *apic = vcpu->arch.apic; ++ ++ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) ++ return; ++ ++ static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); ++} ++EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); ++ + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) + { + /* This may race with setting of irr in __apic_accept_irq() and +diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h +index 0a0ea4b5dd8c..0dd069b8d6d1 100644 +--- a/arch/x86/kvm/lapic.h ++++ b/arch/x86/kvm/lapic.h +@@ -124,6 +124,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); + int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); + int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); + enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu); ++void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu); + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); + + u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index d3e346a574f1..fdf7503491f9 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -4900,6 +4900,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } + ++ if (vmx->nested.update_vmcs01_hwapic_isr) { ++ vmx->nested.update_vmcs01_hwapic_isr = false; ++ kvm_apic_update_hwapic_isr(vcpu); ++ } ++ + if ((vm_exit_reason != -1) && + (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) + vmx->nested.need_vmcs12_to_shadow_sync = true; +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index cde01eb1f5e3..4563e7a9a851 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6839,6 +6839,22 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) + u16 status; + u8 old; + ++ /* ++ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI ++ * is only relevant for if and only if Virtual Interrupt Delivery is ++ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's ++ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested ++ * VM-Exit, otherwise L1 with run with a stale SVI. ++ */ ++ if (is_guest_mode(vcpu)) { ++ /* ++ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID ++ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. ++ */ ++ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; ++ return; ++ } ++ + if (max_isr == -1) + max_isr = 0; + +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 6be1627d888e..88c5b7ebf9d3 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -177,6 +177,7 @@ struct nested_vmx { + bool reload_vmcs01_apic_access_page; + bool update_vmcs01_cpu_dirty_logging; + bool update_vmcs01_apicv_status; ++ bool update_vmcs01_hwapic_isr; + + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to +-- +2.50.1 + diff --git a/queue-6.6/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch b/queue-6.6/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch new file mode 100644 index 0000000000..440e714ac0 --- /dev/null +++ b/queue-6.6/kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch @@ -0,0 +1,123 @@ +From 5b61b1298fba19f1de5269df84bb30ea8bbb71f4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:22 -0700 +Subject: KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI + shadow + +From: Sean Christopherson + +[ Upstream commit be45bc4eff33d9a7dae84a2150f242a91a617402 ] + +Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common +svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff() +so that VMRUN is not executed in an STI shadow. AMD CPUs have a quirk +(some would say "bug"), where the STI shadow bleeds into the guest's +intr_state field if a #VMEXIT occurs during injection of an event, i.e. if +the VMRUN doesn't complete before the subsequent #VMEXIT. + +The spurious "interrupts masked" state is relatively benign, as it only +occurs during event injection and is transient. Because KVM is already +injecting an event, the guest can't be in HLT, and if KVM is querying IRQ +blocking for injection, then KVM would need to force an immediate exit +anyways since injecting multiple events is impossible. + +However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the +spurious STI shadow is visible to L1 when running a nested VM, which can +trip sanity checks, e.g. in VMware's VMM. + +Hoist the STI+CLI all the way to C code, as the aforementioned calls to +guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are +enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already +possible. I.e. if there's kernel code that is confused by running with +RFLAGS.IF=1, then it's already a problem. In practice, since GIF=0 also +blocks NMIs, the only change in exposure to non-KVM code (relative to +surrounding VMRUN with STI+CLI) is exception handling code, and except for +the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path +are fatal. + +Use the "raw" variants to enable/disable IRQs to avoid tracing in the +"no instrumentation" code; the guest state helpers also take care of +tracing IRQ state. + +Oppurtunstically document why KVM needs to do STI in the first place. + +Reported-by: Doug Covelli +Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com +Fixes: f14eec0a3203 ("KVM: SVM: move more vmentry code to assembly") +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve minor syntatic conflict in __svm_sev_es_vcpu_run()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/svm/svm.c | 14 ++++++++++++++ + arch/x86/kvm/svm/vmenter.S | 9 +-------- + 2 files changed, 15 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 86c50747e158..abbb84ddfe02 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4170,6 +4170,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + + guest_state_enter_irqoff(); + ++ /* ++ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of ++ * VMRUN controls whether or not physical IRQs are masked (KVM always ++ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the ++ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow ++ * into guest state if delivery of an event during VMRUN triggers a ++ * #VMEXIT, and the guest_state transitions already tell lockdep that ++ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of ++ * this path, so IRQs aren't actually unmasked while running host code. ++ */ ++ raw_local_irq_enable(); ++ + amd_clear_divider(); + + if (sev_es_guest(vcpu->kvm)) +@@ -4177,6 +4189,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + else + __svm_vcpu_run(svm, spec_ctrl_intercepted); + ++ raw_local_irq_disable(); ++ + guest_state_exit_irqoff(); + } + +diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S +index 56fe34d9397f..81ecb9e1101d 100644 +--- a/arch/x86/kvm/svm/vmenter.S ++++ b/arch/x86/kvm/svm/vmenter.S +@@ -171,12 +171,8 @@ SYM_FUNC_START(__svm_vcpu_run) + VM_CLEAR_CPU_BUFFERS + + /* Enter guest mode */ +- sti +- + 3: vmrun %_ASM_AX + 4: +- cli +- + /* Pop @svm to RAX while it's the only available register. */ + pop %_ASM_AX + +@@ -341,11 +337,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) + VM_CLEAR_CPU_BUFFERS + + /* Enter guest mode */ +- sti +- + 1: vmrun %_ASM_AX +- +-2: cli ++2: + + /* Pop @svm to RDI, guest registers have been saved already. */ + pop %_ASM_DI +-- +2.50.1 + diff --git a/queue-6.6/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch b/queue-6.6/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch new file mode 100644 index 0000000000..cbe96943c1 --- /dev/null +++ b/queue-6.6/kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch @@ -0,0 +1,63 @@ +From f4b282d7e8425cc2b1c48f385fa3c049a29e137b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:36 -0700 +Subject: KVM: VMX: Allow guest to set DEBUGCTL.RTM_DEBUG if RTM is supported + +From: Sean Christopherson + +[ Upstream commit 17ec2f965344ee3fd6620bef7ef68792f4ac3af0 ] + +Let the guest set DEBUGCTL.RTM_DEBUG if RTM is supported according to the +guest CPUID model, as debug support is supposed to be available if RTM is +supported, and there are no known downsides to letting the guest debug RTM +aborts. + +Note, there are no known bug reports related to RTM_DEBUG, the primary +motivation is to reduce the probability of breaking existing guests when a +future change adds a missing consistency check on vmcs12.GUEST_DEBUGCTL +(KVM currently lets L2 run with whatever hardware supports; whoops). + +Note #2, KVM already emulates DR6.RTM, and doesn't restrict access to +DR7.RTM. + +Fixes: 83c529151ab0 ("KVM: x86: expose Intel cpu new features (HLE, RTM) to guest") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-5-seanjc@google.com +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/msr-index.h | 1 + + arch/x86/kvm/vmx/vmx.c | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 033855457581..723e48b57bd0 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -380,6 +380,7 @@ + #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) + #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 + #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) ++#define DEBUGCTLMSR_RTM_DEBUG BIT(15) + + #define MSR_PEBS_FRONTEND 0x000003f7 + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 08ca218ee858..359c3b7f52a1 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2161,6 +2161,10 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) + debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + ++ if (boot_cpu_has(X86_FEATURE_RTM) && ++ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM))) ++ debugctl |= DEBUGCTLMSR_RTM_DEBUG; ++ + return debugctl; + } + +-- +2.50.1 + diff --git a/queue-6.6/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch b/queue-6.6/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch new file mode 100644 index 0000000000..33b3b739db --- /dev/null +++ b/queue-6.6/kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch @@ -0,0 +1,90 @@ +From b170b1c7fa1f5907611a190f0e1c6fa6d1ae712e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:37 -0700 +Subject: KVM: VMX: Extract checking of guest's DEBUGCTL into helper + +From: Sean Christopherson + +[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ] + +Move VMX's logic to check DEBUGCTL values into a standalone helper so that +the code can be used by nested VM-Enter to apply the same logic to the +value being loaded from vmcs12. + +KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested +VM-Enter, as hardware may support features that KVM does not, i.e. relying +on hardware to detect invalid guest state will result in false negatives. +Unfortunately, that means applying KVM's funky suppression of BTF and LBR +to vmcs12 so as not to break existing guests. + +No functional change intended. + +Reviewed-by: Dapeng Mi +Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com +Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------ + 1 file changed, 17 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 359c3b7f52a1..aaa767ed170e 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2168,6 +2168,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated + return debugctl; + } + ++static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, ++ bool host_initiated) ++{ ++ u64 invalid; ++ ++ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); ++ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { ++ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); ++ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); ++ } ++ return !invalid; ++} ++ + /* + * Writes msr value into the appropriate "register". + * Returns 0 on success, non-0 otherwise. +@@ -2236,19 +2249,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + } + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; +- case MSR_IA32_DEBUGCTLMSR: { +- u64 invalid; +- +- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); +- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { +- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); +- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); +- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); +- } +- +- if (invalid) ++ case MSR_IA32_DEBUGCTLMSR: ++ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) + return 1; + ++ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); ++ + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; +@@ -2258,7 +2264,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); + return 0; +- } + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && +-- +2.50.1 + diff --git a/queue-6.6/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch b/queue-6.6/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch new file mode 100644 index 0000000000..5f2148284c --- /dev/null +++ b/queue-6.6/kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch @@ -0,0 +1,56 @@ +From 10389ae08622b1effe126e28b1a647b66752a860 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:30 -0700 +Subject: KVM: VMX: Handle forced exit due to preemption timer in fastpath + +From: Sean Christopherson + +[ Upstream commit 11776aa0cfa7d007ad1799b1553bdcbd830e5010 ] + +Handle VMX preemption timer VM-Exits due to KVM forcing an exit in the +exit fastpath, i.e. avoid calling back into handle_preemption_timer() for +the same exit. There is no work to be done for forced exits, as the name +suggests the goal is purely to get control back in KVM. + +In addition to shaving a few cycles, this will allow cleanly separating +handle_fastpath_preemption_timer() from handle_preemption_timer(), e.g. +it's not immediately obvious why _apparently_ calling +handle_fastpath_preemption_timer() twice on a "slow" exit is necessary: +the "slow" call is necessary to handle exits from L2, which are excluded +from the fastpath by vmx_vcpu_run(). + +Link: https://lore.kernel.org/r/20240110012705.506918-4-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 32b792387271..631fdd4a575a 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6027,12 +6027,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) + return EXIT_FASTPATH_REENTER_GUEST; + +- if (!vmx->req_immediate_exit) { +- kvm_lapic_expired_hv_timer(vcpu); +- return EXIT_FASTPATH_REENTER_GUEST; +- } ++ /* ++ * If the timer expired because KVM used it to force an immediate exit, ++ * then mission accomplished. ++ */ ++ if (vmx->req_immediate_exit) ++ return EXIT_FASTPATH_EXIT_HANDLED; + +- return EXIT_FASTPATH_NONE; ++ kvm_lapic_expired_hv_timer(vcpu); ++ return EXIT_FASTPATH_REENTER_GUEST; + } + + static int handle_preemption_timer(struct kvm_vcpu *vcpu) +-- +2.50.1 + diff --git a/queue-6.6/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch b/queue-6.6/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch new file mode 100644 index 0000000000..fbc9dd4d02 --- /dev/null +++ b/queue-6.6/kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch @@ -0,0 +1,74 @@ +From 346070b6afc211b7d9c548666678021841dbbc67 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:32 -0700 +Subject: KVM: VMX: Handle KVM-induced preemption timer exits in fastpath for + L2 + +From: Sean Christopherson + +[ Upstream commit 7b3d1bbf8d68d76fb21210932a5e8ed8ea80dbcc ] + +Eat VMX treemption timer exits in the fastpath regardless of whether L1 or +L2 is active. The VM-Exit is 100% KVM-induced, i.e. there is nothing +directly related to the exit that KVM needs to do on behalf of the guest, +thus there is no reason to wait until the slow path to do nothing. + +Opportunistically add comments explaining why preemption timer exits for +emulating the guest's APIC timer need to go down the slow path. + +Link: https://lore.kernel.org/r/20240110012705.506918-6-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 4c991d514015..0ecc0e996386 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6034,13 +6034,26 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + if (vmx->req_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + ++ /* ++ * If L2 is active, go down the slow path as emulating the guest timer ++ * expiration likely requires synthesizing a nested VM-Exit. ++ */ ++ if (is_guest_mode(vcpu)) ++ return EXIT_FASTPATH_NONE; ++ + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; + } + + static int handle_preemption_timer(struct kvm_vcpu *vcpu) + { +- handle_fastpath_preemption_timer(vcpu); ++ /* ++ * This non-fastpath handler is reached if and only if the preemption ++ * timer was being used to emulate a guest timer while L2 is active. ++ * All other scenarios are supposed to be handled in the fastpath. ++ */ ++ WARN_ON_ONCE(!is_guest_mode(vcpu)); ++ kvm_lapic_expired_hv_timer(vcpu); + return 1; + } + +@@ -7258,7 +7271,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + + static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + { +- if (is_guest_mode(vcpu)) ++ /* ++ * If L2 is active, some VMX preemption timer exits can be handled in ++ * the fastpath even, all other exits must use the slow path. ++ */ ++ if (is_guest_mode(vcpu) && ++ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + return EXIT_FASTPATH_NONE; + + switch (to_vmx(vcpu)->exit_reason.basic) { +-- +2.50.1 + diff --git a/queue-6.6/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch b/queue-6.6/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch new file mode 100644 index 0000000000..ffe7a41fa7 --- /dev/null +++ b/queue-6.6/kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch @@ -0,0 +1,191 @@ +From 40b4fc9e84bf81654f1ef6040150a04d2e2fc2fc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:40 -0700 +Subject: KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the + guest + +From: Maxim Levitsky + +[ Upstream commit 6b1dd26544d045f6a79e8c73572c0c0db3ef3c1a ] + +Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the +host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting +while running the guest. When running with the "default treatment of SMIs" +in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that +is visible to host (non-SMM) software, and instead transitions directly +from VMX non-root to SMM. And critically, DEBUGCTL isn't context switched +by hardware on SMI or RSM, i.e. SMM will run with whatever value was +resident in hardware at the time of the SMI. + +Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting +events while the CPU is executing in SMM, which can pollute profiling and +potentially leak information into the guest. + +Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner +run loop, as the bit can be toggled in IRQ context via IPI callback (SMP +function call), by way of /sys/devices/cpu/freeze_on_smi. + +Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be +preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs, +i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and +at worst could lead to undesirable behavior in the future if AMD CPUs ever +happened to pick up a collision with the bit. + +Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module +owns and controls GUEST_IA32_DEBUGCTL. + +WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the +lack of handling isn't a KVM bug (TDX already WARNs on any run_flag). + +Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed +by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state(). +Doing so avoids the need to track host_debugctl on a per-VMCS basis, as +GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and +load_vmcs12_host_state(). For the VM-Fail case, even though KVM won't +have actually entered the guest, vcpu_enter_guest() will have run with +vmcs02 active and thus could result in vmcs01 being run with a stale value. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Co-developed-by: Sean Christopherson +Link: https://lore.kernel.org/r/20250610232010.162191-9-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: move vmx/main.c change to vmx/vmx.c] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 7 +++++++ + arch/x86/kvm/vmx/nested.c | 3 +++ + arch/x86/kvm/vmx/vmx.c | 5 +++++ + arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++++- + arch/x86/kvm/x86.c | 14 ++++++++++++-- + 5 files changed, 41 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 7373b22c02a7..813887324d52 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1553,6 +1553,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + enum kvm_x86_run_flags { + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), + KVM_RUN_LOAD_GUEST_DR6 = BIT(1), ++ KVM_RUN_LOAD_DEBUGCTL = BIT(2), + }; + + struct kvm_x86_ops { +@@ -1580,6 +1581,12 @@ struct kvm_x86_ops { + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + ++ /* ++ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to ++ * match the host's value even while the guest is active. ++ */ ++ const u64 HOST_OWNED_DEBUGCTL; ++ + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 2ce39ffbcefb..d2fa192d7ce7 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -4688,6 +4688,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + ++ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ ++ vmx_reload_guest_debugctl(vcpu); ++ + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index d0973bd7853c..9b1f22bcb716 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7399,6 +7399,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + if (run_flags & KVM_RUN_LOAD_GUEST_DR6) + set_debugreg(vcpu->arch.dr6, 6); + ++ if (run_flags & KVM_RUN_LOAD_DEBUGCTL) ++ vmx_reload_guest_debugctl(vcpu); ++ + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time +@@ -8326,6 +8329,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + ++ .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM, ++ + .update_exception_bitmap = vmx_update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, + .get_msr = vmx_get_msr, +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 769e70fd142c..5d73d3e570d7 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -434,12 +434,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) + + static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) + { ++ WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM); ++ ++ val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM; + vmcs_write64(GUEST_IA32_DEBUGCTL, val); + } + + static inline u64 vmx_guest_debugctl_read(void) + { +- return vmcs_read64(GUEST_IA32_DEBUGCTL); ++ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM; ++} ++ ++static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu) ++{ ++ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ ++ if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM)) ++ return; ++ ++ vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM); + } + + /* +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index fbb2e70e3031..fc2cafc33b37 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10518,7 +10518,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + fastpath_t exit_fastpath; +- u64 run_flags; ++ u64 run_flags, debug_ctl; + + bool req_immediate_exit = false; + +@@ -10777,7 +10777,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(0, 7); + } + +- vcpu->arch.host_debugctl = get_debugctlmsr(); ++ /* ++ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL ++ * can be modified in IRQ context, e.g. via SMP function calls. Inform ++ * vendor code if any host-owned bits were changed, e.g. so that the ++ * value loaded into hardware while running the guest can be updated. ++ */ ++ debug_ctl = get_debugctlmsr(); ++ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && ++ !vcpu->arch.guest_state_protected) ++ run_flags |= KVM_RUN_LOAD_DEBUGCTL; ++ vcpu->arch.host_debugctl = debug_ctl; + + guest_timing_enter_irqoff(); + +-- +2.50.1 + diff --git a/queue-6.6/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch b/queue-6.6/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch new file mode 100644 index 0000000000..82cd3d27bd --- /dev/null +++ b/queue-6.6/kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch @@ -0,0 +1,49 @@ +From 010b3aed9b879bc35a20a52e2435f99c018fd9bc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:29 -0700 +Subject: KVM: VMX: Re-enter guest in fastpath for "spurious" preemption timer + exits + +From: Sean Christopherson + +[ Upstream commit e6b5d16bbd2d4c8259ad76aa33de80d561aba5f9 ] + +Re-enter the guest in the fast path if VMX preeemption timer VM-Exit was +"spurious", i.e. if KVM "soft disabled" the timer by writing -1u and by +some miracle the timer expired before any other VM-Exit occurred. This is +just an intermediate step to cleaning up the preemption timer handling, +optimizing these types of spurious VM-Exits is not interesting as they are +extremely rare/infrequent. + +Link: https://lore.kernel.org/r/20240110012705.506918-3-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index ee501871ddb0..32b792387271 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6019,8 +6019,15 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + +- if (!vmx->req_immediate_exit && +- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { ++ /* ++ * In the *extremely* unlikely scenario that this is a spurious VM-Exit ++ * due to the timer expiring while it was "soft" disabled, just eat the ++ * exit and re-enter the guest. ++ */ ++ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) ++ return EXIT_FASTPATH_REENTER_GUEST; ++ ++ if (!vmx->req_immediate_exit) { + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; + } +-- +2.50.1 + diff --git a/queue-6.6/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch b/queue-6.6/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch new file mode 100644 index 0000000000..3dfeefdd5d --- /dev/null +++ b/queue-6.6/kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch @@ -0,0 +1,162 @@ +From 2277369c1b499bf85b3b553e281b264495bb2514 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:39 -0700 +Subject: KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs + +From: Maxim Levitsky + +[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ] + +Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to +vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into +GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state +into the guest, and without needing to copy+paste the FREEZE_IN_SMM +logic into every patch that accesses GUEST_IA32_DEBUGCTL. + +No functional change intended. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +[sean: massage changelog, make inline, use in all prepare_vmcs02() cases] +Reviewed-by: Dapeng Mi +Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com +Signed-off-by: Sasha Levin +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/nested.c | 10 +++++----- + arch/x86/kvm/vmx/pmu_intel.c | 8 ++++---- + arch/x86/kvm/vmx/vmx.c | 8 +++++--- + arch/x86/kvm/vmx/vmx.h | 10 ++++++++++ + 4 files changed, 24 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 10236ecdad95..2ce39ffbcefb 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2564,11 +2564,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & +- vmx_get_supported_debugctl(vcpu, false)); ++ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & ++ vmx_get_supported_debugctl(vcpu, false)); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); +- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); ++ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); + } + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) +@@ -3433,7 +3433,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) +- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); + if (kvm_mpx_supported() && + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) +@@ -4633,7 +4633,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); + + kvm_set_dr(vcpu, 7, 0x400); +- vmcs_write64(GUEST_IA32_DEBUGCTL, 0); ++ vmx_guest_debugctl_write(vcpu, 0); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index 48a2f77f62ef..50364e00e4e9 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -633,11 +633,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) + */ + static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) + { +- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ u64 data = vmx_guest_debugctl_read(); + + if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { + data &= ~DEBUGCTLMSR_LBR; +- vmcs_write64(GUEST_IA32_DEBUGCTL, data); ++ vmx_guest_debugctl_write(vcpu, data); + } + } + +@@ -707,7 +707,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) + + if (!lbr_desc->event) { + vmx_disable_lbr_msrs_passthrough(vcpu); +- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) ++ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) + goto warn; + if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) + goto warn; +@@ -729,7 +729,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) + + static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) + { +- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) ++ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) + intel_pmu_release_guest_lbr_event(vcpu); + } + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 32f1a38a1010..d0973bd7853c 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2124,7 +2124,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; + break; + case MSR_IA32_DEBUGCTLMSR: +- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); ++ msr_info->data = vmx_guest_debugctl_read(); + break; + default: + find_uret_msr: +@@ -2258,7 +2258,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; + +- vmcs_write64(GUEST_IA32_DEBUGCTL, data); ++ vmx_guest_debugctl_write(vcpu, data); ++ + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); +@@ -4826,7 +4827,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); +- vmcs_write64(GUEST_IA32_DEBUGCTL, 0); ++ ++ vmx_guest_debugctl_write(&vmx->vcpu, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 5816fdd2dfa8..769e70fd142c 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -432,6 +432,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); + bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); + ++static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) ++{ ++ vmcs_write64(GUEST_IA32_DEBUGCTL, val); ++} ++ ++static inline u64 vmx_guest_debugctl_read(void) ++{ ++ return vmcs_read64(GUEST_IA32_DEBUGCTL); ++} ++ + /* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch b/queue-6.6/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch new file mode 100644 index 0000000000..b337a35472 --- /dev/null +++ b/queue-6.6/kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch @@ -0,0 +1,138 @@ +From ab60a5a234aeb79d78d4830caee1d001313cd5e5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:34 -0700 +Subject: KVM: x86: Convert vcpu_run()'s immediate exit param into a generic + bitmap + +From: Sean Christopherson + +[ Upstream commit 2478b1b220c49d25cb1c3f061ec4f9b351d9a131 ] + +Convert kvm_x86_ops.vcpu_run()'s "force_immediate_exit" boolean parameter +into an a generic bitmap so that similar "take action" information can be +passed to vendor code without creating a pile of boolean parameters. + +This will allow dropping kvm_x86_ops.set_dr6() in favor of a new flag, and +will also allow for adding similar functionality for re-loading debugctl +in the active VMCS. + +Opportunistically massage the TDX WARN and comment to prepare for adding +more run_flags, all of which are expected to be mutually exclusive with +TDX, i.e. should be WARNed on. + +No functional change intended. + +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-3-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: drop TDX crud, account for lack of kvm_x86_call()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 6 +++++- + arch/x86/kvm/svm/svm.c | 4 ++-- + arch/x86/kvm/vmx/vmx.c | 3 ++- + arch/x86/kvm/x86.c | 10 ++++++++-- + 4 files changed, 17 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 8898ad8cb3de..aa6d04cd9ee6 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1550,6 +1550,10 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; + } + ++enum kvm_x86_run_flags { ++ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), ++}; ++ + struct kvm_x86_ops { + const char *name; + +@@ -1625,7 +1629,7 @@ struct kvm_x86_ops { + + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, +- bool force_immediate_exit); ++ u64 run_flags); + int (*handle_exit)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 4a53b38ea386..61e5e261cde2 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4197,9 +4197,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + guest_state_exit_irqoff(); + } + +-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, +- bool force_immediate_exit) ++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + { ++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; + struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 704e5a552b4f..065aac2f4bce 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7345,8 +7345,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + guest_state_exit_irqoff(); + } + +-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) ++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + { ++ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index f3150d9a1918..ecc151397341 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10518,6 +10518,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + fastpath_t exit_fastpath; ++ u64 run_flags; + + bool req_immediate_exit = false; + +@@ -10750,8 +10751,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + goto cancel_injection; + } + +- if (req_immediate_exit) ++ run_flags = 0; ++ if (req_immediate_exit) { ++ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; + kvm_make_request(KVM_REQ_EVENT, vcpu); ++ } + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) +@@ -10787,7 +10791,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); + +- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); ++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + +@@ -10799,6 +10803,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + break; + } + ++ run_flags = 0; ++ + /* Note, VM-Exits that go down the "slow" path are accounted below. */ + ++vcpu->stat.exits; + } +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch b/queue-6.6/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch new file mode 100644 index 0000000000..124fa5c1c5 --- /dev/null +++ b/queue-6.6/kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch @@ -0,0 +1,144 @@ +From 22c51f0290ecf799d1bb5992d6add57aaa64597f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:35 -0700 +Subject: KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag + +From: Sean Christopherson + +[ Upstream commit 80c64c7afea1da6a93ebe88d3d29d8a60377ef80 ] + +Instruct vendor code to load the guest's DR6 into hardware via a new +KVM_RUN flag, and remove kvm_x86_ops.set_dr6(), whose sole purpose was to +load vcpu->arch.dr6 into hardware when DR6 can be read/written directly +by the guest. + +Note, TDX already WARNs on any run_flag being set, i.e. will yell if KVM +thinks DR6 needs to be reloaded. TDX vCPUs force KVM_DEBUGREG_AUTO_SWITCH +and never clear the flag, i.e. should never observe KVM_RUN_LOAD_GUEST_DR6. + +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250610232010.162191-4-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: account for lack of vmx/main.c] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 - + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kvm/svm/svm.c | 10 ++++++---- + arch/x86/kvm/vmx/vmx.c | 10 +++------- + arch/x86/kvm/x86.c | 2 +- + 5 files changed, 11 insertions(+), 14 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index 8fe6667d945f..a0a4fc684e63 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -48,7 +48,6 @@ KVM_X86_OP(set_idt) + KVM_X86_OP(get_gdt) + KVM_X86_OP(set_gdt) + KVM_X86_OP(sync_dirty_debug_regs) +-KVM_X86_OP(set_dr6) + KVM_X86_OP(set_dr7) + KVM_X86_OP(cache_reg) + KVM_X86_OP(get_rflags) +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index aa6d04cd9ee6..7373b22c02a7 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1552,6 +1552,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) + + enum kvm_x86_run_flags { + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), ++ KVM_RUN_LOAD_GUEST_DR6 = BIT(1), + }; + + struct kvm_x86_ops { +@@ -1600,7 +1601,6 @@ struct kvm_x86_ops { + void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); +- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); + void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 61e5e261cde2..abff6d45ae33 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4241,10 +4241,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + svm_hv_update_vp_id(svm->vmcb, vcpu); + + /* +- * Run with all-zero DR6 unless needed, so that we can get the exact cause +- * of a #DB. ++ * Run with all-zero DR6 unless the guest can write DR6 freely, so that ++ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from ++ * KVM's snapshot is only necessary when DR accesses won't exit. + */ +- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) ++ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) ++ svm_set_dr6(vcpu, vcpu->arch.dr6); ++ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); + + clgi(); +@@ -5021,7 +5024,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .set_idt = svm_set_idt, + .get_gdt = svm_get_gdt, + .set_gdt = svm_set_gdt, +- .set_dr6 = svm_set_dr6, + .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, + .cache_reg = svm_cache_reg, +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 065aac2f4bce..08ca218ee858 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -5616,12 +5616,6 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) + set_debugreg(DR6_RESERVED, 6); + } + +-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +-{ +- lockdep_assert_irqs_disabled(); +- set_debugreg(vcpu->arch.dr6, 6); +-} +- + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) + { + vmcs_writel(GUEST_DR7, val); +@@ -7392,6 +7386,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + vcpu->arch.regs_dirty = 0; + ++ if (run_flags & KVM_RUN_LOAD_GUEST_DR6) ++ set_debugreg(vcpu->arch.dr6, 6); ++ + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time +@@ -8337,7 +8334,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, +- .set_dr6 = vmx_set_dr6, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index ecc151397341..fbb2e70e3031 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10772,7 +10772,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) +- static_call(kvm_x86_set_dr6)(vcpu, vcpu->arch.dr6); ++ run_flags |= KVM_RUN_LOAD_GUEST_DR6; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(0, 7); + } +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch b/queue-6.6/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch new file mode 100644 index 0000000000..53bf15f83e --- /dev/null +++ b/queue-6.6/kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch @@ -0,0 +1,265 @@ +From f141c80dbc877633ec0fb299da98a44a81d7c5aa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:33 -0700 +Subject: KVM: x86: Fully defer to vendor code to decide how to force immediate + exit + +From: Sean Christopherson + +[ Upstream commit 0ec3d6d1f169baa7fc512ae4b78d17e7c94b7763 ] + +Now that vmx->req_immediate_exit is used only in the scope of +vmx_vcpu_run(), use force_immediate_exit to detect that KVM should usurp +the VMX preemption to force a VM-Exit and let vendor code fully handle +forcing a VM-Exit. + +Opportunsitically drop __kvm_request_immediate_exit() and just have +vendor code call smp_send_reschedule() directly. SVM already does this +when injecting an event while also trying to single-step an IRET, i.e. +it's not exactly secret knowledge that KVM uses a reschedule IPI to force +an exit. + +Link: https://lore.kernel.org/r/20240110012705.506918-7-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve absurd conflict due to funky kvm_x86_ops.sched_in prototype] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 - + arch/x86/include/asm/kvm_host.h | 3 --- + arch/x86/kvm/svm/svm.c | 7 ++++--- + arch/x86/kvm/vmx/vmx.c | 32 +++++++++++++----------------- + arch/x86/kvm/vmx/vmx.h | 2 -- + arch/x86/kvm/x86.c | 10 +--------- + 6 files changed, 19 insertions(+), 36 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index e59ded976166..8fe6667d945f 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -102,7 +102,6 @@ KVM_X86_OP(write_tsc_multiplier) + KVM_X86_OP(get_exit_info) + KVM_X86_OP(check_intercept) + KVM_X86_OP(handle_exit_irqoff) +-KVM_X86_OP(request_immediate_exit) + KVM_X86_OP(sched_in) + KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) + KVM_X86_OP_OPTIONAL(vcpu_blocking) +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 5703600a454e..8898ad8cb3de 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1695,8 +1695,6 @@ struct kvm_x86_ops { + struct x86_exception *exception); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); + +- void (*request_immediate_exit)(struct kvm_vcpu *vcpu); +- + void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + + /* +@@ -2182,7 +2180,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); + + int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); + + void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index f42c6ef7dc20..4a53b38ea386 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4222,9 +4222,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); +- smp_send_reschedule(vcpu->cpu); ++ force_immediate_exit = true; + } + ++ if (force_immediate_exit) ++ smp_send_reschedule(vcpu->cpu); ++ + pre_svm_run(vcpu); + + sync_lapic_to_cr8(vcpu); +@@ -5075,8 +5078,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .check_intercept = svm_check_intercept, + .handle_exit_irqoff = svm_handle_exit_irqoff, + +- .request_immediate_exit = __kvm_request_immediate_exit, +- + .sched_in = svm_sched_in, + + .nested_ops = &svm_nested_ops, +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 0ecc0e996386..704e5a552b4f 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -49,6 +49,8 @@ + #include + #include + ++#include ++ + #include "capabilities.h" + #include "cpuid.h" + #include "hyperv.h" +@@ -1304,8 +1306,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) + u16 fs_sel, gs_sel; + int i; + +- vmx->req_immediate_exit = false; +- + /* + * Note that guest MSRs to be saved/restored can also be changed + * when guest state is loaded. This happens when guest transitions +@@ -6015,7 +6015,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) + return 1; + } + +-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) ++static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + +@@ -6031,7 +6032,7 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) + * If the timer expired because KVM used it to force an immediate exit, + * then mission accomplished. + */ +- if (vmx->req_immediate_exit) ++ if (force_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + + /* +@@ -7210,13 +7211,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) + msrs[i].host, false); + } + +-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) ++static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + +- if (vmx->req_immediate_exit) { ++ if (force_immediate_exit) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (vmx->hv_deadline_tsc != -1) { +@@ -7269,7 +7270,8 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + barrier_nospec(); + } + +-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) ++static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit) + { + /* + * If L2 is active, some VMX preemption timer exits can be handled in +@@ -7283,7 +7285,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + case EXIT_REASON_MSR_WRITE: + return handle_fastpath_set_msr_irqoff(vcpu); + case EXIT_REASON_PREEMPTION_TIMER: +- return handle_fastpath_preemption_timer(vcpu); ++ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); + default: + return EXIT_FASTPATH_NONE; + } +@@ -7425,7 +7427,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + vmx_passthrough_lbr_msrs(vcpu); + + if (enable_preemption_timer) +- vmx_update_hv_timer(vcpu); ++ vmx_update_hv_timer(vcpu, force_immediate_exit); ++ else if (force_immediate_exit) ++ smp_send_reschedule(vcpu->cpu); + + kvm_wait_lapic_expire(vcpu); + +@@ -7489,7 +7493,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); + +- return vmx_exit_handlers_fastpath(vcpu); ++ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); + } + + static void vmx_vcpu_free(struct kvm_vcpu *vcpu) +@@ -7988,11 +7992,6 @@ static __init void vmx_set_cpu_caps(void) + kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + } + +-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +-{ +- to_vmx(vcpu)->req_immediate_exit = true; +-} +- + static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info) + { +@@ -8404,8 +8403,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + +- .request_immediate_exit = vmx_request_immediate_exit, +- + .sched_in = vmx_sched_in, + + .cpu_dirty_log_size = PML_ENTITY_NUM, +@@ -8663,7 +8660,6 @@ static __init int hardware_setup(void) + if (!enable_preemption_timer) { + vmx_x86_ops.set_hv_timer = NULL; + vmx_x86_ops.cancel_hv_timer = NULL; +- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; + } + + kvm_caps.supported_mce_cap |= MCG_LMCE_P; +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index fb36bde2dd87..50d32d830890 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -331,8 +331,6 @@ struct vcpu_vmx { + unsigned int ple_window; + bool ple_window_dirty; + +- bool req_immediate_exit; +- + /* Support for PML */ + #define PML_ENTITY_NUM 512 + struct page *pml_pg; +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index d04066099567..f3150d9a1918 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10505,12 +10505,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); + } + +-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) +-{ +- smp_send_reschedule(vcpu->cpu); +-} +-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); +- + /* + * Called within kvm->srcu read side. + * Returns 1 to let vcpu_run() continue the guest execution loop without +@@ -10756,10 +10750,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + goto cancel_injection; + } + +- if (req_immediate_exit) { ++ if (req_immediate_exit) + kvm_make_request(KVM_REQ_EVENT, vcpu); +- static_call(kvm_x86_request_immediate_exit)(vcpu); +- } + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch b/queue-6.6/kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch new file mode 100644 index 0000000000..853ce19a8c --- /dev/null +++ b/queue-6.6/kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch @@ -0,0 +1,75 @@ +From 21d37a330aba310a5c2dc24ee8eea174acdfb829 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:21 -0700 +Subject: KVM: x86/hyper-v: Skip non-canonical addresses during PV TLB flush + +From: Manuel Andreas + +[ Upstream commit fa787ac07b3ceb56dd88a62d1866038498e96230 ] + +In KVM guests with Hyper-V hypercalls enabled, the hypercalls +HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST and HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX +allow a guest to request invalidation of portions of a virtual TLB. +For this, the hypercall parameter includes a list of GVAs that are supposed +to be invalidated. + +However, when non-canonical GVAs are passed, there is currently no +filtering in place and they are eventually passed to checked invocations of +INVVPID on Intel / INVLPGA on AMD. While AMD's INVLPGA silently ignores +non-canonical addresses (effectively a no-op), Intel's INVVPID explicitly +signals VM-Fail and ultimately triggers the WARN_ONCE in invvpid_error(): + + invvpid failed: ext=0x0 vpid=1 gva=0xaaaaaaaaaaaaa000 + WARNING: CPU: 6 PID: 326 at arch/x86/kvm/vmx/vmx.c:482 + invvpid_error+0x91/0xa0 [kvm_intel] + Modules linked in: kvm_intel kvm 9pnet_virtio irqbypass fuse + CPU: 6 UID: 0 PID: 326 Comm: kvm-vm Not tainted 6.15.0 #14 PREEMPT(voluntary) + RIP: 0010:invvpid_error+0x91/0xa0 [kvm_intel] + Call Trace: + vmx_flush_tlb_gva+0x320/0x490 [kvm_intel] + kvm_hv_vcpu_flush_tlb+0x24f/0x4f0 [kvm] + kvm_arch_vcpu_ioctl_run+0x3013/0x5810 [kvm] + +Hyper-V documents that invalid GVAs (those that are beyond a partition's +GVA space) are to be ignored. While not completely clear whether this +ruling also applies to non-canonical GVAs, it is likely fine to make that +assumption, and manual testing on Azure confirms "real" Hyper-V interprets +the specification in the same way. + +Skip non-canonical GVAs when processing the list of address to avoid +tripping the INVVPID failure. Alternatively, KVM could filter out "bad" +GVAs before inserting into the FIFO, but practically speaking the only +downside of pushing validation to the final processing is that doing so +is suboptimal for the guest, and no well-behaved guest will request TLB +flushes for non-canonical addresses. + +Fixes: 260970862c88 ("KVM: x86: hyper-v: Handle HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls gently") +Cc: stable@vger.kernel.org +Signed-off-by: Manuel Andreas +Suggested-by: Vitaly Kuznetsov +Link: https://lore.kernel.org/r/c090efb3-ef82-499f-a5e0-360fc8420fb7@tum.de +Signed-off-by: Sean Christopherson +[sean: use plain is_noncanonical_address()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/hyperv.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c +index bd3fbd5be5da..223f4fa6a849 100644 +--- a/arch/x86/kvm/hyperv.c ++++ b/arch/x86/kvm/hyperv.c +@@ -1929,6 +1929,9 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) + if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) + goto out_flush_all; + ++ if (is_noncanonical_address(entries[i], vcpu)) ++ continue; ++ + /* + * Lower 12 bits of 'address' encode the number of additional + * pages to flush. +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch b/queue-6.6/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch new file mode 100644 index 0000000000..3594804c57 --- /dev/null +++ b/queue-6.6/kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch @@ -0,0 +1,80 @@ +From 03a5de01f1a2bd29e4c24999991e84839d3b1fa1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:31 -0700 +Subject: KVM: x86: Move handling of is_guest_mode() into fastpath exit + handlers + +From: Sean Christopherson + +[ Upstream commit bf1a49436ea37b98dd2f37c57608951d0e28eecc ] + +Let the fastpath code decide which exits can/can't be handled in the +fastpath when L2 is active, e.g. when KVM generates a VMX preemption +timer exit to forcefully regain control, there is no "work" to be done and +so such exits can be handled in the fastpath regardless of whether L1 or +L2 is active. + +Moving the is_guest_mode() check into the fastpath code also makes it +easier to see that L2 isn't allowed to use the fastpath in most cases, +e.g. it's not immediately obvious why handle_fastpath_preemption_timer() +is called from the fastpath and the normal path. + +Link: https://lore.kernel.org/r/20240110012705.506918-5-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/svm/svm.c | 6 +++--- + arch/x86/kvm/vmx/vmx.c | 6 +++--- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 5a230be224d1..f42c6ef7dc20 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4157,6 +4157,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) + + static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + { ++ if (is_guest_mode(vcpu)) ++ return EXIT_FASTPATH_NONE; ++ + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); +@@ -4315,9 +4318,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, + + svm_complete_interrupts(vcpu); + +- if (is_guest_mode(vcpu)) +- return EXIT_FASTPATH_NONE; +- + return svm_exit_handlers_fastpath(vcpu); + } + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 631fdd4a575a..4c991d514015 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7258,6 +7258,9 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + + static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) + { ++ if (is_guest_mode(vcpu)) ++ return EXIT_FASTPATH_NONE; ++ + switch (to_vmx(vcpu)->exit_reason.basic) { + case EXIT_REASON_MSR_WRITE: + return handle_fastpath_set_msr_irqoff(vcpu); +@@ -7468,9 +7471,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); + +- if (is_guest_mode(vcpu)) +- return EXIT_FASTPATH_NONE; +- + return vmx_exit_handlers_fastpath(vcpu); + } + +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch b/queue-6.6/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch new file mode 100644 index 0000000000..f7f8e8663f --- /dev/null +++ b/queue-6.6/kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch @@ -0,0 +1,130 @@ +From c1681ed16b27201a312047cb6a8038088fdc0608 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:28 -0700 +Subject: KVM: x86: Plumb "force_immediate_exit" into kvm_entry() tracepoint + +From: Sean Christopherson + +[ Upstream commit 9c9025ea003a03f967affd690f39b4ef3452c0f5 ] + +Annotate the kvm_entry() tracepoint with "immediate exit" when KVM is +forcing a VM-Exit immediately after VM-Enter, e.g. when KVM wants to +inject an event but needs to first complete some other operation. +Knowing that KVM is (or isn't) forcing an exit is useful information when +debugging issues related to event injection. + +Suggested-by: Maxim Levitsky +Link: https://lore.kernel.org/r/20240110012705.506918-2-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 3 ++- + arch/x86/kvm/svm/svm.c | 5 +++-- + arch/x86/kvm/trace.h | 9 ++++++--- + arch/x86/kvm/vmx/vmx.c | 4 ++-- + arch/x86/kvm/x86.c | 2 +- + 5 files changed, 14 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index b5210505abfa..5703600a454e 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1624,7 +1624,8 @@ struct kvm_x86_ops { + void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); + + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); +- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); ++ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit); + int (*handle_exit)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index abbb84ddfe02..5a230be224d1 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4194,12 +4194,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in + guest_state_exit_irqoff(); + } + +-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) ++static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, ++ bool force_immediate_exit) + { + struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); + +- trace_kvm_entry(vcpu); ++ trace_kvm_entry(vcpu, force_immediate_exit); + + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; +diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h +index b82e6ed4f024..c6b4b1728006 100644 +--- a/arch/x86/kvm/trace.h ++++ b/arch/x86/kvm/trace.h +@@ -15,20 +15,23 @@ + * Tracepoint for guest mode entry. + */ + TRACE_EVENT(kvm_entry, +- TP_PROTO(struct kvm_vcpu *vcpu), +- TP_ARGS(vcpu), ++ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit), ++ TP_ARGS(vcpu, force_immediate_exit), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned long, rip ) ++ __field( bool, immediate_exit ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->rip = kvm_rip_read(vcpu); ++ __entry->immediate_exit = force_immediate_exit; + ), + +- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) ++ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, ++ __entry->immediate_exit ? "[immediate exit]" : "") + ); + + /* +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 9ba4baf2a9e9..ee501871ddb0 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7312,7 +7312,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + guest_state_exit_irqoff(); + } + +-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) ++static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; +@@ -7339,7 +7339,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) + return EXIT_FASTPATH_NONE; + } + +- trace_kvm_entry(vcpu); ++ trace_kvm_entry(vcpu, force_immediate_exit); + + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 9944b32b0b30..d04066099567 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10795,7 +10795,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); + +- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); ++ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch b/queue-6.6/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch new file mode 100644 index 0000000000..42f3463dcc --- /dev/null +++ b/queue-6.6/kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch @@ -0,0 +1,104 @@ +From 8fd23c953af487158937416f6ea3a2e16c6c7503 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:23 -0700 +Subject: KVM: x86: Plumb in the vCPU to kvm_x86_ops.hwapic_isr_update() + +From: Sean Christopherson + +[ Upstream commit 76bce9f10162cd4b36ac0b7889649b22baf70ebd ] + +Pass the target vCPU to the hwapic_isr_update() vendor hook so that VMX +can defer the update until after nested VM-Exit if an EOI for L1's vAPIC +occurs while L2 is active. + +Note, commit d39850f57d21 ("KVM: x86: Drop @vcpu parameter from +kvm_x86_ops.hwapic_isr_update()") removed the parameter with the +justification that doing so "allows for a decent amount of (future) +cleanup in the APIC code", but it's not at all clear what cleanup was +intended, or if it was ever realized. + +No functional change intended. + +Cc: stable@vger.kernel.org +Reviewed-by: Chao Gao +Tested-by: Chao Gao +Link: https://lore.kernel.org/r/20241128000010.4051275-2-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: account for lack of kvm_x86_call(), drop vmx/x86_ops.h change] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kvm/lapic.c | 8 ++++---- + arch/x86/kvm/vmx/vmx.c | 2 +- + 3 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 5dfb8cc9616e..5fc89d255550 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1657,7 +1657,7 @@ struct kvm_x86_ops { + bool allow_apicv_in_x2apic_without_x2apic_virtualization; + void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); + void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); +- void (*hwapic_isr_update)(int isr); ++ void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index 66c7f2367bb3..cbf85a1ffb74 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -750,7 +750,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) + * just set SVI. + */ + if (unlikely(apic->apicv_active)) +- static_call_cond(kvm_x86_hwapic_isr_update)(vec); ++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec); + else { + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); +@@ -795,7 +795,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) + * and must be left alone. + */ + if (unlikely(apic->apicv_active)) +- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); ++ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); + else { + --apic->isr_count; + BUG_ON(apic->isr_count < 0); +@@ -2772,7 +2772,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) + if (apic->apicv_active) { + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); +- static_call_cond(kvm_x86_hwapic_isr_update)(-1); ++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); + } + + vcpu->arch.apic_arb_prio = 0; +@@ -3072,7 +3072,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) + if (apic->apicv_active) { + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); +- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); ++ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (ioapic_in_kernel(vcpu->kvm)) +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index e53620e18925..cde01eb1f5e3 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6834,7 +6834,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) + kvm_release_pfn_clean(pfn); + } + +-static void vmx_hwapic_isr_update(int max_isr) ++static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) + { + u16 status; + u8 old; +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch b/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch new file mode 100644 index 0000000000..b933ff3c31 --- /dev/null +++ b/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch @@ -0,0 +1,48 @@ +From 5bfa7e5a50ee261faf4f40ec4bae020fe4f2e08b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:27 -0700 +Subject: KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs + +From: Sean Christopherson + +[ Upstream commit 189ecdb3e112da703ac0699f4ec76aa78122f911 ] + +Snapshot the host's DEBUGCTL after disabling IRQs, as perf can toggle +debugctl bits from IRQ context, e.g. when enabling/disabling events via +smp_call_function_single(). Taking the snapshot (long) before IRQs are +disabled could result in KVM effectively clobbering DEBUGCTL due to using +a stale snapshot. + +Cc: stable@vger.kernel.org +Reviewed-and-tested-by: Ravi Bangoria +Link: https://lore.kernel.org/r/20250227222411.3490595-6-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/x86.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 7aff0fe469c3..9944b32b0b30 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -4823,7 +4823,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); +- vcpu->arch.host_debugctl = get_debugctlmsr(); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { +@@ -10782,6 +10781,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + set_debugreg(0, 7); + } + ++ vcpu->arch.host_debugctl = get_debugctlmsr(); ++ + guest_timing_enter_irqoff(); + + for (;;) { +-- +2.50.1 + diff --git a/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch b/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch new file mode 100644 index 0000000000..fdac54d1f7 --- /dev/null +++ b/queue-6.6/kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch @@ -0,0 +1,100 @@ +From c711192f36c4c41ec0716b1b0a20448a9cc2194f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Aug 2025 17:25:26 -0700 +Subject: KVM: x86: Snapshot the host's DEBUGCTL in common x86 + +From: Sean Christopherson + +[ Upstream commit fb71c795935652fa20eaf9517ca9547f5af99a76 ] + +Move KVM's snapshot of DEBUGCTL to kvm_vcpu_arch and take the snapshot in +common x86, so that SVM can also use the snapshot. + +Opportunistically change the field to a u64. While bits 63:32 are reserved +on AMD, not mentioned at all in Intel's SDM, and managed as an "unsigned +long" by the kernel, DEBUGCTL is an MSR and therefore a 64-bit value. + +Reviewed-by: Xiaoyao Li +Cc: stable@vger.kernel.org +Reviewed-and-tested-by: Ravi Bangoria +Link: https://lore.kernel.org/r/20250227222411.3490595-4-seanjc@google.com +Signed-off-by: Sean Christopherson +[sean: resolve minor syntatic conflict in vmx_vcpu_load()] +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/vmx/vmx.c | 8 ++------ + arch/x86/kvm/vmx/vmx.h | 2 -- + arch/x86/kvm/x86.c | 1 + + 4 files changed, 4 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 5fc89d255550..b5210505abfa 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -733,6 +733,7 @@ struct kvm_vcpu_arch { + u32 pkru; + u32 hflags; + u64 efer; ++ u64 host_debugctl; + u64 apic_base; + struct kvm_lapic *apic; /* kernel irqchip context */ + bool load_eoi_exitmap_pending; +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 4563e7a9a851..9ba4baf2a9e9 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -1499,13 +1499,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + */ + static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + { +- struct vcpu_vmx *vmx = to_vmx(vcpu); +- + vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + + vmx_vcpu_pi_load(vcpu, cpu); +- +- vmx->host_debugctlmsr = get_debugctlmsr(); + } + + static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +@@ -7414,8 +7410,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) + } + + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ +- if (vmx->host_debugctlmsr) +- update_debugctlmsr(vmx->host_debugctlmsr); ++ if (vcpu->arch.host_debugctl) ++ update_debugctlmsr(vcpu->arch.host_debugctl); + + #ifndef CONFIG_X86_64 + /* +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 88c5b7ebf9d3..fb36bde2dd87 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -340,8 +340,6 @@ struct vcpu_vmx { + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + +- unsigned long host_debugctlmsr; +- + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 55185670e0e5..7aff0fe469c3 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -4823,6 +4823,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); ++ vcpu->arch.host_debugctl = get_debugctlmsr(); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { +-- +2.50.1 + diff --git a/queue-6.6/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch b/queue-6.6/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch new file mode 100644 index 0000000000..a63a43713b --- /dev/null +++ b/queue-6.6/net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch @@ -0,0 +1,44 @@ +From 420eaab27f40ffc253c0d624df6b87a47a58e99c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Aug 2025 07:23:18 -0700 +Subject: net: ti: icss-iep: Fix incorrect type for return value in + extts_enable() + +From: Alok Tiwari + +[ Upstream commit 5f1d1d14db7dabce9c815e7d7cd351f8d58b8585 ] + +The variable ret in icss_iep_extts_enable() was incorrectly declared +as u32, while the function returns int and may return negative error +codes. This will cause sign extension issues and incorrect error +propagation. Update ret to be int to fix error handling. + +This change corrects the declaration to avoid potential type mismatch. + +Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") +Signed-off-by: Alok Tiwari +Reviewed-by: Andrew Lunn +Link: https://patch.msgid.link/20250805142323.1949406-1-alok.a.tiwari@oracle.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/icssg/icss_iep.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c +index 8ed72c8b210f..e7306ed52922 100644 +--- a/drivers/net/ethernet/ti/icssg/icss_iep.c ++++ b/drivers/net/ethernet/ti/icssg/icss_iep.c +@@ -638,7 +638,8 @@ static int icss_iep_pps_enable(struct icss_iep *iep, int on) + + static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on) + { +- u32 val, cap, ret = 0; ++ u32 val, cap; ++ int ret = 0; + + mutex_lock(&iep->ptp_clk_mutex); + +-- +2.50.1 + diff --git a/queue-6.6/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch b/queue-6.6/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch new file mode 100644 index 0000000000..32449be97e --- /dev/null +++ b/queue-6.6/netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch @@ -0,0 +1,129 @@ +From a6dce037ccb6be527d9dd4d896d9612980da17e6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Aug 2025 17:25:08 +0200 +Subject: netfilter: ctnetlink: fix refcount leak on table dump + +From: Florian Westphal + +[ Upstream commit de788b2e6227462b6dcd0e07474e72c089008f74 ] + +There is a reference count leak in ctnetlink_dump_table(): + if (res < 0) { + nf_conntrack_get(&ct->ct_general); // HERE + cb->args[1] = (unsigned long)ct; + ... + +While its very unlikely, its possible that ct == last. +If this happens, then the refcount of ct was already incremented. +This 2nd increment is never undone. + +This prevents the conntrack object from being released, which in turn +keeps prevents cnet->count from dropping back to 0. + +This will then block the netns dismantle (or conntrack rmmod) as +nf_conntrack_cleanup_net_list() will wait forever. + +This can be reproduced by running conntrack_resize.sh selftest in a loop. +It takes ~20 minutes for me on a preemptible kernel on average before +I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. + +One fix would to change this to: + if (res < 0) { + if (ct != last) + nf_conntrack_get(&ct->ct_general); + +But this reference counting isn't needed in the first place. +We can just store a cookie value instead. + +A followup patch will do the same for ctnetlink_exp_dump_table, +it looks to me as if this has the same problem and like +ctnetlink_dump_table, we only need a 'skip hint', not the actual +object so we can apply the same cookie strategy there as well. + +Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 282e9644f6fd..928bd2013289 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -859,8 +859,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) + + static int ctnetlink_done(struct netlink_callback *cb) + { +- if (cb->args[1]) +- nf_ct_put((struct nf_conn *)cb->args[1]); + kfree(cb->data); + return 0; + } +@@ -1175,19 +1173,26 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) + return 0; + } + ++static unsigned long ctnetlink_get_id(const struct nf_conn *ct) ++{ ++ unsigned long id = nf_ct_get_id(ct); ++ ++ return id ? id : 1; ++} ++ + static int + ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; + struct net *net = sock_net(skb->sk); +- struct nf_conn *ct, *last; ++ unsigned long last_id = cb->args[1]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *nf_ct_evict[8]; ++ struct nf_conn *ct; + int res, i; + spinlock_t *lockp; + +- last = (struct nf_conn *)cb->args[1]; + i = 0; + + local_bh_disable(); +@@ -1224,7 +1229,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + continue; + + if (cb->args[1]) { +- if (ct != last) ++ if (ctnetlink_get_id(ct) != last_id) + continue; + cb->args[1] = 0; + } +@@ -1237,8 +1242,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, true, flags); + if (res < 0) { +- nf_conntrack_get(&ct->ct_general); +- cb->args[1] = (unsigned long)ct; ++ cb->args[1] = ctnetlink_get_id(ct); + spin_unlock(lockp); + goto out; + } +@@ -1251,12 +1255,10 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + } + out: + local_bh_enable(); +- if (last) { ++ if (last_id) { + /* nf ct hash resize happened, now clear the leftover. */ +- if ((struct nf_conn *)cb->args[1] == last) ++ if (cb->args[1] == last_id) + cb->args[1] = 0; +- +- nf_ct_put(last); + } + + while (i) { +-- +2.50.1 + diff --git a/queue-6.6/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch b/queue-6.6/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch new file mode 100644 index 0000000000..3d616638f5 --- /dev/null +++ b/queue-6.6/ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch @@ -0,0 +1,103 @@ +From 5bca481007ef80b38edd17f64f35c01f02fae3f0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Jul 2025 15:26:49 +0900 +Subject: ptp: prevent possible ABBA deadlock in ptp_clock_freerun() + +From: Jeongjun Park + +[ Upstream commit 2efe41234dbd0a83fdb7cd38226c2f70039a2cd3 ] + +syzbot reported the following ABBA deadlock: + + CPU0 CPU1 + ---- ---- + n_vclocks_store() + lock(&ptp->n_vclocks_mux) [1] + (physical clock) + pc_clock_adjtime() + lock(&clk->rwsem) [2] + (physical clock) + ... + ptp_clock_freerun() + ptp_vclock_in_use() + lock(&ptp->n_vclocks_mux) [3] + (physical clock) + ptp_clock_unregister() + posix_clock_unregister() + lock(&clk->rwsem) [4] + (virtual clock) + +Since ptp virtual clock is registered only under ptp physical clock, both +ptp_clock and posix_clock must be physical clocks for ptp_vclock_in_use() +to lock &ptp->n_vclocks_mux and check ptp->n_vclocks. + +However, when unregistering vclocks in n_vclocks_store(), the locking +ptp->n_vclocks_mux is a physical clock lock, but clk->rwsem of +ptp_clock_unregister() called through device_for_each_child_reverse() +is a virtual clock lock. + +Therefore, clk->rwsem used in CPU0 and clk->rwsem used in CPU1 are +different locks, but in lockdep, a false positive occurs because the +possibility of deadlock is determined through lock-class. + +To solve this, lock subclass annotation must be added to the posix_clock +rwsem of the vclock. + +Reported-by: syzbot+7cfb66a237c4a5fb22ad@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=7cfb66a237c4a5fb22ad +Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") +Signed-off-by: Jeongjun Park +Acked-by: Richard Cochran +Reviewed-by: Vladimir Oltean +Link: https://patch.msgid.link/20250728062649.469882-1-aha310510@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/ptp/ptp_private.h | 5 +++++ + drivers/ptp/ptp_vclock.c | 7 +++++++ + 2 files changed, 12 insertions(+) + +diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h +index a54124269c2f..3fbd1d68a9bc 100644 +--- a/drivers/ptp/ptp_private.h ++++ b/drivers/ptp/ptp_private.h +@@ -20,6 +20,11 @@ + #define PTP_BUF_TIMESTAMPS 30 + #define PTP_DEFAULT_MAX_VCLOCKS 20 + ++enum { ++ PTP_LOCK_PHYSICAL = 0, ++ PTP_LOCK_VIRTUAL, ++}; ++ + struct timestamp_event_queue { + struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS]; + int head; +diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c +index dcf752c9e045..7d08ff3b30fc 100644 +--- a/drivers/ptp/ptp_vclock.c ++++ b/drivers/ptp/ptp_vclock.c +@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp) + return PTP_VCLOCK_REFRESH_INTERVAL; + } + ++static void ptp_vclock_set_subclass(struct ptp_clock *ptp) ++{ ++ lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); ++} ++ + static const struct ptp_clock_info ptp_vclock_info = { + .owner = THIS_MODULE, + .name = "ptp virtual clock", +@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) + return NULL; + } + ++ ptp_vclock_set_subclass(vclock->clock); ++ + timecounter_init(&vclock->tc, &vclock->cc, 0); + ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); + +-- +2.50.1 + diff --git a/queue-6.6/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch b/queue-6.6/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch new file mode 100644 index 0000000000..bae5dd4671 --- /dev/null +++ b/queue-6.6/sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch @@ -0,0 +1,73 @@ +From 61292c12f981631257858fa1a7e22814646ed11e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 15:40:11 -0400 +Subject: sctp: linearize cloned gso packets in sctp_rcv + +From: Xin Long + +[ Upstream commit fd60d8a086191fe33c2d719732d2482052fa6805 ] + +A cloned head skb still shares these frag skbs in fraglist with the +original head skb. It's not safe to access these frag skbs. + +syzbot reported two use-of-uninitialized-memory bugs caused by this: + + BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 + sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 + sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 + sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 + __release_sock+0x1da/0x330 net/core/sock.c:3106 + release_sock+0x6b/0x250 net/core/sock.c:3660 + sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 + sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 + sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 + inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:718 [inline] + +and + + BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 + sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 + sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 + sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 + __release_sock+0x1d3/0x330 net/core/sock.c:3213 + release_sock+0x6b/0x270 net/core/sock.c:3767 + sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 + sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 + sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 + inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 + sock_sendmsg_nosec net/socket.c:712 [inline] + +This patch fixes it by linearizing cloned gso packets in sctp_rcv(). + +Fixes: 90017accff61 ("sctp: Add GSO support") +Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com +Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com +Signed-off-by: Xin Long +Reviewed-by: Marcelo Ricardo Leitner +Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sctp/input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index a8a254a5008e..032a10d82302 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb) + * it's better to just linearize it otherwise crc computing + * takes longer. + */ +- if ((!is_gso && skb_linearize(skb)) || ++ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; + +-- +2.50.1 + diff --git a/queue-6.6/series b/queue-6.6/series index e397222673..f481a6f7f3 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -30,3 +30,30 @@ eventpoll-fix-semi-unbounded-recursion.patch documentation-acpi-fix-parent-device-references.patch acpi-processor-perflib-fix-initial-_ppc-limit-application.patch acpi-processor-perflib-move-problematic-pr-performance-check.patch +kvm-x86-hyper-v-skip-non-canonical-addresses-during-.patch +kvm-svm-set-rflags.if-1-in-c-code-to-get-vmrun-out-o.patch +kvm-x86-plumb-in-the-vcpu-to-kvm_x86_ops.hwapic_isr_.patch +kvm-nvmx-defer-svi-update-to-vmcs01-on-eoi-when-l2-i.patch +kvm-x86-snapshot-the-host-s-debugctl-in-common-x86.patch +kvm-x86-snapshot-the-host-s-debugctl-after-disabling.patch +kvm-x86-plumb-force_immediate_exit-into-kvm_entry-tr.patch +kvm-vmx-re-enter-guest-in-fastpath-for-spurious-pree.patch +kvm-vmx-handle-forced-exit-due-to-preemption-timer-i.patch +kvm-x86-move-handling-of-is_guest_mode-into-fastpath.patch +kvm-vmx-handle-kvm-induced-preemption-timer-exits-in.patch +kvm-x86-fully-defer-to-vendor-code-to-decide-how-to-.patch +kvm-x86-convert-vcpu_run-s-immediate-exit-param-into.patch +kvm-x86-drop-kvm_x86_ops.set_dr6-in-favor-of-a-new-k.patch +kvm-vmx-allow-guest-to-set-debugctl.rtm_debug-if-rtm.patch +kvm-vmx-extract-checking-of-guest-s-debugctl-into-he.patch +kvm-nvmx-check-vmcs12-guest_ia32_debugctl-on-nested-.patch +kvm-vmx-wrap-all-accesses-to-ia32_debugctl-with-gett.patch +kvm-vmx-preserve-host-s-debugctlmsr_freeze_in_smm-wh.patch +udp-also-consider-secpath-when-evaluating-ipsec-use-.patch +netfilter-ctnetlink-fix-refcount-leak-on-table-dump.patch +net-ti-icss-iep-fix-incorrect-type-for-return-value-.patch +sctp-linearize-cloned-gso-packets-in-sctp_rcv.patch +intel_idle-allow-loading-acpi-tables-for-any-family.patch +cpuidle-governors-menu-avoid-using-invalid-recent-in.patch +ptp-prevent-possible-abba-deadlock-in-ptp_clock_free.patch +tls-handle-data-disappearing-from-under-the-tls-ulp.patch diff --git a/queue-6.6/tls-handle-data-disappearing-from-under-the-tls-ulp.patch b/queue-6.6/tls-handle-data-disappearing-from-under-the-tls-ulp.patch new file mode 100644 index 0000000000..fffc41bc35 --- /dev/null +++ b/queue-6.6/tls-handle-data-disappearing-from-under-the-tls-ulp.patch @@ -0,0 +1,106 @@ +From 7b3746b0fb7bce25102c2ab1f5d3c2d406a17e0a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Aug 2025 16:29:06 -0700 +Subject: tls: handle data disappearing from under the TLS ULP + +From: Jakub Kicinski + +[ Upstream commit 6db015fc4b5d5f63a64a193f65d98da3a7fc811d ] + +TLS expects that it owns the receive queue of the TCP socket. +This cannot be guaranteed in case the reader of the TCP socket +entered before the TLS ULP was installed, or uses some non-standard +read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy +early exit (which leaves anchor pointing to a freed skb) with real +error handling. Wipe the parsing state and tell the reader to retry. + +We already reload the anchor every time we (re)acquire the socket lock, +so the only condition we need to avoid is an out of bounds read +(not having enough bytes in the socket for previously parsed record len). + +If some data was read from under TLS but there's enough in the queue +we'll reload and decrypt what is most likely not a valid TLS record. +Leading to some undefined behavior from TLS perspective (corrupting +a stream? missing an alert? missing an attack?) but no kernel crash +should take place. + +Reported-by: William Liu +Reported-by: Savino Dicanosa +Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io +Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") +Reviewed-by: Eric Dumazet +Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/tls/tls.h | 2 +- + net/tls/tls_strp.c | 11 ++++++++--- + net/tls/tls_sw.c | 3 ++- + 3 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/net/tls/tls.h b/net/tls/tls.h +index 02038d0381b7..5dc61c85c076 100644 +--- a/net/tls/tls.h ++++ b/net/tls/tls.h +@@ -192,7 +192,7 @@ void tls_strp_msg_done(struct tls_strparser *strp); + int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); + void tls_rx_msg_ready(struct tls_strparser *strp); + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); + int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); + struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); + int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); +diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c +index bea60b0160d1..6ce64a6e4495 100644 +--- a/net/tls/tls_strp.c ++++ b/net/tls/tls_strp.c +@@ -474,7 +474,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) + strp->stm.offset = offset; + } + +-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) ++bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + { + struct strp_msg *rxm; + struct tls_msg *tlm; +@@ -483,8 +483,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len); + + if (!strp->copy_mode && force_refresh) { +- if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len)) +- return; ++ if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) { ++ WRITE_ONCE(strp->msg_ready, 0); ++ memset(&strp->stm, 0, sizeof(strp->stm)); ++ return false; ++ } + + tls_strp_load_anchor_with_queue(strp, strp->stm.full_len); + } +@@ -494,6 +497,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) + rxm->offset = strp->stm.offset; + tlm = tls_msg(strp->anchor); + tlm->control = strp->mark; ++ ++ return true; + } + + /* Called with lock held on lower socket */ +diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c +index 4905a81c4ac1..c9b53472e955 100644 +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -1380,7 +1380,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, + return sock_intr_errno(timeo); + } + +- tls_strp_msg_load(&ctx->strp, released); ++ if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) ++ return tls_rx_rec_wait(sk, psock, nonblock, false); + + return 1; + } +-- +2.50.1 + diff --git a/queue-6.6/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch b/queue-6.6/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch new file mode 100644 index 0000000000..e3533e5e64 --- /dev/null +++ b/queue-6.6/udp-also-consider-secpath-when-evaluating-ipsec-use-.patch @@ -0,0 +1,51 @@ +From d605402dabbd308ab13f66983c4babc3e4773210 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Aug 2025 11:26:27 +0200 +Subject: udp: also consider secpath when evaluating ipsec use for checksumming + +From: Sabrina Dubroca + +[ Upstream commit 1118aaa3b35157777890fffab91d8c1da841b20b ] + +Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in +IPsec case") tried to fix checksumming in UFO when the packets are +going through IPsec, so that we can't rely on offloads because the UDP +header and payload will be encrypted. + +But when doing a TCP test over VXLAN going through IPsec transport +mode with GSO enabled (esp4_offload module loaded), I'm seeing broken +UDP checksums on the encap after successful decryption. + +The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via +__dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this +point we've already dropped the dst (unless the device sets +IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and +we proceed with checksum offload. + +Make need_ipsec also check the secpath, which is not dropped on this +callpath. + +Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") +Signed-off-by: Sabrina Dubroca +Signed-off-by: Steffen Klassert +Signed-off-by: Sasha Levin +--- + net/ipv4/udp_offload.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index 3870b59f5400..9be9df2caf65 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + skb->remcsum_offload = remcsum; + +- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); ++ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + !need_ipsec && +-- +2.50.1 +