From: Sasha Levin Date: Sat, 13 Jun 2026 14:51:50 +0000 (-0400) Subject: Fixes for all trees X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=7f5de74092633301ae347ae23a05c2fcf04b3f39;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for all trees Signed-off-by: Sasha Levin --- diff --git a/queue-6.1/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch b/queue-6.1/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch new file mode 100644 index 0000000000..f71ad7b937 --- /dev/null +++ b/queue-6.1/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch @@ -0,0 +1,221 @@ +From 07756cb6faa6228c3f491e96f7b0aebd62ec4bc9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jun 2026 20:39:06 +0000 +Subject: netfilter: ctnetlink: ensure safe access to master conntrack + +From: Pablo Neira Ayuso + +[ Upstream commit bffcaad9afdfe45d7fc777397d3b83c1e3ebffe5 ] + +Holding reference on the expectation is not sufficient, the master +conntrack object can just go away, making exp->master invalid. + +To access exp->master safely: + +- Grab the nf_conntrack_expect_lock, this gets serialized with + clean_from_lists() which also holds this lock when the master + conntrack goes away. + +- Hold reference on master conntrack via nf_conntrack_find_get(). + Not so easy since the master tuple to look up for the master conntrack + is not available in the existing problematic paths. + +This patch goes for extending the nf_conntrack_expect_lock section +to address this issue for simplicity, in the cases that are described +below this is just slightly extending the lock section. + +The add expectation command already holds a reference to the master +conntrack from ctnetlink_create_expect(). + +However, the delete expectation command needs to grab the spinlock +before looking up for the expectation. Expand the existing spinlock +section to address this to cover the expectation lookup. Note that, +the nf_ct_expect_iterate_net() calls already grabs the spinlock while +iterating over the expectation table, which is correct. + +The get expectation command needs to grab the spinlock to ensure master +conntrack does not go away. This also expands the existing spinlock +section to cover the expectation lookup too. I needed to move the +netlink skb allocation out of the spinlock to keep it GFP_KERNEL. + +For the expectation events, the IPEXP_DESTROY event is already delivered +under the spinlock, just move the delivery of IPEXP_NEW under the +spinlock too because the master conntrack event cache is reached through +exp->master. + +While at it, add lockdep notations to help identify what codepaths need +to grab the spinlock. + +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +[ fix timer_delete -> del_timer in diff context lines since 8fa7292 +("treewide: Switch/rename to timer_delete[_sync]()") landed in 6.15 ] +Signed-off-by: Mark Bundschuh +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_conntrack_core.h | 5 ++++ + net/netfilter/nf_conntrack_ecache.c | 2 ++ + net/netfilter/nf_conntrack_expect.c | 10 +++++++- + net/netfilter/nf_conntrack_netlink.c | 28 +++++++++++++++-------- + 4 files changed, 35 insertions(+), 10 deletions(-) + +diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h +index a36f87af415c22..8ea16b0ba1c982 100644 +--- a/include/net/netfilter/nf_conntrack_core.h ++++ b/include/net/netfilter/nf_conntrack_core.h +@@ -84,6 +84,11 @@ void nf_conntrack_lock(spinlock_t *lock); + + extern spinlock_t nf_conntrack_expect_lock; + ++static inline void lockdep_nfct_expect_lock_held(void) ++{ ++ lockdep_assert_held(&nf_conntrack_expect_lock); ++} ++ + /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ + + static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) +diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c +index 69948e1d6974e3..6526bdcca580fd 100644 +--- a/net/netfilter/nf_conntrack_ecache.c ++++ b/net/netfilter/nf_conntrack_ecache.c +@@ -237,6 +237,8 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + ++ lockdep_nfct_expect_lock_held(); ++ + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (!notify) +diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c +index 70bcddfc17ccc2..379711ea5ab67e 100644 +--- a/net/netfilter/nf_conntrack_expect.c ++++ b/net/netfilter/nf_conntrack_expect.c +@@ -51,6 +51,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + struct net *net = nf_ct_exp_net(exp); + struct nf_conntrack_net *cnet; + ++ lockdep_nfct_expect_lock_held(); + WARN_ON(!master_help); + WARN_ON(timer_pending(&exp->timeout)); + +@@ -118,6 +119,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + + bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) + { ++ lockdep_nfct_expect_lock_held(); ++ + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); +@@ -177,6 +180,8 @@ nf_ct_find_expectation(struct net *net, + struct nf_conntrack_expect *i, *exp = NULL; + unsigned int h; + ++ lockdep_nfct_expect_lock_held(); ++ + if (!cnet->expect_count) + return NULL; + +@@ -459,6 +464,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, + unsigned int h; + int ret = 0; + ++ lockdep_nfct_expect_lock_held(); ++ + if (!master_help) { + ret = -ESHUTDOWN; + goto out; +@@ -515,8 +522,9 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, + + nf_ct_expect_insert(expect); + +- spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ + return 0; + out: + spin_unlock_bh(&nf_conntrack_expect_lock); +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index bcbd77608365a9..f6e9d9bc18864a 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -3330,31 +3330,37 @@ static int ctnetlink_get_expect(struct sk_buff *skb, + if (err < 0) + return err; + ++ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); ++ if (!skb2) ++ return -ENOMEM; ++ ++ spin_lock_bh(&nf_conntrack_expect_lock); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); +- if (!exp) ++ if (!exp) { ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ kfree_skb(skb2); + return -ENOENT; ++ } + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + + if (id != nf_expect_get_id(exp)) { + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ kfree_skb(skb2); + return -ENOENT; + } + } + +- skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +- if (!skb2) { +- nf_ct_expect_put(exp); +- return -ENOMEM; +- } +- + rcu_read_lock(); + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + exp); + rcu_read_unlock(); + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; +@@ -3401,22 +3407,26 @@ static int ctnetlink_del_expect(struct sk_buff *skb, + if (err < 0) + return err; + ++ spin_lock_bh(&nf_conntrack_expect_lock); ++ + /* bump usage count to 2 */ + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); +- if (!exp) ++ if (!exp) { ++ spin_unlock_bh(&nf_conntrack_expect_lock); + return -ENOENT; ++ } + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + + if (id != nf_expect_get_id(exp)) { + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ +- spin_lock_bh(&nf_conntrack_expect_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, + nlmsg_report(info->nlh)); +-- +2.53.0 + diff --git a/queue-6.1/series b/queue-6.1/series index e8ca96465e..28392a84ce 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -261,3 +261,4 @@ net-mvpp2-limit-xdp-frame-size-to-the-rx-buffer.patch net-mvpp2-add-metadata-support-for-xdp-mode.patch net-mvpp2-refill-rx-buffers-before-xdp-or-skb-use.patch net-mvpp2-build-skb-from-xdp-adjusted-data-on-xdp_pa.patch +netfilter-ctnetlink-ensure-safe-access-to-master-con.patch diff --git a/queue-6.12/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch b/queue-6.12/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch new file mode 100644 index 0000000000..8d3264df8d --- /dev/null +++ b/queue-6.12/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch @@ -0,0 +1,221 @@ +From 2cd4803c24ecf6e069ec3ca6b04719906fe45815 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jun 2026 19:07:05 +0000 +Subject: netfilter: ctnetlink: ensure safe access to master conntrack + +From: Pablo Neira Ayuso + +[ Upstream commit bffcaad9afdfe45d7fc777397d3b83c1e3ebffe5 ] + +Holding reference on the expectation is not sufficient, the master +conntrack object can just go away, making exp->master invalid. + +To access exp->master safely: + +- Grab the nf_conntrack_expect_lock, this gets serialized with + clean_from_lists() which also holds this lock when the master + conntrack goes away. + +- Hold reference on master conntrack via nf_conntrack_find_get(). + Not so easy since the master tuple to look up for the master conntrack + is not available in the existing problematic paths. + +This patch goes for extending the nf_conntrack_expect_lock section +to address this issue for simplicity, in the cases that are described +below this is just slightly extending the lock section. + +The add expectation command already holds a reference to the master +conntrack from ctnetlink_create_expect(). + +However, the delete expectation command needs to grab the spinlock +before looking up for the expectation. Expand the existing spinlock +section to address this to cover the expectation lookup. Note that, +the nf_ct_expect_iterate_net() calls already grabs the spinlock while +iterating over the expectation table, which is correct. + +The get expectation command needs to grab the spinlock to ensure master +conntrack does not go away. This also expands the existing spinlock +section to cover the expectation lookup too. I needed to move the +netlink skb allocation out of the spinlock to keep it GFP_KERNEL. + +For the expectation events, the IPEXP_DESTROY event is already delivered +under the spinlock, just move the delivery of IPEXP_NEW under the +spinlock too because the master conntrack event cache is reached through +exp->master. + +While at it, add lockdep notations to help identify what codepaths need +to grab the spinlock. + +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +[ fix timer_delete -> del_timer in diff context lines since 8fa7292 +("treewide: Switch/rename to timer_delete[_sync]()") landed in 6.15 ] +Signed-off-by: Mark Bundschuh +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_conntrack_core.h | 5 ++++ + net/netfilter/nf_conntrack_ecache.c | 2 ++ + net/netfilter/nf_conntrack_expect.c | 10 +++++++- + net/netfilter/nf_conntrack_netlink.c | 28 +++++++++++++++-------- + 4 files changed, 35 insertions(+), 10 deletions(-) + +diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h +index 3384859a892101..8883575adcc1e7 100644 +--- a/include/net/netfilter/nf_conntrack_core.h ++++ b/include/net/netfilter/nf_conntrack_core.h +@@ -83,6 +83,11 @@ void nf_conntrack_lock(spinlock_t *lock); + + extern spinlock_t nf_conntrack_expect_lock; + ++static inline void lockdep_nfct_expect_lock_held(void) ++{ ++ lockdep_assert_held(&nf_conntrack_expect_lock); ++} ++ + /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ + + static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) +diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c +index 69948e1d6974e3..6526bdcca580fd 100644 +--- a/net/netfilter/nf_conntrack_ecache.c ++++ b/net/netfilter/nf_conntrack_ecache.c +@@ -237,6 +237,8 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + ++ lockdep_nfct_expect_lock_held(); ++ + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (!notify) +diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c +index f5c45989df5736..bb8b87f9ee50da 100644 +--- a/net/netfilter/nf_conntrack_expect.c ++++ b/net/netfilter/nf_conntrack_expect.c +@@ -51,6 +51,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + struct net *net = nf_ct_exp_net(exp); + struct nf_conntrack_net *cnet; + ++ lockdep_nfct_expect_lock_held(); + WARN_ON(!master_help); + WARN_ON(timer_pending(&exp->timeout)); + +@@ -118,6 +119,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + + bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) + { ++ lockdep_nfct_expect_lock_held(); ++ + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); +@@ -177,6 +180,8 @@ nf_ct_find_expectation(struct net *net, + struct nf_conntrack_expect *i, *exp = NULL; + unsigned int h; + ++ lockdep_nfct_expect_lock_held(); ++ + if (!cnet->expect_count) + return NULL; + +@@ -459,6 +464,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, + unsigned int h; + int ret = 0; + ++ lockdep_nfct_expect_lock_held(); ++ + if (!master_help) { + ret = -ESHUTDOWN; + goto out; +@@ -515,8 +522,9 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, + + nf_ct_expect_insert(expect); + +- spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ + return 0; + out: + spin_unlock_bh(&nf_conntrack_expect_lock); +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index f51cdfba68fbdb..507f17722f375b 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -3332,31 +3332,37 @@ static int ctnetlink_get_expect(struct sk_buff *skb, + if (err < 0) + return err; + ++ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); ++ if (!skb2) ++ return -ENOMEM; ++ ++ spin_lock_bh(&nf_conntrack_expect_lock); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); +- if (!exp) ++ if (!exp) { ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ kfree_skb(skb2); + return -ENOENT; ++ } + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + + if (id != nf_expect_get_id(exp)) { + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ kfree_skb(skb2); + return -ENOENT; + } + } + +- skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +- if (!skb2) { +- nf_ct_expect_put(exp); +- return -ENOMEM; +- } +- + rcu_read_lock(); + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + exp); + rcu_read_unlock(); + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; +@@ -3403,22 +3409,26 @@ static int ctnetlink_del_expect(struct sk_buff *skb, + if (err < 0) + return err; + ++ spin_lock_bh(&nf_conntrack_expect_lock); ++ + /* bump usage count to 2 */ + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); +- if (!exp) ++ if (!exp) { ++ spin_unlock_bh(&nf_conntrack_expect_lock); + return -ENOENT; ++ } + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + + if (id != nf_expect_get_id(exp)) { + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ +- spin_lock_bh(&nf_conntrack_expect_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, + nlmsg_report(info->nlh)); +-- +2.53.0 + diff --git a/queue-6.12/series b/queue-6.12/series index b73b762168..e792c43bb2 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -104,3 +104,8 @@ net-mvpp2-add-metadata-support-for-xdp-mode.patch net-mvpp2-refill-rx-buffers-before-xdp-or-skb-use.patch net-mvpp2-build-skb-from-xdp-adjusted-data-on-xdp_pa.patch ipv6-fix-a-potential-npd-in-cleanup_prefix_route.patch +netfilter-ctnetlink-ensure-safe-access-to-master-con.patch +writeback-avoid-contention-on-wb-list_lock-when-swit.patch +writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch +xfrm-hold-device-only-for-the-asynchronous-decryptio.patch +xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch diff --git a/queue-6.12/writeback-avoid-contention-on-wb-list_lock-when-swit.patch b/queue-6.12/writeback-avoid-contention-on-wb-list_lock-when-swit.patch new file mode 100644 index 0000000000..40c4a0faa6 --- /dev/null +++ b/queue-6.12/writeback-avoid-contention-on-wb-list_lock-when-swit.patch @@ -0,0 +1,300 @@ +From cc73bfdc8a332fe69957503aa0ada8507b9fd7bf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 9 Apr 2025 17:12:59 +0200 +Subject: writeback: Avoid contention on wb->list_lock when switching inodes + +From: Jan Kara + +[ Upstream commit e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b ] + +There can be multiple inode switch works that are trying to switch +inodes to / from the same wb. This can happen in particular if some +cgroup exits which owns many (thousands) inodes and we need to switch +them all. In this case several inode_switch_wbs_work_fn() instances will +be just spinning on the same wb->list_lock while only one of them makes +forward progress. This wastes CPU cycles and quickly leads to softlockup +reports and unusable system. + +Instead of running several inode_switch_wbs_work_fn() instances in +parallel switching to the same wb and contending on wb->list_lock, run +just one work item per wb and manage a queue of isw items switching to +this wb. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 99 ++++++++++++++++++++------------ + include/linux/backing-dev-defs.h | 4 ++ + include/linux/writeback.h | 2 + + mm/backing-dev.c | 5 ++ + 4 files changed, 74 insertions(+), 36 deletions(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 45e90338fbb2df..a8d21a5f354859 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -369,7 +369,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + } + + struct inode_switch_wbs_context { +- struct rcu_work work; ++ /* List of queued switching contexts for the wb */ ++ struct llist_node list; + + /* + * Multiple inodes can be switched at once. The switching procedure +@@ -379,7 +380,6 @@ struct inode_switch_wbs_context { + * array embedded into struct inode_switch_wbs_context. Otherwise + * an inode could be left in a non-consistent state. + */ +- struct bdi_writeback *new_wb; + struct inode *inodes[]; + }; + +@@ -488,13 +488,11 @@ static bool inode_do_switch_wbs(struct inode *inode, + return switched; + } + +-static void inode_switch_wbs_work_fn(struct work_struct *work) ++static void process_inode_switch_wbs(struct bdi_writeback *new_wb, ++ struct inode_switch_wbs_context *isw) + { +- struct inode_switch_wbs_context *isw = +- container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); + struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; +- struct bdi_writeback *new_wb = isw->new_wb; + unsigned long nr_switched = 0; + struct inode **inodep; + +@@ -554,6 +552,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + atomic_dec(&isw_nr_in_flight); + } + ++void inode_switch_wbs_work_fn(struct work_struct *work) ++{ ++ struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, ++ switch_work); ++ struct inode_switch_wbs_context *isw, *next_isw; ++ struct llist_node *list; ++ ++ /* ++ * Grab out reference to wb so that it cannot get freed under us ++ * after we process all the isw items. ++ */ ++ wb_get(new_wb); ++ while (1) { ++ list = llist_del_all(&new_wb->switch_wbs_ctxs); ++ /* Nothing to do? */ ++ if (!list) ++ break; ++ /* ++ * In addition to synchronizing among switchers, I_WB_SWITCH ++ * tells the RCU protected stat update paths to grab the i_page ++ * lock so that stat transfer can synchronize against them. ++ * Let's continue after I_WB_SWITCH is guaranteed to be ++ * visible. ++ */ ++ synchronize_rcu(); ++ ++ llist_for_each_entry_safe(isw, next_isw, list, list) ++ process_inode_switch_wbs(new_wb, isw); ++ } ++ wb_put(new_wb); ++} ++ + static bool inode_prepare_wbs_switch(struct inode *inode, + struct bdi_writeback *new_wb) + { +@@ -583,6 +613,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, + return true; + } + ++static void wb_queue_isw(struct bdi_writeback *wb, ++ struct inode_switch_wbs_context *isw) ++{ ++ if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) ++ queue_work(isw_wq, &wb->switch_work); ++} ++ + /** + * inode_switch_wbs - change the wb association of an inode + * @inode: target inode +@@ -596,6 +633,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; ++ struct bdi_writeback *new_wb = NULL; + + /* noop if seems to be already in progress */ + if (inode->i_state & I_WB_SWITCH) +@@ -620,40 +658,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) + if (!memcg_css) + goto out_free; + +- isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); ++ new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); +- if (!isw->new_wb) ++ if (!new_wb) + goto out_free; + +- if (!inode_prepare_wbs_switch(inode, isw->new_wb)) ++ if (!inode_prepare_wbs_switch(inode, new_wb)) + goto out_free; + + isw->inodes[0] = inode; + +- /* +- * In addition to synchronizing among switchers, I_WB_SWITCH tells +- * the RCU protected stat update paths to grab the i_page +- * lock so that stat transfer can synchronize against them. +- * Let's continue after I_WB_SWITCH is guaranteed to be visible. +- */ +- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); +- queue_rcu_work(isw_wq, &isw->work); ++ wb_queue_isw(new_wb, isw); + return; + + out_free: + atomic_dec(&isw_nr_in_flight); +- if (isw->new_wb) +- wb_put(isw->new_wb); ++ if (new_wb) ++ wb_put(new_wb); + kfree(isw); + } + +-static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, ++static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, ++ struct inode_switch_wbs_context *isw, + struct list_head *list, int *nr) + { + struct inode *inode; + + list_for_each_entry(inode, list, i_io_list) { +- if (!inode_prepare_wbs_switch(inode, isw->new_wb)) ++ if (!inode_prepare_wbs_switch(inode, new_wb)) + continue; + + isw->inodes[*nr] = inode; +@@ -677,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) + { + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; ++ struct bdi_writeback *new_wb; + int nr; + bool restart = false; + +@@ -689,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) + + for (memcg_css = wb->memcg_css->parent; memcg_css; + memcg_css = memcg_css->parent) { +- isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); +- if (isw->new_wb) ++ new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); ++ if (new_wb) + break; + } +- if (unlikely(!isw->new_wb)) +- isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ ++ if (unlikely(!new_wb)) ++ new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + + nr = 0; + spin_lock(&wb->list_lock); +@@ -706,27 +739,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) + * bandwidth restrictions, as writeback of inode metadata is not + * accounted for. + */ +- restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); ++ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); + if (!restart) +- restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); ++ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, ++ &nr); + spin_unlock(&wb->list_lock); + + /* no attached inodes? bail out */ + if (nr == 0) { + atomic_dec(&isw_nr_in_flight); +- wb_put(isw->new_wb); ++ wb_put(new_wb); + kfree(isw); + return restart; + } + +- /* +- * In addition to synchronizing among switchers, I_WB_SWITCH tells +- * the RCU protected stat update paths to grab the i_page +- * lock so that stat transfer can synchronize against them. +- * Let's continue after I_WB_SWITCH is guaranteed to be visible. +- */ +- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); +- queue_rcu_work(isw_wq, &isw->work); ++ wb_queue_isw(new_wb, isw); + + return restart; + } +diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h +index 2ad261082bba5f..c5c9d89c73edcc 100644 +--- a/include/linux/backing-dev-defs.h ++++ b/include/linux/backing-dev-defs.h +@@ -152,6 +152,10 @@ struct bdi_writeback { + struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + struct list_head b_attached; /* attached inodes, protected by list_lock */ + struct list_head offline_node; /* anchored at offline_cgwbs */ ++ struct work_struct switch_work; /* work used to perform inode switching ++ * to this wb */ ++ struct llist_head switch_wbs_ctxs; /* queued contexts for ++ * writeback switching */ + + union { + struct work_struct release_work; +diff --git a/include/linux/writeback.h b/include/linux/writeback.h +index 641a057e041329..b6bf90a7052599 100644 +--- a/include/linux/writeback.h ++++ b/include/linux/writeback.h +@@ -293,6 +293,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) + bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); + } + ++void inode_switch_wbs_work_fn(struct work_struct *work); ++ + #else /* CONFIG_CGROUP_WRITEBACK */ + + static inline void inode_attach_wb(struct inode *inode, struct folio *folio) +diff --git a/mm/backing-dev.c b/mm/backing-dev.c +index bf0594ceb3ff87..956a7e23b5d634 100644 +--- a/mm/backing-dev.c ++++ b/mm/backing-dev.c +@@ -634,6 +634,7 @@ static void cgwb_release_workfn(struct work_struct *work) + wb_exit(wb); + bdi_put(bdi); + WARN_ON_ONCE(!list_empty(&wb->b_attached)); ++ WARN_ON_ONCE(work_pending(&wb->switch_work)); + call_rcu(&wb->rcu, cgwb_free_rcu); + } + +@@ -710,6 +711,8 @@ static int cgwb_create(struct backing_dev_info *bdi, + wb->memcg_css = memcg_css; + wb->blkcg_css = blkcg_css; + INIT_LIST_HEAD(&wb->b_attached); ++ INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); ++ init_llist_head(&wb->switch_wbs_ctxs); + INIT_WORK(&wb->release_work, cgwb_release_workfn); + set_bit(WB_registered, &wb->state); + bdi_get(bdi); +@@ -840,6 +843,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) + if (!ret) { + bdi->wb.memcg_css = &root_mem_cgroup->css; + bdi->wb.blkcg_css = blkcg_root_css; ++ INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); ++ init_llist_head(&bdi->wb.switch_wbs_ctxs); + } + return ret; + } +-- +2.53.0 + diff --git a/queue-6.12/writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch b/queue-6.12/writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch new file mode 100644 index 0000000000..204da2d508 --- /dev/null +++ b/queue-6.12/writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch @@ -0,0 +1,109 @@ +From f765f27fb5a2ce650ddda2e3de068ff0875778e8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Apr 2026 11:36:19 +0200 +Subject: writeback: Fix use after free in inode_switch_wbs_work_fn() + +From: Jan Kara + +[ Upstream commit 6689f01d6740cf358932b3e97ee968c6099800d9 ] + +inode_switch_wbs_work_fn() has a loop like: + + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + ... process the items ... + } + +Now adding of items to the list looks like: + +wb_queue_isw() + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); + +Because inode_switch_wbs_work_fn() loops when processing isw items, it +can happen that wb->switch_work is pending while wb->switch_wbs_ctxs is +empty. This is a problem because in that case wb can get freed (no isw +items -> no wb reference) while the work is still pending causing +use-after-free issues. + +We cannot just fix this by cancelling work when freeing wb because that +could still trigger problematic 0 -> 1 transitions on wb refcount due to +wb_get() in inode_switch_wbs_work_fn(). It could be all handled with +more careful code but that seems unnecessarily complex so let's avoid +that until it is proven that the looping actually brings practical +benefit. Just remove the loop from inode_switch_wbs_work_fn() instead. +That way when wb_queue_isw() queues work, we are guaranteed we have +added the first item to wb->switch_wbs_ctxs and nobody is going to +remove it (and drop the wb reference it holds) until the queued work +runs. + +Fixes: e1b849cfa6b6 ("writeback: Avoid contention on wb->list_lock when switching inodes") +CC: stable@vger.kernel.org +Signed-off-by: Jan Kara +Link: https://patch.msgid.link/20260413093618.17244-2-jack@suse.cz +Acked-by: Tejun Heo +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 36 +++++++++++++++++++----------------- + 1 file changed, 19 insertions(+), 17 deletions(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index a8d21a5f354859..e8afd4fd26f98e 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -559,28 +559,30 @@ void inode_switch_wbs_work_fn(struct work_struct *work) + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + ++ list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* +- * Grab out reference to wb so that it cannot get freed under us ++ * Nothing to do? That would be a problem as references held by isw ++ * items protect wb from freeing... ++ */ ++ if (WARN_ON_ONCE(!list)) ++ return; ++ ++ /* ++ * Grab our reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); +- while (1) { +- list = llist_del_all(&new_wb->switch_wbs_ctxs); +- /* Nothing to do? */ +- if (!list) +- break; +- /* +- * In addition to synchronizing among switchers, I_WB_SWITCH +- * tells the RCU protected stat update paths to grab the i_page +- * lock so that stat transfer can synchronize against them. +- * Let's continue after I_WB_SWITCH is guaranteed to be +- * visible. +- */ +- synchronize_rcu(); ++ /* ++ * In addition to synchronizing among switchers, I_WB_SWITCH ++ * tells the RCU protected stat update paths to grab the i_page ++ * lock so that stat transfer can synchronize against them. ++ * Let's continue after I_WB_SWITCH is guaranteed to be ++ * visible. ++ */ ++ synchronize_rcu(); + +- llist_for_each_entry_safe(isw, next_isw, list, list) +- process_inode_switch_wbs(new_wb, isw); +- } ++ llist_for_each_entry_safe(isw, next_isw, list, list) ++ process_inode_switch_wbs(new_wb, isw); + wb_put(new_wb); + } + +-- +2.53.0 + diff --git a/queue-6.12/xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch b/queue-6.12/xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch new file mode 100644 index 0000000000..57f584da9a --- /dev/null +++ b/queue-6.12/xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch @@ -0,0 +1,145 @@ +From 196fd569f080d6d4b0c74e505c2ca49e9e448e3e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jun 2026 11:13:27 +0000 +Subject: xfrm: hold dev ref until after transport_finish NF_HOOK + +From: Qi Tang + +[ Upstream commit 1c428b03840094410c5fb6a5db30640486bbbfcb ] + +After async crypto completes, xfrm_input_resume() calls dev_put() +immediately on re-entry before the skb reaches transport_finish. +The skb->dev pointer is then used inside NF_HOOK and its okfn, +which can race with device teardown. + +Remove the dev_put from the async resumption entry and instead +drop the reference after the NF_HOOK call in transport_finish, +using a saved device pointer since NF_HOOK may consume the skb. +This covers NF_DROP, NF_QUEUE and NF_STOLEN paths that skip +the okfn. + +For non-transport exits (decaps, gro, drop) and secondary +async return points, release the reference inline when +async is set. + +Suggested-by: Florian Westphal +Fixes: acf568ee859f ("xfrm: Reinject transport-mode packets through tasklet") +Cc: stable@vger.kernel.org +Signed-off-by: Qi Tang +Signed-off-by: Steffen Klassert +[ xfrm_inner_mode_input() always completes synchronously in this kernel +version and cannot return -EINPROGRESS. That requires +7ac64f4598b4 ("xfrm: add mode_cbs module functionality"), which is not +present, so the async dev_put path is unreachable and the hunk was +omitted ] +Signed-off-by: Simon Liebold +Signed-off-by: Sasha Levin +--- + net/ipv4/xfrm4_input.c | 5 ++++- + net/ipv6/xfrm6_input.c | 5 ++++- + net/xfrm/xfrm_input.c | 12 ++++++++++-- + 3 files changed, 18 insertions(+), 4 deletions(-) + +diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c +index 12a1a0f421956c..adf21d6b6076c1 100644 +--- a/net/ipv4/xfrm4_input.c ++++ b/net/ipv4/xfrm4_input.c +@@ -50,6 +50,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) + { + struct xfrm_offload *xo = xfrm_offload(skb); + struct iphdr *iph = ip_hdr(skb); ++ struct net_device *dev = skb->dev; + + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; + +@@ -73,8 +74,10 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) + } + + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, +- dev_net(skb->dev), NULL, skb, skb->dev, NULL, ++ dev_net(dev), NULL, skb, dev, NULL, + xfrm4_rcv_encap_finish); ++ if (async) ++ dev_put(dev); + return 0; + } + +diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c +index 9005fc156a20e6..699a001ac16629 100644 +--- a/net/ipv6/xfrm6_input.c ++++ b/net/ipv6/xfrm6_input.c +@@ -43,6 +43,7 @@ static int xfrm6_transport_finish2(struct net *net, struct sock *sk, + int xfrm6_transport_finish(struct sk_buff *skb, int async) + { + struct xfrm_offload *xo = xfrm_offload(skb); ++ struct net_device *dev = skb->dev; + int nhlen = -skb_network_offset(skb); + + skb_network_header(skb)[IP6CB(skb)->nhoff] = +@@ -68,8 +69,10 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) + } + + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, +- dev_net(skb->dev), NULL, skb, skb->dev, NULL, ++ dev_net(dev), NULL, skb, dev, NULL, + xfrm6_transport_finish2); ++ if (async) ++ dev_put(dev); + return 0; + } + +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index 90a79558dca259..5d3633ce6ba329 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -492,7 +492,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + /* An encap_type of -1 indicates async resumption. */ + if (encap_type == -1) { + async = 1; +- dev_put(skb->dev); + seq = XFRM_SKB_CB(skb)->seq.input.low; + goto resume; + } +@@ -645,8 +644,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + dev_hold(skb->dev); + + nexthdr = x->type->input(x, skb); +- if (nexthdr == -EINPROGRESS) ++ if (nexthdr == -EINPROGRESS) { ++ if (async) ++ dev_put(skb->dev); + return 0; ++ } + + dev_put(skb->dev); + } +@@ -717,6 +719,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + sp->olen = 0; + if (skb_valid_dst(skb)) + skb_dst_drop(skb); ++ if (async) ++ dev_put(skb->dev); + gro_cells_receive(&gro_cells, skb); + return 0; + } else { +@@ -736,6 +740,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + sp->olen = 0; + if (skb_valid_dst(skb)) + skb_dst_drop(skb); ++ if (async) ++ dev_put(skb->dev); + gro_cells_receive(&gro_cells, skb); + return err; + } +@@ -746,6 +752,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + drop_unlock: + spin_unlock(&x->lock); + drop: ++ if (async) ++ dev_put(skb->dev); + xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1); + kfree_skb(skb); + return 0; +-- +2.53.0 + diff --git a/queue-6.12/xfrm-hold-device-only-for-the-asynchronous-decryptio.patch b/queue-6.12/xfrm-hold-device-only-for-the-asynchronous-decryptio.patch new file mode 100644 index 0000000000..c6066554e7 --- /dev/null +++ b/queue-6.12/xfrm-hold-device-only-for-the-asynchronous-decryptio.patch @@ -0,0 +1,73 @@ +From 84bb266d43de21ea4f6f7a4ea0259e4a453ca999 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jun 2026 11:13:26 +0000 +Subject: xfrm: hold device only for the asynchronous decryption + +From: Jianbo Liu + +[ Upstream commit b05d42eefac737ce3cd80114d3579111023941b8 ] + +The dev_hold() on skb->dev during packet reception was originally +added to prevent the device from being released prematurely during +asynchronous decryption operations. + +As current hardware can offload decryption, this asynchronous path is +not always utilized. This often results in a pattern of dev_hold() +immediately followed by dev_put() for each packet, creating +unnecessary reference counting overhead detrimental to performance. + +This patch optimizes this by skipping the dev_hold() and subsequent +dev_put() when asynchronous decryption is not being performed. + +Signed-off-by: Jianbo Liu +Reviewed-by: Cosmin Ratiu +Signed-off-by: Steffen Klassert +Stable-dep-of: 1c428b038400 ("xfrm: hold dev ref until after transport_finish NF_HOOK") +Signed-off-by: Simon Liebold +Signed-off-by: Sasha Levin +--- + net/xfrm/xfrm_input.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index 8edcb32735e595..90a79558dca259 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -492,6 +492,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + /* An encap_type of -1 indicates async resumption. */ + if (encap_type == -1) { + async = 1; ++ dev_put(skb->dev); + seq = XFRM_SKB_CB(skb)->seq.input.low; + goto resume; + } +@@ -638,18 +639,18 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + XFRM_SKB_CB(skb)->seq.input.low = seq; + XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + +- dev_hold(skb->dev); +- +- if (crypto_done) ++ if (crypto_done) { + nexthdr = x->type_offload->input_tail(x, skb); +- else ++ } else { ++ dev_hold(skb->dev); ++ + nexthdr = x->type->input(x, skb); ++ if (nexthdr == -EINPROGRESS) ++ return 0; + +- if (nexthdr == -EINPROGRESS) +- return 0; ++ dev_put(skb->dev); ++ } + resume: +- dev_put(skb->dev); +- + spin_lock(&x->lock); + if (nexthdr < 0) { + if (nexthdr == -EBADMSG) { +-- +2.53.0 + diff --git a/queue-6.18/kvm-vmx-update-svi-during-runtime-apicv-activation.patch b/queue-6.18/kvm-vmx-update-svi-during-runtime-apicv-activation.patch new file mode 100644 index 0000000000..88b0cd0eba --- /dev/null +++ b/queue-6.18/kvm-vmx-update-svi-during-runtime-apicv-activation.patch @@ -0,0 +1,156 @@ +From 6bcc61ad9ab1c9a039d5c6601cca2f09b3871e95 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jun 2026 14:10:01 -0700 +Subject: KVM: VMX: Update SVI during runtime APICv activation + +From: Dongli Zhang + +commit b2849bec936be642b5420801f902337f2507648e upstream. + +The APICv (apic->apicv_active) can be activated or deactivated at runtime, +for instance, because of APICv inhibit reasons. Intel VMX employs different +mechanisms to virtualize LAPIC based on whether APICv is active. + +When APICv is activated at runtime, GUEST_INTR_STATUS is used to configure +and report the current pending IRR and ISR states. Unless a specific vector +is explicitly included in EOI_EXIT_BITMAP, its EOI will not be trapped to +KVM. Intel VMX automatically clears the corresponding ISR bit based on the +GUEST_INTR_STATUS.SVI field. + +When APICv is deactivated at runtime, the VM_ENTRY_INTR_INFO_FIELD is used +to specify the next interrupt vector to invoke upon VM-entry. The +VMX IDT_VECTORING_INFO_FIELD is used to report un-invoked vectors on +VM-exit. EOIs are always trapped to KVM, so the software can manually clear +pending ISR bits. + +There are scenarios where, with APICv activated at runtime, a guest-issued +EOI may not be able to clear the pending ISR bit. + +Taking vector 236 as an example, here is one scenario. + +1. Suppose APICv is inactive. Vector 236 is pending in the IRR. +2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, +and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). +3. After VM-entry, vector 236 is invoked through the guest IDT. At this +point, the data in VM_ENTRY_INTR_INFO_FIELD is no longer valid. The guest +interrupt handler for vector 236 is invoked. +4. Suppose a VM exit occurs very early in the guest interrupt handler, +before the EOI is issued. +5. Nothing is reported through the IDT_VECTORING_INFO_FIELD because +vector 236 has already been invoked in the guest. +6. Now, suppose APICv is activated. Before the next VM-entry, KVM calls +kvm_vcpu_update_apicv() to activate APICv. +7. Unfortunately, GUEST_INTR_STATUS.SVI is not configured, although +vector 236 is still pending in the ISR. +8. After VM-entry, the guest finally issues the EOI for vector 236. +However, because SVI is not configured, vector 236 is not cleared. +9. ISR is stalled forever on vector 236. + +Here is another scenario. + +1. Suppose APICv is inactive. Vector 236 is pending in the IRR. +2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, +and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). +3. VM-exit occurs immediately after the next VM-entry. The vector 236 is +not invoked through the guest IDT. Instead, it is saved to the +IDT_VECTORING_INFO_FIELD during the VM-exit. +4. KVM calls kvm_queue_interrupt() to re-queue the un-invoked vector 236 +into vcpu->arch.interrupt. A KVM_REQ_EVENT is requested. +5. Now, suppose APICv is activated. Before the next VM-entry, KVM calls +kvm_vcpu_update_apicv() to activate APICv. +6. Although APICv is now active, KVM still uses the legacy +VM_ENTRY_INTR_INFO_FIELD to re-inject vector 236. GUEST_INTR_STATUS.SVI is +not configured. +7. After the next VM-entry, vector 236 is invoked through the guest IDT. +Finally, an EOI occurs. However, due to the lack of GUEST_INTR_STATUS.SVI +configuration, vector 236 is not cleared from the ISR. +8. ISR is stalled forever on vector 236. + +Using QEMU as an example, vector 236 is stuck in ISR forever. + +(qemu) info lapic 1 +dumping local APIC state for CPU 1 + +LVT0 0x00010700 active-hi edge masked ExtINT (vec 0) +LVT1 0x00010400 active-hi edge masked NMI +LVTPC 0x00000400 active-hi edge NMI +LVTERR 0x000000fe active-hi edge Fixed (vec 254) +LVTTHMR 0x00010000 active-hi edge masked Fixed (vec 0) +LVTT 0x000400ec active-hi edge tsc-deadline Fixed (vec 236) +Timer DCR=0x0 (divide by 2) initial_count = 0 current_count = 0 +SPIV 0x000001ff APIC enabled, focus=off, spurious vec 255 +ICR 0x000000fd physical edge de-assert no-shorthand +ICR2 0x00000000 cpu 0 (X2APIC ID) +ESR 0x00000000 +ISR 236 +IRR 37(level) 236 + +The issue isn't applicable to AMD SVM as KVM simply writes vmcb01 directly +irrespective of whether L1 (vmcs01) or L2 (vmcb02) is active (unlike VMX, +there is no need/cost to switch between VMCBs). In addition, +APICV_INHIBIT_REASON_IRQWIN ensures AMD SVM AVIC is not activated until +the last interrupt is EOI'd. + +Fix the bug by configuring Intel VMX GUEST_INTR_STATUS.SVI if APICv is +activated at runtime. + +Signed-off-by: Dongli Zhang +Reviewed-by: Chao Gao +Link: https://patch.msgid.link/20251110063212.34902-1-dongli.zhang@oracle.com +[sean: call out that SVM writes vmcb01 directly, tweak comment] +Link: https://patch.msgid.link/20251205231913.441872-2-seanjc@google.com +Signed-off-by: Sean Christopherson +(cherry picked from commit b2849bec936be642b5420801f902337f2507648e) +Cc: stable@vger.kernel.org # 6.6.x and above +Cc: Gulshan Gabel +Signed-off-by: Jon Kohler +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 9 --------- + arch/x86/kvm/x86.c | 7 +++++++ + 2 files changed, 7 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index c084f48e2b0b98..b7798ced7b505c 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6886,15 +6886,6 @@ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) + * VM-Exit, otherwise L1 with run with a stale SVI. + */ + if (is_guest_mode(vcpu)) { +- /* +- * KVM is supposed to forward intercepted L2 EOIs to L1 if VID +- * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. +- * Note, userspace can stuff state while L2 is active; assert +- * that VID is disabled if and only if the vCPU is in KVM_RUN +- * to avoid false positives if userspace is setting APIC state. +- */ +- WARN_ON_ONCE(vcpu->wants_to_run && +- nested_cpu_has_vid(get_vmcs12(vcpu))); + to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; + return; + } +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index ad2b7158b9c8ea..a21ebe04aa23a8 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10950,9 +10950,16 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) + * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was + * still active when the interrupt got accepted. Make sure + * kvm_check_and_inject_events() is called to check for that. ++ * ++ * Update SVI when APICv gets enabled, otherwise SVI won't reflect the ++ * highest bit in vISR and the next accelerated EOI in the guest won't ++ * be virtualized correctly (the CPU uses SVI to determine which vISR ++ * vector to clear). + */ + if (!apic->apicv_active) + kvm_make_request(KVM_REQ_EVENT, vcpu); ++ else ++ kvm_apic_update_hwapic_isr(vcpu); + + out: + preempt_enable(); +-- +2.53.0 + diff --git a/queue-6.18/series b/queue-6.18/series index 8646c259cb..f5057e7178 100644 --- a/queue-6.18/series +++ b/queue-6.18/series @@ -134,3 +134,4 @@ net-txgbe-support-cr-modules-for-aml-devices.patch net-txgbe-rename-the-sfp-related.patch net-txgbe-initialize-module-info-buffer.patch ipv6-fix-a-potential-npd-in-cleanup_prefix_route.patch +kvm-vmx-update-svi-during-runtime-apicv-activation.patch diff --git a/queue-6.6/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch b/queue-6.6/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch new file mode 100644 index 0000000000..ebfd696dc2 --- /dev/null +++ b/queue-6.6/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch @@ -0,0 +1,221 @@ +From 56ce9e8fa76f29b011850684b9bc25901e6071dc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jun 2026 20:24:08 +0000 +Subject: netfilter: ctnetlink: ensure safe access to master conntrack + +From: Pablo Neira Ayuso + +[ Upstream commit bffcaad9afdfe45d7fc777397d3b83c1e3ebffe5 ] + +Holding reference on the expectation is not sufficient, the master +conntrack object can just go away, making exp->master invalid. + +To access exp->master safely: + +- Grab the nf_conntrack_expect_lock, this gets serialized with + clean_from_lists() which also holds this lock when the master + conntrack goes away. + +- Hold reference on master conntrack via nf_conntrack_find_get(). + Not so easy since the master tuple to look up for the master conntrack + is not available in the existing problematic paths. + +This patch goes for extending the nf_conntrack_expect_lock section +to address this issue for simplicity, in the cases that are described +below this is just slightly extending the lock section. + +The add expectation command already holds a reference to the master +conntrack from ctnetlink_create_expect(). + +However, the delete expectation command needs to grab the spinlock +before looking up for the expectation. Expand the existing spinlock +section to address this to cover the expectation lookup. Note that, +the nf_ct_expect_iterate_net() calls already grabs the spinlock while +iterating over the expectation table, which is correct. + +The get expectation command needs to grab the spinlock to ensure master +conntrack does not go away. This also expands the existing spinlock +section to cover the expectation lookup too. I needed to move the +netlink skb allocation out of the spinlock to keep it GFP_KERNEL. + +For the expectation events, the IPEXP_DESTROY event is already delivered +under the spinlock, just move the delivery of IPEXP_NEW under the +spinlock too because the master conntrack event cache is reached through +exp->master. + +While at it, add lockdep notations to help identify what codepaths need +to grab the spinlock. + +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +[ fix timer_delete -> del_timer in diff context lines since 8fa7292 +("treewide: Switch/rename to timer_delete[_sync]()") landed in 6.15 ] +Signed-off-by: Mark Bundschuh +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_conntrack_core.h | 5 ++++ + net/netfilter/nf_conntrack_ecache.c | 2 ++ + net/netfilter/nf_conntrack_expect.c | 10 +++++++- + net/netfilter/nf_conntrack_netlink.c | 28 +++++++++++++++-------- + 4 files changed, 35 insertions(+), 10 deletions(-) + +diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h +index 3384859a892101..8883575adcc1e7 100644 +--- a/include/net/netfilter/nf_conntrack_core.h ++++ b/include/net/netfilter/nf_conntrack_core.h +@@ -83,6 +83,11 @@ void nf_conntrack_lock(spinlock_t *lock); + + extern spinlock_t nf_conntrack_expect_lock; + ++static inline void lockdep_nfct_expect_lock_held(void) ++{ ++ lockdep_assert_held(&nf_conntrack_expect_lock); ++} ++ + /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ + + static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) +diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c +index 69948e1d6974e3..6526bdcca580fd 100644 +--- a/net/netfilter/nf_conntrack_ecache.c ++++ b/net/netfilter/nf_conntrack_ecache.c +@@ -237,6 +237,8 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + ++ lockdep_nfct_expect_lock_held(); ++ + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (!notify) +diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c +index 70bcddfc17ccc2..379711ea5ab67e 100644 +--- a/net/netfilter/nf_conntrack_expect.c ++++ b/net/netfilter/nf_conntrack_expect.c +@@ -51,6 +51,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + struct net *net = nf_ct_exp_net(exp); + struct nf_conntrack_net *cnet; + ++ lockdep_nfct_expect_lock_held(); + WARN_ON(!master_help); + WARN_ON(timer_pending(&exp->timeout)); + +@@ -118,6 +119,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + + bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) + { ++ lockdep_nfct_expect_lock_held(); ++ + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); +@@ -177,6 +180,8 @@ nf_ct_find_expectation(struct net *net, + struct nf_conntrack_expect *i, *exp = NULL; + unsigned int h; + ++ lockdep_nfct_expect_lock_held(); ++ + if (!cnet->expect_count) + return NULL; + +@@ -459,6 +464,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, + unsigned int h; + int ret = 0; + ++ lockdep_nfct_expect_lock_held(); ++ + if (!master_help) { + ret = -ESHUTDOWN; + goto out; +@@ -515,8 +522,9 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, + + nf_ct_expect_insert(expect); + +- spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ + return 0; + out: + spin_unlock_bh(&nf_conntrack_expect_lock); +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 255996f43d854c..eff5008f5e9d4e 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -3326,31 +3326,37 @@ static int ctnetlink_get_expect(struct sk_buff *skb, + if (err < 0) + return err; + ++ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); ++ if (!skb2) ++ return -ENOMEM; ++ ++ spin_lock_bh(&nf_conntrack_expect_lock); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); +- if (!exp) ++ if (!exp) { ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ kfree_skb(skb2); + return -ENOENT; ++ } + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + + if (id != nf_expect_get_id(exp)) { + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ kfree_skb(skb2); + return -ENOENT; + } + } + +- skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +- if (!skb2) { +- nf_ct_expect_put(exp); +- return -ENOMEM; +- } +- + rcu_read_lock(); + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + exp); + rcu_read_unlock(); + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); ++ + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; +@@ -3397,22 +3403,26 @@ static int ctnetlink_del_expect(struct sk_buff *skb, + if (err < 0) + return err; + ++ spin_lock_bh(&nf_conntrack_expect_lock); ++ + /* bump usage count to 2 */ + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); +- if (!exp) ++ if (!exp) { ++ spin_unlock_bh(&nf_conntrack_expect_lock); + return -ENOENT; ++ } + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + + if (id != nf_expect_get_id(exp)) { + nf_ct_expect_put(exp); ++ spin_unlock_bh(&nf_conntrack_expect_lock); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ +- spin_lock_bh(&nf_conntrack_expect_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, + nlmsg_report(info->nlh)); +-- +2.53.0 + diff --git a/queue-6.6/series b/queue-6.6/series index ce46b5c3fb..ffa706ab8d 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -283,3 +283,4 @@ net-mvpp2-add-metadata-support-for-xdp-mode.patch net-mvpp2-refill-rx-buffers-before-xdp-or-skb-use.patch net-mvpp2-build-skb-from-xdp-adjusted-data-on-xdp_pa.patch ipv6-fix-a-potential-npd-in-cleanup_prefix_route.patch +netfilter-ctnetlink-ensure-safe-access-to-master-con.patch