]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for all trees
authorSasha Levin <sashal@kernel.org>
Sat, 13 Jun 2026 14:51:50 +0000 (10:51 -0400)
committerSasha Levin <sashal@kernel.org>
Sat, 13 Jun 2026 14:51:50 +0000 (10:51 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
12 files changed:
queue-6.1/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch [new file with mode: 0644]
queue-6.1/series
queue-6.12/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch [new file with mode: 0644]
queue-6.12/series
queue-6.12/writeback-avoid-contention-on-wb-list_lock-when-swit.patch [new file with mode: 0644]
queue-6.12/writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch [new file with mode: 0644]
queue-6.12/xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch [new file with mode: 0644]
queue-6.12/xfrm-hold-device-only-for-the-asynchronous-decryptio.patch [new file with mode: 0644]
queue-6.18/kvm-vmx-update-svi-during-runtime-apicv-activation.patch [new file with mode: 0644]
queue-6.18/series
queue-6.6/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.1/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch b/queue-6.1/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch
new file mode 100644 (file)
index 0000000..f71ad7b
--- /dev/null
@@ -0,0 +1,221 @@
+From 07756cb6faa6228c3f491e96f7b0aebd62ec4bc9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jun 2026 20:39:06 +0000
+Subject: netfilter: ctnetlink: ensure safe access to master conntrack
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit bffcaad9afdfe45d7fc777397d3b83c1e3ebffe5 ]
+
+Holding reference on the expectation is not sufficient, the master
+conntrack object can just go away, making exp->master invalid.
+
+To access exp->master safely:
+
+- Grab the nf_conntrack_expect_lock, this gets serialized with
+  clean_from_lists() which also holds this lock when the master
+  conntrack goes away.
+
+- Hold reference on master conntrack via nf_conntrack_find_get().
+  Not so easy since the master tuple to look up for the master conntrack
+  is not available in the existing problematic paths.
+
+This patch goes for extending the nf_conntrack_expect_lock section
+to address this issue for simplicity, in the cases that are described
+below this is just slightly extending the lock section.
+
+The add expectation command already holds a reference to the master
+conntrack from ctnetlink_create_expect().
+
+However, the delete expectation command needs to grab the spinlock
+before looking up for the expectation. Expand the existing spinlock
+section to address this to cover the expectation lookup. Note that,
+the nf_ct_expect_iterate_net() calls already grabs the spinlock while
+iterating over the expectation table, which is correct.
+
+The get expectation command needs to grab the spinlock to ensure master
+conntrack does not go away. This also expands the existing spinlock
+section to cover the expectation lookup too. I needed to move the
+netlink skb allocation out of the spinlock to keep it GFP_KERNEL.
+
+For the expectation events, the IPEXP_DESTROY event is already delivered
+under the spinlock, just move the delivery of IPEXP_NEW under the
+spinlock too because the master conntrack event cache is reached through
+exp->master.
+
+While at it, add lockdep notations to help identify what codepaths need
+to grab the spinlock.
+
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+[ fix timer_delete -> del_timer in diff context lines since 8fa7292
+("treewide: Switch/rename to timer_delete[_sync]()") landed in 6.15 ]
+Signed-off-by: Mark Bundschuh <mkbund@amazon.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_conntrack_core.h |  5 ++++
+ net/netfilter/nf_conntrack_ecache.c       |  2 ++
+ net/netfilter/nf_conntrack_expect.c       | 10 +++++++-
+ net/netfilter/nf_conntrack_netlink.c      | 28 +++++++++++++++--------
+ 4 files changed, 35 insertions(+), 10 deletions(-)
+
+diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
+index a36f87af415c22..8ea16b0ba1c982 100644
+--- a/include/net/netfilter/nf_conntrack_core.h
++++ b/include/net/netfilter/nf_conntrack_core.h
+@@ -84,6 +84,11 @@ void nf_conntrack_lock(spinlock_t *lock);
+ extern spinlock_t nf_conntrack_expect_lock;
++static inline void lockdep_nfct_expect_lock_held(void)
++{
++      lockdep_assert_held(&nf_conntrack_expect_lock);
++}
++
+ /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
+diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
+index 69948e1d6974e3..6526bdcca580fd 100644
+--- a/net/netfilter/nf_conntrack_ecache.c
++++ b/net/netfilter/nf_conntrack_ecache.c
+@@ -237,6 +237,8 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
+       struct nf_ct_event_notifier *notify;
+       struct nf_conntrack_ecache *e;
++      lockdep_nfct_expect_lock_held();
++
+       rcu_read_lock();
+       notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+       if (!notify)
+diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
+index 70bcddfc17ccc2..379711ea5ab67e 100644
+--- a/net/netfilter/nf_conntrack_expect.c
++++ b/net/netfilter/nf_conntrack_expect.c
+@@ -51,6 +51,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+       struct net *net = nf_ct_exp_net(exp);
+       struct nf_conntrack_net *cnet;
++      lockdep_nfct_expect_lock_held();
+       WARN_ON(!master_help);
+       WARN_ON(timer_pending(&exp->timeout));
+@@ -118,6 +119,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
+ bool nf_ct_remove_expect(struct nf_conntrack_expect *exp)
+ {
++      lockdep_nfct_expect_lock_held();
++
+       if (del_timer(&exp->timeout)) {
+               nf_ct_unlink_expect(exp);
+               nf_ct_expect_put(exp);
+@@ -177,6 +180,8 @@ nf_ct_find_expectation(struct net *net,
+       struct nf_conntrack_expect *i, *exp = NULL;
+       unsigned int h;
++      lockdep_nfct_expect_lock_held();
++
+       if (!cnet->expect_count)
+               return NULL;
+@@ -459,6 +464,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
+       unsigned int h;
+       int ret = 0;
++      lockdep_nfct_expect_lock_held();
++
+       if (!master_help) {
+               ret = -ESHUTDOWN;
+               goto out;
+@@ -515,8 +522,9 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
+       nf_ct_expect_insert(expect);
+-      spin_unlock_bh(&nf_conntrack_expect_lock);
+       nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
++      spin_unlock_bh(&nf_conntrack_expect_lock);
++
+       return 0;
+ out:
+       spin_unlock_bh(&nf_conntrack_expect_lock);
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index bcbd77608365a9..f6e9d9bc18864a 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -3330,31 +3330,37 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
+       if (err < 0)
+               return err;
++      skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
++      if (!skb2)
++              return -ENOMEM;
++
++      spin_lock_bh(&nf_conntrack_expect_lock);
+       exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
+-      if (!exp)
++      if (!exp) {
++              spin_unlock_bh(&nf_conntrack_expect_lock);
++              kfree_skb(skb2);
+               return -ENOENT;
++      }
+       if (cda[CTA_EXPECT_ID]) {
+               __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+               if (id != nf_expect_get_id(exp)) {
+                       nf_ct_expect_put(exp);
++                      spin_unlock_bh(&nf_conntrack_expect_lock);
++                      kfree_skb(skb2);
+                       return -ENOENT;
+               }
+       }
+-      skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+-      if (!skb2) {
+-              nf_ct_expect_put(exp);
+-              return -ENOMEM;
+-      }
+-
+       rcu_read_lock();
+       err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
+                                     info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+                                     exp);
+       rcu_read_unlock();
+       nf_ct_expect_put(exp);
++      spin_unlock_bh(&nf_conntrack_expect_lock);
++
+       if (err <= 0) {
+               kfree_skb(skb2);
+               return -ENOMEM;
+@@ -3401,22 +3407,26 @@ static int ctnetlink_del_expect(struct sk_buff *skb,
+               if (err < 0)
+                       return err;
++              spin_lock_bh(&nf_conntrack_expect_lock);
++
+               /* bump usage count to 2 */
+               exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
+-              if (!exp)
++              if (!exp) {
++                      spin_unlock_bh(&nf_conntrack_expect_lock);
+                       return -ENOENT;
++              }
+               if (cda[CTA_EXPECT_ID]) {
+                       __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+                       if (id != nf_expect_get_id(exp)) {
+                               nf_ct_expect_put(exp);
++                              spin_unlock_bh(&nf_conntrack_expect_lock);
+                               return -ENOENT;
+                       }
+               }
+               /* after list removal, usage count == 1 */
+-              spin_lock_bh(&nf_conntrack_expect_lock);
+               if (del_timer(&exp->timeout)) {
+                       nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
+                                                  nlmsg_report(info->nlh));
+-- 
+2.53.0
+
index e8ca96465edcb5ba7bd43d3812b0cb49abad3656..28392a84cef57260863b3e6cbbe781e07d95c268 100644 (file)
@@ -261,3 +261,4 @@ net-mvpp2-limit-xdp-frame-size-to-the-rx-buffer.patch
 net-mvpp2-add-metadata-support-for-xdp-mode.patch
 net-mvpp2-refill-rx-buffers-before-xdp-or-skb-use.patch
 net-mvpp2-build-skb-from-xdp-adjusted-data-on-xdp_pa.patch
+netfilter-ctnetlink-ensure-safe-access-to-master-con.patch
diff --git a/queue-6.12/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch b/queue-6.12/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch
new file mode 100644 (file)
index 0000000..8d3264d
--- /dev/null
@@ -0,0 +1,221 @@
+From 2cd4803c24ecf6e069ec3ca6b04719906fe45815 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jun 2026 19:07:05 +0000
+Subject: netfilter: ctnetlink: ensure safe access to master conntrack
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit bffcaad9afdfe45d7fc777397d3b83c1e3ebffe5 ]
+
+Holding reference on the expectation is not sufficient, the master
+conntrack object can just go away, making exp->master invalid.
+
+To access exp->master safely:
+
+- Grab the nf_conntrack_expect_lock, this gets serialized with
+  clean_from_lists() which also holds this lock when the master
+  conntrack goes away.
+
+- Hold reference on master conntrack via nf_conntrack_find_get().
+  Not so easy since the master tuple to look up for the master conntrack
+  is not available in the existing problematic paths.
+
+This patch goes for extending the nf_conntrack_expect_lock section
+to address this issue for simplicity, in the cases that are described
+below this is just slightly extending the lock section.
+
+The add expectation command already holds a reference to the master
+conntrack from ctnetlink_create_expect().
+
+However, the delete expectation command needs to grab the spinlock
+before looking up for the expectation. Expand the existing spinlock
+section to address this to cover the expectation lookup. Note that,
+the nf_ct_expect_iterate_net() calls already grabs the spinlock while
+iterating over the expectation table, which is correct.
+
+The get expectation command needs to grab the spinlock to ensure master
+conntrack does not go away. This also expands the existing spinlock
+section to cover the expectation lookup too. I needed to move the
+netlink skb allocation out of the spinlock to keep it GFP_KERNEL.
+
+For the expectation events, the IPEXP_DESTROY event is already delivered
+under the spinlock, just move the delivery of IPEXP_NEW under the
+spinlock too because the master conntrack event cache is reached through
+exp->master.
+
+While at it, add lockdep notations to help identify what codepaths need
+to grab the spinlock.
+
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+[ fix timer_delete -> del_timer in diff context lines since 8fa7292
+("treewide: Switch/rename to timer_delete[_sync]()") landed in 6.15 ]
+Signed-off-by: Mark Bundschuh <mkbund@amazon.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_conntrack_core.h |  5 ++++
+ net/netfilter/nf_conntrack_ecache.c       |  2 ++
+ net/netfilter/nf_conntrack_expect.c       | 10 +++++++-
+ net/netfilter/nf_conntrack_netlink.c      | 28 +++++++++++++++--------
+ 4 files changed, 35 insertions(+), 10 deletions(-)
+
+diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
+index 3384859a892101..8883575adcc1e7 100644
+--- a/include/net/netfilter/nf_conntrack_core.h
++++ b/include/net/netfilter/nf_conntrack_core.h
+@@ -83,6 +83,11 @@ void nf_conntrack_lock(spinlock_t *lock);
+ extern spinlock_t nf_conntrack_expect_lock;
++static inline void lockdep_nfct_expect_lock_held(void)
++{
++      lockdep_assert_held(&nf_conntrack_expect_lock);
++}
++
+ /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
+diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
+index 69948e1d6974e3..6526bdcca580fd 100644
+--- a/net/netfilter/nf_conntrack_ecache.c
++++ b/net/netfilter/nf_conntrack_ecache.c
+@@ -237,6 +237,8 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
+       struct nf_ct_event_notifier *notify;
+       struct nf_conntrack_ecache *e;
++      lockdep_nfct_expect_lock_held();
++
+       rcu_read_lock();
+       notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+       if (!notify)
+diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
+index f5c45989df5736..bb8b87f9ee50da 100644
+--- a/net/netfilter/nf_conntrack_expect.c
++++ b/net/netfilter/nf_conntrack_expect.c
+@@ -51,6 +51,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+       struct net *net = nf_ct_exp_net(exp);
+       struct nf_conntrack_net *cnet;
++      lockdep_nfct_expect_lock_held();
+       WARN_ON(!master_help);
+       WARN_ON(timer_pending(&exp->timeout));
+@@ -118,6 +119,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
+ bool nf_ct_remove_expect(struct nf_conntrack_expect *exp)
+ {
++      lockdep_nfct_expect_lock_held();
++
+       if (del_timer(&exp->timeout)) {
+               nf_ct_unlink_expect(exp);
+               nf_ct_expect_put(exp);
+@@ -177,6 +180,8 @@ nf_ct_find_expectation(struct net *net,
+       struct nf_conntrack_expect *i, *exp = NULL;
+       unsigned int h;
++      lockdep_nfct_expect_lock_held();
++
+       if (!cnet->expect_count)
+               return NULL;
+@@ -459,6 +464,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
+       unsigned int h;
+       int ret = 0;
++      lockdep_nfct_expect_lock_held();
++
+       if (!master_help) {
+               ret = -ESHUTDOWN;
+               goto out;
+@@ -515,8 +522,9 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
+       nf_ct_expect_insert(expect);
+-      spin_unlock_bh(&nf_conntrack_expect_lock);
+       nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
++      spin_unlock_bh(&nf_conntrack_expect_lock);
++
+       return 0;
+ out:
+       spin_unlock_bh(&nf_conntrack_expect_lock);
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index f51cdfba68fbdb..507f17722f375b 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -3332,31 +3332,37 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
+       if (err < 0)
+               return err;
++      skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
++      if (!skb2)
++              return -ENOMEM;
++
++      spin_lock_bh(&nf_conntrack_expect_lock);
+       exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
+-      if (!exp)
++      if (!exp) {
++              spin_unlock_bh(&nf_conntrack_expect_lock);
++              kfree_skb(skb2);
+               return -ENOENT;
++      }
+       if (cda[CTA_EXPECT_ID]) {
+               __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+               if (id != nf_expect_get_id(exp)) {
+                       nf_ct_expect_put(exp);
++                      spin_unlock_bh(&nf_conntrack_expect_lock);
++                      kfree_skb(skb2);
+                       return -ENOENT;
+               }
+       }
+-      skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+-      if (!skb2) {
+-              nf_ct_expect_put(exp);
+-              return -ENOMEM;
+-      }
+-
+       rcu_read_lock();
+       err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
+                                     info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+                                     exp);
+       rcu_read_unlock();
+       nf_ct_expect_put(exp);
++      spin_unlock_bh(&nf_conntrack_expect_lock);
++
+       if (err <= 0) {
+               kfree_skb(skb2);
+               return -ENOMEM;
+@@ -3403,22 +3409,26 @@ static int ctnetlink_del_expect(struct sk_buff *skb,
+               if (err < 0)
+                       return err;
++              spin_lock_bh(&nf_conntrack_expect_lock);
++
+               /* bump usage count to 2 */
+               exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
+-              if (!exp)
++              if (!exp) {
++                      spin_unlock_bh(&nf_conntrack_expect_lock);
+                       return -ENOENT;
++              }
+               if (cda[CTA_EXPECT_ID]) {
+                       __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+                       if (id != nf_expect_get_id(exp)) {
+                               nf_ct_expect_put(exp);
++                              spin_unlock_bh(&nf_conntrack_expect_lock);
+                               return -ENOENT;
+                       }
+               }
+               /* after list removal, usage count == 1 */
+-              spin_lock_bh(&nf_conntrack_expect_lock);
+               if (del_timer(&exp->timeout)) {
+                       nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
+                                                  nlmsg_report(info->nlh));
+-- 
+2.53.0
+
index b73b76216809dc40eced0fd111243cc810a6726e..e792c43bb2b55ac4b77c59b646634aa9c3a3f5d1 100644 (file)
@@ -104,3 +104,8 @@ net-mvpp2-add-metadata-support-for-xdp-mode.patch
 net-mvpp2-refill-rx-buffers-before-xdp-or-skb-use.patch
 net-mvpp2-build-skb-from-xdp-adjusted-data-on-xdp_pa.patch
 ipv6-fix-a-potential-npd-in-cleanup_prefix_route.patch
+netfilter-ctnetlink-ensure-safe-access-to-master-con.patch
+writeback-avoid-contention-on-wb-list_lock-when-swit.patch
+writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch
+xfrm-hold-device-only-for-the-asynchronous-decryptio.patch
+xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch
diff --git a/queue-6.12/writeback-avoid-contention-on-wb-list_lock-when-swit.patch b/queue-6.12/writeback-avoid-contention-on-wb-list_lock-when-swit.patch
new file mode 100644 (file)
index 0000000..40c4a0f
--- /dev/null
@@ -0,0 +1,300 @@
+From cc73bfdc8a332fe69957503aa0ada8507b9fd7bf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 9 Apr 2025 17:12:59 +0200
+Subject: writeback: Avoid contention on wb->list_lock when switching inodes
+
+From: Jan Kara <jack@suse.cz>
+
+[ Upstream commit e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b ]
+
+There can be multiple inode switch works that are trying to switch
+inodes to / from the same wb. This can happen in particular if some
+cgroup exits which owns many (thousands) inodes and we need to switch
+them all. In this case several inode_switch_wbs_work_fn() instances will
+be just spinning on the same wb->list_lock while only one of them makes
+forward progress. This wastes CPU cycles and quickly leads to softlockup
+reports and unusable system.
+
+Instead of running several inode_switch_wbs_work_fn() instances in
+parallel switching to the same wb and contending on wb->list_lock, run
+just one work item per wb and manage a queue of isw items switching to
+this wb.
+
+Acked-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fs-writeback.c                | 99 ++++++++++++++++++++------------
+ include/linux/backing-dev-defs.h |  4 ++
+ include/linux/writeback.h        |  2 +
+ mm/backing-dev.c                 |  5 ++
+ 4 files changed, 74 insertions(+), 36 deletions(-)
+
+diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
+index 45e90338fbb2df..a8d21a5f354859 100644
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -369,7 +369,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+ }
+ struct inode_switch_wbs_context {
+-      struct rcu_work         work;
++      /* List of queued switching contexts for the wb */
++      struct llist_node       list;
+       /*
+        * Multiple inodes can be switched at once.  The switching procedure
+@@ -379,7 +380,6 @@ struct inode_switch_wbs_context {
+        * array embedded into struct inode_switch_wbs_context.  Otherwise
+        * an inode could be left in a non-consistent state.
+        */
+-      struct bdi_writeback    *new_wb;
+       struct inode            *inodes[];
+ };
+@@ -488,13 +488,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
+       return switched;
+ }
+-static void inode_switch_wbs_work_fn(struct work_struct *work)
++static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
++                                   struct inode_switch_wbs_context *isw)
+ {
+-      struct inode_switch_wbs_context *isw =
+-              container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+       struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
+       struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
+-      struct bdi_writeback *new_wb = isw->new_wb;
+       unsigned long nr_switched = 0;
+       struct inode **inodep;
+@@ -554,6 +552,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
+       atomic_dec(&isw_nr_in_flight);
+ }
++void inode_switch_wbs_work_fn(struct work_struct *work)
++{
++      struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
++                                                  switch_work);
++      struct inode_switch_wbs_context *isw, *next_isw;
++      struct llist_node *list;
++
++      /*
++       * Grab out reference to wb so that it cannot get freed under us
++       * after we process all the isw items.
++       */
++      wb_get(new_wb);
++      while (1) {
++              list = llist_del_all(&new_wb->switch_wbs_ctxs);
++              /* Nothing to do? */
++              if (!list)
++                      break;
++              /*
++               * In addition to synchronizing among switchers, I_WB_SWITCH
++               * tells the RCU protected stat update paths to grab the i_page
++               * lock so that stat transfer can synchronize against them.
++               * Let's continue after I_WB_SWITCH is guaranteed to be
++               * visible.
++               */
++              synchronize_rcu();
++
++              llist_for_each_entry_safe(isw, next_isw, list, list)
++                      process_inode_switch_wbs(new_wb, isw);
++      }
++      wb_put(new_wb);
++}
++
+ static bool inode_prepare_wbs_switch(struct inode *inode,
+                                    struct bdi_writeback *new_wb)
+ {
+@@ -583,6 +613,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
+       return true;
+ }
++static void wb_queue_isw(struct bdi_writeback *wb,
++                       struct inode_switch_wbs_context *isw)
++{
++      if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
++              queue_work(isw_wq, &wb->switch_work);
++}
++
+ /**
+  * inode_switch_wbs - change the wb association of an inode
+  * @inode: target inode
+@@ -596,6 +633,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct cgroup_subsys_state *memcg_css;
+       struct inode_switch_wbs_context *isw;
++      struct bdi_writeback *new_wb = NULL;
+       /* noop if seems to be already in progress */
+       if (inode->i_state & I_WB_SWITCH)
+@@ -620,40 +658,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+       if (!memcg_css)
+               goto out_free;
+-      isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
++      new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+       css_put(memcg_css);
+-      if (!isw->new_wb)
++      if (!new_wb)
+               goto out_free;
+-      if (!inode_prepare_wbs_switch(inode, isw->new_wb))
++      if (!inode_prepare_wbs_switch(inode, new_wb))
+               goto out_free;
+       isw->inodes[0] = inode;
+-      /*
+-       * In addition to synchronizing among switchers, I_WB_SWITCH tells
+-       * the RCU protected stat update paths to grab the i_page
+-       * lock so that stat transfer can synchronize against them.
+-       * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+-       */
+-      INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+-      queue_rcu_work(isw_wq, &isw->work);
++      wb_queue_isw(new_wb, isw);
+       return;
+ out_free:
+       atomic_dec(&isw_nr_in_flight);
+-      if (isw->new_wb)
+-              wb_put(isw->new_wb);
++      if (new_wb)
++              wb_put(new_wb);
+       kfree(isw);
+ }
+-static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
++static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
++                                 struct inode_switch_wbs_context *isw,
+                                  struct list_head *list, int *nr)
+ {
+       struct inode *inode;
+       list_for_each_entry(inode, list, i_io_list) {
+-              if (!inode_prepare_wbs_switch(inode, isw->new_wb))
++              if (!inode_prepare_wbs_switch(inode, new_wb))
+                       continue;
+               isw->inodes[*nr] = inode;
+@@ -677,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+ {
+       struct cgroup_subsys_state *memcg_css;
+       struct inode_switch_wbs_context *isw;
++      struct bdi_writeback *new_wb;
+       int nr;
+       bool restart = false;
+@@ -689,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+       for (memcg_css = wb->memcg_css->parent; memcg_css;
+            memcg_css = memcg_css->parent) {
+-              isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+-              if (isw->new_wb)
++              new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
++              if (new_wb)
+                       break;
+       }
+-      if (unlikely(!isw->new_wb))
+-              isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
++      if (unlikely(!new_wb))
++              new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+       nr = 0;
+       spin_lock(&wb->list_lock);
+@@ -706,27 +739,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+        * bandwidth restrictions, as writeback of inode metadata is not
+        * accounted for.
+        */
+-      restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
++      restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
+       if (!restart)
+-              restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
++              restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
++                                               &nr);
+       spin_unlock(&wb->list_lock);
+       /* no attached inodes? bail out */
+       if (nr == 0) {
+               atomic_dec(&isw_nr_in_flight);
+-              wb_put(isw->new_wb);
++              wb_put(new_wb);
+               kfree(isw);
+               return restart;
+       }
+-      /*
+-       * In addition to synchronizing among switchers, I_WB_SWITCH tells
+-       * the RCU protected stat update paths to grab the i_page
+-       * lock so that stat transfer can synchronize against them.
+-       * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+-       */
+-      INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+-      queue_rcu_work(isw_wq, &isw->work);
++      wb_queue_isw(new_wb, isw);
+       return restart;
+ }
+diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
+index 2ad261082bba5f..c5c9d89c73edcc 100644
+--- a/include/linux/backing-dev-defs.h
++++ b/include/linux/backing-dev-defs.h
+@@ -152,6 +152,10 @@ struct bdi_writeback {
+       struct list_head blkcg_node;    /* anchored at blkcg->cgwb_list */
+       struct list_head b_attached;    /* attached inodes, protected by list_lock */
+       struct list_head offline_node;  /* anchored at offline_cgwbs */
++      struct work_struct switch_work; /* work used to perform inode switching
++                                       * to this wb */
++      struct llist_head switch_wbs_ctxs;      /* queued contexts for
++                                               * writeback switching */
+       union {
+               struct work_struct release_work;
+diff --git a/include/linux/writeback.h b/include/linux/writeback.h
+index 641a057e041329..b6bf90a7052599 100644
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -293,6 +293,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+               bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
+ }
++void inode_switch_wbs_work_fn(struct work_struct *work);
++
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
+diff --git a/mm/backing-dev.c b/mm/backing-dev.c
+index bf0594ceb3ff87..956a7e23b5d634 100644
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -634,6 +634,7 @@ static void cgwb_release_workfn(struct work_struct *work)
+       wb_exit(wb);
+       bdi_put(bdi);
+       WARN_ON_ONCE(!list_empty(&wb->b_attached));
++      WARN_ON_ONCE(work_pending(&wb->switch_work));
+       call_rcu(&wb->rcu, cgwb_free_rcu);
+ }
+@@ -710,6 +711,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
+       wb->memcg_css = memcg_css;
+       wb->blkcg_css = blkcg_css;
+       INIT_LIST_HEAD(&wb->b_attached);
++      INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
++      init_llist_head(&wb->switch_wbs_ctxs);
+       INIT_WORK(&wb->release_work, cgwb_release_workfn);
+       set_bit(WB_registered, &wb->state);
+       bdi_get(bdi);
+@@ -840,6 +843,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
+       if (!ret) {
+               bdi->wb.memcg_css = &root_mem_cgroup->css;
+               bdi->wb.blkcg_css = blkcg_root_css;
++              INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
++              init_llist_head(&bdi->wb.switch_wbs_ctxs);
+       }
+       return ret;
+ }
+-- 
+2.53.0
+
diff --git a/queue-6.12/writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch b/queue-6.12/writeback-fix-use-after-free-in-inode_switch_wbs_wor.patch
new file mode 100644 (file)
index 0000000..204da2d
--- /dev/null
@@ -0,0 +1,109 @@
+From f765f27fb5a2ce650ddda2e3de068ff0875778e8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Apr 2026 11:36:19 +0200
+Subject: writeback: Fix use after free in inode_switch_wbs_work_fn()
+
+From: Jan Kara <jack@suse.cz>
+
+[ Upstream commit 6689f01d6740cf358932b3e97ee968c6099800d9 ]
+
+inode_switch_wbs_work_fn() has a loop like:
+
+  wb_get(new_wb);
+  while (1) {
+    list = llist_del_all(&new_wb->switch_wbs_ctxs);
+    /* Nothing to do? */
+    if (!list)
+      break;
+    ... process the items ...
+  }
+
+Now adding of items to the list looks like:
+
+wb_queue_isw()
+  if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
+    queue_work(isw_wq, &wb->switch_work);
+
+Because inode_switch_wbs_work_fn() loops when processing isw items, it
+can happen that wb->switch_work is pending while wb->switch_wbs_ctxs is
+empty. This is a problem because in that case wb can get freed (no isw
+items -> no wb reference) while the work is still pending causing
+use-after-free issues.
+
+We cannot just fix this by cancelling work when freeing wb because that
+could still trigger problematic 0 -> 1 transitions on wb refcount due to
+wb_get() in inode_switch_wbs_work_fn(). It could be all handled with
+more careful code but that seems unnecessarily complex so let's avoid
+that until it is proven that the looping actually brings practical
+benefit. Just remove the loop from inode_switch_wbs_work_fn() instead.
+That way when wb_queue_isw() queues work, we are guaranteed we have
+added the first item to wb->switch_wbs_ctxs and nobody is going to
+remove it (and drop the wb reference it holds) until the queued work
+runs.
+
+Fixes: e1b849cfa6b6 ("writeback: Avoid contention on wb->list_lock when switching inodes")
+CC: stable@vger.kernel.org
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://patch.msgid.link/20260413093618.17244-2-jack@suse.cz
+Acked-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fs-writeback.c | 36 +++++++++++++++++++-----------------
+ 1 file changed, 19 insertions(+), 17 deletions(-)
+
+diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
+index a8d21a5f354859..e8afd4fd26f98e 100644
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -559,28 +559,30 @@ void inode_switch_wbs_work_fn(struct work_struct *work)
+       struct inode_switch_wbs_context *isw, *next_isw;
+       struct llist_node *list;
++      list = llist_del_all(&new_wb->switch_wbs_ctxs);
+       /*
+-       * Grab out reference to wb so that it cannot get freed under us
++       * Nothing to do? That would be a problem as references held by isw
++       * items protect wb from freeing...
++       */
++      if (WARN_ON_ONCE(!list))
++              return;
++
++      /*
++       * Grab our reference to wb so that it cannot get freed under us
+        * after we process all the isw items.
+        */
+       wb_get(new_wb);
+-      while (1) {
+-              list = llist_del_all(&new_wb->switch_wbs_ctxs);
+-              /* Nothing to do? */
+-              if (!list)
+-                      break;
+-              /*
+-               * In addition to synchronizing among switchers, I_WB_SWITCH
+-               * tells the RCU protected stat update paths to grab the i_page
+-               * lock so that stat transfer can synchronize against them.
+-               * Let's continue after I_WB_SWITCH is guaranteed to be
+-               * visible.
+-               */
+-              synchronize_rcu();
++      /*
++       * In addition to synchronizing among switchers, I_WB_SWITCH
++       * tells the RCU protected stat update paths to grab the i_page
++       * lock so that stat transfer can synchronize against them.
++       * Let's continue after I_WB_SWITCH is guaranteed to be
++       * visible.
++       */
++      synchronize_rcu();
+-              llist_for_each_entry_safe(isw, next_isw, list, list)
+-                      process_inode_switch_wbs(new_wb, isw);
+-      }
++      llist_for_each_entry_safe(isw, next_isw, list, list)
++              process_inode_switch_wbs(new_wb, isw);
+       wb_put(new_wb);
+ }
+-- 
+2.53.0
+
diff --git a/queue-6.12/xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch b/queue-6.12/xfrm-hold-dev-ref-until-after-transport_finish-nf_ho.patch
new file mode 100644 (file)
index 0000000..57f584d
--- /dev/null
@@ -0,0 +1,145 @@
+From 196fd569f080d6d4b0c74e505c2ca49e9e448e3e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jun 2026 11:13:27 +0000
+Subject: xfrm: hold dev ref until after transport_finish NF_HOOK
+
+From: Qi Tang <tpluszz77@gmail.com>
+
+[ Upstream commit 1c428b03840094410c5fb6a5db30640486bbbfcb ]
+
+After async crypto completes, xfrm_input_resume() calls dev_put()
+immediately on re-entry before the skb reaches transport_finish.
+The skb->dev pointer is then used inside NF_HOOK and its okfn,
+which can race with device teardown.
+
+Remove the dev_put from the async resumption entry and instead
+drop the reference after the NF_HOOK call in transport_finish,
+using a saved device pointer since NF_HOOK may consume the skb.
+This covers NF_DROP, NF_QUEUE and NF_STOLEN paths that skip
+the okfn.
+
+For non-transport exits (decaps, gro, drop) and secondary
+async return points, release the reference inline when
+async is set.
+
+Suggested-by: Florian Westphal <fw@strlen.de>
+Fixes: acf568ee859f ("xfrm: Reinject transport-mode packets through tasklet")
+Cc: stable@vger.kernel.org
+Signed-off-by: Qi Tang <tpluszz77@gmail.com>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+[ xfrm_inner_mode_input() always completes synchronously in this kernel
+version and cannot return -EINPROGRESS. That requires
+7ac64f4598b4 ("xfrm: add mode_cbs module functionality"), which is not
+present, so the async dev_put path is unreachable and the hunk was
+omitted ]
+Signed-off-by: Simon Liebold <simonlie@amazon.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/xfrm4_input.c |  5 ++++-
+ net/ipv6/xfrm6_input.c |  5 ++++-
+ net/xfrm/xfrm_input.c  | 12 ++++++++++--
+ 3 files changed, 18 insertions(+), 4 deletions(-)
+
+diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
+index 12a1a0f421956c..adf21d6b6076c1 100644
+--- a/net/ipv4/xfrm4_input.c
++++ b/net/ipv4/xfrm4_input.c
+@@ -50,6 +50,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
+ {
+       struct xfrm_offload *xo = xfrm_offload(skb);
+       struct iphdr *iph = ip_hdr(skb);
++      struct net_device *dev = skb->dev;
+       iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+@@ -73,8 +74,10 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
+       }
+       NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+-              dev_net(skb->dev), NULL, skb, skb->dev, NULL,
++              dev_net(dev), NULL, skb, dev, NULL,
+               xfrm4_rcv_encap_finish);
++      if (async)
++              dev_put(dev);
+       return 0;
+ }
+diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
+index 9005fc156a20e6..699a001ac16629 100644
+--- a/net/ipv6/xfrm6_input.c
++++ b/net/ipv6/xfrm6_input.c
+@@ -43,6 +43,7 @@ static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
+ int xfrm6_transport_finish(struct sk_buff *skb, int async)
+ {
+       struct xfrm_offload *xo = xfrm_offload(skb);
++      struct net_device *dev = skb->dev;
+       int nhlen = -skb_network_offset(skb);
+       skb_network_header(skb)[IP6CB(skb)->nhoff] =
+@@ -68,8 +69,10 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
+       }
+       NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+-              dev_net(skb->dev), NULL, skb, skb->dev, NULL,
++              dev_net(dev), NULL, skb, dev, NULL,
+               xfrm6_transport_finish2);
++      if (async)
++              dev_put(dev);
+       return 0;
+ }
+diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
+index 90a79558dca259..5d3633ce6ba329 100644
+--- a/net/xfrm/xfrm_input.c
++++ b/net/xfrm/xfrm_input.c
+@@ -492,7 +492,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+               /* An encap_type of -1 indicates async resumption. */
+               if (encap_type == -1) {
+                       async = 1;
+-                      dev_put(skb->dev);
+                       seq = XFRM_SKB_CB(skb)->seq.input.low;
+                       goto resume;
+               }
+@@ -645,8 +644,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+                       dev_hold(skb->dev);
+                       nexthdr = x->type->input(x, skb);
+-                      if (nexthdr == -EINPROGRESS)
++                      if (nexthdr == -EINPROGRESS) {
++                              if (async)
++                                      dev_put(skb->dev);
+                               return 0;
++                      }
+                       dev_put(skb->dev);
+               }
+@@ -717,6 +719,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+                       sp->olen = 0;
+               if (skb_valid_dst(skb))
+                       skb_dst_drop(skb);
++              if (async)
++                      dev_put(skb->dev);
+               gro_cells_receive(&gro_cells, skb);
+               return 0;
+       } else {
+@@ -736,6 +740,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+                               sp->olen = 0;
+                       if (skb_valid_dst(skb))
+                               skb_dst_drop(skb);
++                      if (async)
++                              dev_put(skb->dev);
+                       gro_cells_receive(&gro_cells, skb);
+                       return err;
+               }
+@@ -746,6 +752,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+ drop_unlock:
+       spin_unlock(&x->lock);
+ drop:
++      if (async)
++              dev_put(skb->dev);
+       xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1);
+       kfree_skb(skb);
+       return 0;
+-- 
+2.53.0
+
diff --git a/queue-6.12/xfrm-hold-device-only-for-the-asynchronous-decryptio.patch b/queue-6.12/xfrm-hold-device-only-for-the-asynchronous-decryptio.patch
new file mode 100644 (file)
index 0000000..c606655
--- /dev/null
@@ -0,0 +1,73 @@
+From 84bb266d43de21ea4f6f7a4ea0259e4a453ca999 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jun 2026 11:13:26 +0000
+Subject: xfrm: hold device only for the asynchronous decryption
+
+From: Jianbo Liu <jianbol@nvidia.com>
+
+[ Upstream commit b05d42eefac737ce3cd80114d3579111023941b8 ]
+
+The dev_hold() on skb->dev during packet reception was originally
+added to prevent the device from being released prematurely during
+asynchronous decryption operations.
+
+As current hardware can offload decryption, this asynchronous path is
+not always utilized. This often results in a pattern of dev_hold()
+immediately followed by dev_put() for each packet, creating
+unnecessary reference counting overhead detrimental to performance.
+
+This patch optimizes this by skipping the dev_hold() and subsequent
+dev_put() when asynchronous decryption is not being performed.
+
+Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
+Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
+Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
+Stable-dep-of: 1c428b038400 ("xfrm: hold dev ref until after transport_finish NF_HOOK")
+Signed-off-by: Simon Liebold <simonlie@amazon.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xfrm/xfrm_input.c | 17 +++++++++--------
+ 1 file changed, 9 insertions(+), 8 deletions(-)
+
+diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
+index 8edcb32735e595..90a79558dca259 100644
+--- a/net/xfrm/xfrm_input.c
++++ b/net/xfrm/xfrm_input.c
+@@ -492,6 +492,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+               /* An encap_type of -1 indicates async resumption. */
+               if (encap_type == -1) {
+                       async = 1;
++                      dev_put(skb->dev);
+                       seq = XFRM_SKB_CB(skb)->seq.input.low;
+                       goto resume;
+               }
+@@ -638,18 +639,18 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+               XFRM_SKB_CB(skb)->seq.input.low = seq;
+               XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
+-              dev_hold(skb->dev);
+-
+-              if (crypto_done)
++              if (crypto_done) {
+                       nexthdr = x->type_offload->input_tail(x, skb);
+-              else
++              } else {
++                      dev_hold(skb->dev);
++
+                       nexthdr = x->type->input(x, skb);
++                      if (nexthdr == -EINPROGRESS)
++                              return 0;
+-              if (nexthdr == -EINPROGRESS)
+-                      return 0;
++                      dev_put(skb->dev);
++              }
+ resume:
+-              dev_put(skb->dev);
+-
+               spin_lock(&x->lock);
+               if (nexthdr < 0) {
+                       if (nexthdr == -EBADMSG) {
+-- 
+2.53.0
+
diff --git a/queue-6.18/kvm-vmx-update-svi-during-runtime-apicv-activation.patch b/queue-6.18/kvm-vmx-update-svi-during-runtime-apicv-activation.patch
new file mode 100644 (file)
index 0000000..88b0cd0
--- /dev/null
@@ -0,0 +1,156 @@
+From 6bcc61ad9ab1c9a039d5c6601cca2f09b3871e95 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jun 2026 14:10:01 -0700
+Subject: KVM: VMX: Update SVI during runtime APICv activation
+
+From: Dongli Zhang <dongli.zhang@oracle.com>
+
+commit b2849bec936be642b5420801f902337f2507648e upstream.
+
+The APICv (apic->apicv_active) can be activated or deactivated at runtime,
+for instance, because of APICv inhibit reasons. Intel VMX employs different
+mechanisms to virtualize LAPIC based on whether APICv is active.
+
+When APICv is activated at runtime, GUEST_INTR_STATUS is used to configure
+and report the current pending IRR and ISR states. Unless a specific vector
+is explicitly included in EOI_EXIT_BITMAP, its EOI will not be trapped to
+KVM. Intel VMX automatically clears the corresponding ISR bit based on the
+GUEST_INTR_STATUS.SVI field.
+
+When APICv is deactivated at runtime, the VM_ENTRY_INTR_INFO_FIELD is used
+to specify the next interrupt vector to invoke upon VM-entry. The
+VMX IDT_VECTORING_INFO_FIELD is used to report un-invoked vectors on
+VM-exit. EOIs are always trapped to KVM, so the software can manually clear
+pending ISR bits.
+
+There are scenarios where, with APICv activated at runtime, a guest-issued
+EOI may not be able to clear the pending ISR bit.
+
+Taking vector 236 as an example, here is one scenario.
+
+1. Suppose APICv is inactive. Vector 236 is pending in the IRR.
+2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR,
+and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq().
+3. After VM-entry, vector 236 is invoked through the guest IDT. At this
+point, the data in VM_ENTRY_INTR_INFO_FIELD is no longer valid. The guest
+interrupt handler for vector 236 is invoked.
+4. Suppose a VM exit occurs very early in the guest interrupt handler,
+before the EOI is issued.
+5. Nothing is reported through the IDT_VECTORING_INFO_FIELD because
+vector 236 has already been invoked in the guest.
+6. Now, suppose APICv is activated. Before the next VM-entry, KVM calls
+kvm_vcpu_update_apicv() to activate APICv.
+7. Unfortunately, GUEST_INTR_STATUS.SVI is not configured, although
+vector 236 is still pending in the ISR.
+8. After VM-entry, the guest finally issues the EOI for vector 236.
+However, because SVI is not configured, vector 236 is not cleared.
+9. ISR is stalled forever on vector 236.
+
+Here is another scenario.
+
+1. Suppose APICv is inactive. Vector 236 is pending in the IRR.
+2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR,
+and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq().
+3. VM-exit occurs immediately after the next VM-entry. The vector 236 is
+not invoked through the guest IDT. Instead, it is saved to the
+IDT_VECTORING_INFO_FIELD during the VM-exit.
+4. KVM calls kvm_queue_interrupt() to re-queue the un-invoked vector 236
+into vcpu->arch.interrupt. A KVM_REQ_EVENT is requested.
+5. Now, suppose APICv is activated. Before the next VM-entry, KVM calls
+kvm_vcpu_update_apicv() to activate APICv.
+6. Although APICv is now active, KVM still uses the legacy
+VM_ENTRY_INTR_INFO_FIELD to re-inject vector 236. GUEST_INTR_STATUS.SVI is
+not configured.
+7. After the next VM-entry, vector 236 is invoked through the guest IDT.
+Finally, an EOI occurs. However, due to the lack of GUEST_INTR_STATUS.SVI
+configuration, vector 236 is not cleared from the ISR.
+8. ISR is stalled forever on vector 236.
+
+Using QEMU as an example, vector 236 is stuck in ISR forever.
+
+(qemu) info lapic 1
+dumping local APIC state for CPU 1
+
+LVT0    0x00010700 active-hi edge  masked                      ExtINT (vec 0)
+LVT1    0x00010400 active-hi edge  masked                      NMI
+LVTPC   0x00000400 active-hi edge                              NMI
+LVTERR  0x000000fe active-hi edge                              Fixed  (vec 254)
+LVTTHMR         0x00010000 active-hi edge  masked                      Fixed  (vec 0)
+LVTT    0x000400ec active-hi edge                 tsc-deadline Fixed  (vec 236)
+Timer   DCR=0x0 (divide by 2) initial_count = 0 current_count = 0
+SPIV    0x000001ff APIC enabled, focus=off, spurious vec 255
+ICR     0x000000fd physical edge de-assert no-shorthand
+ICR2    0x00000000 cpu 0 (X2APIC ID)
+ESR     0x00000000
+ISR     236
+IRR     37(level) 236
+
+The issue isn't applicable to AMD SVM as KVM simply writes vmcb01 directly
+irrespective of whether L1 (vmcs01) or L2 (vmcb02) is active (unlike VMX,
+there is no need/cost to switch between VMCBs).  In addition,
+APICV_INHIBIT_REASON_IRQWIN ensures AMD SVM AVIC is not activated until
+the last interrupt is EOI'd.
+
+Fix the bug by configuring Intel VMX GUEST_INTR_STATUS.SVI if APICv is
+activated at runtime.
+
+Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
+Reviewed-by: Chao Gao <chao.gao@intel.com>
+Link: https://patch.msgid.link/20251110063212.34902-1-dongli.zhang@oracle.com
+[sean: call out that SVM writes vmcb01 directly, tweak comment]
+Link: https://patch.msgid.link/20251205231913.441872-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+(cherry picked from commit b2849bec936be642b5420801f902337f2507648e)
+Cc: stable@vger.kernel.org # 6.6.x and above
+Cc: Gulshan Gabel <gulshan.gabel@nutanix.com>
+Signed-off-by: Jon Kohler <jon@nutanix.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 9 ---------
+ arch/x86/kvm/x86.c     | 7 +++++++
+ 2 files changed, 7 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index c084f48e2b0b98..b7798ced7b505c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6886,15 +6886,6 @@ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+        * VM-Exit, otherwise L1 with run with a stale SVI.
+        */
+       if (is_guest_mode(vcpu)) {
+-              /*
+-               * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+-               * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+-               * Note, userspace can stuff state while L2 is active; assert
+-               * that VID is disabled if and only if the vCPU is in KVM_RUN
+-               * to avoid false positives if userspace is setting APIC state.
+-               */
+-              WARN_ON_ONCE(vcpu->wants_to_run &&
+-                           nested_cpu_has_vid(get_vmcs12(vcpu)));
+               to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+               return;
+       }
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index ad2b7158b9c8ea..a21ebe04aa23a8 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10950,9 +10950,16 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+        * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+        * still active when the interrupt got accepted. Make sure
+        * kvm_check_and_inject_events() is called to check for that.
++       *
++       * Update SVI when APICv gets enabled, otherwise SVI won't reflect the
++       * highest bit in vISR and the next accelerated EOI in the guest won't
++       * be virtualized correctly (the CPU uses SVI to determine which vISR
++       * vector to clear).
+        */
+       if (!apic->apicv_active)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
++      else
++              kvm_apic_update_hwapic_isr(vcpu);
+ out:
+       preempt_enable();
+-- 
+2.53.0
+
index 8646c259cbfa5c65e4113f3978c9c00d679efea7..f5057e71780ca0882edb183385a0cde22af4a3a3 100644 (file)
@@ -134,3 +134,4 @@ net-txgbe-support-cr-modules-for-aml-devices.patch
 net-txgbe-rename-the-sfp-related.patch
 net-txgbe-initialize-module-info-buffer.patch
 ipv6-fix-a-potential-npd-in-cleanup_prefix_route.patch
+kvm-vmx-update-svi-during-runtime-apicv-activation.patch
diff --git a/queue-6.6/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch b/queue-6.6/netfilter-ctnetlink-ensure-safe-access-to-master-con.patch
new file mode 100644 (file)
index 0000000..ebfd696
--- /dev/null
@@ -0,0 +1,221 @@
+From 56ce9e8fa76f29b011850684b9bc25901e6071dc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jun 2026 20:24:08 +0000
+Subject: netfilter: ctnetlink: ensure safe access to master conntrack
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit bffcaad9afdfe45d7fc777397d3b83c1e3ebffe5 ]
+
+Holding reference on the expectation is not sufficient, the master
+conntrack object can just go away, making exp->master invalid.
+
+To access exp->master safely:
+
+- Grab the nf_conntrack_expect_lock, this gets serialized with
+  clean_from_lists() which also holds this lock when the master
+  conntrack goes away.
+
+- Hold reference on master conntrack via nf_conntrack_find_get().
+  Not so easy since the master tuple to look up for the master conntrack
+  is not available in the existing problematic paths.
+
+This patch goes for extending the nf_conntrack_expect_lock section
+to address this issue for simplicity, in the cases that are described
+below this is just slightly extending the lock section.
+
+The add expectation command already holds a reference to the master
+conntrack from ctnetlink_create_expect().
+
+However, the delete expectation command needs to grab the spinlock
+before looking up for the expectation. Expand the existing spinlock
+section to address this to cover the expectation lookup. Note that,
+the nf_ct_expect_iterate_net() calls already grabs the spinlock while
+iterating over the expectation table, which is correct.
+
+The get expectation command needs to grab the spinlock to ensure master
+conntrack does not go away. This also expands the existing spinlock
+section to cover the expectation lookup too. I needed to move the
+netlink skb allocation out of the spinlock to keep it GFP_KERNEL.
+
+For the expectation events, the IPEXP_DESTROY event is already delivered
+under the spinlock, just move the delivery of IPEXP_NEW under the
+spinlock too because the master conntrack event cache is reached through
+exp->master.
+
+While at it, add lockdep notations to help identify what codepaths need
+to grab the spinlock.
+
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+[ fix timer_delete -> del_timer in diff context lines since 8fa7292
+("treewide: Switch/rename to timer_delete[_sync]()") landed in 6.15 ]
+Signed-off-by: Mark Bundschuh <mkbund@amazon.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_conntrack_core.h |  5 ++++
+ net/netfilter/nf_conntrack_ecache.c       |  2 ++
+ net/netfilter/nf_conntrack_expect.c       | 10 +++++++-
+ net/netfilter/nf_conntrack_netlink.c      | 28 +++++++++++++++--------
+ 4 files changed, 35 insertions(+), 10 deletions(-)
+
+diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
+index 3384859a892101..8883575adcc1e7 100644
+--- a/include/net/netfilter/nf_conntrack_core.h
++++ b/include/net/netfilter/nf_conntrack_core.h
+@@ -83,6 +83,11 @@ void nf_conntrack_lock(spinlock_t *lock);
+ extern spinlock_t nf_conntrack_expect_lock;
++static inline void lockdep_nfct_expect_lock_held(void)
++{
++      lockdep_assert_held(&nf_conntrack_expect_lock);
++}
++
+ /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
+diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
+index 69948e1d6974e3..6526bdcca580fd 100644
+--- a/net/netfilter/nf_conntrack_ecache.c
++++ b/net/netfilter/nf_conntrack_ecache.c
+@@ -237,6 +237,8 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
+       struct nf_ct_event_notifier *notify;
+       struct nf_conntrack_ecache *e;
++      lockdep_nfct_expect_lock_held();
++
+       rcu_read_lock();
+       notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+       if (!notify)
+diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
+index 70bcddfc17ccc2..379711ea5ab67e 100644
+--- a/net/netfilter/nf_conntrack_expect.c
++++ b/net/netfilter/nf_conntrack_expect.c
+@@ -51,6 +51,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+       struct net *net = nf_ct_exp_net(exp);
+       struct nf_conntrack_net *cnet;
++      lockdep_nfct_expect_lock_held();
+       WARN_ON(!master_help);
+       WARN_ON(timer_pending(&exp->timeout));
+@@ -118,6 +119,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
+ bool nf_ct_remove_expect(struct nf_conntrack_expect *exp)
+ {
++      lockdep_nfct_expect_lock_held();
++
+       if (del_timer(&exp->timeout)) {
+               nf_ct_unlink_expect(exp);
+               nf_ct_expect_put(exp);
+@@ -177,6 +180,8 @@ nf_ct_find_expectation(struct net *net,
+       struct nf_conntrack_expect *i, *exp = NULL;
+       unsigned int h;
++      lockdep_nfct_expect_lock_held();
++
+       if (!cnet->expect_count)
+               return NULL;
+@@ -459,6 +464,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
+       unsigned int h;
+       int ret = 0;
++      lockdep_nfct_expect_lock_held();
++
+       if (!master_help) {
+               ret = -ESHUTDOWN;
+               goto out;
+@@ -515,8 +522,9 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
+       nf_ct_expect_insert(expect);
+-      spin_unlock_bh(&nf_conntrack_expect_lock);
+       nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
++      spin_unlock_bh(&nf_conntrack_expect_lock);
++
+       return 0;
+ out:
+       spin_unlock_bh(&nf_conntrack_expect_lock);
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 255996f43d854c..eff5008f5e9d4e 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -3326,31 +3326,37 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
+       if (err < 0)
+               return err;
++      skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
++      if (!skb2)
++              return -ENOMEM;
++
++      spin_lock_bh(&nf_conntrack_expect_lock);
+       exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
+-      if (!exp)
++      if (!exp) {
++              spin_unlock_bh(&nf_conntrack_expect_lock);
++              kfree_skb(skb2);
+               return -ENOENT;
++      }
+       if (cda[CTA_EXPECT_ID]) {
+               __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+               if (id != nf_expect_get_id(exp)) {
+                       nf_ct_expect_put(exp);
++                      spin_unlock_bh(&nf_conntrack_expect_lock);
++                      kfree_skb(skb2);
+                       return -ENOENT;
+               }
+       }
+-      skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+-      if (!skb2) {
+-              nf_ct_expect_put(exp);
+-              return -ENOMEM;
+-      }
+-
+       rcu_read_lock();
+       err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
+                                     info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+                                     exp);
+       rcu_read_unlock();
+       nf_ct_expect_put(exp);
++      spin_unlock_bh(&nf_conntrack_expect_lock);
++
+       if (err <= 0) {
+               kfree_skb(skb2);
+               return -ENOMEM;
+@@ -3397,22 +3403,26 @@ static int ctnetlink_del_expect(struct sk_buff *skb,
+               if (err < 0)
+                       return err;
++              spin_lock_bh(&nf_conntrack_expect_lock);
++
+               /* bump usage count to 2 */
+               exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
+-              if (!exp)
++              if (!exp) {
++                      spin_unlock_bh(&nf_conntrack_expect_lock);
+                       return -ENOENT;
++              }
+               if (cda[CTA_EXPECT_ID]) {
+                       __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+                       if (id != nf_expect_get_id(exp)) {
+                               nf_ct_expect_put(exp);
++                              spin_unlock_bh(&nf_conntrack_expect_lock);
+                               return -ENOENT;
+                       }
+               }
+               /* after list removal, usage count == 1 */
+-              spin_lock_bh(&nf_conntrack_expect_lock);
+               if (del_timer(&exp->timeout)) {
+                       nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
+                                                  nlmsg_report(info->nlh));
+-- 
+2.53.0
+
index ce46b5c3fbabd9fb0d2fa60212c5133b9c4ef23e..ffa706ab8df5f2dd309c5a738f21db99c93bb55d 100644 (file)
@@ -283,3 +283,4 @@ net-mvpp2-add-metadata-support-for-xdp-mode.patch
 net-mvpp2-refill-rx-buffers-before-xdp-or-skb-use.patch
 net-mvpp2-build-skb-from-xdp-adjusted-data-on-xdp_pa.patch
 ipv6-fix-a-potential-npd-in-cleanup_prefix_route.patch
+netfilter-ctnetlink-ensure-safe-access-to-master-con.patch