]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
bonding: Fix multiple long standing offload races
authorCosmin Ratiu <cratiu@nvidia.com>
Fri, 11 Apr 2025 07:49:58 +0000 (10:49 +0300)
committerSteffen Klassert <steffen.klassert@secunet.com>
Wed, 16 Apr 2025 09:02:49 +0000 (11:02 +0200)
Refactor the bonding ipsec offload operations to fix a number of
long-standing control plane races between state migration and user
deletion and a few other issues.

xfrm state deletion can happen concurrently with
bond_change_active_slave() operation. This manifests itself as a
bond_ipsec_del_sa() call with x->lock held, followed by a
bond_ipsec_free_sa() a bit later from a wq. The alternate path of
these calls coming from xfrm_dev_state_flush() can't happen, as that
needs the RTNL lock and bond_change_active_slave() already holds it.

1. bond_ipsec_del_sa_all() might call xdo_dev_state_delete() a second
   time on an xfrm state that was concurrently killed. This is bad.
2. bond_ipsec_add_sa_all() can add a state on the new device, but
   pending bond_ipsec_free_sa() calls from the old device will then hit
   the WARN_ON() and then, worse, call xdo_dev_state_free() on the new
   device without a corresponding xdo_dev_state_delete().
3. Resolve a sleeping in atomic context introduced by the mentioned
   "Fixes" commit.

bond_ipsec_del_sa_all() and bond_ipsec_add_sa_all() now acquire x->lock
and check for x->km.state to help with problems 1 and 2. And since
xso.real_dev is now a private pointer managed by the bonding driver in
xfrm state, make better use of it to fully fix problems 1 and 2. In
bond_ipsec_del_sa_all(), set xso.real_dev to NULL while holding both the
mutex and x->lock, which makes sure that neither bond_ipsec_del_sa() nor
bond_ipsec_free_sa() could run concurrently.

Fix problem 3 by moving the list cleanup (which requires the mutex) from
bond_ipsec_del_sa() (called from atomic context) to bond_ipsec_free_sa()

Finally, simplify bond_ipsec_del_sa() and bond_ipsec_free_sa() by using
xso->real_dev directly, since it's now protected by locks and can be
trusted to always reflect the offload device.

Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Tested-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
drivers/net/bonding/bond_main.c
include/net/xfrm.h

index 14f7c9712ad4178e73ed13f60f81b1830912a1e5..8ed8c29659a0f5fb78991f5d892a02d532a7054e 100644 (file)
@@ -545,7 +545,20 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
                        slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
                        continue;
                }
+
+               spin_lock_bh(&ipsec->xs->lock);
+               /* xs might have been killed by the user during the migration
+                * to the new dev, but bond_ipsec_del_sa() should have done
+                * nothing, as xso.real_dev is NULL.
+                * Delete it from the device we just added it to. The pending
+                * bond_ipsec_free_sa() call will do the rest of the cleanup.
+                */
+               if (ipsec->xs->km.state == XFRM_STATE_DEAD &&
+                   real_dev->xfrmdev_ops->xdo_dev_state_delete)
+                       real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
+                                                                   ipsec->xs);
                ipsec->xs->xso.real_dev = real_dev;
+               spin_unlock_bh(&ipsec->xs->lock);
        }
 out:
        mutex_unlock(&bond->ipsec_lock);
@@ -560,48 +573,20 @@ static void bond_ipsec_del_sa(struct net_device *bond_dev,
                              struct xfrm_state *xs)
 {
        struct net_device *real_dev;
-       netdevice_tracker tracker;
-       struct bond_ipsec *ipsec;
-       struct bonding *bond;
-       struct slave *slave;
 
-       if (!bond_dev)
+       if (!bond_dev || !xs->xso.real_dev)
                return;
 
-       rcu_read_lock();
-       bond = netdev_priv(bond_dev);
-       slave = rcu_dereference(bond->curr_active_slave);
-       real_dev = slave ? slave->dev : NULL;
-       netdev_hold(real_dev, &tracker, GFP_ATOMIC);
-       rcu_read_unlock();
-
-       if (!slave)
-               goto out;
-
-       if (!xs->xso.real_dev)
-               goto out;
-
-       WARN_ON(xs->xso.real_dev != real_dev);
+       real_dev = xs->xso.real_dev;
 
        if (!real_dev->xfrmdev_ops ||
            !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
            netif_is_bond_master(real_dev)) {
                slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__);
-               goto out;
+               return;
        }
 
        real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, xs);
-out:
-       netdev_put(real_dev, &tracker);
-       mutex_lock(&bond->ipsec_lock);
-       list_for_each_entry(ipsec, &bond->ipsec_list, list) {
-               if (ipsec->xs == xs) {
-                       list_del(&ipsec->list);
-                       kfree(ipsec);
-                       break;
-               }
-       }
-       mutex_unlock(&bond->ipsec_lock);
 }
 
 static void bond_ipsec_del_sa_all(struct bonding *bond)
@@ -629,9 +614,15 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
                                   __func__);
                        continue;
                }
+
+               spin_lock_bh(&ipsec->xs->lock);
                ipsec->xs->xso.real_dev = NULL;
-               real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
-                                                           ipsec->xs);
+               /* Don't double delete states killed by the user. */
+               if (ipsec->xs->km.state != XFRM_STATE_DEAD)
+                       real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
+                                                                   ipsec->xs);
+               spin_unlock_bh(&ipsec->xs->lock);
+
                if (real_dev->xfrmdev_ops->xdo_dev_state_free)
                        real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev,
                                                                  ipsec->xs);
@@ -643,34 +634,33 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev,
                               struct xfrm_state *xs)
 {
        struct net_device *real_dev;
-       netdevice_tracker tracker;
+       struct bond_ipsec *ipsec;
        struct bonding *bond;
-       struct slave *slave;
 
        if (!bond_dev)
                return;
 
-       rcu_read_lock();
        bond = netdev_priv(bond_dev);
-       slave = rcu_dereference(bond->curr_active_slave);
-       real_dev = slave ? slave->dev : NULL;
-       netdev_hold(real_dev, &tracker, GFP_ATOMIC);
-       rcu_read_unlock();
-
-       if (!slave)
-               goto out;
 
+       mutex_lock(&bond->ipsec_lock);
        if (!xs->xso.real_dev)
                goto out;
 
-       WARN_ON(xs->xso.real_dev != real_dev);
+       real_dev = xs->xso.real_dev;
 
        xs->xso.real_dev = NULL;
-       if (real_dev && real_dev->xfrmdev_ops &&
+       if (real_dev->xfrmdev_ops &&
            real_dev->xfrmdev_ops->xdo_dev_state_free)
                real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs);
 out:
-       netdev_put(real_dev, &tracker);
+       list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+               if (ipsec->xs == xs) {
+                       list_del(&ipsec->list);
+                       kfree(ipsec);
+                       break;
+               }
+       }
+       mutex_unlock(&bond->ipsec_lock);
 }
 
 /**
index 3d2f6c8793114fa8ae85c83f79e64b1fc9daf782..b7e8f3f4962734533a898dfdc218248415eeb832 100644 (file)
@@ -154,8 +154,11 @@ struct xfrm_dev_offload {
         */
        struct net_device       *dev;
        netdevice_tracker       dev_tracker;
-       /* This is a private pointer used by the bonding driver.
-        * Device drivers should not use it.
+       /* This is a private pointer used by the bonding driver (and eventually
+        * should be moved there). Device drivers should not use it.
+        * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases,
+        * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock
+        * is held.
         */
        struct net_device       *real_dev;
        unsigned long           offload_handle;