]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
net/mlx5e: SD, Fix race condition in secondary device probe/remove
authorShay Drory <shayd@nvidia.com>
Mon, 4 May 2026 18:02:06 +0000 (21:02 +0300)
committerJakub Kicinski <kuba@kernel.org>
Wed, 6 May 2026 02:13:09 +0000 (19:13 -0700)
When utilizing Socket-Direct single netdev functionality the driver
resolves the actual auxiliary device using mlx5_sd_get_adev(). However,
the current implementation returns the primary ETH auxiliary device
without holding the device lock, leading to a potential race condition
where the ETH device could be unbound or removed concurrently during
probe, suspend, resume, or remove operations.[1]

Fix this by introducing mlx5_sd_put_adev() and updating
mlx5_sd_get_adev() so that secondaries devices would get a ref and
acquire the device lock of the returned auxiliary device. After the lock
is acquired, a second devcom check is needed[2].
In addition, update The callers to pair the get operation with the new
put operation, ensuring the lock is held while the auxiliary device is
being operated on and released afterwards.

The "primary" designation is determined once in sd_register(). It's set
before devcom is marked ready, and it never changes after that.
In Addition, The primary path never locks a secondary: When the primary
device invoke mlx5_sd_get_adev(), it sees dev == primary and returns.
no additional lock is taken.
Therefore lock ordering is always: secondary_lock -> primary_lock. The
reverse never happens, so ABBA deadlock is impossible.

[1]
for example:
BUG: kernel NULL pointer dereference, address: 0000000000000370
PGD 0 P4D 0
Oops: Oops: 0000 [#1] SMP
CPU: 4 UID: 0 PID: 3945 Comm: bash Not tainted 6.19.0-rc3+ #1 NONE
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:mlx5e_dcbnl_dscp_app+0x23/0x100 [mlx5_core]
Call Trace:
 <TASK>
 mlx5e_remove+0x82/0x12a [mlx5_core]
 device_release_driver_internal+0x194/0x1f0
 bus_remove_device+0xc6/0x140
 device_del+0x159/0x3c0
 ? devl_param_driverinit_value_get+0x29/0x80
 mlx5_rescan_drivers_locked+0x92/0x160 [mlx5_core]
 mlx5_unregister_device+0x34/0x50 [mlx5_core]
 mlx5_uninit_one+0x43/0xb0 [mlx5_core]
 remove_one+0x4e/0xc0 [mlx5_core]
 pci_device_remove+0x39/0xa0
 device_release_driver_internal+0x194/0x1f0
 unbind_store+0x99/0xa0
 kernfs_fop_write_iter+0x12e/0x1e0
 vfs_write+0x215/0x3d0
 ksys_write+0x5f/0xd0
 do_syscall_64+0x55/0xe90
 entry_SYSCALL_64_after_hwframe+0x4b/0x53

[2]
    CPU0 (primary)                     CPU1 (secondary)
==========================================================================
mlx5e_remove() (device_lock held)
                                     mlx5e_remove() (2nd device_lock held)
                                      mlx5_sd_get_adev()
                                       mlx5_devcom_comp_is_ready() => true
                                       device_lock(primary)
 mlx5_sd_get_adev() ==> ret adev
 _mlx5e_remove()
 mlx5_sd_cleanup()
 // mlx5e_remove finished
 // releasing device_lock
                                       //need another check here...
                                       mlx5_devcom_comp_is_ready() => false

Fixes: 381978d28317 ("net/mlx5e: Create single netdev per SD group")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20260504180206.268568-5-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h

index 62b70334a13d1c641cbe97e926d979294c5878dd..8f2b3abe00921442347bf1708fcd2cfd84f5570e 100644 (file)
@@ -6774,8 +6774,10 @@ static int mlx5e_resume(struct auxiliary_device *adev)
                return err;
 
        actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
-       if (actual_adev)
+       if (actual_adev) {
                err = _mlx5e_resume(actual_adev);
+               mlx5_sd_put_adev(actual_adev, adev);
+       }
        return err;
 }
 
@@ -6815,6 +6817,8 @@ static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state)
                err = _mlx5e_suspend(actual_adev, false);
 
        mlx5_sd_cleanup(mdev);
+       if (actual_adev)
+               mlx5_sd_put_adev(actual_adev, adev);
        return err;
 }
 
@@ -6916,11 +6920,14 @@ static int mlx5e_probe(struct auxiliary_device *adev,
                err = _mlx5e_probe(actual_adev);
                if (err)
                        goto sd_cleanup;
+               mlx5_sd_put_adev(actual_adev, adev);
        }
        return 0;
 
 sd_cleanup:
        mlx5_sd_cleanup(mdev);
+       if (actual_adev)
+               mlx5_sd_put_adev(actual_adev, adev);
        return err;
 }
 
@@ -6973,6 +6980,8 @@ static void mlx5e_remove(struct auxiliary_device *adev)
                _mlx5e_remove(actual_adev);
 
        mlx5_sd_cleanup(mdev);
+       if (actual_adev)
+               mlx5_sd_put_adev(actual_adev, adev);
 }
 
 static const struct auxiliary_device_id mlx5e_id_table[] = {
index 89b7e4d67303c041019cfc1fbd884b19ade776ba..6e199161b0083af954c9fcbf53c3255c255951c7 100644 (file)
@@ -562,22 +562,55 @@ out_unlock:
        sd_cleanup(dev);
 }
 
+/* Lock order:
+ *   primary:   actual_adev_lock -> SD devcom comp lock
+ *   secondary: SD devcom comp lock -> (drop) -> actual_adev_lock
+ * The two locks are never held together, so no ABBA.
+ */
 struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev,
                                          struct auxiliary_device *adev,
                                          int idx)
 {
        struct mlx5_sd *sd = mlx5_get_sd(dev);
        struct mlx5_core_dev *primary;
+       struct mlx5_adev *primary_adev;
 
        if (!sd)
                return adev;
 
-       if (!mlx5_devcom_comp_is_ready(sd->devcom))
+       mlx5_devcom_comp_lock(sd->devcom);
+       if (!mlx5_devcom_comp_is_ready(sd->devcom)) {
+               mlx5_devcom_comp_unlock(sd->devcom);
                return NULL;
+       }
 
        primary = mlx5_sd_get_primary(dev);
-       if (dev == primary)
+       if (!primary || dev == primary) {
+               mlx5_devcom_comp_unlock(sd->devcom);
                return adev;
+       }
+
+       primary_adev = primary->priv.adev[idx];
+       get_device(&primary_adev->adev.dev);
+       mlx5_devcom_comp_unlock(sd->devcom);
 
-       return &primary->priv.adev[idx]->adev;
+       device_lock(&primary_adev->adev.dev);
+       /* Primary may have completed remove between dropping devcom and
+        * acquiring device_lock; recheck.
+        */
+       if (!mlx5_devcom_comp_is_ready(sd->devcom)) {
+               device_unlock(&primary_adev->adev.dev);
+               put_device(&primary_adev->adev.dev);
+               return NULL;
+       }
+       return &primary_adev->adev;
+}
+
+void mlx5_sd_put_adev(struct auxiliary_device *actual_adev,
+                     struct auxiliary_device *adev)
+{
+       if (actual_adev != adev) {
+               device_unlock(&actual_adev->dev);
+               put_device(&actual_adev->dev);
+       }
 }
index 137efaf9aabcc010cbc5237fdb19aba8abe542a3..9bfd5b9756b507d8c7e3c2ccebd5ac1b13c4e18b 100644 (file)
@@ -15,6 +15,8 @@ struct mlx5_core_dev *mlx5_sd_ch_ix_get_dev(struct mlx5_core_dev *primary, int c
 struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev,
                                          struct auxiliary_device *adev,
                                          int idx);
+void mlx5_sd_put_adev(struct auxiliary_device *actual_adev,
+                     struct auxiliary_device *adev);
 
 int mlx5_sd_init(struct mlx5_core_dev *dev);
 void mlx5_sd_cleanup(struct mlx5_core_dev *dev);