]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net/mlx5: Propagate LAG effective max_tx_speed to vports
authorOr Har-Toov <ohartoov@nvidia.com>
Thu, 18 Dec 2025 15:58:05 +0000 (17:58 +0200)
committerLeon Romanovsky <leon@kernel.org>
Mon, 5 Jan 2026 07:38:17 +0000 (02:38 -0500)
Currently, vports report only their parent's uplink speed, which in LAG
setups does not reflect the true aggregated bandwidth. This makes it
hard for upper-layer software to optimize load balancing decisions
based on accurate bandwidth information.

Fix the issue by calculating the possible maximum speed of a LAG as
the sum of speeds of all active uplinks that are part of the LAG.
Propagate this effective max speed to vports associated with the LAG
whenever a relevant event occurs, such as physical port link state
changes or LAG creation/modification.

With this change, upper-layer components receive accurate bandwidth
information corresponding to the active members of the LAG and can
make better load balancing decisions.

Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
Reviewed-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
drivers/net/ethernet/mellanox/mlx5/core/port.c
drivers/net/ethernet/mellanox/mlx5/core/vport.c
include/linux/mlx5/vport.h

index a459a30f36cae6675740468ad69b4affa6c73f7a..c9d943a230b590ff97aae7a943e49941f4e2c410 100644 (file)
@@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
               ldev->mode != MLX5_LAG_MODE_MPESW;
 }
 
+#ifdef CONFIG_MLX5_ESWITCH
+static int
+mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed,
+                          int (*get_speed)(struct mlx5_core_dev *, u32 *))
+{
+       struct mlx5_core_dev *pf_mdev;
+       int pf_idx;
+       u32 speed;
+       int ret;
+
+       *sum_speed = 0;
+       mlx5_ldev_for_each(pf_idx, 0, ldev) {
+               pf_mdev = ldev->pf[pf_idx].dev;
+               if (!pf_mdev)
+                       continue;
+
+               ret = get_speed(pf_mdev, &speed);
+               if (ret) {
+                       mlx5_core_dbg(pf_mdev,
+                                     "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n",
+                                     get_speed, dev_name(pf_mdev->device),
+                                     ret);
+                       return ret;
+               }
+
+               *sum_speed += speed;
+       }
+
+       return 0;
+}
+
+static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed)
+{
+       return mlx5_lag_sum_devices_speed(ldev, max_speed,
+                                         mlx5_port_max_linkspeed);
+}
+
+static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev,
+                                               u32 speed)
+{
+       u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT;
+       struct mlx5_eswitch *esw = mdev->priv.eswitch;
+       struct mlx5_vport *vport;
+       unsigned long i;
+       int ret;
+
+       if (!esw)
+               return;
+
+       if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed))
+               return;
+
+       mlx5_esw_for_each_vport(esw, i, vport) {
+               if (!vport)
+                       continue;
+
+               if (vport->vport == MLX5_VPORT_UPLINK)
+                       continue;
+
+               ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod,
+                                                    vport->vport, true, speed);
+               if (ret)
+                       mlx5_core_dbg(mdev,
+                                     "Failed to set vport %d speed %d, err=%d\n",
+                                     vport->vport, speed, ret);
+       }
+}
+
+void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev)
+{
+       struct mlx5_core_dev *mdev;
+       u32 speed;
+       int pf_idx;
+
+       speed = ldev->tracker.bond_speed_mbps;
+
+       if (speed == SPEED_UNKNOWN)
+               return;
+
+       /* If speed is not set, use the sum of max speeds of all PFs */
+       if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed))
+               return;
+
+       speed = speed / MLX5_MAX_TX_SPEED_UNIT;
+
+       mlx5_ldev_for_each(pf_idx, 0, ldev) {
+               mdev = ldev->pf[pf_idx].dev;
+               if (!mdev)
+                       continue;
+
+               mlx5_lag_modify_device_vports_speed(mdev, speed);
+       }
+}
+
+void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev)
+{
+       struct mlx5_core_dev *mdev;
+       u32 speed;
+       int pf_idx;
+       int ret;
+
+       mlx5_ldev_for_each(pf_idx, 0, ldev) {
+               mdev = ldev->pf[pf_idx].dev;
+               if (!mdev)
+                       continue;
+
+               ret = mlx5_port_oper_linkspeed(mdev, &speed);
+               if (ret) {
+                       mlx5_core_dbg(mdev,
+                                     "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n",
+                                     dev_name(mdev->device), ret);
+                       continue;
+               }
+
+               speed = speed / MLX5_MAX_TX_SPEED_UNIT;
+               mlx5_lag_modify_device_vports_speed(mdev, speed);
+       }
+}
+#endif
+
 static void mlx5_do_bond(struct mlx5_lag *ldev)
 {
        int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
@@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
                                                     ndev);
                        dev_put(ndev);
                }
+               mlx5_lag_set_vports_agg_speed(ldev);
        } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
                mlx5_modify_lag(ldev, &tracker);
+               mlx5_lag_set_vports_agg_speed(ldev);
        } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
+               mlx5_lag_reset_vports_speed(ldev);
                mlx5_disable_lag(ldev);
        }
 }
@@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
        return 1;
 }
 
+static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker,
+                                         struct net_device *ndev)
+{
+       struct ethtool_link_ksettings lksettings;
+       struct net_device *bond_dev;
+       int err;
+
+       if (netif_is_lag_master(ndev))
+               bond_dev = ndev;
+       else
+               bond_dev = netdev_master_upper_dev_get(ndev);
+
+       if (!bond_dev) {
+               tracker->bond_speed_mbps = SPEED_UNKNOWN;
+               return;
+       }
+
+       err = __ethtool_get_link_ksettings(bond_dev, &lksettings);
+       if (err) {
+               netdev_dbg(bond_dev,
+                          "Failed to get speed for bond dev %s, err=%d\n",
+                          bond_dev->name, err);
+               tracker->bond_speed_mbps = SPEED_UNKNOWN;
+               return;
+       }
+
+       if (lksettings.base.speed == SPEED_UNKNOWN)
+               tracker->bond_speed_mbps = 0;
+       else
+               tracker->bond_speed_mbps = lksettings.base.speed;
+}
+
 /* this handler is always registered to netdev events */
 static int mlx5_lag_netdev_event(struct notifier_block *this,
                                 unsigned long event, void *ptr)
@@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
                break;
        }
 
+       if (changed)
+               mlx5_lag_update_tracker_speed(&tracker, ndev);
+
        ldev->tracker = tracker;
 
        if (changed)
index 4918eee2b3da5cb9db1610ba41a9683628d68d5a..8de5640a01617993f7cf890010b9d1b177e8be53 100644 (file)
@@ -48,6 +48,7 @@ struct lag_tracker {
        unsigned int is_bonded:1;
        unsigned int has_inactive:1;
        enum netdev_lag_hash hash_type;
+       u32 bond_speed_mbps;
 };
 
 /* LAG data of a ConnectX card.
@@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev);
 void mlx5_lag_add_devices(struct mlx5_lag *ldev);
 struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);
 
+#ifdef CONFIG_MLX5_ESWITCH
+void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev);
+void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev);
+#else
+static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {}
+static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {}
+#endif
+
 static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
 {
        if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
index cfebc110c02fd94d7f3b7796f6f0702e999e8c76..9fdb9a543cf13b73ce9fda5fc17b435c0122a005 100644 (file)
@@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev,
 u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
                             struct mlx5_link_info *info,
                             bool force_legacy);
+int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 
 #define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) &&               \
index 85a9e534f44274e84f0a04f2f266523d581bcd75..83044c9b6b4193683d2d6532cad736c437228658 100644 (file)
@@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
        return link_modes;
 }
 
+int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+       const struct mlx5_link_info *table;
+       struct mlx5_port_eth_proto eproto;
+       u32 oper_speed = 0;
+       u32 max_size;
+       bool ext;
+       int err;
+       int i;
+
+       ext = mlx5_ptys_ext_supported(mdev);
+       err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
+       if (err)
+               return err;
+
+       mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false);
+       for (i = 0; i < max_size; ++i)
+               if (eproto.oper & MLX5E_PROT_MASK(i))
+                       oper_speed = max(oper_speed, table[i].speed);
+
+       *speed = oper_speed;
+       return 0;
+}
+
 int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 {
        const struct mlx5_link_info *table;
index 306affbcfd3b09c0b92b81a381f44d5684062025..78b1b291cfa44eb7892ae534f62622e2c1ec2338 100644 (file)
@@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
        return MLX5_GET(query_vport_state_out, out, state);
 }
 
+static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+                                       u16 vport, u8 other_vport,
+                                       u8 *admin_state)
+{
+       u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
+       int err;
+
+       MLX5_SET(query_vport_state_in, in, opcode,
+                MLX5_CMD_OP_QUERY_VPORT_STATE);
+       MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+       MLX5_SET(query_vport_state_in, in, vport_number, vport);
+       MLX5_SET(query_vport_state_in, in, other_vport, other_vport);
+
+       err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
+       if (err)
+               return err;
+
+       *admin_state = MLX5_GET(query_vport_state_out, out, admin_state);
+       return 0;
+}
+
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
                                  u16 vport, u8 other_vport, u8 state)
 {
@@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
        return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
 }
 
+int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
+                                  u16 vport, u8 other_vport, u16 max_tx_speed)
+{
+       u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {};
+       u8 admin_state;
+       int err;
+
+       err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport,
+                                          &admin_state);
+       if (err)
+               return err;
+
+       MLX5_SET(modify_vport_state_in, in, opcode,
+                MLX5_CMD_OP_MODIFY_VPORT_STATE);
+       MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
+       MLX5_SET(modify_vport_state_in, in, vport_number, vport);
+       MLX5_SET(modify_vport_state_in, in, other_vport, other_vport);
+       MLX5_SET(modify_vport_state_in, in, admin_state, admin_state);
+       MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed);
+
+       return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
+}
+
 static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
                                        bool other_vport, u32 *out)
 {
index f876bfc0669c8c280e050a2100648b5b74b0707e..2acf10e9f60a2f9077984b7593666dd62ba39103 100644 (file)
@@ -41,6 +41,8 @@
         (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&   \
         mlx5_core_is_pf(mdev))
 
+#define MLX5_MAX_TX_SPEED_UNIT 100
+
 enum {
        MLX5_CAP_INLINE_MODE_L2,
        MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
@@ -58,6 +60,8 @@ enum {
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
                                  u16 vport, u8 other_vport, u8 state);
+int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
+                                  u16 vport, u8 other_vport, u16 max_tx_speed);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
                                     u16 vport, bool other, u8 *addr);
 int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr);