Merge tag 'mlx5-updates-2021-06-14' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says: ==================== mlx5-updates-2021-06-14 1) Trivial Lag refactroing in preparation for upcomming Single FDB lag feature - First 3 patches 2) Scalable IRQ distriburion for Sub-functions A subfunction (SF) is a lightweight function that has a parent PCI function (PF) on which it is deployed. Currently, mlx5 subfunction is sharing the IRQs (MSI-X) with their parent PCI function. Before this series the PF allocates enough IRQs to cover all the cores in a system, Newly created SFs will re-use all the IRQs that the PF has allocated for itself. Hence, the more SFs are created, there are more EQs per IRQs. Therefore, whenever we handle an interrupt, we need to pull all SFs EQs and PF EQs instead of PF EQs without SFs on the system. This leads to a hard impact on the performance of SFs and PF. For example, on machine with: Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz with 56 cores. PCI Express 3 with BW of 126 Gb/s. ConnectX-5 Ex; EDR IB (100Gb/s) and 100GbE; dual-port QSFP28; PCIe4.0 x16. test case: iperf TX BW single CPU, affinity of app and IRQ are the same. PF only: no SFs on the system, 56 IRQs. SF (before), 250 SFs Sharing the same 56 IRQs . SF (now), 250 SFs + 255 avaiable IRQs for the NIC. (please see IRQ spread scheme below). application SF-IRQ channel BW(Gb/sec) interrupts/sec iperf TX affinity PF only cpu={0} cpu={0} cpu={0} 79 8200 SF (before) cpu={0} cpu={0} cpu={0} 51.3 (-35%) 9500 SF (now) cpu={0} cpu={0} cpu={0} 78 (-2%) 8200 command: $ taskset -c 0 iperf -c 11.1.1.1 -P 3 -i 6 -t 30 | grep SUM The different between the SF examples is that before this series we allocate num_cpus (56) IRQs, and all of them were shared among the PF and the SFs. And after this series, we allocate 255 IRQs, and we spread the SFs among the above IRQs. This have significantly decreased the load on each IRQ and the number of EQs per IRQ is down by 95% (251->11). In this patchset the solution proposed is to have a dedicated IRQ pool for SFs to use. the pool will allocate a large number of IRQs for SFs to grab from in order to minimize irq sharing between the different SFs. IRQs will not be requested from the OS until they are 1st requested by an SF consumer, and will be eventually released when the last SF consumer releases them. For the detailed IRQ spread and allocation scheme please see last patch: ("net/mlx5: Round-Robin EQs over IRQs") ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge tag 'mlx5-updates-2021-06-14' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says: ==================== mlx5-updates-2021-06-14 1) Trivial Lag refactroing in preparation for upcomming Single FDB lag feature - First 3 patches 2) Scalable IRQ distriburion for Sub-functions A subfunction (SF) is a lightweight function that has a parent PCI function (PF) on which it is deployed. Currently, mlx5 subfunction is sharing the IRQs (MSI-X) with their parent PCI function. Before this series the PF allocates enough IRQs to cover all the cores in a system, Newly created SFs will re-use all the IRQs that the PF has allocated for itself. Hence, the more SFs are created, there are more EQs per IRQs. Therefore, whenever we handle an interrupt, we need to pull all SFs EQs and PF EQs instead of PF EQs without SFs on the system. This leads to a hard impact on the performance of SFs and PF. For example, on machine with: Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz with 56 cores. PCI Express 3 with BW of 126 Gb/s. ConnectX-5 Ex; EDR IB (100Gb/s) and 100GbE; dual-port QSFP28; PCIe4.0 x16. test case: iperf TX BW single CPU, affinity of app and IRQ are the same. PF only: no SFs on the system, 56 IRQs. SF (before), 250 SFs Sharing the same 56 IRQs . SF (now), 250 SFs + 255 avaiable IRQs for the NIC. (please see IRQ spread scheme below). application SF-IRQ channel BW(Gb/sec) interrupts/sec iperf TX affinity PF only cpu={0} cpu={0} cpu={0} 79 8200 SF (before) cpu={0} cpu={0} cpu={0} 51.3 (-35%) 9500 SF (now) cpu={0} cpu={0} cpu={0} 78 (-2%) 8200 command: $ taskset -c 0 iperf -c 11.1.1.1 -P 3 -i 6 -t 30 | grep SUM The different between the SF examples is that before this series we allocate num_cpus (56) IRQs, and all of them were shared among the PF and the SFs. And after this series, we allocate 255 IRQs, and we spread the SFs among the above IRQs. This have significantly decreased the load on each IRQ and the number of EQs per IRQ is down by 95% (251->11). In this patchset the solution proposed is to have a dedicated IRQ pool for SFs to use. the pool will allocate a large number of IRQs for SFs to grab from in order to minimize irq sharing between the different SFs. IRQs will not be requested from the OS until they are 1st requested by an SF consumer, and will be eventually released when the last SF consumer releases them. For the detailed IRQ spread and allocation scheme please see last patch: ("net/mlx5: Round-Robin EQs over IRQs") ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
f0c227c7 · David S. Miller · 08ab4d74 · c36326d3 · f0c227c7 · f0c227c7
Commit f0c227c7 authored Jun 15, 2021 by David S. Miller
17 changed files
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1559,12 +1559,16 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	}

 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
-	param = (struct mlx5_eq_param){
-		.irq_index = 0,
+	param = (struct mlx5_eq_param) {
 		.nent = MLX5_IB_NUM_PF_EQE,
 	};
 	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
+	if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
+	free_cpumask_var(param.affinity);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5114,7 +5114,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 	mlx5e_set_netdev_mtu_boundaries(priv);
 	mlx5e_set_dev_port_mtu(priv);

-	mlx5_lag_add(mdev, netdev);
+	mlx5_lag_add_netdev(mdev, netdev);

 	mlx5e_enable_async_events(priv);
 	mlx5e_enable_blocking_events(priv);
@@ -5162,7 +5162,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 		priv->en_trap = NULL;
 	}
 	mlx5e_disable_async_events(priv);
-	mlx5_lag_remove(mdev);
+	mlx5_lag_remove_netdev(mdev, priv->netdev);
 	mlx5_vxlan_reset_to_default(mdev->vxlan);
 }


--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -976,7 +976,7 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
 	if (MLX5_CAP_GEN(mdev, uplink_follow))
 		mlx5_modify_vport_admin_state(mdev, MLX5_VPORT_STATE_OP_MOD_UPLINK,
 					      0, 0, MLX5_VPORT_ADMIN_STATE_AUTO);
-	mlx5_lag_add(mdev, netdev);
+	mlx5_lag_add_netdev(mdev, netdev);
 	priv->events_nb.notifier_call = uplink_rep_async_event;
 	mlx5_notifier_register(mdev, &priv->events_nb);
 	mlx5e_dcbnl_initialize(priv);
@@ -1009,7 +1009,7 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
 	mlx5e_dcbnl_delete_app(priv);
 	mlx5_notifier_unregister(mdev, &priv->events_nb);
 	mlx5e_rep_tc_disable(priv);
-	mlx5_lag_remove(mdev);
+	mlx5_lag_remove_netdev(mdev, priv->netdev);
 }

 static MLX5E_DEFINE_STATS_GRP(sw_rep, 0);

--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -93,6 +93,64 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);

+static int mlx5_lag_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr);
+static void mlx5_do_bond_work(struct work_struct *work);
+
+static void mlx5_ldev_free(struct kref *ref)
+{
+	struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);
+
+	if (ldev->nb.notifier_call)
+		unregister_netdevice_notifier_net(&init_net, &ldev->nb);
+	mlx5_lag_mp_cleanup(ldev);
+	cancel_delayed_work_sync(&ldev->bond_work);
+	destroy_workqueue(ldev->wq);
+	kfree(ldev);
+}
+
+static void mlx5_ldev_put(struct mlx5_lag *ldev)
+{
+	kref_put(&ldev->ref, mlx5_ldev_free);
+}
+
+static void mlx5_ldev_get(struct mlx5_lag *ldev)
+{
+	kref_get(&ldev->ref);
+}
+
+static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	int err;
+
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev)
+		return NULL;
+
+	ldev->wq = create_singlethread_workqueue("mlx5_lag");
+	if (!ldev->wq) {
+		kfree(ldev);
+		return NULL;
+	}
+
+	kref_init(&ldev->ref);
+	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
+
+	ldev->nb.notifier_call = mlx5_lag_netdev_event;
+	if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
+		ldev->nb.notifier_call = NULL;
+		mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
+	}
+
+	err = mlx5_lag_mp_init(ldev);
+	if (err)
+		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
+			      err);
+
+	return ldev;
+}
+
 int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
 				struct net_device *ndev)
 {
@@ -258,6 +316,10 @@ static void mlx5_lag_add_devices(struct mlx5_lag *ldev)
 		if (!ldev->pf[i].dev)
 			continue;

+		if (ldev->pf[i].dev->priv.flags &
+		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
+			continue;
+
 		ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
 		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
 	}
@@ -276,6 +338,31 @@ static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
 	}
 }

+static void mlx5_disable_lag(struct mlx5_lag *ldev)
+{
+	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
+	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
+	bool roce_lag;
+	int err;
+
+	roce_lag = __mlx5_lag_is_roce(ldev);
+
+	if (roce_lag) {
+		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
+			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+			mlx5_rescan_drivers_locked(dev0);
+		}
+		mlx5_nic_vport_disable_roce(dev1);
+	}
+
+	err = mlx5_deactivate_lag(ldev);
+	if (err)
+		return;
+
+	if (roce_lag)
+		mlx5_lag_add_devices(ldev);
+}
+
 static void mlx5_do_bond(struct mlx5_lag *ldev)
 {
 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
@@ -322,20 +409,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 	} else if (do_bond && __mlx5_lag_is_active(ldev)) {
 		mlx5_modify_lag(ldev, &tracker);
 	} else if (!do_bond && __mlx5_lag_is_active(ldev)) {
-		roce_lag = __mlx5_lag_is_roce(ldev);
-
-		if (roce_lag) {
-			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-			mlx5_rescan_drivers_locked(dev0);
-			mlx5_nic_vport_disable_roce(dev1);
-		}
-
-		err = mlx5_deactivate_lag(ldev);
-		if (err)
-			return;
-
-		if (roce_lag)
-			mlx5_lag_add_devices(ldev);
+		mlx5_disable_lag(ldev);
 	}
 }

@@ -495,54 +569,51 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
 	return NOTIFY_DONE;
 }

-static struct mlx5_lag *mlx5_lag_dev_alloc(void)
-{
-	struct mlx5_lag *ldev;
-
-	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
-	if (!ldev)
-		return NULL;
-
-	ldev->wq = create_singlethread_workqueue("mlx5_lag");
-	if (!ldev->wq) {
-		kfree(ldev);
-		return NULL;
-	}
-
-	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
-
-	return ldev;
-}
-
-static void mlx5_lag_dev_free(struct mlx5_lag *ldev)
-{
-	destroy_workqueue(ldev->wq);
-	kfree(ldev);
-}
-
-static int mlx5_lag_dev_add_pf(struct mlx5_lag *ldev,
+static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
 				 struct mlx5_core_dev *dev,
 				 struct net_device *netdev)
 {
 	unsigned int fn = PCI_FUNC(dev->pdev->devfn);

 	if (fn >= MLX5_MAX_PORTS)
-		return -EPERM;
+		return;

 	spin_lock(&lag_lock);
-	ldev->pf[fn].dev    = dev;
 	ldev->pf[fn].netdev = netdev;
 	ldev->tracker.netdev_state[fn].link_up = 0;
 	ldev->tracker.netdev_state[fn].tx_enabled = 0;
+	spin_unlock(&lag_lock);
+}

-	dev->priv.lag = ldev;
+static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
+				    struct net_device *netdev)
+{
+	int i;

+	spin_lock(&lag_lock);
+	for (i = 0; i < MLX5_MAX_PORTS; i++) {
+		if (ldev->pf[i].netdev == netdev) {
+			ldev->pf[i].netdev = NULL;
+			break;
+		}
+	}
 	spin_unlock(&lag_lock);
+}
+
+static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
+			       struct mlx5_core_dev *dev)
+{
+	unsigned int fn = PCI_FUNC(dev->pdev->devfn);
+
+	if (fn >= MLX5_MAX_PORTS)
+		return;

-	return fn;
+	ldev->pf[fn].dev = dev;
+	dev->priv.lag = ldev;
 }

-static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev,
+/* Must be called with intf_mutex held */
+static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
 				  struct mlx5_core_dev *dev)
 {
 	int i;
@@ -554,19 +625,15 @@ static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev,
 	if (i == MLX5_MAX_PORTS)
 		return;

-	spin_lock(&lag_lock);
-	memset(&ldev->pf[i], 0, sizeof(*ldev->pf));
-
+	ldev->pf[i].dev = NULL;
 	dev->priv.lag = NULL;
-	spin_unlock(&lag_lock);
 }

 /* Must be called with intf_mutex held */
-void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
+static void __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
 {
 	struct mlx5_lag *ldev = NULL;
 	struct mlx5_core_dev *tmp_dev;
-	int i, err;

 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
 	    !MLX5_CAP_GEN(dev, lag_master) ||
@@ -578,67 +645,77 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 		ldev = tmp_dev->priv.lag;

 	if (!ldev) {
-		ldev = mlx5_lag_dev_alloc();
+		ldev = mlx5_lag_dev_alloc(dev);
 		if (!ldev) {
 			mlx5_core_err(dev, "Failed to alloc lag dev\n");
 			return;
 		}
+	} else {
+		mlx5_ldev_get(ldev);
 	}

-	if (mlx5_lag_dev_add_pf(ldev, dev, netdev) < 0)
+	mlx5_ldev_add_mdev(ldev, dev);
+
 	return;
+}

-	for (i = 0; i < MLX5_MAX_PORTS; i++)
-		if (!ldev->pf[i].dev)
-			break;
+void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;

-	if (i >= MLX5_MAX_PORTS)
-		ldev->flags |= MLX5_LAG_FLAG_READY;
+	ldev = mlx5_lag_dev(dev);
+	if (!ldev)
+		return;

-	if (!ldev->nb.notifier_call) {
-		ldev->nb.notifier_call = mlx5_lag_netdev_event;
-		if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
-			ldev->nb.notifier_call = NULL;
-			mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
-		}
-	}
+	mlx5_dev_list_lock();
+	mlx5_ldev_remove_mdev(ldev, dev);
+	mlx5_dev_list_unlock();
+	mlx5_ldev_put(ldev);
+}

-	err = mlx5_lag_mp_init(ldev);
-	if (err)
-		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
-			      err);
+void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
+{
+	mlx5_dev_list_lock();
+	__mlx5_lag_dev_add_mdev(dev);
+	mlx5_dev_list_unlock();
 }

 /* Must be called with intf_mutex held */
-void mlx5_lag_remove(struct mlx5_core_dev *dev)
+void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
+			    struct net_device *netdev)
 {
 	struct mlx5_lag *ldev;
-	int i;

-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	if (!ldev)
 		return;

 	if (__mlx5_lag_is_active(ldev))
-		mlx5_deactivate_lag(ldev);
-
-	mlx5_lag_dev_remove_pf(ldev, dev);
+		mlx5_disable_lag(ldev);

+	mlx5_ldev_remove_netdev(ldev, netdev);
 	ldev->flags &= ~MLX5_LAG_FLAG_READY;
+}
+
+/* Must be called with intf_mutex held */
+void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
+			 struct net_device *netdev)
+{
+	struct mlx5_lag *ldev;
+	int i;
+
+	ldev = mlx5_lag_dev(dev);
+	if (!ldev)
+		return;
+
+	mlx5_ldev_add_netdev(ldev, dev, netdev);

 	for (i = 0; i < MLX5_MAX_PORTS; i++)
-		if (ldev->pf[i].dev)
+		if (!ldev->pf[i].dev)
 			break;

-	if (i == MLX5_MAX_PORTS) {
-		if (ldev->nb.notifier_call) {
-			unregister_netdevice_notifier_net(&init_net, &ldev->nb);
-			ldev->nb.notifier_call = NULL;
-		}
-		mlx5_lag_mp_cleanup(ldev);
-		cancel_delayed_work_sync(&ldev->bond_work);
-		mlx5_lag_dev_free(ldev);
-	}
+	if (i >= MLX5_MAX_PORTS)
+		ldev->flags |= MLX5_LAG_FLAG_READY;
 }

 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
@@ -647,7 +724,7 @@ bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
 	bool res;

 	spin_lock(&lag_lock);
-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	res  = ldev && __mlx5_lag_is_roce(ldev);
 	spin_unlock(&lag_lock);

@@ -661,7 +738,7 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
 	bool res;

 	spin_lock(&lag_lock);
-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	res  = ldev && __mlx5_lag_is_active(ldev);
 	spin_unlock(&lag_lock);

@@ -675,7 +752,7 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
 	bool res;

 	spin_lock(&lag_lock);
-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	res  = ldev && __mlx5_lag_is_sriov(ldev);
 	spin_unlock(&lag_lock);

@@ -688,7 +765,7 @@ void mlx5_lag_update(struct mlx5_core_dev *dev)
 	struct mlx5_lag *ldev;

 	mlx5_dev_list_lock();
-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	if (!ldev)
 		goto unlock;

@@ -704,7 +781,7 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
 	struct mlx5_lag *ldev;

 	spin_lock(&lag_lock);
-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);

 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
 		goto unlock;
@@ -733,7 +810,7 @@ u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
 	u8 port = 0;

 	spin_lock(&lag_lock);
-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
 		goto unlock;

@@ -769,7 +846,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
 	memset(values, 0, sizeof(*values) * num_counters);

 	spin_lock(&lag_lock);
-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	if (ldev && __mlx5_lag_is_active(ldev)) {
 		num_ports = MLX5_MAX_PORTS;
 		mdev[MLX5_LAG_P1] = ldev->pf[MLX5_LAG_P1].dev;

--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
@@ -40,6 +40,7 @@ struct lag_tracker {
 struct mlx5_lag {
 	u8                        flags;
 	u8                        v2p_map[MLX5_MAX_PORTS];
+	struct kref               ref;
 	struct lag_func           pf[MLX5_MAX_PORTS];
 	struct lag_tracker        tracker;
 	struct workqueue_struct   *wq;
@@ -49,7 +50,7 @@ struct mlx5_lag {
 };

 static inline struct mlx5_lag *
-mlx5_lag_dev_get(struct mlx5_core_dev *dev)
+mlx5_lag_dev(struct mlx5_core_dev *dev)
 {
 	return dev->priv.lag;
 }

--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@@ -28,7 +28,7 @@ bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev)
 	struct mlx5_lag *ldev;
 	bool res;

-	ldev = mlx5_lag_dev_get(dev);
+	ldev = mlx5_lag_dev(dev);
 	res  = ldev && __mlx5_lag_is_multipath(ldev);

 	return res;

--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
-/* Copyright (c) 2018 Mellanox Technologies */
+/* Copyright (c) 2018-2021, Mellanox Technologies inc.  All rights reserved. */

 #ifndef __LIB_MLX5_EQ_H__
 #define __LIB_MLX5_EQ_H__
@@ -32,6 +32,7 @@ struct mlx5_eq {
 	unsigned int            irqn;
 	u8                      eqn;
 	struct mlx5_rsc_debug   *dbg;
+	struct mlx5_irq         *irq;
 };

 struct mlx5_eq_async {

--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sf.h
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2021 Mellanox Technologies Ltd */
+
+#ifndef __LIB_MLX5_SF_H__
+#define __LIB_MLX5_SF_H__
+
+#include <linux/mlx5/driver.h>
+
+static inline u16 mlx5_sf_start_function_id(const struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN(dev, sf_base_id);
+}
+
+#ifdef CONFIG_MLX5_SF
+
+static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN(dev, sf);
+}
+
+static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev)
+{
+	if (!mlx5_sf_supported(dev))
+		return 0;
+	if (MLX5_CAP_GEN(dev, max_num_sf))
+		return MLX5_CAP_GEN(dev, max_num_sf);
+	else
+		return 1 << MLX5_CAP_GEN(dev, log_max_sf);
+}
+
+#else
+
+static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev)
+{
+	return false;
+}
+
+static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
+#endif
+
+#endif
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -76,6 +76,7 @@
 #include "sf/vhca_event.h"
 #include "sf/dev/dev.h"
 #include "sf/sf.h"
+#include "mlx5_irq.h"

 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -1185,6 +1186,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 	}

 	mlx5_sf_dev_table_create(dev);
+	mlx5_lag_add_mdev(dev);

 	return 0;

@@ -1219,6 +1221,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)

 static void mlx5_unload(struct mlx5_core_dev *dev)
 {
+	mlx5_lag_remove_mdev(dev);
 	mlx5_sf_dev_table_destroy(dev);
 	mlx5_sriov_detach(dev);
 	mlx5_ec_cleanup(dev);

--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -164,27 +164,10 @@ int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcap, u8 feature_group,
 int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
 			u8 feature_group, u8 access_reg_group);

-void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
-void mlx5_lag_remove(struct mlx5_core_dev *dev);
-
-int mlx5_irq_table_init(struct mlx5_core_dev *dev);
-void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);
-int mlx5_irq_table_create(struct mlx5_core_dev *dev);
-void mlx5_irq_table_destroy(struct mlx5_core_dev *dev);
-int mlx5_irq_attach_nb(struct mlx5_irq_table *irq_table, int vecidx,
-		       struct notifier_block *nb);
-int mlx5_irq_detach_nb(struct mlx5_irq_table *irq_table, int vecidx,
-		       struct notifier_block *nb);
-
-int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
-			    int msix_vec_count);
-int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
-
-struct cpumask *
-mlx5_irq_get_affinity_mask(struct mlx5_irq_table *irq_table, int vecidx);
-struct cpu_rmap *mlx5_irq_get_rmap(struct mlx5_irq_table *table);
-int mlx5_irq_get_num_comp(struct mlx5_irq_table *table);
-struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev);
+void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, struct net_device *netdev);
+void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, struct net_device *netdev);
+void mlx5_lag_add_mdev(struct mlx5_core_dev *dev);
+void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev);

 int mlx5_events_init(struct mlx5_core_dev *dev);
 void mlx5_events_cleanup(struct mlx5_core_dev *dev);

--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2021 Mellanox Technologies. */
+
+#ifndef __MLX5_IRQ_H__
+#define __MLX5_IRQ_H__
+
+#include <linux/mlx5/driver.h>
+
+#define MLX5_COMP_EQS_PER_SF 8
+
+#define MLX5_IRQ_EQ_CTRL (0)
+
+struct mlx5_irq;
+
+int mlx5_irq_table_init(struct mlx5_core_dev *dev);
+void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);
+int mlx5_irq_table_create(struct mlx5_core_dev *dev);
+void mlx5_irq_table_destroy(struct mlx5_core_dev *dev);
+int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table);
+int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table);
+struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev);
+
+int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
+			    int msix_vec_count);
+int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
+
+struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
+				  struct cpumask *affinity);
+void mlx5_irq_release(struct mlx5_irq *irq);
+int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
+int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
+struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq);
+int mlx5_irq_get_index(struct mlx5_irq *irq);
+
+#endif /* __MLX5_IRQ_H__ */
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -5,42 +5,7 @@
 #define __MLX5_SF_H__

 #include <linux/mlx5/driver.h>
-
-static inline u16 mlx5_sf_start_function_id(const struct mlx5_core_dev *dev)
-{
-	return MLX5_CAP_GEN(dev, sf_base_id);
-}
-
-#ifdef CONFIG_MLX5_SF
-
-static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev)
-{
-	return MLX5_CAP_GEN(dev, sf);
-}
-
-static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev)
-{
-	if (!mlx5_sf_supported(dev))
-		return 0;
-	if (MLX5_CAP_GEN(dev, max_num_sf))
-		return MLX5_CAP_GEN(dev, max_num_sf);
-	else
-		return 1 << MLX5_CAP_GEN(dev, log_max_sf);
-}
-
-#else
-
-static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev)
-{
-	return false;
-}
-
-static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev)
-{
-	return 0;
-}
-
-#endif
+#include "lib/sf.h"

 #ifdef CONFIG_MLX5_SF_MANAGER


--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -34,6 +34,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/vport.h>
 #include "mlx5_core.h"
+#include "mlx5_irq.h"
 #include "eswitch.h"

 static int sriov_restore_guids(struct mlx5_core_dev *dev, int vf)

--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -16,6 +16,7 @@ struct mlx5_eq_param {
 	u8             irq_index;
 	int            nent;
 	u64            mask[4];
+	cpumask_var_t  affinity;
 };

 struct mlx5_eq *

--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3806,8 +3806,8 @@ struct mlx5_ifc_eqc_bits {

 	u8         reserved_at_80[0x20];

-	u8         reserved_at_a0[0x18];
-	u8         intr[0x8];
+	u8         reserved_at_a0[0x14];
+	u8         intr[0xc];

 	u8         reserved_at_c0[0x3];
 	u8         log_page_size[0x5];