Merge tag 'mlx5-updates-2022-01-06' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says: ==================== mlx5-updates-2022-01-06 1) Expose FEC per lane block counters via ethtool 2) Trivial fixes/updates/cleanup to mlx5e netdev driver 3) Fix htmldoc build warning 4) Spread mlx5 SFs (sub-functions) to all available CPU cores: Commits 1..5 Shay Drory Says: ================ Before this patchset, mlx5 subfunction shared the same IRQs (MSI-X) with their peers subfunctions, causing them to use same CPU cores. In large scale, this is very undesirable, SFs use small number of cpu cores and all of them will be packed on the same CPU cores, not utilizing all CPU cores in the system. In this patchset we want to achieve two things. a) Spread IRQs used by SFs to all cpu cores b) Pack less SFs in the same IRQ, will result in multiple IRQs per core. In this patchset, we spread SFs over all online cpus available to mlx5 irqs in Round-Robin manner. e.g.: Whenever a SF is created, pick the next CPU core with least number of SF IRQs bound to it, SFs will share IRQs on the same core until a certain limit, when such limit is reached, we request a new IRQ and add it to that CPU core IRQ pool, when out of IRQs, pick any IRQ with least number of SF users. This enhancement is done in order to achieve a better distribution of the SFs over all the available CPUs, which reduces application latency, as shown bellow. Machine details: Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz with 56 cores. PCI Express 3 with BW of 126 Gb/s. ConnectX-5 Ex; EDR IB (100Gb/s) and 100GbE; dual-port QSFP28; PCIe4.0 x16. Base line test description: Single SF on the system. One instance of netperf is running on-top the SF. Numbers: latency = 15.136 usec, CPU Util = 35% Test description: There are 250 SFs on the system. There are 3 instances of netperf running, on-top three different SFs, in parallel. Perf numbers: # netperf SFs latency(usec) latency CPU utilization affinity affinity (lower is better) increase % 1 cpu=0 cpu={0} ~23 (app 1-3) 35% 75% 2 cpu=0,2,4 cpu={0} app 1: 21.625 30% 68% (CPU 0) app 2-3: 16.5 9% 15% (CPU 2,4) 3 cpu=0 cpu={0,2,4} app 1: ~16 7% 84% (CPU 0) app 2-3: ~17.9 14% 22% (CPU 2,4) 4 cpu=0,2,4 cpu={0,2,4} 15.2 (app 1-3) 0% 33% (CPU 0,2,4) - The first two entries (#1 and #2) show current state. e.g.: SFs are using the same CPU. The last two entries (#3 and #4) shows the latency reduction improvement of this patch. e.g.: SFs are on different CPUs. - Whenever we use several CPUs, in case there is a different CPU utilization, write the utilization of each CPU separately. - Whenever the latency result of the netperf instances were different, write the latency of each netperf instances separately. Commands: - for netperf CPU=0: $ for i in {1..3}; do taskset -c 0 netperf -H 1${i}.1.1.1 -t TCP_RR -- \ -o RT_LATENCY -r8 & done - for netperf CPU=0,2,4 $ for i in {1..3}; do taskset -c $(( ($i - 1) * 2 )) netperf -H \ 1${i}.1.1.1 -t TCP_RR -- -o RT_LATENCY -r8 & done ================ ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge tag 'mlx5-updates-2022-01-06' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says: ==================== mlx5-updates-2022-01-06 1) Expose FEC per lane block counters via ethtool 2) Trivial fixes/updates/cleanup to mlx5e netdev driver 3) Fix htmldoc build warning 4) Spread mlx5 SFs (sub-functions) to all available CPU cores: Commits 1..5 Shay Drory Says: ================ Before this patchset, mlx5 subfunction shared the same IRQs (MSI-X) with their peers subfunctions, causing them to use same CPU cores. In large scale, this is very undesirable, SFs use small number of cpu cores and all of them will be packed on the same CPU cores, not utilizing all CPU cores in the system. In this patchset we want to achieve two things. a) Spread IRQs used by SFs to all cpu cores b) Pack less SFs in the same IRQ, will result in multiple IRQs per core. In this patchset, we spread SFs over all online cpus available to mlx5 irqs in Round-Robin manner. e.g.: Whenever a SF is created, pick the next CPU core with least number of SF IRQs bound to it, SFs will share IRQs on the same core until a certain limit, when such limit is reached, we request a new IRQ and add it to that CPU core IRQ pool, when out of IRQs, pick any IRQ with least number of SF users. This enhancement is done in order to achieve a better distribution of the SFs over all the available CPUs, which reduces application latency, as shown bellow. Machine details: Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz with 56 cores. PCI Express 3 with BW of 126 Gb/s. ConnectX-5 Ex; EDR IB (100Gb/s) and 100GbE; dual-port QSFP28; PCIe4.0 x16. Base line test description: Single SF on the system. One instance of netperf is running on-top the SF. Numbers: latency = 15.136 usec, CPU Util = 35% Test description: There are 250 SFs on the system. There are 3 instances of netperf running, on-top three different SFs, in parallel. Perf numbers: # netperf SFs latency(usec) latency CPU utilization affinity affinity (lower is better) increase % 1 cpu=0 cpu={0} ~23 (app 1-3) 35% 75% 2 cpu=0,2,4 cpu={0} app 1: 21.625 30% 68% (CPU 0) app 2-3: 16.5 9% 15% (CPU 2,4) 3 cpu=0 cpu={0,2,4} app 1: ~16 7% 84% (CPU 0) app 2-3: ~17.9 14% 22% (CPU 2,4) 4 cpu=0,2,4 cpu={0,2,4} 15.2 (app 1-3) 0% 33% (CPU 0,2,4) - The first two entries (#1 and #2) show current state. e.g.: SFs are using the same CPU. The last two entries (#3 and #4) shows the latency reduction improvement of this patch. e.g.: SFs are on different CPUs. - Whenever we use several CPUs, in case there is a different CPU utilization, write the utilization of each CPU separately. - Whenever the latency result of the netperf instances were different, write the latency of each netperf instances separately. Commands: - for netperf CPU=0: $ for i in {1..3}; do taskset -c 0 netperf -H 1${i}.1.1.1 -t TCP_RR -- \ -o RT_LATENCY -r8 & done - for netperf CPU=0,2,4 $ for i in {1..3}; do taskset -c $(( ($i - 1) * 2 )) netperf -H \ 1${i}.1.1.1 -t TCP_RR -- -o RT_LATENCY -r8 & done ================ ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
26abf15c · David S. Miller · e4a3d6a6 · 745a1306 · 26abf15c · 26abf15c
Commit 26abf15c authored Jan 07, 2022 by David S. Miller
19 changed files
--- a/Documentation/networking/devlink/mlx5.rst
+++ b/Documentation/networking/devlink/mlx5.rst
@@ -17,6 +17,7 @@ Parameters
     - Validation
   * - ``enable_roce``
     - driverinit
+     - Type: Boolean
   * - ``io_eq_size``
     - driverinit
     - The range is between 64 and 4096.

--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1541,16 +1541,10 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
 		.nent = MLX5_IB_NUM_PF_EQE,
 	};
 	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
-	if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
-		err = -ENOMEM;
-		goto err_wq;
-	}
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
-	free_cpumask_var(param.affinity);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;

--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -109,7 +109,7 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o
 #
 # SF device
 #
-mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o
+mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_affinity.o
 #
 # SF manager

--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -984,7 +984,7 @@ struct mlx5e_profile {
 };
 #define mlx5e_profile_feature_cap(profile, feature)	\
-	((profile)->features & (MLX5E_PROFILE_FEATURE_## feature))
+	((profile)->features & BIT(MLX5E_PROFILE_FEATURE_##feature))
 void mlx5e_build_ptys2ethtool_map(void);

--- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -120,14 +120,14 @@ static void mlx5e_hv_vhca_stats_cleanup(struct mlx5_hv_vhca_agent *agent)
 	cancel_delayed_work_sync(&priv->stats_agent.work);
 }
-int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
+void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
 {
 	int buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
 	struct mlx5_hv_vhca_agent *agent;
 	priv->stats_agent.buf = kvzalloc(buf_len, GFP_KERNEL);
 	if (!priv->stats_agent.buf)
-		return -ENOMEM;
+		return;
 	agent = mlx5_hv_vhca_agent_create(priv->mdev->hv_vhca,
 					  MLX5_HV_VHCA_AGENT_STATS,
@@ -142,13 +142,11 @@ int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
 				    PTR_ERR(agent));
 		kvfree(priv->stats_agent.buf);
-		return IS_ERR_OR_NULL(agent);
+		return;
 	}
 	priv->stats_agent.agent = agent;
 	INIT_DELAYED_WORK(&priv->stats_agent.work, mlx5e_hv_vhca_stats_work);
-	return 0;
 }
 void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
@@ -7,19 +7,12 @@
 #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
-int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv);
+void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv);
 void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv);
 #else
+static inline void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv) {}
-static inline int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
+static inline void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv) {}
-{
-	return 0;
-}
-static inline void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
-{
-}
 #endif
 #endif /* __MLX5_EN_STATS_VHCA_H__ */
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
@@ -45,14 +45,10 @@ verify_uplink_forwarding(struct mlx5e_priv *priv,
 					termination_table_raw_traffic)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "devices are both uplink, can't offload forwarding");
-			pr_err("devices %s %s are both uplink, can't offload forwarding\n",
-			       priv->netdev->name, out_dev->name);
 			return -EOPNOTSUPP;
 	} else if (out_dev != rep_priv->netdev) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "devices are not the same uplink, can't offload forwarding");
-		pr_err("devices %s %s are both uplink but not the same, can't offload forwarding\n",
-		       priv->netdev->name, out_dev->name);
 		return -EOPNOTSUPP;
 	}
 	return 0;
@@ -160,10 +156,6 @@ tc_act_can_offload_mirred(struct mlx5e_tc_act_parse_state *parse_state,
 	}
 	NL_SET_ERR_MSG_MOD(extack, "devices are not on same switch HW, can't offload forwarding");
-	netdev_warn(priv->netdev,
-		    "devices %s %s not on same switch HW, can't offload forwarding\n",
-		    netdev_name(priv->netdev),
-		    out_dev->name);
 	return false;
 }

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1883,24 +1883,19 @@ static int set_pflag_cqe_based_moder(struct net_device *netdev, bool enable,
 				     bool is_rx_cq)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
-	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_params new_params;
-	bool mode_changed;
 	u8 cq_period_mode, current_cq_period_mode;
+	struct mlx5e_params new_params;
+	if (enable && !MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
+		return -EOPNOTSUPP;
+	cq_period_mode = cqe_mode_to_period_mode(enable);
-	cq_period_mode = enable ?
-		MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
-		MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 	current_cq_period_mode = is_rx_cq ?
 		priv->channels.params.rx_cq_moderation.cq_period_mode :
 		priv->channels.params.tx_cq_moderation.cq_period_mode;
-	mode_changed = cq_period_mode != current_cq_period_mode;
-	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE &&
-	    !MLX5_CAP_GEN(mdev, cq_period_start_from_cqe))
-		return -EOPNOTSUPP;
-	if (!mode_changed)
+	if (cq_period_mode == current_cq_period_mode)
 		return 0;
 	new_params = priv->channels.params;

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3605,11 +3605,6 @@ static int set_feature_hw_gro(struct net_device *netdev, bool enable)
 	new_params = priv->channels.params;
 	if (enable) {
-		if (MLX5E_GET_PFLAG(&new_params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
-			netdev_warn(netdev, "Can't set HW-GRO when CQE compress is active\n");
-			err = -EINVAL;
-			goto out;
-		}
 		new_params.packet_merge.type = MLX5E_PACKET_MERGE_SHAMPO;
 		new_params.packet_merge.shampo.match_criteria_type =
 			MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED;
@@ -3871,6 +3866,11 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 		features &= ~NETIF_F_RXHASH;
 		if (netdev->features & NETIF_F_RXHASH)
 			netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n");
+		if (features & NETIF_F_GRO_HW) {
+			netdev_warn(netdev, "Disabling HW-GRO, not supported when CQE compress is active\n");
+			features &= ~NETIF_F_GRO_HW;
+		}
 	}
 	if (mlx5e_is_uplink_rep(priv))

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1603,6 +1603,12 @@ static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	}
 }
+static void mlx5e_handle_rx_err_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	trigger_report(rq, cqe);
+	rq->stats->wqe_err++;
+}
 static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1616,8 +1622,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
+		mlx5e_handle_rx_err_cqe(rq, cqe);
-		rq->stats->wqe_err++;
 		goto free_wqe;
 	}
@@ -1670,7 +1675,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		rq->stats->wqe_err++;
+		mlx5e_handle_rx_err_cqe(rq, cqe);
 		goto free_wqe;
 	}
@@ -1719,8 +1724,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64
 	wi->consumed_strides += cstrides;
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
+		mlx5e_handle_rx_err_cqe(rq, cqe);
-		rq->stats->wqe_err++;
 		goto mpwrq_cqe_out;
 	}
@@ -1988,8 +1992,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq
 	wi->consumed_strides += cstrides;
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
+		mlx5e_handle_rx_err_cqe(rq, cqe);
-		stats->wqe_err++;
 		goto mpwrq_cqe_out;
 	}
@@ -2058,8 +2061,7 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq
 	wi->consumed_strides += cstrides;
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
+		mlx5e_handle_rx_err_cqe(rq, cqe);
-		rq->stats->wqe_err++;
 		goto mpwrq_cqe_out;
 	}

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -35,6 +35,7 @@
 #include "en_accel/tls.h"
 #include "en_accel/en_accel.h"
 #include "en/ptp.h"
+#include "en/port.h"
 static unsigned int stats_grps_num(struct mlx5e_priv *priv)
 {
@@ -1158,12 +1159,99 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(phy)
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 }
-void mlx5e_stats_fec_get(struct mlx5e_priv *priv,
+static int fec_num_lanes(struct mlx5_core_dev *dev)
-			 struct ethtool_fec_stats *fec_stats)
+{
+	u32 out[MLX5_ST_SZ_DW(pmlp_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {};
+	int err;
+	MLX5_SET(pmlp_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_PMLP, 0, 0);
+	if (err)
+		return 0;
+	return MLX5_GET(pmlp_reg, out, width);
+}
+static int fec_active_mode(struct mlx5_core_dev *mdev)
+{
+	unsigned long fec_active_long;
+	u32 fec_active;
+	if (mlx5e_get_fec_mode(mdev, &fec_active, NULL))
+		return MLX5E_FEC_NOFEC;
+	fec_active_long = fec_active;
+	return find_first_bit(&fec_active_long, sizeof(unsigned long) * BITS_PER_BYTE);
+}
+#define MLX5E_STATS_SET_FEC_BLOCK(idx) ({ \
+	fec_stats->corrected_blocks.lanes[(idx)] = \
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, \
+				      fc_fec_corrected_blocks_lane##idx); \
+	fec_stats->uncorrectable_blocks.lanes[(idx)] = \
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, \
+				      fc_fec_uncorrectable_blocks_lane##idx); \
+})
+static void fec_set_fc_stats(struct ethtool_fec_stats *fec_stats,
+			     u32 *ppcnt, u8 lanes)
+{
+	if (lanes > 3) { /* 4 lanes */
+		MLX5E_STATS_SET_FEC_BLOCK(3);
+		MLX5E_STATS_SET_FEC_BLOCK(2);
+	}
+	if (lanes > 1) /* 2 lanes */
+		MLX5E_STATS_SET_FEC_BLOCK(1);
+	if (lanes > 0) /* 1 lane */
+		MLX5E_STATS_SET_FEC_BLOCK(0);
+}
+static void fec_set_rs_stats(struct ethtool_fec_stats *fec_stats, u32 *ppcnt)
+{
+	fec_stats->corrected_blocks.total =
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs,
+				      rs_fec_corrected_blocks);
+	fec_stats->uncorrectable_blocks.total =
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs,
+				      rs_fec_uncorrectable_blocks);
+}
+static void fec_set_block_stats(struct mlx5e_priv *priv,
+				struct ethtool_fec_stats *fec_stats)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 out[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	int mode = fec_active_mode(mdev);
+	if (mode == MLX5E_FEC_NOFEC)
+		return;
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
+	if (mlx5_core_access_reg(mdev, in, sz, outl, sz, MLX5_REG_PPCNT, 0, 0))
+		return;
+	switch (mode) {
+	case MLX5E_FEC_RS_528_514:
+	case MLX5E_FEC_RS_544_514:
+	case MLX5E_FEC_LLRS_272_257_1:
+		fec_set_rs_stats(fec_stats, out);
+		return;
+	case MLX5E_FEC_FIRECODE:
+		fec_set_fc_stats(fec_stats, out, fec_num_lanes(mdev));
+	}
+}
+static void fec_set_corrected_bits_total(struct mlx5e_priv *priv,
+					 struct ethtool_fec_stats *fec_stats)
 {
 	u32 ppcnt_phy_statistical[MLX5_ST_SZ_DW(ppcnt_reg)];
 	struct mlx5_core_dev *mdev = priv->mdev;
-	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
 	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 	if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group))
@@ -1181,6 +1269,13 @@ void mlx5e_stats_fec_get(struct mlx5e_priv *priv,
 				      phy_corrected_bits);
 }
+void mlx5e_stats_fec_get(struct mlx5e_priv *priv,
+			 struct ethtool_fec_stats *fec_stats)
+{
+	fec_set_corrected_bits_total(priv, fec_stats);
+	fec_set_block_stats(priv, fec_stats);
+}
 #define PPORT_ETH_EXT_OFF(c) \
 	MLX5_BYTE_OFF(ppcnt_reg, \
 		      counter_set.eth_extended_cntrs_grp_data_layout.c##_high)

--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -59,6 +59,8 @@ struct mlx5_eq_table {
 	struct mutex            lock; /* sync async eqs creations */
 	int			num_comp_eqs;
 	struct mlx5_irq_table	*irq_table;
+	struct mlx5_irq         **comp_irqs;
+	struct mlx5_irq         *ctrl_irq;
 #ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap		*rmap;
 #endif
@@ -266,8 +268,8 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	u8 log_eq_stride = ilog2(MLX5_EQE_SIZE);
 	struct mlx5_priv *priv = &dev->priv;
-	u16 vecidx = param->irq_index;
 	__be64 *pas;
+	u16 vecidx;
 	void *eqc;
 	int inlen;
 	u32 *in;
@@ -289,20 +291,16 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc);
 	init_eq_buf(eq);
-	eq->irq = mlx5_irq_request(dev, vecidx, param->affinity);
+	eq->irq = param->irq;
-	if (IS_ERR(eq->irq)) {
-		err = PTR_ERR(eq->irq);
-		goto err_buf;
-	}
 	vecidx = mlx5_irq_get_index(eq->irq);
 	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
 		MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->frag_buf.npages;
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
 		err = -ENOMEM;
-		goto err_irq;
+		goto err_buf;
 	}
 	pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
@@ -346,8 +344,6 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 err_in:
 	kvfree(in);
-err_irq:
-	mlx5_irq_release(eq->irq);
 err_buf:
 	mlx5_frag_buf_free(dev, &eq->frag_buf);
 	return err;
@@ -401,7 +397,6 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
 			       eq->eqn);
-	mlx5_irq_release(eq->irq);
 	mlx5_frag_buf_free(dev, &eq->frag_buf);
 	return err;
@@ -594,11 +589,8 @@ setup_async_eq(struct mlx5_core_dev *dev, struct mlx5_eq_async *eq,
 	eq->irq_nb.notifier_call = mlx5_eq_async_int;
 	spin_lock_init(&eq->lock);
-	if (!zalloc_cpumask_var(&param->affinity, GFP_KERNEL))
-		return -ENOMEM;
 	err = create_async_eq(dev, &eq->core, param);
-	free_cpumask_var(param->affinity);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create %s EQ %d\n", name, err);
 		return err;
@@ -643,11 +635,18 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_param param = {};
 	int err;
+	/* All the async_eqs are using single IRQ, request one IRQ and share its
+	 * index among all the async_eqs of this device.
+	 */
+	table->ctrl_irq = mlx5_ctrl_irq_request(dev);
+	if (IS_ERR(table->ctrl_irq))
+		return PTR_ERR(table->ctrl_irq);
 	MLX5_NB_INIT(&table->cq_err_nb, cq_err_event_notifier, CQ_ERROR);
 	mlx5_eq_notifier_register(dev, &table->cq_err_nb);
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
+		.irq = table->ctrl_irq,
 		.nent = MLX5_NUM_CMD_EQE,
 		.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD,
 	};
@@ -660,7 +659,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL);
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
+		.irq = table->ctrl_irq,
 		.nent = async_eq_depth_devlink_param_get(dev),
 	};
@@ -670,7 +669,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		goto err2;
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
+		.irq = table->ctrl_irq,
 		.nent = /* TODO: sriov max_vf + */ 1,
 		.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST,
 	};
@@ -689,6 +688,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 err1:
 	mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL);
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
+	mlx5_ctrl_irq_release(table->ctrl_irq);
 	return err;
 }
@@ -703,6 +703,7 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 	cleanup_async_eq(dev, &table->cmd_eq, "cmd");
 	mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL);
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
+	mlx5_ctrl_irq_release(table->ctrl_irq);
 }
 struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev)
@@ -730,12 +731,10 @@ mlx5_eq_create_generic(struct mlx5_core_dev *dev,
 	struct mlx5_eq *eq = kvzalloc(sizeof(*eq), GFP_KERNEL);
 	int err;
-	if (!cpumask_available(param->affinity))
-		return ERR_PTR(-EINVAL);
 	if (!eq)
 		return ERR_PTR(-ENOMEM);
+	param->irq = dev->priv.eq_table->ctrl_irq;
 	err = create_async_eq(dev, eq, param);
 	if (err) {
 		kvfree(eq);
@@ -795,6 +794,54 @@ void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm)
 }
 EXPORT_SYMBOL(mlx5_eq_update_ci);
+static void comp_irqs_release(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+	if (mlx5_core_is_sf(dev))
+		mlx5_irq_affinity_irqs_release(dev, table->comp_irqs, table->num_comp_eqs);
+	else
+		mlx5_irqs_release_vectors(table->comp_irqs, table->num_comp_eqs);
+	kfree(table->comp_irqs);
+}
+static int comp_irqs_request(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+	int ncomp_eqs = table->num_comp_eqs;
+	u16 *cpus;
+	int ret;
+	int i;
+	ncomp_eqs = table->num_comp_eqs;
+	table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL);
+	if (!table->comp_irqs)
+		return -ENOMEM;
+	if (mlx5_core_is_sf(dev)) {
+		ret = mlx5_irq_affinity_irqs_request_auto(dev, ncomp_eqs, table->comp_irqs);
+		if (ret < 0)
+			goto free_irqs;
+		return ret;
+	}
+	cpus = kcalloc(ncomp_eqs, sizeof(*cpus), GFP_KERNEL);
+	if (!cpus) {
+		ret = -ENOMEM;
+		goto free_irqs;
+	}
+	for (i = 0; i < ncomp_eqs; i++)
+		cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
+	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
+	kfree(cpus);
+	if (ret < 0)
+		goto free_irqs;
+	return ret;
+free_irqs:
+	kfree(table->comp_irqs);
+	return ret;
+}
 static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
@@ -809,6 +856,7 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 		tasklet_disable(&eq->tasklet_ctx.task);
 		kfree(eq);
 	}
+	comp_irqs_release(dev);
 }
 static u16 comp_eq_depth_devlink_param_get(struct mlx5_core_dev *dev)
@@ -835,12 +883,13 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 	int err;
 	int i;
+	ncomp_eqs = comp_irqs_request(dev);
+	if (ncomp_eqs < 0)
+		return ncomp_eqs;
 	INIT_LIST_HEAD(&table->comp_eqs_list);
-	ncomp_eqs = table->num_comp_eqs;
 	nent = comp_eq_depth_devlink_param_get(dev);
 	for (i = 0; i < ncomp_eqs; i++) {
 		struct mlx5_eq_param param = {};
-		int vecidx = i;
 		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
 		if (!eq) {
@@ -855,18 +904,11 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
-			.irq_index = vecidx,
+			.irq = table->comp_irqs[i],
 			.nent = nent,
 		};
-		if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
-			err = -ENOMEM;
-			goto clean_eq;
-		}
-		cpumask_set_cpu(cpumask_local_spread(i, dev->priv.numa_node),
-				param.affinity);
 		err = create_map_eq(dev, &eq->core, &param);
-		free_cpumask_var(param.affinity);
 		if (err)
 			goto clean_eq;
 		err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb);
@@ -880,7 +922,9 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 		list_add_tail(&eq->list, &table->comp_eqs_list);
 	}
+	table->num_comp_eqs = ncomp_eqs;
 	return 0;
 clean_eq:
 	kfree(eq);
 clean:

--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c
@@ -431,7 +431,7 @@ int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
 	int err = 0;
 	if (!mlx5_esw_allowed(esw))
-		return -EPERM;
+		return vlan ? -EPERM : 0;
 	if (vlan || qos)
 		set_flags = SET_VLAN_STRIP | SET_VLAN_INSERT;

--- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#include "mlx5_core.h"
+#include "mlx5_irq.h"
+#include "pci_irq.h"
+static void cpu_put(struct mlx5_irq_pool *pool, int cpu)
+{
+	pool->irqs_per_cpu[cpu]--;
+}
+static void cpu_get(struct mlx5_irq_pool *pool, int cpu)
+{
+	pool->irqs_per_cpu[cpu]++;
+}
+/* Gets the least loaded CPU. e.g.: the CPU with least IRQs bound to it */
+static int cpu_get_least_loaded(struct mlx5_irq_pool *pool,
+				const struct cpumask *req_mask)
+{
+	int best_cpu = -1;
+	int cpu;
+	for_each_cpu_and(cpu, req_mask, cpu_online_mask) {
+		/* CPU has zero IRQs on it. No need to search any more CPUs. */
+		if (!pool->irqs_per_cpu[cpu]) {
+			best_cpu = cpu;
+			break;
+		}
+		if (best_cpu < 0)
+			best_cpu = cpu;
+		if (pool->irqs_per_cpu[cpu] < pool->irqs_per_cpu[best_cpu])
+			best_cpu = cpu;
+	}
+	if (best_cpu == -1) {
+		/* There isn't online CPUs in req_mask */
+		mlx5_core_err(pool->dev, "NO online CPUs in req_mask (%*pbl)\n",
+			      cpumask_pr_args(req_mask));
+		best_cpu = cpumask_first(cpu_online_mask);
+	}
+	pool->irqs_per_cpu[best_cpu]++;
+	return best_cpu;
+}
+/* Creating an IRQ from irq_pool */
+static struct mlx5_irq *
+irq_pool_request_irq(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	cpumask_var_t auto_mask;
+	struct mlx5_irq *irq;
+	u32 irq_index;
+	int err;
+	if (!zalloc_cpumask_var(&auto_mask, GFP_KERNEL))
+		return ERR_PTR(-ENOMEM);
+	err = xa_alloc(&pool->irqs, &irq_index, NULL, pool->xa_num_irqs, GFP_KERNEL);
+	if (err)
+		return ERR_PTR(err);
+	if (pool->irqs_per_cpu) {
+		if (cpumask_weight(req_mask) > 1)
+			/* if req_mask contain more then one CPU, set the least loadad CPU
+			 * of req_mask
+			 */
+			cpumask_set_cpu(cpu_get_least_loaded(pool, req_mask), auto_mask);
+		else
+			cpu_get(pool, cpumask_first(req_mask));
+	}
+	irq = mlx5_irq_alloc(pool, irq_index, cpumask_empty(auto_mask) ? req_mask : auto_mask);
+	free_cpumask_var(auto_mask);
+	return irq;
+}
+/* Looking for the IRQ with the smallest refcount that fits req_mask.
+ * If pool is sf_comp_pool, then we are looking for an IRQ with any of the
+ * requested CPUs in req_mask.
+ * for example: req_mask = 0xf, irq0_mask = 0x10, irq1_mask = 0x1. irq0_mask
+ * isn't subset of req_mask, so we will skip it. irq1_mask is subset of req_mask,
+ * we don't skip it.
+ * If pool is sf_ctrl_pool, then all IRQs have the same mask, so any IRQ will
+ * fit. And since mask is subset of itself, we will pass the first if bellow.
+ */
+static struct mlx5_irq *
+irq_pool_find_least_loaded(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	int start = pool->xa_num_irqs.min;
+	int end = pool->xa_num_irqs.max;
+	struct mlx5_irq *irq = NULL;
+	struct mlx5_irq *iter;
+	int irq_refcount = 0;
+	unsigned long index;
+	lockdep_assert_held(&pool->lock);
+	xa_for_each_range(&pool->irqs, index, iter, start, end) {
+		struct cpumask *iter_mask = mlx5_irq_get_affinity_mask(iter);
+		int iter_refcount = mlx5_irq_read_locked(iter);
+		if (!cpumask_subset(iter_mask, req_mask))
+			/* skip IRQs with a mask which is not subset of req_mask */
+			continue;
+		if (iter_refcount < pool->min_threshold)
+			/* If we found an IRQ with less than min_thres, return it */
+			return iter;
+		if (!irq || iter_refcount < irq_refcount) {
+			/* In case we won't find an IRQ with less than min_thres,
+			 * keep a pointer to the least used IRQ
+			 */
+			irq_refcount = iter_refcount;
+			irq = iter;
+		}
+	}
+	return irq;
+}
+/**
+ * mlx5_irq_affinity_request - request an IRQ according to the given mask.
+ * @pool: IRQ pool to request from.
+ * @req_mask: cpumask requested for this IRQ.
+ *
+ * This function returns a pointer to IRQ, or ERR_PTR in case of error.
+ */
+struct mlx5_irq *
+mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	struct mlx5_irq *least_loaded_irq, *new_irq;
+	mutex_lock(&pool->lock);
+	least_loaded_irq = irq_pool_find_least_loaded(pool, req_mask);
+	if (least_loaded_irq &&
+	    mlx5_irq_read_locked(least_loaded_irq) < pool->min_threshold)
+		goto out;
+	/* We didn't find an IRQ with less than min_thres, try to allocate a new IRQ */
+	new_irq = irq_pool_request_irq(pool, req_mask);
+	if (IS_ERR(new_irq)) {
+		if (!least_loaded_irq) {
+			/* We failed to create an IRQ and we didn't find an IRQ */
+			mlx5_core_err(pool->dev, "Didn't find a matching IRQ. err = %ld\n",
+				      PTR_ERR(new_irq));
+			mutex_unlock(&pool->lock);
+			return new_irq;
+		}
+		/* We failed to create a new IRQ for the requested affinity,
+		 * sharing existing IRQ.
+		 */
+		goto out;
+	}
+	least_loaded_irq = new_irq;
+	goto unlock;
+out:
+	mlx5_irq_get_locked(least_loaded_irq);
+	if (mlx5_irq_read_locked(least_loaded_irq) > pool->max_threshold)
+		mlx5_core_dbg(pool->dev, "IRQ %u overloaded, pool_name: %s, %u EQs on this irq\n",
+			      pci_irq_vector(pool->dev->pdev,
+					     mlx5_irq_get_index(least_loaded_irq)), pool->name,
+			      mlx5_irq_read_locked(least_loaded_irq) / MLX5_EQ_REFS_PER_IRQ);
+unlock:
+	mutex_unlock(&pool->lock);
+	return least_loaded_irq;
+}
+void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+				    int num_irqs)
+{
+	struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+	int i;
+	for (i = 0; i < num_irqs; i++) {
+		int cpu = cpumask_first(mlx5_irq_get_affinity_mask(irqs[i]));
+		synchronize_irq(pci_irq_vector(pool->dev->pdev,
+					       mlx5_irq_get_index(irqs[i])));
+		if (mlx5_irq_put(irqs[i]))
+			if (pool->irqs_per_cpu)
+				cpu_put(pool, cpu);
+	}
+}
+/**
+ * mlx5_irq_affinity_irqs_request_auto - request one or more IRQs for mlx5 device.
+ * @dev: mlx5 device that is requesting the IRQs.
+ * @nirqs: number of IRQs to request.
+ * @irqs: an output array of IRQs pointers.
+ *
+ * Each IRQ is bounded to at most 1 CPU.
+ * This function is requesting IRQs according to the default assignment.
+ * The default assignment policy is:
+ * - in each iteration, request the least loaded IRQ which is not bound to any
+ *   CPU of the previous IRQs requested.
+ *
+ * This function returns the number of IRQs requested, (which might be smaller than
+ * @nirqs), if successful, or a negative error code in case of an error.
+ */
+int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
+					struct mlx5_irq **irqs)
+{
+	struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+	cpumask_var_t req_mask;
+	struct mlx5_irq *irq;
+	int i = 0;
+	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_copy(req_mask, cpu_online_mask);
+	for (i = 0; i < nirqs; i++) {
+		if (mlx5_irq_pool_is_sf_pool(pool))
+			irq = mlx5_irq_affinity_request(pool, req_mask);
+		else
+			/* In case SF pool doesn't exists, fallback to the PF IRQs.
+			 * The PF IRQs are already allocated and binded to CPU
+			 * at this point. Hence, only an index is needed.
+			 */
+			irq = mlx5_irq_request(dev, i, NULL);
+		if (IS_ERR(irq))
+			break;
+		irqs[i] = irq;
+		cpumask_clear_cpu(cpumask_first(mlx5_irq_get_affinity_mask(irq)), req_mask);
+		mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
+			      pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
+			      cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
+			      mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
+	}
+	free_cpumask_var(req_mask);
+	if (!i)
+		return PTR_ERR(irq);
+	return i;
+}
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -98,6 +98,8 @@ enum {
 	MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1,
 };
+#define LOG_MAX_SUPPORTED_QPS 0xff
 static struct mlx5_profile profile[] = {
 	[0] = {
 		.mask           = 0,
@@ -109,7 +111,7 @@ static struct mlx5_profile profile[] = {
 	[2] = {
 		.mask		= MLX5_PROF_MASK_QP_SIZE |
 				  MLX5_PROF_MASK_MR_CACHE,
-		.log_max_qp	= 18,
+		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
 		.mr_cache[0]	= {
 			.size	= 500,
 			.limit	= 250
@@ -523,7 +525,9 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 		 to_fw_pkey_sz(dev, 128));
 	/* Check log_max_qp from HCA caps to set in current profile */
-	if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < prof->log_max_qp) {
+	if (prof->log_max_qp == LOG_MAX_SUPPORTED_QPS) {
+		prof->log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp);
+	} else if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < prof->log_max_qp) {
 		mlx5_core_warn(dev, "log_max_qp value in current profile is %d, changing it to HCA capability limit (%d)\n",
 			       prof->log_max_qp,
 			       MLX5_CAP_GEN_MAX(dev, log_max_qp));

--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
@@ -22,12 +22,40 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
 			    int msix_vec_count);
 int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
+struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev);
+void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq);
 struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
 				  struct cpumask *affinity);
-void mlx5_irq_release(struct mlx5_irq *irq);
+int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
+			      struct mlx5_irq **irqs);
+void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs);
 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
 int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
 struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq);
 int mlx5_irq_get_index(struct mlx5_irq *irq);
+struct mlx5_irq_pool;
+#ifdef CONFIG_MLX5_SF
+int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
+					struct mlx5_irq **irqs);
+struct mlx5_irq *mlx5_irq_affinity_request(struct mlx5_irq_pool *pool,
+					   const struct cpumask *req_mask);
+void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+				    int num_irqs);
+#else
+static inline int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
+						      struct mlx5_irq **irqs)
+{
+	return -EOPNOTSUPP;
+}
+static inline struct mlx5_irq *
+mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+static inline void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev,
+						  struct mlx5_irq **irqs, int num_irqs) {}
+#endif
 #endif /* __MLX5_IRQ_H__ */
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#ifndef __PCI_IRQ_H__
+#define __PCI_IRQ_H__
+#include <linux/mlx5/driver.h>
+#define MLX5_MAX_IRQ_NAME (32)
+/* max irq_index is 2047, so four chars */
+#define MLX5_MAX_IRQ_IDX_CHARS (4)
+#define MLX5_EQ_REFS_PER_IRQ (2)
+struct mlx5_irq;
+struct mlx5_irq_pool {
+	char name[MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS];
+	struct xa_limit xa_num_irqs;
+	struct mutex lock; /* sync IRQs creations */
+	struct xarray irqs;
+	u32 max_threshold;
+	u32 min_threshold;
+	u16 *irqs_per_cpu;
+	struct mlx5_core_dev *dev;
+};
+struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev);
+static inline bool mlx5_irq_pool_is_sf_pool(struct mlx5_irq_pool *pool)
+{
+	return !strncmp("mlx5_sf", pool->name, strlen("mlx5_sf"));
+}
+struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
+				const struct cpumask *affinity);
+int mlx5_irq_get_locked(struct mlx5_irq *irq);
+int mlx5_irq_read_locked(struct mlx5_irq *irq);
+int mlx5_irq_put(struct mlx5_irq *irq);
+#endif /* __PCI_IRQ_H__ */
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -9,13 +9,13 @@
 #define MLX5_NUM_SPARE_EQE (0x80)
 struct mlx5_eq;
+struct mlx5_irq;
 struct mlx5_core_dev;
 struct mlx5_eq_param {
-	u8             irq_index;
 	int            nent;
 	u64            mask[4];
-	cpumask_var_t  affinity;
+	struct mlx5_irq *irq;
 };
 struct mlx5_eq *