Commit 7da375e2 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-mlx5e-shampo-enable-hw-gro-once-more'

Tariq Toukan says:

====================
net/mlx5e: SHAMPO, Enable HW GRO once more

This series enables hardware GRO for ConnectX-7 and newer NICs.
SHAMPO stands for Split Header And Merge Payload Offload.

The first part of the series contains important fixes and improvements.

The second part reworks the HW GRO counters.

Lastly, HW GRO is perf optimized and enabled.

Here are the bandwidth numbers for a simple iperf3 test over a single rq
where the application and irq are pinned to the same CPU:

+---------+--------+--------+-----------+-------------+
| streams | SW GRO | HW GRO | Unit      | Improvement |
+---------+--------+--------+-----------+-------------+
| 1       | 36     | 57     | Gbits/sec |    1.6 x    |
| 4       | 34     | 50     | Gbits/sec |    1.5 x    |
| 8       | 31     | 43     | Gbits/sec |    1.4 x    |
+---------+--------+--------+-----------+-------------+

Benchmark details:
VM based setup
CPU: Intel(R) Xeon(R) Platinum 8380 CPU, 24 cores
NIC: ConnectX-7 100GbE
iperf3 and irq running on same CPU over a single receive queue
====================

Link: https://lore.kernel.org/r/20240603212219.1037656-1-tariqt@nvidia.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ed20142e 14ae2fd1
......@@ -189,22 +189,19 @@ the software port.
* - `rx[i]_gro_packets`
- Number of received packets processed using hardware-accelerated GRO. The
number of hardware GRO offloaded packets received on ring i.
number of hardware GRO offloaded packets received on ring i. Only true GRO
packets are counted: only packets that are in an SKB with a GRO count > 1.
- Acceleration
* - `rx[i]_gro_bytes`
- Number of received bytes processed using hardware-accelerated GRO. The
number of hardware GRO offloaded bytes received on ring i.
number of hardware GRO offloaded bytes received on ring i. Only true GRO
packets are counted: only packets that are in an SKB with a GRO count > 1.
- Acceleration
* - `rx[i]_gro_skbs`
- The number of receive SKBs constructed while performing
hardware-accelerated GRO.
- Informative
* - `rx[i]_gro_match_packets`
- Number of received packets processed using hardware-accelerated GRO that
met the flow table match criteria.
- The number of GRO SKBs constructed from hardware-accelerated GRO. Only SKBs
with a GRO count > 1 are counted.
- Informative
* - `rx[i]_gro_large_hds`
......@@ -212,6 +209,15 @@ the software port.
headers that require additional memory to be allocated.
- Informative
* - `rx[i]_hds_nodata_packets`
- Number of header only packets in header/data split mode [#accel]_.
- Informative
* - `rx[i]_hds_nodata_bytes`
- Number of bytes for header only packets in header/data split mode
[#accel]_.
- Informative
* - `rx[i]_lro_packets`
- The number of LRO packets received on ring i [#accel]_.
- Acceleration
......
......@@ -80,6 +80,7 @@ struct page_pool;
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
#define MLX5E_RX_MAX_HEAD (256)
#define MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE (8)
#define MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE (9)
#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (PAGE_SIZE >> MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE)
#define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE (64)
......@@ -146,25 +147,6 @@ struct page_pool;
#define MLX5E_TX_XSK_POLL_BUDGET 64
#define MLX5E_SQ_RECOVER_MIN_INTERVAL 500 /* msecs */
#define MLX5E_KLM_UMR_WQE_SZ(sgl_len)\
(sizeof(struct mlx5e_umr_wqe) +\
(sizeof(struct mlx5_klm) * (sgl_len)))
#define MLX5E_KLM_UMR_WQEBBS(klm_entries) \
(DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_BB))
#define MLX5E_KLM_UMR_DS_CNT(klm_entries)\
(DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_DS))
#define MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size)\
(((wqe_size) - sizeof(struct mlx5e_umr_wqe)) / sizeof(struct mlx5_klm))
#define MLX5E_KLM_ENTRIES_PER_WQE(wqe_size)\
ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)
#define MLX5E_MAX_KLM_PER_WQE(mdev) \
MLX5E_KLM_ENTRIES_PER_WQE(MLX5_SEND_WQE_BB * mlx5e_get_max_sq_aligned_wqebbs(mdev))
#define mlx5e_state_dereference(priv, p) \
rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock))
......@@ -1014,7 +996,7 @@ void mlx5e_build_ptys2ethtool_map(void);
bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift,
enum mlx5e_mpwrq_umr_mode umr_mode);
void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq, u16 len, u16 start, bool close);
void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq);
void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats);
void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s);
......
......@@ -1071,18 +1071,18 @@ static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
struct mlx5e_rq_param *rq_param)
{
int max_num_of_umr_per_wqe, max_hd_per_wqe, max_klm_per_umr, rest;
int max_num_of_umr_per_wqe, max_hd_per_wqe, max_ksm_per_umr, rest;
void *wqc = MLX5_ADDR_OF(rqc, rq_param->rqc, wq);
int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
u32 wqebbs;
max_klm_per_umr = MLX5E_MAX_KLM_PER_WQE(mdev);
max_ksm_per_umr = MLX5E_MAX_KSM_PER_WQE(mdev);
max_hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param);
max_num_of_umr_per_wqe = max_hd_per_wqe / max_klm_per_umr;
rest = max_hd_per_wqe % max_klm_per_umr;
wqebbs = MLX5E_KLM_UMR_WQEBBS(max_klm_per_umr) * max_num_of_umr_per_wqe;
max_num_of_umr_per_wqe = max_hd_per_wqe / max_ksm_per_umr;
rest = max_hd_per_wqe % max_ksm_per_umr;
wqebbs = MLX5E_KSM_UMR_WQEBBS(max_ksm_per_umr) * max_num_of_umr_per_wqe;
if (rest)
wqebbs += MLX5E_KLM_UMR_WQEBBS(rest);
wqebbs += MLX5E_KSM_UMR_WQEBBS(rest);
wqebbs *= wq_size;
return wqebbs;
}
......
......@@ -34,6 +34,25 @@
#define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)
#define MLX5E_KSM_UMR_WQE_SZ(sgl_len)\
(sizeof(struct mlx5e_umr_wqe) +\
(sizeof(struct mlx5_ksm) * (sgl_len)))
#define MLX5E_KSM_UMR_WQEBBS(ksm_entries) \
(DIV_ROUND_UP(MLX5E_KSM_UMR_WQE_SZ(ksm_entries), MLX5_SEND_WQE_BB))
#define MLX5E_KSM_UMR_DS_CNT(ksm_entries)\
(DIV_ROUND_UP(MLX5E_KSM_UMR_WQE_SZ(ksm_entries), MLX5_SEND_WQE_DS))
#define MLX5E_KSM_MAX_ENTRIES_PER_WQE(wqe_size)\
(((wqe_size) - sizeof(struct mlx5e_umr_wqe)) / sizeof(struct mlx5_ksm))
#define MLX5E_KSM_ENTRIES_PER_WQE(wqe_size)\
ALIGN_DOWN(MLX5E_KSM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT)
#define MLX5E_MAX_KSM_PER_WQE(mdev) \
MLX5E_KSM_ENTRIES_PER_WQE(MLX5_SEND_WQE_BB * mlx5e_get_max_sq_aligned_wqebbs(mdev))
static inline
ktime_t mlx5e_cqe_ts_to_ns(cqe_ts_to_ns func, struct mlx5_clock *clock, u64 cqe_ts)
{
......
......@@ -74,6 +74,27 @@
#include "lib/devcom.h"
#include "lib/sd.h"
static bool mlx5e_hw_gro_supported(struct mlx5_core_dev *mdev)
{
if (!MLX5_CAP_GEN(mdev, shampo))
return false;
/* Our HW-GRO implementation relies on "KSM Mkey" for
* SHAMPO headers buffer mapping
*/
if (!MLX5_CAP_GEN(mdev, fixed_buffer_size))
return false;
if (!MLX5_CAP_GEN_2(mdev, min_mkey_log_entity_size_fixed_buffer_valid))
return false;
if (MLX5_CAP_GEN_2(mdev, min_mkey_log_entity_size_fixed_buffer) >
MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE)
return false;
return true;
}
bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift,
enum mlx5e_mpwrq_umr_mode umr_mode)
{
......@@ -504,8 +525,8 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
return err;
}
static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev,
u64 nentries,
static int mlx5e_create_umr_ksm_mkey(struct mlx5_core_dev *mdev,
u64 nentries, u8 log_entry_size,
u32 *umr_mkey)
{
int inlen;
......@@ -525,12 +546,13 @@ static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev,
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET(mkc, mkc, lw, 1);
MLX5_SET(mkc, mkc, lr, 1);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KSM);
mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
MLX5_SET(mkc, mkc, qpn, 0xffffff);
MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
MLX5_SET(mkc, mkc, translations_octword_size, nentries);
MLX5_SET(mkc, mkc, length64, 1);
MLX5_SET(mkc, mkc, log_page_size, log_entry_size);
MLX5_SET64(mkc, mkc, len, nentries << log_entry_size);
err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);
kvfree(in);
......@@ -565,14 +587,16 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
static int mlx5e_create_rq_hd_umr_mkey(struct mlx5_core_dev *mdev,
struct mlx5e_rq *rq)
{
u32 max_klm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size));
u32 max_ksm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size));
if (max_klm_size < rq->mpwqe.shampo->hd_per_wq) {
mlx5_core_err(mdev, "max klm list size 0x%x is smaller than shampo header buffer list size 0x%x\n",
max_klm_size, rq->mpwqe.shampo->hd_per_wq);
if (max_ksm_size < rq->mpwqe.shampo->hd_per_wq) {
mlx5_core_err(mdev, "max ksm list size 0x%x is smaller than shampo header buffer list size 0x%x\n",
max_ksm_size, rq->mpwqe.shampo->hd_per_wq);
return -EINVAL;
}
return mlx5e_create_umr_klm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq,
return mlx5e_create_umr_ksm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq,
MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE,
&rq->mpwqe.shampo->mkey);
}
......@@ -1208,15 +1232,6 @@ void mlx5e_free_rx_missing_descs(struct mlx5e_rq *rq)
head = mlx5_wq_ll_get_wqe_next_ix(wq, head);
}
if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) {
u16 len;
len = (rq->mpwqe.shampo->pi - rq->mpwqe.shampo->ci) &
(rq->mpwqe.shampo->hd_per_wq - 1);
mlx5e_shampo_dealloc_hd(rq, len, rq->mpwqe.shampo->ci, false);
rq->mpwqe.shampo->pi = rq->mpwqe.shampo->ci;
}
rq->mpwqe.actual_wq_head = wq->head;
rq->mpwqe.umr_in_progress = 0;
rq->mpwqe.umr_completed = 0;
......@@ -1244,8 +1259,7 @@ void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
}
if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state))
mlx5e_shampo_dealloc_hd(rq, rq->mpwqe.shampo->hd_per_wq,
0, true);
mlx5e_shampo_dealloc_hd(rq);
} else {
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
u16 missing = mlx5_wq_cyc_missing(wq);
......@@ -4259,13 +4273,19 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features)
#define MLX5E_HANDLE_FEATURE(feature, handler) \
mlx5e_handle_feature(netdev, &oper_features, feature, handler)
if (features & (NETIF_F_GRO_HW | NETIF_F_LRO)) {
err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_GRO_HW, set_feature_hw_gro);
} else {
err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_GRO_HW, set_feature_hw_gro);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
}
err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
set_feature_cvlan_filter);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan);
#ifdef CONFIG_MLX5_EN_ARFS
err |= MLX5E_HANDLE_FEATURE(NETIF_F_NTUPLE, set_feature_arfs);
......@@ -5332,6 +5352,11 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER;
netdev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
if (mlx5e_hw_gro_supported(mdev) &&
mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT,
MLX5E_MPWRQ_UMR_MODE_ALIGNED))
netdev->hw_features |= NETIF_F_GRO_HW;
if (mlx5e_tunnel_any_tx_proto_supported(mdev)) {
netdev->hw_enc_features |= NETIF_F_HW_CSUM;
netdev->hw_enc_features |= NETIF_F_TSO;
......
......@@ -141,7 +141,6 @@ static const struct counter_desc sw_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_packets) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_bytes) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_skbs) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_match_packets) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_large_hds) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_ecn_mark) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_removed_vlan_packets) },
......@@ -343,8 +342,9 @@ static void mlx5e_stats_grp_sw_update_stats_rq_stats(struct mlx5e_sw_stats *s,
s->rx_gro_packets += rq_stats->gro_packets;
s->rx_gro_bytes += rq_stats->gro_bytes;
s->rx_gro_skbs += rq_stats->gro_skbs;
s->rx_gro_match_packets += rq_stats->gro_match_packets;
s->rx_gro_large_hds += rq_stats->gro_large_hds;
s->rx_hds_nodata_packets += rq_stats->hds_nodata_packets;
s->rx_hds_nodata_bytes += rq_stats->hds_nodata_bytes;
s->rx_ecn_mark += rq_stats->ecn_mark;
s->rx_removed_vlan_packets += rq_stats->removed_vlan_packets;
s->rx_csum_none += rq_stats->csum_none;
......@@ -2057,8 +2057,9 @@ static const struct counter_desc rq_stats_desc[] = {
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_packets) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_bytes) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_skbs) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_match_packets) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_large_hds) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, hds_nodata_packets) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, hds_nodata_bytes) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, ecn_mark) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, removed_vlan_packets) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, wqe_err) },
......
......@@ -153,8 +153,9 @@ struct mlx5e_sw_stats {
u64 rx_gro_packets;
u64 rx_gro_bytes;
u64 rx_gro_skbs;
u64 rx_gro_match_packets;
u64 rx_gro_large_hds;
u64 rx_hds_nodata_packets;
u64 rx_hds_nodata_bytes;
u64 rx_mcast_packets;
u64 rx_ecn_mark;
u64 rx_removed_vlan_packets;
......@@ -352,8 +353,9 @@ struct mlx5e_rq_stats {
u64 gro_packets;
u64 gro_bytes;
u64 gro_skbs;
u64 gro_match_packets;
u64 gro_large_hds;
u64 hds_nodata_packets;
u64 hds_nodata_bytes;
u64 mcast_packets;
u64 ecn_mark;
u64 removed_vlan_packets;
......
......@@ -294,6 +294,7 @@ enum {
#define MLX5_UMR_FLEX_ALIGNMENT 0x40
#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt))
#define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm))
#define MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_ksm))
#define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
......
......@@ -1526,8 +1526,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 ts_cqe_to_dest_cqn[0x1];
u8 reserved_at_b3[0x6];
u8 go_back_n[0x1];
u8 shampo[0x1];
u8 reserved_at_bb[0x5];
u8 reserved_at_ba[0x6];
u8 max_sgl_for_optimized_performance[0x8];
u8 log_max_cq_sz[0x8];
......@@ -1744,7 +1743,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_280[0x10];
u8 max_wqe_sz_sq[0x10];
u8 reserved_at_2a0[0x10];
u8 reserved_at_2a0[0xb];
u8 shampo[0x1];
u8 reserved_at_2ac[0x4];
u8 max_wqe_sz_rq[0x10];
u8 max_flow_counter_31_16[0x10];
......@@ -2017,7 +2018,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
u8 reserved_at_250[0x10];
u8 reserved_at_260[0x120];
u8 reserved_at_380[0x10];
u8 reserved_at_380[0xb];
u8 min_mkey_log_entity_size_fixed_buffer[0x5];
u8 ec_vf_vport_base[0x10];
u8 reserved_at_3a0[0x10];
......@@ -2029,7 +2031,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
u8 pcc_ifa2[0x1];
u8 reserved_at_3f1[0xf];
u8 reserved_at_400[0x400];
u8 reserved_at_400[0x1];
u8 min_mkey_log_entity_size_fixed_buffer_valid[0x1];
u8 reserved_at_402[0x1e];
u8 reserved_at_420[0x3e0];
};
enum mlx5_ifc_flow_destination_type {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment