Commit f28186d6 authored by Tariq Toukan's avatar Tariq Toukan Committed by David S. Miller

net/mlx4_en: Improve stack xmit function

Several small code and performance improvements in stack TX datapath,
including:
- Compiler branch predictor hints.
- Minimize variables scope.
- Move tx_info non-inline flow handling to a separate function.
- Calculate data_offset in compile time rather than in runtime
  (for !lso_header_size branch).
- Avoid trinary-operator ("?") when value can be preset in a matching
  branch.

Performance tests:
Tested on ConnectX3Pro, Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Gain is too small to be measurable, no degradation sensed.
Results are similar for IPv4 and IPv6.
Signed-off-by: default avatarTariq Toukan <tariqt@mellanox.com>
Reviewed-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
Cc: kernel-team@fb.com
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent cc26a490
...@@ -774,37 +774,101 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring, ...@@ -774,37 +774,101 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
} }
} }
static bool mlx4_en_build_dma_wqe(struct mlx4_en_priv *priv,
struct skb_shared_info *shinfo,
struct mlx4_wqe_data_seg *data,
struct sk_buff *skb,
int lso_header_size,
__be32 mr_key,
struct mlx4_en_tx_info *tx_info)
{
struct device *ddev = priv->ddev;
dma_addr_t dma = 0;
u32 byte_count = 0;
int i_frag;
/* Map fragments if any */
for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
const struct skb_frag_struct *frag;
frag = &shinfo->frags[i_frag];
byte_count = skb_frag_size(frag);
dma = skb_frag_dma_map(ddev, frag,
0, byte_count,
DMA_TO_DEVICE);
if (dma_mapping_error(ddev, dma))
goto tx_drop_unmap;
data->addr = cpu_to_be64(dma);
data->lkey = mr_key;
dma_wmb();
data->byte_count = cpu_to_be32(byte_count);
--data;
}
/* Map linear part if needed */
if (tx_info->linear) {
byte_count = skb_headlen(skb) - lso_header_size;
dma = dma_map_single(ddev, skb->data +
lso_header_size, byte_count,
PCI_DMA_TODEVICE);
if (dma_mapping_error(ddev, dma))
goto tx_drop_unmap;
data->addr = cpu_to_be64(dma);
data->lkey = mr_key;
dma_wmb();
data->byte_count = cpu_to_be32(byte_count);
}
/* tx completion can avoid cache line miss for common cases */
tx_info->map0_dma = dma;
tx_info->map0_byte_count = byte_count;
return true;
tx_drop_unmap:
en_err(priv, "DMA mapping error\n");
while (++i_frag < shinfo->nr_frags) {
++data;
dma_unmap_page(ddev, (dma_addr_t)be64_to_cpu(data->addr),
be32_to_cpu(data->byte_count),
PCI_DMA_TODEVICE);
}
return false;
}
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
{ {
struct skb_shared_info *shinfo = skb_shinfo(skb); struct skb_shared_info *shinfo = skb_shinfo(skb);
struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_priv *priv = netdev_priv(dev);
union mlx4_wqe_qpn_vlan qpn_vlan = {}; union mlx4_wqe_qpn_vlan qpn_vlan = {};
struct device *ddev = priv->ddev;
struct mlx4_en_tx_ring *ring; struct mlx4_en_tx_ring *ring;
struct mlx4_en_tx_desc *tx_desc; struct mlx4_en_tx_desc *tx_desc;
struct mlx4_wqe_data_seg *data; struct mlx4_wqe_data_seg *data;
struct mlx4_en_tx_info *tx_info; struct mlx4_en_tx_info *tx_info;
int tx_ind = 0; int tx_ind;
int nr_txbb; int nr_txbb;
int desc_size; int desc_size;
int real_size; int real_size;
u32 index, bf_index; u32 index, bf_index;
__be32 op_own; __be32 op_own;
u16 vlan_proto = 0;
int i_frag;
int lso_header_size; int lso_header_size;
void *fragptr = NULL; void *fragptr = NULL;
bool bounce = false; bool bounce = false;
bool send_doorbell; bool send_doorbell;
bool stop_queue; bool stop_queue;
bool inline_ok; bool inline_ok;
u8 data_offset;
u32 ring_cons; u32 ring_cons;
bool bf_ok; bool bf_ok;
tx_ind = skb_get_queue_mapping(skb); tx_ind = skb_get_queue_mapping(skb);
ring = priv->tx_ring[TX][tx_ind]; ring = priv->tx_ring[TX][tx_ind];
if (!priv->port_up) if (unlikely(!priv->port_up))
goto tx_drop; goto tx_drop;
/* fetch ring->cons far ahead before needing it to avoid stall */ /* fetch ring->cons far ahead before needing it to avoid stall */
...@@ -826,6 +890,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -826,6 +890,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
bf_ok = ring->bf_enabled; bf_ok = ring->bf_enabled;
if (skb_vlan_tag_present(skb)) { if (skb_vlan_tag_present(skb)) {
u16 vlan_proto;
qpn_vlan.vlan_tag = cpu_to_be16(skb_vlan_tag_get(skb)); qpn_vlan.vlan_tag = cpu_to_be16(skb_vlan_tag_get(skb));
vlan_proto = be16_to_cpu(skb->vlan_proto); vlan_proto = be16_to_cpu(skb->vlan_proto);
if (vlan_proto == ETH_P_8021AD) if (vlan_proto == ETH_P_8021AD)
...@@ -862,64 +928,31 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -862,64 +928,31 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_info->skb = skb; tx_info->skb = skb;
tx_info->nr_txbb = nr_txbb; tx_info->nr_txbb = nr_txbb;
data = &tx_desc->data; if (!lso_header_size) {
if (lso_header_size) data = &tx_desc->data;
data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, data_offset = offsetof(struct mlx4_en_tx_desc, data);
DS_SIZE)); } else {
int lso_align = ALIGN(lso_header_size + 4, DS_SIZE);
data = (void *)&tx_desc->lso + lso_align;
data_offset = offsetof(struct mlx4_en_tx_desc, lso) + lso_align;
}
/* valid only for none inline segments */ /* valid only for none inline segments */
tx_info->data_offset = (void *)data - (void *)tx_desc; tx_info->data_offset = data_offset;
tx_info->inl = inline_ok; tx_info->inl = inline_ok;
tx_info->linear = (lso_header_size < skb_headlen(skb) && tx_info->linear = lso_header_size < skb_headlen(skb) && !inline_ok;
!inline_ok) ? 1 : 0;
tx_info->nr_maps = shinfo->nr_frags + tx_info->linear; tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
data += tx_info->nr_maps - 1; data += tx_info->nr_maps - 1;
if (!tx_info->inl) { if (!tx_info->inl)
dma_addr_t dma = 0; if (!mlx4_en_build_dma_wqe(priv, shinfo, data, skb,
u32 byte_count = 0; lso_header_size, ring->mr_key,
tx_info))
/* Map fragments if any */ goto tx_drop_count;
for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
const struct skb_frag_struct *frag;
frag = &shinfo->frags[i_frag];
byte_count = skb_frag_size(frag);
dma = skb_frag_dma_map(ddev, frag,
0, byte_count,
DMA_TO_DEVICE);
if (dma_mapping_error(ddev, dma))
goto tx_drop_unmap;
data->addr = cpu_to_be64(dma);
data->lkey = ring->mr_key;
dma_wmb();
data->byte_count = cpu_to_be32(byte_count);
--data;
}
/* Map linear part if needed */
if (tx_info->linear) {
byte_count = skb_headlen(skb) - lso_header_size;
dma = dma_map_single(ddev, skb->data +
lso_header_size, byte_count,
PCI_DMA_TODEVICE);
if (dma_mapping_error(ddev, dma))
goto tx_drop_unmap;
data->addr = cpu_to_be64(dma);
data->lkey = ring->mr_key;
dma_wmb();
data->byte_count = cpu_to_be32(byte_count);
}
/* tx completion can avoid cache line miss for common cases */
tx_info->map0_dma = dma;
tx_info->map0_byte_count = byte_count;
}
/* /*
* For timestamping add flag to skb_shinfo and * For timestamping add flag to skb_shinfo and
...@@ -1055,16 +1088,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -1055,16 +1088,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
} }
return NETDEV_TX_OK; return NETDEV_TX_OK;
tx_drop_unmap:
en_err(priv, "DMA mapping error\n");
while (++i_frag < shinfo->nr_frags) {
++data;
dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
be32_to_cpu(data->byte_count),
PCI_DMA_TODEVICE);
}
tx_drop_count: tx_drop_count:
ring->tx_dropped++; ring->tx_dropped++;
tx_drop: tx_drop:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment