Commit 88a85f99 authored by Achiad Shochat's avatar Achiad Shochat Committed by David S. Miller

net/mlx5e: TX latency optimization to save DMA reads

A regular TX WQE execution involves two or more DMA reads -
one to fetch the WQE, and another one per WQE gather entry.

These DMA reads obviously increase the TX latency.
There are two mlx5 mechanisms to bypass these DMA reads:
1) Inline WQE
2) Blue Flame (BF)

An inline WQE contains a whole packet, thus saves the DMA read/s
of the regular WQE gather entry/s. Inline WQE support was already
added in the previous commit.

A BF WQE is written directly to the device I/O mapped memory, thus
enables saving the DMA read that fetches the WQE.

The BF WQE I/O write must be in cache line granularity, thus uses
the CPU write combining mechanism.
A BF WQE I/O write acts also as a TX doorbell for notifying the
device of new TX WQEs.
A BF WQE is written to the same I/O mapped address as the regular TX
doorbell, thus this address is being mapped twice - once by ioremap()
and once by io_mapping_map_wc().

While both mechanisms reduce the TX latency, they both consume more CPU
cycles than a regular WQE:
- A BF WQE must still be written to host memory, in addition to being
  written directly to the device I/O mapped memory.
- An inline WQE involves copying the SKB data into it.

To handle this tradeoff, we introduce here a heuristic algorithm that
strives to avoid using these two mechanisms in case the TX queue is
being back-pressured by the device, and limit their usage rate otherwise.

An inline WQE will always be "Blue Flamed" (written directly to the
device I/O mapped memory) while a BF WQE may not be inlined (may contain
gather entries).

Preliminary testing using netperf UDP_RR shows that the latency goes down
from 17.5us to 16.9us, while the message rate (tested with pktgen) stays
the same.
Signed-off-by: default avatarAchiad Shochat <achiad@mellanox.com>
Signed-off-by: default avatarAmir Vadai <amirv@mellanox.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 58d52291
...@@ -60,6 +60,7 @@ ...@@ -60,6 +60,7 @@
#define MLX5E_TX_CQ_POLL_BUDGET 128 #define MLX5E_TX_CQ_POLL_BUDGET 128
#define MLX5E_UPDATE_STATS_INTERVAL 200 /* msecs */ #define MLX5E_UPDATE_STATS_INTERVAL 200 /* msecs */
#define MLX5E_SQ_BF_BUDGET 16
static const char vport_strings[][ETH_GSTRING_LEN] = { static const char vport_strings[][ETH_GSTRING_LEN] = {
/* vport statistics */ /* vport statistics */
...@@ -268,7 +269,9 @@ struct mlx5e_sq { ...@@ -268,7 +269,9 @@ struct mlx5e_sq {
/* dirtied @xmit */ /* dirtied @xmit */
u16 pc ____cacheline_aligned_in_smp; u16 pc ____cacheline_aligned_in_smp;
u32 dma_fifo_pc; u32 dma_fifo_pc;
u32 bf_offset; u16 bf_offset;
u16 prev_cc;
u8 bf_budget;
struct mlx5e_sq_stats stats; struct mlx5e_sq_stats stats;
struct mlx5e_cq cq; struct mlx5e_cq cq;
...@@ -281,9 +284,10 @@ struct mlx5e_sq { ...@@ -281,9 +284,10 @@ struct mlx5e_sq {
struct mlx5_wq_cyc wq; struct mlx5_wq_cyc wq;
u32 dma_fifo_mask; u32 dma_fifo_mask;
void __iomem *uar_map; void __iomem *uar_map;
void __iomem *uar_bf_map;
struct netdev_queue *txq; struct netdev_queue *txq;
u32 sqn; u32 sqn;
u32 bf_buf_size; u16 bf_buf_size;
u16 max_inline; u16 max_inline;
u16 edge; u16 edge;
struct device *pdev; struct device *pdev;
...@@ -493,8 +497,10 @@ int mlx5e_update_priv_params(struct mlx5e_priv *priv, ...@@ -493,8 +497,10 @@ int mlx5e_update_priv_params(struct mlx5e_priv *priv,
struct mlx5e_params *new_params); struct mlx5e_params *new_params);
static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq, static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
struct mlx5e_tx_wqe *wqe) struct mlx5e_tx_wqe *wqe, int bf_sz)
{ {
u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
/* ensure wqe is visible to device before updating doorbell record */ /* ensure wqe is visible to device before updating doorbell record */
dma_wmb(); dma_wmb();
...@@ -505,9 +511,15 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq, ...@@ -505,9 +511,15 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
*/ */
wmb(); wmb();
mlx5_write64((__be32 *)&wqe->ctrl, if (bf_sz) {
sq->uar_map + MLX5_BF_OFFSET + sq->bf_offset, __iowrite64_copy(sq->uar_bf_map + ofst, &wqe->ctrl, bf_sz);
NULL);
/* flush the write-combining mapped buffer */
wmb();
} else {
mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL);
}
sq->bf_offset ^= sq->bf_buf_size; sq->bf_offset ^= sq->bf_buf_size;
} }
......
...@@ -514,6 +514,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c, ...@@ -514,6 +514,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
sq->uar_map = sq->uar.map; sq->uar_map = sq->uar.map;
sq->uar_bf_map = sq->uar.bf_map;
sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2; sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
sq->max_inline = param->max_inline; sq->max_inline = param->max_inline;
...@@ -529,6 +530,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c, ...@@ -529,6 +530,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
sq->channel = c; sq->channel = c;
sq->tc = tc; sq->tc = tc;
sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS; sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
sq->bf_budget = MLX5E_SQ_BF_BUDGET;
priv->txq_to_sq_map[txq_ix] = sq; priv->txq_to_sq_map[txq_ix] = sq;
return 0; return 0;
......
...@@ -57,7 +57,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw) ...@@ -57,7 +57,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
if (notify_hw) { if (notify_hw) {
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
mlx5e_tx_notify_hw(sq, wqe); mlx5e_tx_notify_hw(sq, wqe, 0);
} }
} }
...@@ -110,7 +110,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, ...@@ -110,7 +110,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
} }
static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
struct sk_buff *skb) struct sk_buff *skb, bool bf)
{ {
/* Some NIC TX decisions, e.g loopback, are based on the packet /* Some NIC TX decisions, e.g loopback, are based on the packet
* headers and occur before the data gather. * headers and occur before the data gather.
...@@ -118,7 +118,7 @@ static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, ...@@ -118,7 +118,7 @@ static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
*/ */
#define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/) #define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/)
if (skb_headlen(skb) <= sq->max_inline) if (bf && (skb_headlen(skb) <= sq->max_inline))
return skb_headlen(skb); return skb_headlen(skb);
return MLX5E_MIN_INLINE; return MLX5E_MIN_INLINE;
...@@ -137,6 +137,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) ...@@ -137,6 +137,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
u8 opcode = MLX5_OPCODE_SEND; u8 opcode = MLX5_OPCODE_SEND;
dma_addr_t dma_addr = 0; dma_addr_t dma_addr = 0;
bool bf = false;
u16 headlen; u16 headlen;
u16 ds_cnt; u16 ds_cnt;
u16 ihs; u16 ihs;
...@@ -149,6 +150,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) ...@@ -149,6 +150,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
else else
sq->stats.csum_offload_none++; sq->stats.csum_offload_none++;
if (sq->cc != sq->prev_cc) {
sq->prev_cc = sq->cc;
sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0;
}
if (skb_is_gso(skb)) { if (skb_is_gso(skb)) {
u32 payload_len; u32 payload_len;
...@@ -161,7 +167,10 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) ...@@ -161,7 +167,10 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
sq->stats.tso_packets++; sq->stats.tso_packets++;
sq->stats.tso_bytes += payload_len; sq->stats.tso_bytes += payload_len;
} else { } else {
ihs = mlx5e_get_inline_hdr_size(sq, skb); bf = sq->bf_budget &&
!skb->xmit_more &&
!skb_shinfo(skb)->nr_frags;
ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len, MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len,
ETH_ZLEN); ETH_ZLEN);
} }
...@@ -233,14 +242,21 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) ...@@ -233,14 +242,21 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
} }
if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) { if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) {
int bf_sz = 0;
if (bf && sq->uar_bf_map)
bf_sz = MLX5E_TX_SKB_CB(skb)->num_wqebbs << 3;
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
mlx5e_tx_notify_hw(sq, wqe); mlx5e_tx_notify_hw(sq, wqe, bf_sz);
} }
/* fill sq edge with nops to avoid wqe wrap around */ /* fill sq edge with nops to avoid wqe wrap around */
while ((sq->pc & wq->sz_m1) > sq->edge) while ((sq->pc & wq->sz_m1) > sq->edge)
mlx5e_send_nop(sq, false); mlx5e_send_nop(sq, false);
sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
sq->stats.packets++; sq->stats.packets++;
return NETDEV_TX_OK; return NETDEV_TX_OK;
......
...@@ -654,6 +654,22 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev) ...@@ -654,6 +654,22 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
} }
#endif #endif
static int map_bf_area(struct mlx5_core_dev *dev)
{
resource_size_t bf_start = pci_resource_start(dev->pdev, 0);
resource_size_t bf_len = pci_resource_len(dev->pdev, 0);
dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len);
return dev->priv.bf_mapping ? 0 : -ENOMEM;
}
static void unmap_bf_area(struct mlx5_core_dev *dev)
{
if (dev->priv.bf_mapping)
io_mapping_free(dev->priv.bf_mapping);
}
static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
{ {
struct mlx5_priv *priv = &dev->priv; struct mlx5_priv *priv = &dev->priv;
...@@ -808,10 +824,13 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) ...@@ -808,10 +824,13 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
goto err_stop_eqs; goto err_stop_eqs;
} }
if (map_bf_area(dev))
dev_err(&pdev->dev, "Failed to map blue flame area\n");
err = mlx5_irq_set_affinity_hints(dev); err = mlx5_irq_set_affinity_hints(dev);
if (err) { if (err) {
dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n"); dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
goto err_free_comp_eqs; goto err_unmap_bf_area;
} }
MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock); MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
...@@ -823,7 +842,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) ...@@ -823,7 +842,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
return 0; return 0;
err_free_comp_eqs: err_unmap_bf_area:
unmap_bf_area(dev);
free_comp_eqs(dev); free_comp_eqs(dev);
err_stop_eqs: err_stop_eqs:
...@@ -881,6 +902,7 @@ static void mlx5_dev_cleanup(struct mlx5_core_dev *dev) ...@@ -881,6 +902,7 @@ static void mlx5_dev_cleanup(struct mlx5_core_dev *dev)
mlx5_cleanup_qp_table(dev); mlx5_cleanup_qp_table(dev);
mlx5_cleanup_cq_table(dev); mlx5_cleanup_cq_table(dev);
mlx5_irq_clear_affinity_hints(dev); mlx5_irq_clear_affinity_hints(dev);
unmap_bf_area(dev);
free_comp_eqs(dev); free_comp_eqs(dev);
mlx5_stop_eqs(dev); mlx5_stop_eqs(dev);
mlx5_free_uuars(dev, &priv->uuari); mlx5_free_uuars(dev, &priv->uuari);
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/io-mapping.h>
#include <linux/mlx5/driver.h> #include <linux/mlx5/driver.h>
#include <linux/mlx5/cmd.h> #include <linux/mlx5/cmd.h>
#include "mlx5_core.h" #include "mlx5_core.h"
...@@ -246,6 +247,10 @@ int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar) ...@@ -246,6 +247,10 @@ int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
goto err_free_uar; goto err_free_uar;
} }
if (mdev->priv.bf_mapping)
uar->bf_map = io_mapping_map_wc(mdev->priv.bf_mapping,
uar->index << PAGE_SHIFT);
return 0; return 0;
err_free_uar: err_free_uar:
...@@ -257,6 +262,7 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar); ...@@ -257,6 +262,7 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar);
void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar) void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
{ {
io_mapping_unmap(uar->bf_map);
iounmap(uar->map); iounmap(uar->map);
mlx5_cmd_free_uar(mdev, uar->index); mlx5_cmd_free_uar(mdev, uar->index);
} }
......
...@@ -380,7 +380,7 @@ struct mlx5_uar { ...@@ -380,7 +380,7 @@ struct mlx5_uar {
u32 index; u32 index;
struct list_head bf_list; struct list_head bf_list;
unsigned free_bf_bmap; unsigned free_bf_bmap;
void __iomem *wc_map; void __iomem *bf_map;
void __iomem *map; void __iomem *map;
}; };
...@@ -435,6 +435,8 @@ struct mlx5_priv { ...@@ -435,6 +435,8 @@ struct mlx5_priv {
struct mlx5_uuar_info uuari; struct mlx5_uuar_info uuari;
MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock); MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
struct io_mapping *bf_mapping;
/* pages stuff */ /* pages stuff */
struct workqueue_struct *pg_wq; struct workqueue_struct *pg_wq;
struct rb_root page_root; struct rb_root page_root;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment