Commit 9ecc2d86 authored by Brenden Blanco's avatar Brenden Blanco Committed by David S. Miller

net/mlx4_en: add xdp forwarding and data write support

A user will now be able to loop packets back out of the same port using
a bpf program attached to xdp hook. Updates to the packet contents from
the bpf program is also supported.

For the packet write feature to work, the rx buffers are now mapped as
bidirectional when the page is allocated. This occurs only when the xdp
hook is active.

When the program returns a TX action, enqueue the packet directly to a
dedicated tx ring, so as to avoid completely any locking. This requires
the tx ring to be allocated 1:1 for each rx ring, as well as the tx
completion running in the same softirq.

Upon tx completion, this dedicated tx ring recycles pages without
unmapping directly back to the original rx ring. In steady state tx/drop
workload, effectively 0 page allocs/frees will occur.

In order to separate out the paths between free and recycle, a
free_tx_desc func pointer is introduced that is optionally updated
whenever recycle_ring is activated. By default the original free
function is always initialized.
Signed-off-by: default avatarBrenden Blanco <bblanco@plumgrid.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 224e92e0
...@@ -1722,6 +1722,12 @@ static int mlx4_en_set_channels(struct net_device *dev, ...@@ -1722,6 +1722,12 @@ static int mlx4_en_set_channels(struct net_device *dev,
!channel->tx_count || !channel->rx_count) !channel->tx_count || !channel->rx_count)
return -EINVAL; return -EINVAL;
if (channel->tx_count * MLX4_EN_NUM_UP <= priv->xdp_ring_num) {
en_err(priv, "Minimum %d tx channels required with XDP on\n",
priv->xdp_ring_num / MLX4_EN_NUM_UP + 1);
return -EINVAL;
}
mutex_lock(&mdev->state_lock); mutex_lock(&mdev->state_lock);
if (priv->port_up) { if (priv->port_up) {
port_up = 1; port_up = 1;
...@@ -1740,7 +1746,8 @@ static int mlx4_en_set_channels(struct net_device *dev, ...@@ -1740,7 +1746,8 @@ static int mlx4_en_set_channels(struct net_device *dev,
goto out; goto out;
} }
netif_set_real_num_tx_queues(dev, priv->tx_ring_num); netif_set_real_num_tx_queues(dev, priv->tx_ring_num -
priv->xdp_ring_num);
netif_set_real_num_rx_queues(dev, priv->rx_ring_num); netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
if (dev->num_tc) if (dev->num_tc)
......
...@@ -1522,6 +1522,24 @@ static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) ...@@ -1522,6 +1522,24 @@ static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask); free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
} }
static void mlx4_en_init_recycle_ring(struct mlx4_en_priv *priv,
int tx_ring_idx)
{
struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[tx_ring_idx];
int rr_index;
rr_index = (priv->xdp_ring_num - priv->tx_ring_num) + tx_ring_idx;
if (rr_index >= 0) {
tx_ring->free_tx_desc = mlx4_en_recycle_tx_desc;
tx_ring->recycle_ring = priv->rx_ring[rr_index];
en_dbg(DRV, priv,
"Set tx_ring[%d]->recycle_ring = rx_ring[%d]\n",
tx_ring_idx, rr_index);
} else {
tx_ring->recycle_ring = NULL;
}
}
int mlx4_en_start_port(struct net_device *dev) int mlx4_en_start_port(struct net_device *dev)
{ {
struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_priv *priv = netdev_priv(dev);
...@@ -1644,6 +1662,8 @@ int mlx4_en_start_port(struct net_device *dev) ...@@ -1644,6 +1662,8 @@ int mlx4_en_start_port(struct net_device *dev)
} }
tx_ring->tx_queue = netdev_get_tx_queue(dev, i); tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
mlx4_en_init_recycle_ring(priv, i);
/* Arm CQ for TX completions */ /* Arm CQ for TX completions */
mlx4_en_arm_cq(priv, cq); mlx4_en_arm_cq(priv, cq);
...@@ -2561,6 +2581,13 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) ...@@ -2561,6 +2581,13 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
if (priv->tx_ring_num < xdp_ring_num + MLX4_EN_NUM_UP) {
en_err(priv,
"Minimum %d tx channels required to run XDP\n",
(xdp_ring_num + MLX4_EN_NUM_UP) / MLX4_EN_NUM_UP);
return -EINVAL;
}
if (prog) { if (prog) {
prog = bpf_prog_add(prog, priv->rx_ring_num - 1); prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
if (IS_ERR(prog)) if (IS_ERR(prog))
...@@ -2574,6 +2601,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) ...@@ -2574,6 +2601,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
} }
priv->xdp_ring_num = xdp_ring_num; priv->xdp_ring_num = xdp_ring_num;
netif_set_real_num_tx_queues(dev, priv->tx_ring_num -
priv->xdp_ring_num);
for (i = 0; i < priv->rx_ring_num; i++) { for (i = 0; i < priv->rx_ring_num; i++) {
old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog); old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
......
...@@ -783,7 +783,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ...@@ -783,7 +783,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
struct mlx4_en_rx_alloc *frags; struct mlx4_en_rx_alloc *frags;
struct mlx4_en_rx_desc *rx_desc; struct mlx4_en_rx_desc *rx_desc;
struct bpf_prog *xdp_prog; struct bpf_prog *xdp_prog;
int doorbell_pending;
struct sk_buff *skb; struct sk_buff *skb;
int tx_index;
int index; int index;
int nr; int nr;
unsigned int length; unsigned int length;
...@@ -800,6 +802,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ...@@ -800,6 +802,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
return polled; return polled;
xdp_prog = READ_ONCE(ring->xdp_prog); xdp_prog = READ_ONCE(ring->xdp_prog);
doorbell_pending = 0;
tx_index = (priv->tx_ring_num - priv->xdp_ring_num) + cq->ring;
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
* descriptor offset can be deduced from the CQE index instead of * descriptor offset can be deduced from the CQE index instead of
...@@ -898,6 +902,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ...@@ -898,6 +902,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
switch (act) { switch (act) {
case XDP_PASS: case XDP_PASS:
break; break;
case XDP_TX:
if (!mlx4_en_xmit_frame(frags, dev,
length, tx_index,
&doorbell_pending))
goto consumed;
break;
default: default:
bpf_warn_invalid_xdp_action(act); bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED: case XDP_ABORTED:
...@@ -1068,6 +1078,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ...@@ -1068,6 +1078,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
} }
out: out:
if (doorbell_pending)
mlx4_en_xmit_doorbell(priv->tx_ring[tx_index]);
AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
mlx4_cq_set_ci(&cq->mcq); mlx4_cq_set_ci(&cq->mcq);
wmb(); /* ensure HW sees CQ consumer before we post new buffers */ wmb(); /* ensure HW sees CQ consumer before we post new buffers */
...@@ -1147,6 +1160,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev) ...@@ -1147,6 +1160,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
* This only works when num_frags == 1. * This only works when num_frags == 1.
*/ */
if (priv->xdp_ring_num) { if (priv->xdp_ring_num) {
dma_dir = PCI_DMA_BIDIRECTIONAL;
/* This will gain efficient xdp frame recycling at the expense /* This will gain efficient xdp frame recycling at the expense
* of more costly truesize accounting * of more costly truesize accounting
*/ */
......
...@@ -196,6 +196,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, ...@@ -196,6 +196,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
ring->last_nr_txbb = 1; ring->last_nr_txbb = 1;
memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
memset(ring->buf, 0, ring->buf_size); memset(ring->buf, 0, ring->buf_size);
ring->free_tx_desc = mlx4_en_free_tx_desc;
ring->qp_state = MLX4_QP_STATE_RST; ring->qp_state = MLX4_QP_STATE_RST;
ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8); ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
...@@ -265,7 +266,7 @@ static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, ...@@ -265,7 +266,7 @@ static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
} }
static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring, struct mlx4_en_tx_ring *ring,
int index, u8 owner, u64 timestamp, int index, u8 owner, u64 timestamp,
int napi_mode) int napi_mode)
...@@ -344,6 +345,27 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, ...@@ -344,6 +345,27 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
return tx_info->nr_txbb; return tx_info->nr_txbb;
} }
u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring,
int index, u8 owner, u64 timestamp,
int napi_mode)
{
struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
struct mlx4_en_rx_alloc frame = {
.page = tx_info->page,
.dma = tx_info->map0_dma,
.page_offset = 0,
.page_size = PAGE_SIZE,
};
if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
dma_unmap_page(priv->ddev, tx_info->map0_dma,
PAGE_SIZE, priv->frag_info[0].dma_dir);
put_page(tx_info->page);
}
return tx_info->nr_txbb;
}
int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
{ {
...@@ -362,7 +384,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) ...@@ -362,7 +384,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
} }
while (ring->cons != ring->prod) { while (ring->cons != ring->prod) {
ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, ring->last_nr_txbb = ring->free_tx_desc(priv, ring,
ring->cons & ring->size_mask, ring->cons & ring->size_mask,
!!(ring->cons & ring->size), 0, !!(ring->cons & ring->size), 0,
0 /* Non-NAPI caller */); 0 /* Non-NAPI caller */);
...@@ -444,7 +466,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -444,7 +466,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
timestamp = mlx4_en_get_cqe_ts(cqe); timestamp = mlx4_en_get_cqe_ts(cqe);
/* free next descriptor */ /* free next descriptor */
last_nr_txbb = mlx4_en_free_tx_desc( last_nr_txbb = ring->free_tx_desc(
priv, ring, ring_index, priv, ring, ring_index,
!!((ring_cons + txbbs_skipped) & !!((ring_cons + txbbs_skipped) &
ring->size), timestamp, napi_budget); ring->size), timestamp, napi_budget);
...@@ -476,6 +498,9 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -476,6 +498,9 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb; ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped; ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
if (ring->free_tx_desc == mlx4_en_recycle_tx_desc)
return done < budget;
netdev_tx_completed_queue(ring->tx_queue, packets, bytes); netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
/* Wakeup Tx queue if this stopped, and ring is not full. /* Wakeup Tx queue if this stopped, and ring is not full.
...@@ -1052,3 +1077,106 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -1052,3 +1077,106 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK; return NETDEV_TX_OK;
} }
netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
struct net_device *dev, unsigned int length,
int tx_ind, int *doorbell_pending)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
union mlx4_wqe_qpn_vlan qpn_vlan = {};
struct mlx4_en_tx_ring *ring;
struct mlx4_en_tx_desc *tx_desc;
struct mlx4_wqe_data_seg *data;
struct mlx4_en_tx_info *tx_info;
int index, bf_index;
bool send_doorbell;
int nr_txbb = 1;
bool stop_queue;
dma_addr_t dma;
int real_size;
__be32 op_own;
u32 ring_cons;
bool bf_ok;
BUILD_BUG_ON_MSG(ALIGN(CTRL_SIZE + DS_SIZE, TXBB_SIZE) != TXBB_SIZE,
"mlx4_en_xmit_frame requires minimum size tx desc");
ring = priv->tx_ring[tx_ind];
if (!priv->port_up)
goto tx_drop;
if (mlx4_en_is_tx_ring_full(ring))
goto tx_drop;
/* fetch ring->cons far ahead before needing it to avoid stall */
ring_cons = READ_ONCE(ring->cons);
index = ring->prod & ring->size_mask;
tx_info = &ring->tx_info[index];
bf_ok = ring->bf_enabled;
/* Track current inflight packets for performance analysis */
AVG_PERF_COUNTER(priv->pstats.inflight_avg,
(u32)(ring->prod - ring_cons - 1));
bf_index = ring->prod;
tx_desc = ring->buf + index * TXBB_SIZE;
data = &tx_desc->data;
dma = frame->dma;
tx_info->page = frame->page;
frame->page = NULL;
tx_info->map0_dma = dma;
tx_info->map0_byte_count = length;
tx_info->nr_txbb = nr_txbb;
tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
tx_info->data_offset = (void *)data - (void *)tx_desc;
tx_info->ts_requested = 0;
tx_info->nr_maps = 1;
tx_info->linear = 1;
tx_info->inl = 0;
dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE);
data->addr = cpu_to_be64(dma);
data->lkey = ring->mr_key;
dma_wmb();
data->byte_count = cpu_to_be32(length);
/* tx completion can avoid cache line miss for common cases */
tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
((ring->prod & ring->size) ?
cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
ring->packets++;
ring->bytes += tx_info->nr_bytes;
AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, length);
ring->prod += nr_txbb;
stop_queue = mlx4_en_is_tx_ring_full(ring);
send_doorbell = stop_queue ||
*doorbell_pending > MLX4_EN_DOORBELL_BUDGET;
bf_ok &= send_doorbell;
real_size = ((CTRL_SIZE + nr_txbb * DS_SIZE) / 16) & 0x3f;
if (bf_ok)
qpn_vlan.bf_qpn = ring->doorbell_qpn | cpu_to_be32(real_size);
else
qpn_vlan.fence_size = real_size;
mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, TXBB_SIZE, bf_index,
op_own, bf_ok, send_doorbell);
*doorbell_pending = send_doorbell ? 0 : *doorbell_pending + 1;
return NETDEV_TX_OK;
tx_drop:
ring->tx_dropped++;
return NETDEV_TX_BUSY;
}
...@@ -132,6 +132,7 @@ enum { ...@@ -132,6 +132,7 @@ enum {
MLX4_EN_NUM_UP) MLX4_EN_NUM_UP)
#define MLX4_EN_DEFAULT_TX_WORK 256 #define MLX4_EN_DEFAULT_TX_WORK 256
#define MLX4_EN_DOORBELL_BUDGET 8
/* Target number of packets to coalesce with interrupt moderation */ /* Target number of packets to coalesce with interrupt moderation */
#define MLX4_EN_RX_COAL_TARGET 44 #define MLX4_EN_RX_COAL_TARGET 44
...@@ -219,7 +220,10 @@ enum cq_type { ...@@ -219,7 +220,10 @@ enum cq_type {
struct mlx4_en_tx_info { struct mlx4_en_tx_info {
union {
struct sk_buff *skb; struct sk_buff *skb;
struct page *page;
};
dma_addr_t map0_dma; dma_addr_t map0_dma;
u32 map0_byte_count; u32 map0_byte_count;
u32 nr_txbb; u32 nr_txbb;
...@@ -265,6 +269,8 @@ struct mlx4_en_page_cache { ...@@ -265,6 +269,8 @@ struct mlx4_en_page_cache {
struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE]; struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE];
}; };
struct mlx4_en_priv;
struct mlx4_en_tx_ring { struct mlx4_en_tx_ring {
/* cache line used and dirtied in tx completion /* cache line used and dirtied in tx completion
* (mlx4_en_free_tx_buf()) * (mlx4_en_free_tx_buf())
...@@ -298,6 +304,11 @@ struct mlx4_en_tx_ring { ...@@ -298,6 +304,11 @@ struct mlx4_en_tx_ring {
__be32 mr_key; __be32 mr_key;
void *buf; void *buf;
struct mlx4_en_tx_info *tx_info; struct mlx4_en_tx_info *tx_info;
struct mlx4_en_rx_ring *recycle_ring;
u32 (*free_tx_desc)(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring,
int index, u8 owner,
u64 timestamp, int napi_mode);
u8 *bounce_buf; u8 *bounce_buf;
struct mlx4_qp_context context; struct mlx4_qp_context context;
int qpn; int qpn;
...@@ -678,6 +689,12 @@ void mlx4_en_tx_irq(struct mlx4_cq *mcq); ...@@ -678,6 +689,12 @@ void mlx4_en_tx_irq(struct mlx4_cq *mcq);
u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback); void *accel_priv, select_queue_fallback_t fallback);
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
struct net_device *dev, unsigned int length,
int tx_ind, int *doorbell_pending);
void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring);
bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
struct mlx4_en_rx_alloc *frame);
int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring **pring, struct mlx4_en_tx_ring **pring,
...@@ -706,6 +723,14 @@ int mlx4_en_process_rx_cq(struct net_device *dev, ...@@ -706,6 +723,14 @@ int mlx4_en_process_rx_cq(struct net_device *dev,
int budget); int budget);
int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget); int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget); int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget);
u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring,
int index, u8 owner, u64 timestamp,
int napi_mode);
u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring,
int index, u8 owner, u64 timestamp,
int napi_mode);
void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
int is_tx, int rss, int qpn, int cqn, int user_prio, int is_tx, int rss, int qpn, int cqn, int user_prio,
struct mlx4_qp_context *context); struct mlx4_qp_context *context);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment