Commit 39c536ac authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'xdp-ice-mbuf'

Alexander Lobakin says:

====================
The set grew from the poor performance of %BPF_F_TEST_XDP_LIVE_FRAMES
when the ice-backed device is a sender. Initially there were around
3.3 Mpps / thread, while I have 5.5 on skb-based pktgen ...

After fixing 0005 (0004 is a prereq for it) first (strange thing nobody
noticed that earlier), I started catching random OOMs. This is how 0002
(and partially 0001) appeared.

0003 is a suggestion from Maciej to not waste time on refactoring dead
lines. 0006 is a "cherry on top" to get away with the final 6.7 Mpps.
4.5 of 6 are fixes, but only the first three are tagged, since it then
starts being tricky. I may backport them manually later on.

TL;DR for the series is that shortcuts are good, but only as long as
they don't make the driver miss important things. %XDP_TX is purely
driver-local, however .ndo_xdp_xmit() is not, and sometimes assumptions
can be unsafe there.

With that series and also one core code patch[0], "live frames" and
xdp-trafficgen are now safe'n'fast on ice (probably more to come).

  [0] https://lore.kernel.org/all/20230209172827.874728-1-alexandr.lobakin@intel.com
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 0b075724 ad07f29b
......@@ -85,7 +85,7 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
ICE_TX_DESC_CMD_RE;
tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT;
tx_buf->type = ICE_TX_BUF_DUMMY;
tx_buf->raw_buf = raw_packet;
tx_desc->cmd_type_offset_bsz =
......@@ -112,31 +112,29 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
static void
ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
{
if (tx_buf->skb) {
if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) {
devm_kfree(ring->dev, tx_buf->raw_buf);
} else if (ice_ring_is_xdp(ring)) {
if (ring->xsk_pool)
xsk_buff_free(tx_buf->xdp);
else
page_frag_free(tx_buf->raw_buf);
} else {
dev_kfree_skb_any(tx_buf->skb);
}
if (dma_unmap_len(tx_buf, len))
dma_unmap_single(ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
} else if (dma_unmap_len(tx_buf, len)) {
if (dma_unmap_len(tx_buf, len))
dma_unmap_page(ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
switch (tx_buf->type) {
case ICE_TX_BUF_DUMMY:
devm_kfree(ring->dev, tx_buf->raw_buf);
break;
case ICE_TX_BUF_SKB:
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
page_frag_free(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
break;
}
tx_buf->next_to_watch = NULL;
tx_buf->skb = NULL;
tx_buf->type = ICE_TX_BUF_EMPTY;
dma_unmap_len_set(tx_buf, len, 0);
/* tx_buf must be completely set up in the transmit path */
}
......@@ -269,7 +267,7 @@ static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget)
DMA_TO_DEVICE);
/* clear tx_buf data */
tx_buf->skb = NULL;
tx_buf->type = ICE_TX_BUF_EMPTY;
dma_unmap_len_set(tx_buf, len, 0);
/* unmap remaining buffers */
......@@ -580,7 +578,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
case XDP_TX:
if (static_branch_unlikely(&ice_xdp_locking_key))
spin_lock(&xdp_ring->tx_lock);
ret = __ice_xmit_xdp_ring(xdp, xdp_ring);
ret = __ice_xmit_xdp_ring(xdp, xdp_ring, false);
if (static_branch_unlikely(&ice_xdp_locking_key))
spin_unlock(&xdp_ring->tx_lock);
if (ret == ICE_XDP_CONSUMED)
......@@ -607,6 +605,25 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
ice_set_rx_bufs_act(xdp, rx_ring, ret);
}
/**
* ice_xmit_xdp_ring - submit frame to XDP ring for transmission
* @xdpf: XDP frame that will be converted to XDP buff
* @xdp_ring: XDP ring for transmission
*/
static int ice_xmit_xdp_ring(const struct xdp_frame *xdpf,
struct ice_tx_ring *xdp_ring)
{
struct xdp_buff xdp;
xdp.data_hard_start = (void *)xdpf;
xdp.data = xdpf->data;
xdp.data_end = xdp.data + xdpf->len;
xdp.frame_sz = xdpf->frame_sz;
xdp.flags = xdpf->flags;
return __ice_xmit_xdp_ring(&xdp, xdp_ring, true);
}
/**
* ice_xdp_xmit - submit packets to XDP ring for transmission
* @dev: netdev
......@@ -652,7 +669,7 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
tx_buf = &xdp_ring->tx_buf[xdp_ring->next_to_use];
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
const struct xdp_frame *xdpf = frames[i];
int err;
err = ice_xmit_xdp_ring(xdpf, xdp_ring);
......@@ -1712,6 +1729,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
DMA_TO_DEVICE);
tx_buf = &tx_ring->tx_buf[i];
tx_buf->type = ICE_TX_BUF_FRAG;
}
/* record SW timestamp if HW timestamp is not available */
......@@ -2355,6 +2373,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
/* record the location of the first descriptor for this packet */
first = &tx_ring->tx_buf[tx_ring->next_to_use];
first->skb = skb;
first->type = ICE_TX_BUF_SKB;
first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
first->gso_segs = 1;
first->tx_flags = 0;
......@@ -2527,11 +2546,11 @@ void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring)
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
if (tx_buf->type == ICE_TX_BUF_DUMMY)
devm_kfree(tx_ring->dev, tx_buf->raw_buf);
/* clear next_to_watch to prevent false hangs */
tx_buf->raw_buf = NULL;
tx_buf->type = ICE_TX_BUF_EMPTY;
tx_buf->tx_flags = 0;
tx_buf->next_to_watch = NULL;
dma_unmap_len_set(tx_buf, len, 0);
......
......@@ -121,10 +121,7 @@ static inline int ice_skb_pad(void)
#define ICE_TX_FLAGS_TSO BIT(0)
#define ICE_TX_FLAGS_HW_VLAN BIT(1)
#define ICE_TX_FLAGS_SW_VLAN BIT(2)
/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be
* freed instead of returned like skb packets.
*/
#define ICE_TX_FLAGS_DUMMY_PKT BIT(3)
/* Free, was ICE_TX_FLAGS_DUMMY_PKT */
#define ICE_TX_FLAGS_TSYN BIT(4)
#define ICE_TX_FLAGS_IPV4 BIT(5)
#define ICE_TX_FLAGS_IPV6 BIT(6)
......@@ -149,22 +146,44 @@ static inline int ice_skb_pad(void)
#define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS)
/**
* enum ice_tx_buf_type - type of &ice_tx_buf to act on Tx completion
* @ICE_TX_BUF_EMPTY: unused OR XSk frame, no action required
* @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
* @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
* @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
* @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
* @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
* @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
*/
enum ice_tx_buf_type {
ICE_TX_BUF_EMPTY = 0U,
ICE_TX_BUF_DUMMY,
ICE_TX_BUF_FRAG,
ICE_TX_BUF_SKB,
ICE_TX_BUF_XDP_TX,
ICE_TX_BUF_XDP_XMIT,
ICE_TX_BUF_XSK_TX,
};
struct ice_tx_buf {
union {
struct ice_tx_desc *next_to_watch;
u32 rs_idx;
};
union {
struct sk_buff *skb;
void *raw_buf; /* used for XDP */
struct xdp_buff *xdp; /* used for XDP_TX ZC */
void *raw_buf; /* used for XDP_TX and FDir rules */
struct sk_buff *skb; /* used for .ndo_start_xmit() */
struct xdp_frame *xdpf; /* used for .ndo_xdp_xmit() */
struct xdp_buff *xdp; /* used for XDP_TX ZC */
};
unsigned int bytecount;
union {
unsigned int gso_segs;
unsigned int nr_frags; /* used for mbuf XDP */
unsigned int nr_frags; /* used for mbuf XDP */
};
u32 tx_flags;
u32 type:16; /* &ice_tx_buf_type */
u32 tx_flags:16;
DEFINE_DMA_UNMAP_LEN(len);
DEFINE_DMA_UNMAP_ADDR(dma);
};
......
......@@ -222,18 +222,28 @@ ice_receive_skb(struct ice_rx_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
/**
* ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
* @xdp_ring: XDP Tx ring
* @dev: device for DMA mapping
* @tx_buf: Tx buffer to clean
* @bq: XDP bulk flush struct
*/
static void
ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf *tx_buf,
struct xdp_frame_bulk *bq)
{
dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
dma_unmap_single(dev, dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
dma_unmap_len_set(tx_buf, len, 0);
xdp_ring->xdp_tx_active--;
page_frag_free(tx_buf->raw_buf);
tx_buf->raw_buf = NULL;
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
page_frag_free(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
break;
}
tx_buf->type = ICE_TX_BUF_EMPTY;
}
/**
......@@ -243,11 +253,13 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
{
int total_bytes = 0, total_pkts = 0;
struct device *dev = xdp_ring->dev;
u32 ntc = xdp_ring->next_to_clean;
struct ice_tx_desc *tx_desc;
u32 cnt = xdp_ring->count;
struct xdp_frame_bulk bq;
u32 frags, xdp_tx = 0;
u32 ready_frames = 0;
u32 frags;
u32 idx;
u32 ret;
......@@ -261,12 +273,16 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
ready_frames = idx + cnt - ntc + 1;
}
if (!ready_frames)
if (unlikely(!ready_frames))
return 0;
ret = ready_frames;
xdp_frame_bulk_init(&bq);
rcu_read_lock(); /* xdp_return_frame_bulk() */
while (ready_frames) {
struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
struct ice_tx_buf *head = tx_buf;
/* bytecount holds size of head + frags */
total_bytes += tx_buf->bytecount;
......@@ -274,11 +290,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
total_pkts++;
/* count head + frags */
ready_frames -= frags + 1;
xdp_tx++;
if (xdp_ring->xsk_pool)
xsk_buff_free(tx_buf->xdp);
else
ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
ntc++;
if (ntc == cnt)
ntc = 0;
......@@ -286,15 +299,21 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
for (int i = 0; i < frags; i++) {
tx_buf = &xdp_ring->tx_buf[ntc];
ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
ice_clean_xdp_tx_buf(dev, tx_buf, &bq);
ntc++;
if (ntc == cnt)
ntc = 0;
}
ice_clean_xdp_tx_buf(dev, head, &bq);
}
xdp_flush_frame_bulk(&bq);
rcu_read_unlock();
tx_desc->cmd_type_offset_bsz = 0;
xdp_ring->next_to_clean = ntc;
xdp_ring->xdp_tx_active -= xdp_tx;
ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);
return ret;
......@@ -304,8 +323,10 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
* __ice_xmit_xdp_ring - submit frame to XDP ring for transmission
* @xdp: XDP buffer to be placed onto Tx descriptors
* @xdp_ring: XDP ring for transmission
* @frame: whether this comes from .ndo_xdp_xmit()
*/
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring,
bool frame)
{
struct skb_shared_info *sinfo = NULL;
u32 size = xdp->data_end - xdp->data;
......@@ -321,17 +342,17 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
u32 frag = 0;
free_space = ICE_DESC_UNUSED(xdp_ring);
if (ICE_DESC_UNUSED(xdp_ring) < ICE_RING_QUARTER(xdp_ring))
if (free_space < ICE_RING_QUARTER(xdp_ring))
free_space += ice_clean_xdp_irq(xdp_ring);
if (unlikely(!free_space))
goto busy;
if (unlikely(xdp_buff_has_frags(xdp))) {
sinfo = xdp_get_shared_info_from_buff(xdp);
nr_frags = sinfo->nr_frags;
if (free_space < nr_frags + 1) {
xdp_ring->ring_stats->tx_stats.tx_busy++;
return ICE_XDP_CONSUMED;
}
if (free_space < nr_frags + 1)
goto busy;
}
tx_desc = ICE_TX_DESC(xdp_ring, ntu);
......@@ -349,9 +370,15 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
dma_unmap_len_set(tx_buf, len, size);
dma_unmap_addr_set(tx_buf, dma, dma);
if (frame) {
tx_buf->type = ICE_TX_BUF_FRAG;
} else {
tx_buf->type = ICE_TX_BUF_XDP_TX;
tx_buf->raw_buf = data;
}
tx_desc->buf_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0);
tx_buf->raw_buf = data;
ntu++;
if (ntu == cnt)
......@@ -372,6 +399,11 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
tx_head->bytecount = xdp_get_buff_len(xdp);
tx_head->nr_frags = nr_frags;
if (frame) {
tx_head->type = ICE_TX_BUF_XDP_XMIT;
tx_head->xdpf = xdp->data_hard_start;
}
/* update last descriptor from a frame with EOP */
tx_desc->cmd_type_offset_bsz |=
cpu_to_le64(ICE_TX_DESC_CMD_EOP << ICE_TXD_QW1_CMD_S);
......@@ -395,19 +427,11 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
ntu--;
}
return ICE_XDP_CONSUMED;
}
/**
* ice_xmit_xdp_ring - submit frame to XDP ring for transmission
* @xdpf: XDP frame that will be converted to XDP buff
* @xdp_ring: XDP ring for transmission
*/
int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring)
{
struct xdp_buff xdp;
busy:
xdp_ring->ring_stats->tx_stats.tx_busy++;
xdp_convert_frame_to_buff(xdpf, &xdp);
return __ice_xmit_xdp_ring(&xdp, xdp_ring);
return ICE_XDP_CONSUMED;
}
/**
......
......@@ -142,8 +142,8 @@ static inline u32 ice_set_rs_bit(const struct ice_tx_ring *xdp_ring)
void ice_finalize_xdp_rx(struct ice_tx_ring *xdp_ring, unsigned int xdp_res, u32 first_idx);
int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring);
int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring);
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring);
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring,
bool frame);
void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val);
void
ice_process_skb_fields(struct ice_rx_ring *rx_ring,
......
......@@ -631,7 +631,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring)
for (i = 0; i < xsk_frames; i++) {
tx_buf = &xdp_ring->tx_buf[ntc];
if (tx_buf->xdp) {
if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
tx_buf->type = ICE_TX_BUF_EMPTY;
xsk_buff_free(tx_buf->xdp);
xdp_ring->xdp_tx_active--;
} else {
......@@ -685,6 +686,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp,
tx_buf = &xdp_ring->tx_buf[ntu];
tx_buf->xdp = xdp;
tx_buf->type = ICE_TX_BUF_XSK_TX;
tx_desc = ICE_TX_DESC(xdp_ring, ntu);
tx_desc->buf_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
......@@ -1083,12 +1085,12 @@ void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring)
while (ntc != ntu) {
struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
if (tx_buf->xdp)
if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
tx_buf->type = ICE_TX_BUF_EMPTY;
xsk_buff_free(tx_buf->xdp);
else
} else {
xsk_frames++;
tx_buf->raw_buf = NULL;
}
ntc++;
if (ntc >= xdp_ring->count)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment