Commit 40ea3ee2 authored by David S. Miller's avatar David S. Miller

Merge branch 'virtio-net-xdp-multi-buffer'

Heng Qi says:

====================
virtio-net: support multi buffer xdp

Changes since PATCH v4:
- Make netdev_warn() in [PATCH 2/10] independent from [PATCH 3/10].

Changes since PATCH v3:
- Separate fix patch [2/10] for MTU calculation of single buffer xdp.
  Note that this patch needs to be backported to the stable branch.

Changes since PATCH v2:
- Even if single buffer xdp has a hole mechanism, there will be no
  problem (limiting mtu and turning off GUEST GSO), so there is no
  need to backport "[PATCH 1/9]";
- Modify calculation of MTU for single buffer xdp in virtnet_xdp_set();
- Make truesize in mergeable mode return to literal meaning;
- Add some comments for legibility;

Changes since RFC:
- Using headroom instead of vi->xdp_enabled to avoid re-reading
  in add_recvbuf_mergeable();
- Disable GRO_HW and keep linearization for single buffer xdp;
- Renamed to virtnet_build_xdp_buff_mrg();
- pr_debug() to netdev_dbg();
- Adjusted the order of the patch series.

Currently, virtio net only supports xdp for single-buffer packets
or linearized multi-buffer packets. This patchset supports xdp for
multi-buffer packets, then larger MTU can be used if xdp sets the
xdp.frags. This does not affect single buffer handling.

In order to build multi-buffer xdp neatly, we integrated the code
into virtnet_build_xdp_buff_mrg() for xdp. The first buffer is used
for prepared xdp buff, and the rest of the buffers are added to
its skb_shared_info structure. This structure can also be
conveniently converted during XDP_PASS to get the corresponding skb.

Since virtio net uses comp pages, and bpf_xdp_frags_increase_tail()
is based on the assumption of the page pool,
(rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag))
is negative in most cases. So we didn't set xdp_rxq->frag_size in
virtnet_open() to disable the tail increase.

====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 5129bd8e fab89baf
......@@ -446,9 +446,7 @@ static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
struct receive_queue *rq,
struct page *page, unsigned int offset,
unsigned int len, unsigned int truesize,
bool hdr_valid, unsigned int metasize,
unsigned int headroom)
unsigned int len, unsigned int truesize)
{
struct sk_buff *skb;
struct virtio_net_hdr_mrg_rxbuf *hdr;
......@@ -466,21 +464,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
else
hdr_padded_len = sizeof(struct padded_vnet_hdr);
/* If headroom is not 0, there is an offset between the beginning of the
* data and the allocated space, otherwise the data and the allocated
* space are aligned.
*
* Buffers with headroom use PAGE_SIZE as alloc size, see
* add_recvbuf_mergeable() + get_mergeable_buf_len()
*/
truesize = headroom ? PAGE_SIZE : truesize;
tailroom = truesize - headroom;
buf = p - headroom;
buf = p;
len -= hdr_len;
offset += hdr_padded_len;
p += hdr_padded_len;
tailroom -= hdr_padded_len + len;
tailroom = truesize - hdr_padded_len - len;
shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
......@@ -510,7 +498,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
if (len <= skb_tailroom(skb))
copy = len;
else
copy = ETH_HLEN + metasize;
copy = ETH_HLEN;
skb_put_data(skb, p, copy);
len -= copy;
......@@ -549,19 +537,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
give_pages(rq, page);
ok:
/* hdr_valid means no XDP, so we can copy the vnet header */
if (hdr_valid) {
hdr = skb_vnet_hdr(skb);
memcpy(hdr, hdr_p, hdr_len);
}
hdr = skb_vnet_hdr(skb);
memcpy(hdr, hdr_p, hdr_len);
if (page_to_free)
put_page(page_to_free);
if (metasize) {
__skb_pull(skb, metasize);
skb_metadata_set(skb, metasize);
}
return skb;
}
......@@ -570,22 +550,43 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct xdp_frame *xdpf)
{
struct virtio_net_hdr_mrg_rxbuf *hdr;
int err;
struct skb_shared_info *shinfo;
u8 nr_frags = 0;
int err, i;
if (unlikely(xdpf->headroom < vi->hdr_len))
return -EOVERFLOW;
/* Make room for virtqueue hdr (also change xdpf->headroom?) */
if (unlikely(xdp_frame_has_frags(xdpf))) {
shinfo = xdp_get_shared_info_from_frame(xdpf);
nr_frags = shinfo->nr_frags;
}
/* In wrapping function virtnet_xdp_xmit(), we need to free
* up the pending old buffers, where we need to calculate the
* position of skb_shared_info in xdp_get_frame_len() and
* xdp_return_frame(), which will involve to xdpf->data and
* xdpf->headroom. Therefore, we need to update the value of
* headroom synchronously here.
*/
xdpf->headroom -= vi->hdr_len;
xdpf->data -= vi->hdr_len;
/* Zero header and leave csum up to XDP layers */
hdr = xdpf->data;
memset(hdr, 0, vi->hdr_len);
xdpf->len += vi->hdr_len;
sg_init_one(sq->sg, xdpf->data, xdpf->len);
sg_init_table(sq->sg, nr_frags + 1);
sg_set_buf(sq->sg, xdpf->data, xdpf->len);
for (i = 0; i < nr_frags; i++) {
skb_frag_t *frag = &shinfo->frags[i];
sg_set_page(&sq->sg[i + 1], skb_frag_page(frag),
skb_frag_size(frag), skb_frag_off(frag));
}
err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
GFP_ATOMIC);
err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
xdp_to_ptr(xdpf), GFP_ATOMIC);
if (unlikely(err))
return -ENOSPC; /* Caller handle free/refcnt */
......@@ -665,7 +666,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
if (likely(is_xdp_frame(ptr))) {
struct xdp_frame *frame = ptr_to_xdp(ptr);
bytes += frame->len;
bytes += xdp_get_frame_len(frame);
xdp_return_frame(frame);
} else {
struct sk_buff *skb = ptr;
......@@ -924,7 +925,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
{
struct page *page = buf;
struct sk_buff *skb =
page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
stats->bytes += len - vi->hdr_len;
if (unlikely(!skb))
......@@ -938,6 +939,140 @@ static struct sk_buff *receive_big(struct net_device *dev,
return NULL;
}
/* Why not use xdp_build_skb_from_frame() ?
* XDP core assumes that xdp frags are PAGE_SIZE in length, while in
* virtio-net there are 2 points that do not match its requirements:
* 1. The size of the prefilled buffer is not fixed before xdp is set.
* 2. xdp_build_skb_from_frame() does more checks that we don't need,
* like eth_type_trans() (which virtio-net does in receive_buf()).
*/
static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev,
struct virtnet_info *vi,
struct xdp_buff *xdp,
unsigned int xdp_frags_truesz)
{
struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
unsigned int headroom, data_len;
struct sk_buff *skb;
int metasize;
u8 nr_frags;
if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
pr_debug("Error building skb as missing reserved tailroom for xdp");
return NULL;
}
if (unlikely(xdp_buff_has_frags(xdp)))
nr_frags = sinfo->nr_frags;
skb = build_skb(xdp->data_hard_start, xdp->frame_sz);
if (unlikely(!skb))
return NULL;
headroom = xdp->data - xdp->data_hard_start;
data_len = xdp->data_end - xdp->data;
skb_reserve(skb, headroom);
__skb_put(skb, data_len);
metasize = xdp->data - xdp->data_meta;
metasize = metasize > 0 ? metasize : 0;
if (metasize)
skb_metadata_set(skb, metasize);
if (unlikely(xdp_buff_has_frags(xdp)))
xdp_update_skb_shared_info(skb, nr_frags,
sinfo->xdp_frags_size,
xdp_frags_truesz,
xdp_buff_is_frag_pfmemalloc(xdp));
return skb;
}
/* TODO: build xdp in big mode */
static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
struct xdp_buff *xdp,
void *buf,
unsigned int len,
unsigned int frame_sz,
u16 *num_buf,
unsigned int *xdp_frags_truesize,
struct virtnet_rq_stats *stats)
{
struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
unsigned int headroom, tailroom, room;
unsigned int truesize, cur_frag_size;
struct skb_shared_info *shinfo;
unsigned int xdp_frags_truesz = 0;
struct page *page;
skb_frag_t *frag;
int offset;
void *ctx;
xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
xdp_prepare_buff(xdp, buf - VIRTIO_XDP_HEADROOM,
VIRTIO_XDP_HEADROOM + vi->hdr_len, len - vi->hdr_len, true);
if (*num_buf > 1) {
/* If we want to build multi-buffer xdp, we need
* to specify that the flags of xdp_buff have the
* XDP_FLAGS_HAS_FRAG bit.
*/
if (!xdp_buff_has_frags(xdp))
xdp_buff_set_frags_flag(xdp);
shinfo = xdp_get_shared_info_from_buff(xdp);
shinfo->nr_frags = 0;
shinfo->xdp_frags_size = 0;
}
if ((*num_buf - 1) > MAX_SKB_FRAGS)
return -EINVAL;
while ((--*num_buf) >= 1) {
buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
if (unlikely(!buf)) {
pr_debug("%s: rx error: %d buffers out of %d missing\n",
dev->name, *num_buf,
virtio16_to_cpu(vi->vdev, hdr->num_buffers));
dev->stats.rx_length_errors++;
return -EINVAL;
}
stats->bytes += len;
page = virt_to_head_page(buf);
offset = buf - page_address(page);
truesize = mergeable_ctx_to_truesize(ctx);
headroom = mergeable_ctx_to_headroom(ctx);
tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
room = SKB_DATA_ALIGN(headroom + tailroom);
cur_frag_size = truesize;
xdp_frags_truesz += cur_frag_size;
if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
put_page(page);
pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
dev->name, len, (unsigned long)(truesize - room));
dev->stats.rx_length_errors++;
return -EINVAL;
}
frag = &shinfo->frags[shinfo->nr_frags++];
__skb_frag_set_page(frag, page);
skb_frag_off_set(frag, offset);
skb_frag_size_set(frag, len);
if (page_is_pfmemalloc(page))
xdp_buff_set_frag_pfmemalloc(xdp);
shinfo->xdp_frags_size += len;
}
*xdp_frags_truesize = xdp_frags_truesz;
return 0;
}
static struct sk_buff *receive_mergeable(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
......@@ -955,16 +1090,17 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
struct bpf_prog *xdp_prog;
unsigned int truesize = mergeable_ctx_to_truesize(ctx);
unsigned int headroom = mergeable_ctx_to_headroom(ctx);
unsigned int metasize = 0;
unsigned int frame_sz;
unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
unsigned int frame_sz, xdp_room;
int err;
head_skb = NULL;
stats->bytes += len - vi->hdr_len;
if (unlikely(len > truesize)) {
if (unlikely(len > truesize - room)) {
pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
dev->name, len, (unsigned long)ctx);
dev->name, len, (unsigned long)(truesize - room));
dev->stats.rx_length_errors++;
goto err_skb;
}
......@@ -977,11 +1113,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
unsigned int xdp_frags_truesz = 0;
struct skb_shared_info *shinfo;
struct xdp_frame *xdpf;
struct page *xdp_page;
struct xdp_buff xdp;
void *data;
u32 act;
int i;
/* Transient failure which in theory could occur if
* in-flight packets from before XDP was enabled reach
......@@ -990,19 +1129,23 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
if (unlikely(hdr->hdr.gso_type))
goto err_xdp;
/* Buffers with headroom use PAGE_SIZE as alloc size,
* see add_recvbuf_mergeable() + get_mergeable_buf_len()
/* Now XDP core assumes frag size is PAGE_SIZE, but buffers
* with headroom may add hole in truesize, which
* make their length exceed PAGE_SIZE. So we disabled the
* hole mechanism for xdp. See add_recvbuf_mergeable().
*/
frame_sz = headroom ? PAGE_SIZE : truesize;
/* This happens when rx buffer size is underestimated
* or headroom is not enough because of the buffer
* was refilled before XDP is set. This should only
* happen for the first several packets, so we don't
* care much about its performance.
frame_sz = truesize;
/* This happens when headroom is not enough because
* of the buffer was prefilled before XDP is set.
* This should only happen for the first several packets.
* In fact, vq reset can be used here to help us clean up
* the prefilled buffers, but many existing devices do not
* support it, and we don't want to bother users who are
* using xdp normally.
*/
if (unlikely(num_buf > 1 ||
headroom < virtnet_get_headroom(vi))) {
if (!xdp_prog->aux->xdp_has_frags &&
(num_buf > 1 || headroom < virtnet_get_headroom(vi))) {
/* linearize data for XDP */
xdp_page = xdp_linearize_page(rq, &num_buf,
page, offset,
......@@ -1013,82 +1156,53 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
if (!xdp_page)
goto err_xdp;
offset = VIRTIO_XDP_HEADROOM;
} else if (unlikely(headroom < virtnet_get_headroom(vi))) {
xdp_room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM +
sizeof(struct skb_shared_info));
if (len + xdp_room > PAGE_SIZE)
goto err_xdp;
xdp_page = alloc_page(GFP_ATOMIC);
if (!xdp_page)
goto err_xdp;
memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
page_address(page) + offset, len);
frame_sz = PAGE_SIZE;
offset = VIRTIO_XDP_HEADROOM;
} else {
xdp_page = page;
}
/* Allow consuming headroom but reserve enough space to push
* the descriptor on if we get an XDP_TX return code.
*/
data = page_address(xdp_page) + offset;
xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
err = virtnet_build_xdp_buff_mrg(dev, vi, rq, &xdp, data, len, frame_sz,
&num_buf, &xdp_frags_truesz, stats);
if (unlikely(err))
goto err_xdp_frags;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
stats->xdp_packets++;
switch (act) {
case XDP_PASS:
metasize = xdp.data - xdp.data_meta;
/* recalculate offset to account for any header
* adjustments and minus the metasize to copy the
* metadata in page_to_skb(). Note other cases do not
* build an skb and avoid using offset
*/
offset = xdp.data - page_address(xdp_page) -
vi->hdr_len - metasize;
/* recalculate len if xdp.data, xdp.data_end or
* xdp.data_meta were adjusted
*/
len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
/* recalculate headroom if xdp.data or xdp_data_meta
* were adjusted, note that offset should always point
* to the start of the reserved bytes for virtio_net
* header which are followed by xdp.data, that means
* that offset is equal to the headroom (when buf is
* starting at the beginning of the page, otherwise
* there is a base offset inside the page) but it's used
* with a different starting point (buf start) than
* xdp.data (buf start + vnet hdr size). If xdp.data or
* data_meta were adjusted by the xdp prog then the
* headroom size has changed and so has the offset, we
* can use data_hard_start, which points at buf start +
* vnet hdr size, to calculate the new headroom and use
* it later to compute buf start in page_to_skb()
*/
headroom = xdp.data - xdp.data_hard_start - metasize;
/* We can only create skb based on xdp_page. */
if (unlikely(xdp_page != page)) {
rcu_read_unlock();
if (unlikely(xdp_page != page))
put_page(page);
head_skb = page_to_skb(vi, rq, xdp_page, offset,
len, PAGE_SIZE, false,
metasize,
headroom);
return head_skb;
}
break;
head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
rcu_read_unlock();
return head_skb;
case XDP_TX:
stats->xdp_tx++;
xdpf = xdp_convert_buff_to_frame(&xdp);
if (unlikely(!xdpf)) {
if (unlikely(xdp_page != page))
put_page(xdp_page);
goto err_xdp;
netdev_dbg(dev, "convert buff to frame failed for xdp\n");
goto err_xdp_frags;
}
err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
if (unlikely(!err)) {
xdp_return_frame_rx_napi(xdpf);
} else if (unlikely(err < 0)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
if (unlikely(xdp_page != page))
put_page(xdp_page);
goto err_xdp;
goto err_xdp_frags;
}
*xdp_xmit |= VIRTIO_XDP_TX;
if (unlikely(xdp_page != page))
......@@ -1098,11 +1212,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
case XDP_REDIRECT:
stats->xdp_redirects++;
err = xdp_do_redirect(dev, &xdp, xdp_prog);
if (err) {
if (unlikely(xdp_page != page))
put_page(xdp_page);
goto err_xdp;
}
if (err)
goto err_xdp_frags;
*xdp_xmit |= VIRTIO_XDP_REDIR;
if (unlikely(xdp_page != page))
put_page(page);
......@@ -1115,16 +1226,26 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
trace_xdp_exception(vi->dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
if (unlikely(xdp_page != page))
__free_pages(xdp_page, 0);
goto err_xdp;
goto err_xdp_frags;
}
err_xdp_frags:
if (unlikely(xdp_page != page))
__free_pages(xdp_page, 0);
if (xdp_buff_has_frags(&xdp)) {
shinfo = xdp_get_shared_info_from_buff(&xdp);
for (i = 0; i < shinfo->nr_frags; i++) {
xdp_page = skb_frag_page(&shinfo->frags[i]);
put_page(xdp_page);
}
}
goto err_xdp;
}
rcu_read_unlock();
skip_xdp:
head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
metasize, headroom);
head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
curr_skb = head_skb;
if (unlikely(!curr_skb))
......@@ -1146,9 +1267,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
page = virt_to_head_page(buf);
truesize = mergeable_ctx_to_truesize(ctx);
if (unlikely(len > truesize)) {
headroom = mergeable_ctx_to_headroom(ctx);
tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
room = SKB_DATA_ALIGN(headroom + tailroom);
if (unlikely(len > truesize - room)) {
pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
dev->name, len, (unsigned long)ctx);
dev->name, len, (unsigned long)(truesize - room));
dev->stats.rx_length_errors++;
goto err_skb;
}
......@@ -1426,13 +1550,16 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
/* To avoid internal fragmentation, if there is very likely not
* enough space for another buffer, add the remaining space to
* the current buffer.
* XDP core assumes that frame_size of xdp_buff and the length
* of the frag are PAGE_SIZE, so we disable the hole mechanism.
*/
len += hole;
if (!headroom)
len += hole;
alloc_frag->offset += hole;
}
sg_init_one(rq->sg, buf, len);
ctx = mergeable_len_to_ctx(len, headroom);
ctx = mergeable_len_to_ctx(len + room, headroom);
err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
if (err < 0)
put_page(virt_to_head_page(buf));
......@@ -1608,7 +1735,7 @@ static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
} else {
struct xdp_frame *frame = ptr_to_xdp(ptr);
bytes += frame->len;
bytes += xdp_get_frame_len(frame);
xdp_return_frame(frame);
}
packets++;
......@@ -3078,7 +3205,9 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
unsigned int room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM +
sizeof(struct skb_shared_info));
unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN;
struct virtnet_info *vi = netdev_priv(dev);
struct bpf_prog *old_prog;
u16 xdp_qp = 0, curr_qp;
......@@ -3101,9 +3230,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
return -EINVAL;
}
if (dev->mtu > max_sz) {
NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
if (prog && !prog->aux->xdp_has_frags && dev->mtu > max_sz) {
NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP without frags");
netdev_warn(dev, "single-buffer XDP requires MTU less than %u\n", max_sz);
return -EINVAL;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment