Commit d944c27a authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'virtio-net-support-af_xdp-zero-copy'

Xuan Zhuo says:

====================
virtio-net: support AF_XDP zero copy

v5: http://lore.kernel.org/all/20240611114147.31320-1-xuanzhuo@linux.alibaba.com

XDP socket(AF_XDP) is an excellent bypass kernel network framework. The zero
copy feature of xsk (XDP socket) needs to be supported by the driver. The
performance of zero copy is very good. mlx5 and intel ixgbe already support
this feature, This patch set allows virtio-net to support xsk's zerocopy xmit
feature.

At present, we have completed some preparation:

1. vq-reset (virtio spec and kernel code)
2. virtio-core premapped dma
3. virtio-net xdp refactor

So it is time for Virtio-Net to complete the support for the XDP Socket
Zerocopy.

Virtio-net can not increase the queue num at will, so xsk shares the queue with
kernel.

On the other hand, Virtio-Net does not support generate interrupt from driver
manually, so when we wakeup tx xmit, we used some tips. If the CPU run by TX
NAPI last time is other CPUs, use IPI to wake up NAPI on the remote CPU. If it
is also the local CPU, then we wake up napi directly.

This patch set includes some refactor to the virtio-net to let that to support
AF_XDP.

Because there are too many commits, the work of virtio net supporting af-xdp is
split to rx part and tx part. This patch set is for rx part.

So the flag NETDEV_XDP_ACT_XSK_ZEROCOPY is not added, if someone want to test
for af-xdp rx, the flag needs to be adding locally.

ENV: Qemu with vhost-user(polling mode).
Host CPU: Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz

testpmd> show port stats all

 ######################## NIC statistics for port 0 ########################
 RX-packets: 19531092064 RX-missed: 0     RX-bytes: 1093741155584
 RX-errors: 0
 RX-nombuf: 0
 TX-packets: 5959955552 TX-errors: 0     TX-bytes: 371030645664

 Throughput (since last show)
 Rx-pps:   8861574     Rx-bps:  3969985208
 Tx-pps:   8861493     Tx-bps:  3969962736
 ############################################################################

testpmd> show port stats all

  ######################## NIC statistics for port 0  ########################
  RX-packets: 68152727   RX-missed: 0          RX-bytes:  3816552712
  RX-errors: 0
  RX-nombuf:  0
  TX-packets: 68114967   TX-errors: 33216      TX-bytes:  3814438152

  Throughput (since last show)
  Rx-pps:      6333196          Rx-bps:   2837272088
  Tx-pps:      6333227          Tx-bps:   2837285936
  ############################################################################

But AF_XDP consumes more CPU for tx and rx napi(100% and 86%).
====================

Link: https://patch.msgid.link/20240708112537.96291-1-xuanzhuo@linux.alibaba.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents e6c29506 99c861b4
......@@ -25,6 +25,7 @@
#include <net/net_failover.h>
#include <net/netdev_rx_queue.h>
#include <net/netdev_queues.h>
#include <net/xdp_sock_drv.h>
static int napi_weight = NAPI_POLL_WEIGHT;
module_param(napi_weight, int, 0444);
......@@ -40,9 +41,6 @@ module_param(napi_tx, bool, 0644);
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX BIT(0)
#define VIRTIO_XDP_REDIR BIT(1)
......@@ -351,6 +349,13 @@ struct receive_queue {
/* Record the last dma info to free after new pages is allocated. */
struct virtnet_rq_dma *last_dma;
struct xsk_buff_pool *xsk_pool;
/* xdp rxq used by xsk */
struct xdp_rxq_info xsk_rxq_info;
struct xdp_buff **xsk_buffs;
};
/* This structure can contain rss message with maximum settings for indirection table and keysize
......@@ -493,6 +498,16 @@ struct virtio_net_common_hdr {
};
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
struct net_device *dev,
unsigned int *xdp_xmit,
struct virtnet_rq_stats *stats);
static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq,
struct sk_buff *skb, u8 flags);
static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
struct sk_buff *curr_skb,
struct page *page, void *buf,
int len, int truesize);
static bool is_xdp_frame(void *ptr)
{
......@@ -973,6 +988,11 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
rq = &vi->rq[i];
if (rq->xsk_pool) {
xsk_buff_free((struct xdp_buff *)buf);
return;
}
if (!vi->big_packets || vi->mergeable_rx_bufs)
virtnet_rq_unmap(rq, buf, 0);
......@@ -1051,6 +1071,329 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
}
}
static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
{
sg->dma_address = addr;
sg->length = len;
}
static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi,
struct receive_queue *rq, void *buf, u32 len)
{
struct xdp_buff *xdp;
u32 bufsize;
xdp = (struct xdp_buff *)buf;
bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len;
if (unlikely(len > bufsize)) {
pr_debug("%s: rx error: len %u exceeds truesize %u\n",
vi->dev->name, len, bufsize);
DEV_STATS_INC(vi->dev, rx_length_errors);
xsk_buff_free(xdp);
return NULL;
}
xsk_buff_set_size(xdp, len);
xsk_buff_dma_sync_for_cpu(xdp);
return xdp;
}
static struct sk_buff *xsk_construct_skb(struct receive_queue *rq,
struct xdp_buff *xdp)
{
unsigned int metasize = xdp->data - xdp->data_meta;
struct sk_buff *skb;
unsigned int size;
size = xdp->data_end - xdp->data_hard_start;
skb = napi_alloc_skb(&rq->napi, size);
if (unlikely(!skb)) {
xsk_buff_free(xdp);
return NULL;
}
skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
size = xdp->data_end - xdp->data_meta;
memcpy(__skb_put(skb, size), xdp->data_meta, size);
if (metasize) {
__skb_pull(skb, metasize);
skb_metadata_set(skb, metasize);
}
xsk_buff_free(xdp);
return skb;
}
static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi,
struct receive_queue *rq, struct xdp_buff *xdp,
unsigned int *xdp_xmit,
struct virtnet_rq_stats *stats)
{
struct bpf_prog *prog;
u32 ret;
ret = XDP_PASS;
rcu_read_lock();
prog = rcu_dereference(rq->xdp_prog);
if (prog)
ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
rcu_read_unlock();
switch (ret) {
case XDP_PASS:
return xsk_construct_skb(rq, xdp);
case XDP_TX:
case XDP_REDIRECT:
return NULL;
default:
/* drop packet */
xsk_buff_free(xdp);
u64_stats_inc(&stats->drops);
return NULL;
}
}
static void xsk_drop_follow_bufs(struct net_device *dev,
struct receive_queue *rq,
u32 num_buf,
struct virtnet_rq_stats *stats)
{
struct xdp_buff *xdp;
u32 len;
while (num_buf-- > 1) {
xdp = virtqueue_get_buf(rq->vq, &len);
if (unlikely(!xdp)) {
pr_debug("%s: rx error: %d buffers missing\n",
dev->name, num_buf);
DEV_STATS_INC(dev, rx_length_errors);
break;
}
u64_stats_add(&stats->bytes, len);
xsk_buff_free(xdp);
}
}
static int xsk_append_merge_buffer(struct virtnet_info *vi,
struct receive_queue *rq,
struct sk_buff *head_skb,
u32 num_buf,
struct virtio_net_hdr_mrg_rxbuf *hdr,
struct virtnet_rq_stats *stats)
{
struct sk_buff *curr_skb;
struct xdp_buff *xdp;
u32 len, truesize;
struct page *page;
void *buf;
curr_skb = head_skb;
while (--num_buf) {
buf = virtqueue_get_buf(rq->vq, &len);
if (unlikely(!buf)) {
pr_debug("%s: rx error: %d buffers out of %d missing\n",
vi->dev->name, num_buf,
virtio16_to_cpu(vi->vdev,
hdr->num_buffers));
DEV_STATS_INC(vi->dev, rx_length_errors);
return -EINVAL;
}
u64_stats_add(&stats->bytes, len);
xdp = buf_to_xdp(vi, rq, buf, len);
if (!xdp)
goto err;
buf = napi_alloc_frag(len);
if (!buf) {
xsk_buff_free(xdp);
goto err;
}
memcpy(buf, xdp->data - vi->hdr_len, len);
xsk_buff_free(xdp);
page = virt_to_page(buf);
truesize = len;
curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page,
buf, len, truesize);
if (!curr_skb) {
put_page(page);
goto err;
}
}
return 0;
err:
xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats);
return -EINVAL;
}
static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi,
struct receive_queue *rq, struct xdp_buff *xdp,
unsigned int *xdp_xmit,
struct virtnet_rq_stats *stats)
{
struct virtio_net_hdr_mrg_rxbuf *hdr;
struct bpf_prog *prog;
struct sk_buff *skb;
u32 ret, num_buf;
hdr = xdp->data - vi->hdr_len;
num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
ret = XDP_PASS;
rcu_read_lock();
prog = rcu_dereference(rq->xdp_prog);
/* TODO: support multi buffer. */
if (prog && num_buf == 1)
ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
rcu_read_unlock();
switch (ret) {
case XDP_PASS:
skb = xsk_construct_skb(rq, xdp);
if (!skb)
goto drop_bufs;
if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) {
dev_kfree_skb(skb);
goto drop;
}
return skb;
case XDP_TX:
case XDP_REDIRECT:
return NULL;
default:
/* drop packet */
xsk_buff_free(xdp);
}
drop_bufs:
xsk_drop_follow_bufs(dev, rq, num_buf, stats);
drop:
u64_stats_inc(&stats->drops);
return NULL;
}
static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq,
void *buf, u32 len,
unsigned int *xdp_xmit,
struct virtnet_rq_stats *stats)
{
struct net_device *dev = vi->dev;
struct sk_buff *skb = NULL;
struct xdp_buff *xdp;
u8 flags;
len -= vi->hdr_len;
u64_stats_add(&stats->bytes, len);
xdp = buf_to_xdp(vi, rq, buf, len);
if (!xdp)
return;
if (unlikely(len < ETH_HLEN)) {
pr_debug("%s: short packet %i\n", dev->name, len);
DEV_STATS_INC(dev, rx_length_errors);
xsk_buff_free(xdp);
return;
}
flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags;
if (!vi->mergeable_rx_bufs)
skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats);
else
skb = virtnet_receive_xsk_merge(dev, vi, rq, xdp, xdp_xmit, stats);
if (skb)
virtnet_receive_done(vi, rq, skb, flags);
}
static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq,
struct xsk_buff_pool *pool, gfp_t gfp)
{
struct xdp_buff **xsk_buffs;
dma_addr_t addr;
int err = 0;
u32 len, i;
int num;
xsk_buffs = rq->xsk_buffs;
num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free);
if (!num)
return -ENOMEM;
len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
for (i = 0; i < num; ++i) {
/* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space.
* We assume XDP_PACKET_HEADROOM is larger than hdr->len.
* (see function virtnet_xsk_pool_enable)
*/
addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len;
sg_init_table(rq->sg, 1);
sg_fill_dma(rq->sg, addr, len);
err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, xsk_buffs[i], gfp);
if (err)
goto err;
}
return num;
err:
for (; i < num; ++i)
xsk_buff_free(xsk_buffs[i]);
return err;
}
static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
{
struct virtnet_info *vi = netdev_priv(dev);
struct send_queue *sq;
if (!netif_running(dev))
return -ENETDOWN;
if (qid >= vi->curr_queue_pairs)
return -EINVAL;
sq = &vi->sq[qid];
if (napi_if_scheduled_mark_missed(&sq->napi))
return 0;
local_bh_disable();
virtqueue_napi_schedule(&sq->napi, sq->vq);
local_bh_enable();
return 0;
}
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct send_queue *sq,
struct xdp_frame *xdpf)
......@@ -1268,7 +1611,7 @@ static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
return vi->xdp_enabled ? XDP_PACKET_HEADROOM : 0;
}
/* We copy the packet for XDP in the following cases:
......@@ -1332,7 +1675,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
}
/* Headroom does not contribute to packet length */
*len = page_off - VIRTIO_XDP_HEADROOM;
*len = page_off - XDP_PACKET_HEADROOM;
return page;
err_buf:
__free_pages(page, 0);
......@@ -1619,8 +1962,8 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
void *ctx;
xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
xdp_prepare_buff(xdp, buf - VIRTIO_XDP_HEADROOM,
VIRTIO_XDP_HEADROOM + vi->hdr_len, len - vi->hdr_len, true);
xdp_prepare_buff(xdp, buf - XDP_PACKET_HEADROOM,
XDP_PACKET_HEADROOM + vi->hdr_len, len - vi->hdr_len, true);
if (!*num_buf)
return 0;
......@@ -1737,12 +2080,12 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
/* linearize data for XDP */
xdp_page = xdp_linearize_page(rq, num_buf,
*page, offset,
VIRTIO_XDP_HEADROOM,
XDP_PACKET_HEADROOM,
len);
if (!xdp_page)
return NULL;
} else {
xdp_room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM +
xdp_room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM +
sizeof(struct skb_shared_info));
if (*len + xdp_room > PAGE_SIZE)
return NULL;
......@@ -1751,7 +2094,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
if (!xdp_page)
return NULL;
memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
memcpy(page_address(xdp_page) + XDP_PACKET_HEADROOM,
page_address(*page) + offset, *len);
}
......@@ -1761,7 +2104,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
*page = xdp_page;
return page_address(*page) + VIRTIO_XDP_HEADROOM;
return page_address(*page) + XDP_PACKET_HEADROOM;
}
static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
......@@ -1824,6 +2167,49 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
return NULL;
}
static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
struct sk_buff *curr_skb,
struct page *page, void *buf,
int len, int truesize)
{
int num_skb_frags;
int offset;
num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
if (unlikely(!nskb))
return NULL;
if (curr_skb == head_skb)
skb_shinfo(curr_skb)->frag_list = nskb;
else
curr_skb->next = nskb;
curr_skb = nskb;
head_skb->truesize += nskb->truesize;
num_skb_frags = 0;
}
if (curr_skb != head_skb) {
head_skb->data_len += len;
head_skb->len += len;
head_skb->truesize += truesize;
}
offset = buf - page_address(page);
if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
put_page(page);
skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
len, truesize);
} else {
skb_add_rx_frag(curr_skb, num_skb_frags, page,
offset, len, truesize);
}
return curr_skb;
}
static struct sk_buff *receive_mergeable(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
......@@ -1873,8 +2259,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
if (unlikely(!curr_skb))
goto err_skb;
while (--num_buf) {
int num_skb_frags;
buf = virtnet_rq_get_buf(rq, &len, &ctx);
if (unlikely(!buf)) {
pr_debug("%s: rx error: %d buffers out of %d missing\n",
......@@ -1899,34 +2283,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
goto err_skb;
}
num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
if (unlikely(!nskb))
curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page,
buf, len, truesize);
if (!curr_skb)
goto err_skb;
if (curr_skb == head_skb)
skb_shinfo(curr_skb)->frag_list = nskb;
else
curr_skb->next = nskb;
curr_skb = nskb;
head_skb->truesize += nskb->truesize;
num_skb_frags = 0;
}
if (curr_skb != head_skb) {
head_skb->data_len += len;
head_skb->len += len;
head_skb->truesize += truesize;
}
offset = buf - page_address(page);
if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
put_page(page);
skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
len, truesize);
} else {
skb_add_rx_frag(curr_skb, num_skb_frags, page,
offset, len, truesize);
}
}
ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
......@@ -1971,6 +2331,40 @@ static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash,
skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type);
}
static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq,
struct sk_buff *skb, u8 flags)
{
struct virtio_net_common_hdr *hdr;
struct net_device *dev = vi->dev;
hdr = skb_vnet_common_hdr(skb);
if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report)
virtio_skb_set_hash(&hdr->hash_v1_hdr, skb);
if (flags & VIRTIO_NET_HDR_F_DATA_VALID)
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
virtio_is_little_endian(vi->vdev))) {
net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
dev->name, hdr->hdr.gso_type,
hdr->hdr.gso_size);
goto frame_err;
}
skb_record_rx_queue(skb, vq2rxq(rq->vq));
skb->protocol = eth_type_trans(skb, dev);
pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
ntohs(skb->protocol), skb->len, skb->pkt_type);
napi_gro_receive(&rq->napi, skb);
return;
frame_err:
DEV_STATS_INC(dev, rx_frame_errors);
dev_kfree_skb(skb);
}
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
void *buf, unsigned int len, void **ctx,
unsigned int *xdp_xmit,
......@@ -1978,7 +2372,6 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
{
struct net_device *dev = vi->dev;
struct sk_buff *skb;
struct virtio_net_common_hdr *hdr;
u8 flags;
if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
......@@ -2008,32 +2401,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
if (unlikely(!skb))
return;
hdr = skb_vnet_common_hdr(skb);
if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report)
virtio_skb_set_hash(&hdr->hash_v1_hdr, skb);
if (flags & VIRTIO_NET_HDR_F_DATA_VALID)
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
virtio_is_little_endian(vi->vdev))) {
net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
dev->name, hdr->hdr.gso_type,
hdr->hdr.gso_size);
goto frame_err;
}
skb_record_rx_queue(skb, vq2rxq(rq->vq));
skb->protocol = eth_type_trans(skb, dev);
pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
ntohs(skb->protocol), skb->len, skb->pkt_type);
napi_gro_receive(&rq->napi, skb);
return;
frame_err:
DEV_STATS_INC(dev, rx_frame_errors);
dev_kfree_skb(skb);
virtnet_receive_done(vi, rq, skb, flags);
}
/* Unlike mergeable buffers, all buffers are allocated to the
......@@ -2194,7 +2562,11 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
gfp_t gfp)
{
int err;
bool oom;
if (rq->xsk_pool) {
err = virtnet_add_recvbuf_xsk(vi, rq, rq->xsk_pool, gfp);
goto kick;
}
do {
if (vi->mergeable_rx_bufs)
......@@ -2204,10 +2576,11 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
else
err = add_recvbuf_small(vi, rq, gfp);
oom = err == -ENOMEM;
if (err)
break;
} while (rq->vq->num_free);
kick:
if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
unsigned long flags;
......@@ -2216,7 +2589,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
}
return !oom;
return err != -ENOMEM;
}
static void skb_recv_done(struct virtqueue *rvq)
......@@ -2287,32 +2660,68 @@ static void refill_work(struct work_struct *work)
}
}
static int virtnet_receive(struct receive_queue *rq, int budget,
unsigned int *xdp_xmit)
static int virtnet_receive_xsk_bufs(struct virtnet_info *vi,
struct receive_queue *rq,
int budget,
unsigned int *xdp_xmit,
struct virtnet_rq_stats *stats)
{
unsigned int len;
int packets = 0;
void *buf;
while (packets < budget) {
buf = virtqueue_get_buf(rq->vq, &len);
if (!buf)
break;
virtnet_receive_xsk_buf(vi, rq, buf, len, xdp_xmit, stats);
packets++;
}
return packets;
}
static int virtnet_receive_packets(struct virtnet_info *vi,
struct receive_queue *rq,
int budget,
unsigned int *xdp_xmit,
struct virtnet_rq_stats *stats)
{
struct virtnet_info *vi = rq->vq->vdev->priv;
struct virtnet_rq_stats stats = {};
unsigned int len;
int packets = 0;
void *buf;
int i;
if (!vi->big_packets || vi->mergeable_rx_bufs) {
void *ctx;
while (packets < budget &&
(buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats);
packets++;
}
} else {
while (packets < budget &&
(buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats);
packets++;
}
}
return packets;
}
static int virtnet_receive(struct receive_queue *rq, int budget,
unsigned int *xdp_xmit)
{
struct virtnet_info *vi = rq->vq->vdev->priv;
struct virtnet_rq_stats stats = {};
int i, packets;
if (rq->xsk_pool)
packets = virtnet_receive_xsk_bufs(vi, rq, budget, xdp_xmit, &stats);
else
packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats);
if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
spin_lock(&vi->refill_lock);
......@@ -2668,37 +3077,49 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
static int virtnet_rx_resize(struct virtnet_info *vi,
struct receive_queue *rq, u32 ring_num)
static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
{
bool running = netif_running(vi->dev);
int err, qindex;
qindex = rq - vi->rq;
if (running) {
napi_disable(&rq->napi);
virtnet_cancel_dim(vi, &rq->dim);
}
}
err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf);
if (err)
netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err);
static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
{
bool running = netif_running(vi->dev);
if (!try_fill_recv(vi, rq, GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
if (running)
virtnet_napi_enable(rq->vq, &rq->napi);
}
static int virtnet_rx_resize(struct virtnet_info *vi,
struct receive_queue *rq, u32 ring_num)
{
int err, qindex;
qindex = rq - vi->rq;
virtnet_rx_pause(vi, rq);
err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf);
if (err)
netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err);
virtnet_rx_resume(vi, rq);
return err;
}
static int virtnet_tx_resize(struct virtnet_info *vi,
struct send_queue *sq, u32 ring_num)
static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq)
{
bool running = netif_running(vi->dev);
struct netdev_queue *txq;
int err, qindex;
int qindex;
qindex = sq - vi->sq;
......@@ -2719,10 +3140,17 @@ static int virtnet_tx_resize(struct virtnet_info *vi,
netif_stop_subqueue(vi->dev, qindex);
__netif_tx_unlock_bh(txq);
}
err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf);
if (err)
netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err);
static void virtnet_tx_resume(struct virtnet_info *vi, struct send_queue *sq)
{
bool running = netif_running(vi->dev);
struct netdev_queue *txq;
int qindex;
qindex = sq - vi->sq;
txq = netdev_get_tx_queue(vi->dev, qindex);
__netif_tx_lock_bh(txq);
sq->reset = false;
......@@ -2731,6 +3159,23 @@ static int virtnet_tx_resize(struct virtnet_info *vi,
if (running)
virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
}
static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq,
u32 ring_num)
{
int qindex, err;
qindex = sq - vi->sq;
virtnet_tx_pause(vi, sq);
err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf);
if (err)
netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err);
virtnet_tx_resume(vi, sq);
return err;
}
......@@ -4968,10 +5413,144 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
return virtnet_set_guest_offloads(vi, offloads);
}
static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq,
struct xsk_buff_pool *pool)
{
int err, qindex;
qindex = rq - vi->rq;
if (pool) {
err = xdp_rxq_info_reg(&rq->xsk_rxq_info, vi->dev, qindex, rq->napi.napi_id);
if (err < 0)
return err;
err = xdp_rxq_info_reg_mem_model(&rq->xsk_rxq_info,
MEM_TYPE_XSK_BUFF_POOL, NULL);
if (err < 0)
goto unreg;
xsk_pool_set_rxq_info(pool, &rq->xsk_rxq_info);
}
virtnet_rx_pause(vi, rq);
err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf);
if (err) {
netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err);
pool = NULL;
}
rq->xsk_pool = pool;
virtnet_rx_resume(vi, rq);
if (pool)
return 0;
unreg:
xdp_rxq_info_unreg(&rq->xsk_rxq_info);
return err;
}
static int virtnet_xsk_pool_enable(struct net_device *dev,
struct xsk_buff_pool *pool,
u16 qid)
{
struct virtnet_info *vi = netdev_priv(dev);
struct receive_queue *rq;
struct device *dma_dev;
struct send_queue *sq;
int err, size;
if (vi->hdr_len > xsk_pool_get_headroom(pool))
return -EINVAL;
/* In big_packets mode, xdp cannot work, so there is no need to
* initialize xsk of rq.
*/
if (vi->big_packets && !vi->mergeable_rx_bufs)
return -ENOENT;
if (qid >= vi->curr_queue_pairs)
return -EINVAL;
sq = &vi->sq[qid];
rq = &vi->rq[qid];
/* xsk assumes that tx and rx must have the same dma device. The af-xdp
* may use one buffer to receive from the rx and reuse this buffer to
* send by the tx. So the dma dev of sq and rq must be the same one.
*
* But vq->dma_dev allows every vq has the respective dma dev. So I
* check the dma dev of vq and sq is the same dev.
*/
if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq))
return -EINVAL;
dma_dev = virtqueue_dma_dev(rq->vq);
if (!dma_dev)
return -EINVAL;
size = virtqueue_get_vring_size(rq->vq);
rq->xsk_buffs = kvcalloc(size, sizeof(*rq->xsk_buffs), GFP_KERNEL);
if (!rq->xsk_buffs)
return -ENOMEM;
err = xsk_pool_dma_map(pool, dma_dev, 0);
if (err)
goto err_xsk_map;
err = virtnet_rq_bind_xsk_pool(vi, rq, pool);
if (err)
goto err_rq;
return 0;
err_rq:
xsk_pool_dma_unmap(pool, 0);
err_xsk_map:
return err;
}
static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
{
struct virtnet_info *vi = netdev_priv(dev);
struct xsk_buff_pool *pool;
struct receive_queue *rq;
int err;
if (qid >= vi->curr_queue_pairs)
return -EINVAL;
rq = &vi->rq[qid];
pool = rq->xsk_pool;
err = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
xsk_pool_dma_unmap(pool, 0);
kvfree(rq->xsk_buffs);
return err;
}
static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp)
{
if (xdp->xsk.pool)
return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
xdp->xsk.queue_id);
else
return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
}
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
unsigned int room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM +
unsigned int room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM +
sizeof(struct skb_shared_info));
unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN;
struct virtnet_info *vi = netdev_priv(dev);
......@@ -5093,6 +5672,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
switch (xdp->command) {
case XDP_SETUP_PROG:
return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
case XDP_SETUP_XSK_POOL:
return virtnet_xsk_pool_setup(dev, xdp);
default:
return -EINVAL;
}
......@@ -5207,6 +5788,7 @@ static const struct net_device_ops virtnet_netdev = {
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
.ndo_bpf = virtnet_xdp,
.ndo_xdp_xmit = virtnet_xdp_xmit,
.ndo_xsk_wakeup = virtnet_xsk_wakeup,
.ndo_features_check = passthru_features_check,
.ndo_get_phys_port_name = virtnet_get_phys_port_name,
.ndo_set_features = virtnet_set_features,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment