Commit e8b18af8 authored by David S. Miller's avatar David S. Miller

Merge branch 'XDP-transmission-for-tuntap'

Jason Wang says:

====================
XDP transmission for tuntap

This series tries to implement XDP transmission (ndo_xdp_xmit) for
tuntap. Pointer ring was used for queuing both XDP buffers and
sk_buff, this is done by encoding the type into lowest bit of the
pointer and storin XDP metadata in the headroom of XDP buff.

Tests gets 3.05 Mpps when doing xdp_redirect_map from ixgbe to VM
(testpmd + virtio-net in guest). This gives us ~20% improvments
compared to use skb during redirect.

Please review.

Changes from V1:

- slient warnings
- fix typos
- add skb mode number in the commit log
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a0ce0931 fc72d1d5
......@@ -330,7 +330,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
if (!q)
return RX_HANDLER_PASS;
if (__skb_array_full(&q->skb_array))
if (__ptr_ring_full(&q->ring))
goto drop;
skb_push(skb, ETH_HLEN);
......@@ -348,7 +348,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
goto drop;
if (!segs) {
if (skb_array_produce(&q->skb_array, skb))
if (ptr_ring_produce(&q->ring, skb))
goto drop;
goto wake_up;
}
......@@ -358,7 +358,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
struct sk_buff *nskb = segs->next;
segs->next = NULL;
if (skb_array_produce(&q->skb_array, segs)) {
if (ptr_ring_produce(&q->ring, segs)) {
kfree_skb(segs);
kfree_skb_list(nskb);
break;
......@@ -375,7 +375,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb))
goto drop;
if (skb_array_produce(&q->skb_array, skb))
if (ptr_ring_produce(&q->ring, skb))
goto drop;
}
......@@ -497,7 +497,7 @@ static void tap_sock_destruct(struct sock *sk)
{
struct tap_queue *q = container_of(sk, struct tap_queue, sk);
skb_array_cleanup(&q->skb_array);
ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb);
}
static int tap_open(struct inode *inode, struct file *file)
......@@ -517,7 +517,7 @@ static int tap_open(struct inode *inode, struct file *file)
&tap_proto, 0);
if (!q)
goto err;
if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) {
if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) {
sk_free(&q->sk);
goto err;
}
......@@ -546,7 +546,7 @@ static int tap_open(struct inode *inode, struct file *file)
err = tap_set_queue(tap, file, q);
if (err) {
/* tap_sock_destruct() will take care of freeing skb_array */
/* tap_sock_destruct() will take care of freeing ptr_ring */
goto err_put;
}
......@@ -583,7 +583,7 @@ static unsigned int tap_poll(struct file *file, poll_table *wait)
mask = 0;
poll_wait(file, &q->wq.wait, wait);
if (!skb_array_empty(&q->skb_array))
if (!ptr_ring_empty(&q->ring))
mask |= POLLIN | POLLRDNORM;
if (sock_writeable(&q->sk) ||
......@@ -844,7 +844,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
TASK_INTERRUPTIBLE);
/* Read frames from the queue */
skb = skb_array_consume(&q->skb_array);
skb = ptr_ring_consume(&q->ring);
if (skb)
break;
if (noblock) {
......@@ -1176,7 +1176,7 @@ static int tap_peek_len(struct socket *sock)
{
struct tap_queue *q = container_of(sock, struct tap_queue,
sock);
return skb_array_peek_len(&q->skb_array);
return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag);
}
/* Ops structure to mimic raw sockets with tun */
......@@ -1202,7 +1202,7 @@ struct socket *tap_get_socket(struct file *file)
}
EXPORT_SYMBOL_GPL(tap_get_socket);
struct skb_array *tap_get_skb_array(struct file *file)
struct ptr_ring *tap_get_ptr_ring(struct file *file)
{
struct tap_queue *q;
......@@ -1211,29 +1211,30 @@ struct skb_array *tap_get_skb_array(struct file *file)
q = file->private_data;
if (!q)
return ERR_PTR(-EBADFD);
return &q->skb_array;
return &q->ring;
}
EXPORT_SYMBOL_GPL(tap_get_skb_array);
EXPORT_SYMBOL_GPL(tap_get_ptr_ring);
int tap_queue_resize(struct tap_dev *tap)
{
struct net_device *dev = tap->dev;
struct tap_queue *q;
struct skb_array **arrays;
struct ptr_ring **rings;
int n = tap->numqueues;
int ret, i = 0;
arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
if (!arrays)
rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
if (!rings)
return -ENOMEM;
list_for_each_entry(q, &tap->queue_list, next)
arrays[i++] = &q->skb_array;
rings[i++] = &q->ring;
ret = skb_array_resize_multiple(arrays, n,
dev->tx_queue_len, GFP_KERNEL);
ret = ptr_ring_resize_multiple(rings, n,
dev->tx_queue_len, GFP_KERNEL,
__skb_array_destroy_skb);
kfree(arrays);
kfree(rings);
return ret;
}
EXPORT_SYMBOL_GPL(tap_queue_resize);
......
This diff is collapsed.
......@@ -89,7 +89,7 @@ struct vhost_net_ubuf_ref {
#define VHOST_RX_BATCH 64
struct vhost_net_buf {
struct sk_buff **queue;
void **queue;
int tail;
int head;
};
......@@ -108,7 +108,7 @@ struct vhost_net_virtqueue {
/* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
struct vhost_net_ubuf_ref *ubufs;
struct skb_array *rx_array;
struct ptr_ring *rx_ring;
struct vhost_net_buf rxq;
};
......@@ -158,7 +158,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
struct vhost_net_buf *rxq = &nvq->rxq;
rxq->head = 0;
rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
VHOST_RX_BATCH);
return rxq->tail;
}
......@@ -167,13 +167,25 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
{
struct vhost_net_buf *rxq = &nvq->rxq;
if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
vhost_net_buf_get_size(rxq));
if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
vhost_net_buf_get_size(rxq),
__skb_array_destroy_skb);
rxq->head = rxq->tail = 0;
}
}
static int vhost_net_buf_peek_len(void *ptr)
{
if (tun_is_xdp_buff(ptr)) {
struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
return xdp->data_end - xdp->data;
}
return __skb_array_len_with_tag(ptr);
}
static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
{
struct vhost_net_buf *rxq = &nvq->rxq;
......@@ -185,7 +197,7 @@ static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
return 0;
out:
return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
}
static void vhost_net_buf_init(struct vhost_net_buf *rxq)
......@@ -583,7 +595,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
int len = 0;
unsigned long flags;
if (rvq->rx_array)
if (rvq->rx_ring)
return vhost_net_buf_peek(rvq);
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
......@@ -790,7 +802,7 @@ static void handle_rx(struct vhost_net *net)
* they refilled. */
goto out;
}
if (nvq->rx_array)
if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
if (unlikely(headcount > UIO_MAXIOV)) {
......@@ -896,7 +908,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
struct vhost_net *n;
struct vhost_dev *dev;
struct vhost_virtqueue **vqs;
struct sk_buff **queue;
void **queue;
int i;
n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
......@@ -908,7 +920,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
return -ENOMEM;
}
queue = kmalloc_array(VHOST_RX_BATCH, sizeof(struct sk_buff *),
queue = kmalloc_array(VHOST_RX_BATCH, sizeof(void *),
GFP_KERNEL);
if (!queue) {
kfree(vqs);
......@@ -1046,23 +1058,23 @@ static struct socket *get_raw_socket(int fd)
return ERR_PTR(r);
}
static struct skb_array *get_tap_skb_array(int fd)
static struct ptr_ring *get_tap_ptr_ring(int fd)
{
struct skb_array *array;
struct ptr_ring *ring;
struct file *file = fget(fd);
if (!file)
return NULL;
array = tun_get_skb_array(file);
if (!IS_ERR(array))
ring = tun_get_tx_ring(file);
if (!IS_ERR(ring))
goto out;
array = tap_get_skb_array(file);
if (!IS_ERR(array))
ring = tap_get_ptr_ring(file);
if (!IS_ERR(ring))
goto out;
array = NULL;
ring = NULL;
out:
fput(file);
return array;
return ring;
}
static struct socket *get_tap_socket(int fd)
......@@ -1143,7 +1155,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
vq->private_data = sock;
vhost_net_buf_unproduce(nvq);
if (index == VHOST_NET_VQ_RX)
nvq->rx_array = get_tap_skb_array(fd);
nvq->rx_ring = get_tap_ptr_ring(fd);
r = vhost_vq_init_access(vq);
if (r)
goto err_used;
......
......@@ -4,7 +4,7 @@
#if IS_ENABLED(CONFIG_TAP)
struct socket *tap_get_socket(struct file *);
struct skb_array *tap_get_skb_array(struct file *file);
struct ptr_ring *tap_get_ptr_ring(struct file *file);
#else
#include <linux/err.h>
#include <linux/errno.h>
......@@ -14,7 +14,7 @@ static inline struct socket *tap_get_socket(struct file *f)
{
return ERR_PTR(-EINVAL);
}
static inline struct skb_array *tap_get_skb_array(struct file *f)
static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
{
return ERR_PTR(-EINVAL);
}
......@@ -70,7 +70,7 @@ struct tap_queue {
u16 queue_index;
bool enabled;
struct list_head next;
struct skb_array skb_array;
struct ptr_ring ring;
};
rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
......
......@@ -17,9 +17,14 @@
#include <uapi/linux/if_tun.h>
#define TUN_XDP_FLAG 0x1UL
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct skb_array *tun_get_skb_array(struct file *file);
struct ptr_ring *tun_get_tx_ring(struct file *file);
bool tun_is_xdp_buff(void *ptr);
void *tun_xdp_to_ptr(void *ptr);
void *tun_ptr_to_xdp(void *ptr);
#else
#include <linux/err.h>
#include <linux/errno.h>
......@@ -29,9 +34,21 @@ static inline struct socket *tun_get_socket(struct file *f)
{
return ERR_PTR(-EINVAL);
}
static inline struct skb_array *tun_get_skb_array(struct file *f)
static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
{
return ERR_PTR(-EINVAL);
}
static inline bool tun_is_xdp_buff(void *ptr)
{
return false;
}
void *tun_xdp_to_ptr(void *ptr)
{
return NULL;
}
void *tun_ptr_to_xdp(void *ptr)
{
return NULL;
}
#endif /* CONFIG_TUN */
#endif /* __IF_TUN_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment