Commit beb528d0 authored by David S. Miller's avatar David S. Miller

Merge branch 'tun-skb_array'

Jason Wang says:

====================
switch to use tx skb array in tun

This series tries to switch to use skb array in tun. This is used to
eliminate the spinlock contention between producer and consumer. The
conversion was straightforward: just introdce a tx skb array and use
it instead of sk_receive_queue.

A minor issue is to keep the tx_queue_len behaviour, since tun used to
use it for the length of sk_receive_queue. This is done through:

- add the ability to resize multiple rings at once to avoid handling
  partial resize failure for mutiple rings.
- add the support for zero length ring.
- introduce a notifier which was triggered when tx_queue_len was
  changed for a netdev.
- resize all queues during the tx_queue_len changing.

Tests shows about 15% improvement on guest rx pps:

Before: ~1300000pps
After : ~1500000pps

Changes from V3:
- fix kbuild warnings
- call NETDEV_CHANGE_TX_QUEUE_LEN on IFLA_TXQLEN

Changes from V2:
- add multiple rings resizing support for ptr_ring/skb_array
- add zero length ring support
- introdce a NETDEV_CHANGE_TX_QUEUE_LEN
- drop new flags

Changes from V1:
- switch to use skb array instead of a customized circular buffer
- add non-blocking support
- rename .peek to .peek_len
- drop lockless peeking since test show very minor improvement
====================
Acked-by: default avatarMichael S. Tsirkin <mst@redhat.com>
Acked-from-altitude: 34697 feet.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8dc7243a 1576d986
...@@ -71,6 +71,7 @@ ...@@ -71,6 +71,7 @@
#include <net/sock.h> #include <net/sock.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/skb_array.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -167,6 +168,7 @@ struct tun_file { ...@@ -167,6 +168,7 @@ struct tun_file {
}; };
struct list_head next; struct list_head next;
struct tun_struct *detached; struct tun_struct *detached;
struct skb_array tx_array;
}; };
struct tun_flow_entry { struct tun_flow_entry {
...@@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) ...@@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
static void tun_queue_purge(struct tun_file *tfile) static void tun_queue_purge(struct tun_file *tfile)
{ {
skb_queue_purge(&tfile->sk.sk_receive_queue); struct sk_buff *skb;
while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
kfree_skb(skb);
skb_queue_purge(&tfile->sk.sk_error_queue); skb_queue_purge(&tfile->sk.sk_error_queue);
} }
...@@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean) ...@@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun->dev->reg_state == NETREG_REGISTERED) tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev); unregister_netdevice(tun->dev);
} }
if (tun)
skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk); sock_put(&tfile->sk);
} }
} }
...@@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev) ...@@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
{ {
struct tun_file *tfile = file->private_data; struct tun_file *tfile = file->private_data;
struct net_device *dev = tun->dev;
int err; int err;
err = security_tun_dev_attach(tfile->socket.sk, tun->security); err = security_tun_dev_attach(tfile->socket.sk, tun->security);
...@@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte ...@@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
if (!err) if (!err)
goto out; goto out;
} }
if (!tfile->detached &&
skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
err = -ENOMEM;
goto out;
}
tfile->queue_index = tun->numqueues; tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tfile->tun, tun);
...@@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset(skb); nf_reset(skb);
/* Enqueue packet */ if (skb_array_produce(&tfile->tx_array, skb))
skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); goto drop;
/* Notify and wake up reader process */ /* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC) if (tfile->flags & TUN_FASYNC)
...@@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) ...@@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
poll_wait(file, sk_sleep(sk), wait); poll_wait(file, sk_sleep(sk), wait);
if (!skb_queue_empty(&sk->sk_receive_queue)) if (!skb_array_empty(&tfile->tx_array))
mask |= POLLIN | POLLRDNORM; mask |= POLLIN | POLLRDNORM;
if (sock_writeable(sk) || if (sock_writeable(sk) ||
...@@ -1426,22 +1442,61 @@ static ssize_t tun_put_user(struct tun_struct *tun, ...@@ -1426,22 +1442,61 @@ static ssize_t tun_put_user(struct tun_struct *tun,
return total; return total;
} }
static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
int *err)
{
DECLARE_WAITQUEUE(wait, current);
struct sk_buff *skb = NULL;
skb = skb_array_consume(&tfile->tx_array);
if (skb)
goto out;
if (noblock) {
*err = -EAGAIN;
goto out;
}
add_wait_queue(&tfile->wq.wait, &wait);
current->state = TASK_INTERRUPTIBLE;
while (1) {
skb = skb_array_consume(&tfile->tx_array);
if (skb)
break;
if (signal_pending(current)) {
*err = -ERESTARTSYS;
break;
}
if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
*err = -EFAULT;
break;
}
schedule();
}
current->state = TASK_RUNNING;
remove_wait_queue(&tfile->wq.wait, &wait);
out:
return skb;
}
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to, struct iov_iter *to,
int noblock) int noblock)
{ {
struct sk_buff *skb; struct sk_buff *skb;
ssize_t ret; ssize_t ret;
int peeked, err, off = 0; int err;
tun_debug(KERN_INFO, tun, "tun_do_read\n"); tun_debug(KERN_INFO, tun, "tun_do_read\n");
if (!iov_iter_count(to)) if (!iov_iter_count(to))
return 0; return 0;
/* Read frames from queue */ /* Read frames from ring */
skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, skb = tun_ring_recv(tfile, noblock, &err);
&peeked, &off, &err);
if (!skb) if (!skb)
return err; return err;
...@@ -1574,8 +1629,25 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, ...@@ -1574,8 +1629,25 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
return ret; return ret;
} }
static int tun_peek_len(struct socket *sock)
{
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun;
int ret = 0;
tun = __tun_get(tfile);
if (!tun)
return 0;
ret = skb_array_peek_len(&tfile->tx_array);
tun_put(tun);
return ret;
}
/* Ops structure to mimic raw sockets with tun */ /* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = { static const struct proto_ops tun_socket_ops = {
.peek_len = tun_peek_len,
.sendmsg = tun_sendmsg, .sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg, .recvmsg = tun_recvmsg,
}; };
...@@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = { ...@@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info, .get_ts_info = ethtool_op_get_ts_info,
}; };
static int tun_queue_resize(struct tun_struct *tun)
{
struct net_device *dev = tun->dev;
struct tun_file *tfile;
struct skb_array **arrays;
int n = tun->numqueues + tun->numdisabled;
int ret, i;
arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
if (!arrays)
return -ENOMEM;
for (i = 0; i < tun->numqueues; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
arrays[i] = &tfile->tx_array;
}
list_for_each_entry(tfile, &tun->disabled, next)
arrays[i++] = &tfile->tx_array;
ret = skb_array_resize_multiple(arrays, n,
dev->tx_queue_len, GFP_KERNEL);
kfree(arrays);
return ret;
}
static int tun_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tun_struct *tun = netdev_priv(dev);
switch (event) {
case NETDEV_CHANGE_TX_QUEUE_LEN:
if (tun_queue_resize(tun))
return NOTIFY_BAD;
break;
default:
break;
}
return NOTIFY_DONE;
}
static struct notifier_block tun_notifier_block __read_mostly = {
.notifier_call = tun_device_event,
};
static int __init tun_init(void) static int __init tun_init(void)
{ {
...@@ -2416,6 +2535,8 @@ static int __init tun_init(void) ...@@ -2416,6 +2535,8 @@ static int __init tun_init(void)
pr_err("Can't register misc device %d\n", TUN_MINOR); pr_err("Can't register misc device %d\n", TUN_MINOR);
goto err_misc; goto err_misc;
} }
register_netdevice_notifier(&tun_notifier_block);
return 0; return 0;
err_misc: err_misc:
rtnl_link_unregister(&tun_link_ops); rtnl_link_unregister(&tun_link_ops);
...@@ -2427,6 +2548,7 @@ static void tun_cleanup(void) ...@@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
{ {
misc_deregister(&tun_miscdev); misc_deregister(&tun_miscdev);
rtnl_link_unregister(&tun_link_ops); rtnl_link_unregister(&tun_link_ops);
unregister_netdevice_notifier(&tun_notifier_block);
} }
/* Get an underlying socket object from tun file. Returns error unless file is /* Get an underlying socket object from tun file. Returns error unless file is
......
...@@ -481,10 +481,14 @@ static void handle_tx(struct vhost_net *net) ...@@ -481,10 +481,14 @@ static void handle_tx(struct vhost_net *net)
static int peek_head_len(struct sock *sk) static int peek_head_len(struct sock *sk)
{ {
struct socket *sock = sk->sk_socket;
struct sk_buff *head; struct sk_buff *head;
int len = 0; int len = 0;
unsigned long flags; unsigned long flags;
if (sock->ops->peek_len)
return sock->ops->peek_len(sock);
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue); head = skb_peek(&sk->sk_receive_queue);
if (likely(head)) { if (likely(head)) {
...@@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk) ...@@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
return len; return len;
} }
static int sk_has_rx_data(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
if (sock->ops->peek_len)
return sock->ops->peek_len(sock);
return skb_queue_empty(&sk->sk_receive_queue);
}
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{ {
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
...@@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) ...@@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
endtime = busy_clock() + vq->busyloop_timeout; endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(&net->dev, endtime) && while (vhost_can_busy_poll(&net->dev, endtime) &&
skb_queue_empty(&sk->sk_receive_queue) && !sk_has_rx_data(sk) &&
vhost_vq_avail_empty(&net->dev, vq)) vhost_vq_avail_empty(&net->dev, vq))
cpu_relax_lowlatency(); cpu_relax_lowlatency();
......
...@@ -185,6 +185,7 @@ struct proto_ops { ...@@ -185,6 +185,7 @@ struct proto_ops {
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags); struct pipe_inode_info *pipe, size_t len, unsigned int flags);
int (*set_peek_off)(struct sock *sk, int val); int (*set_peek_off)(struct sock *sk, int val);
int (*peek_len)(struct socket *sock);
}; };
#define DECLARE_SOCKADDR(type, dst, src) \ #define DECLARE_SOCKADDR(type, dst, src) \
......
...@@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info { ...@@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
#define NETDEV_PRECHANGEUPPER 0x001A #define NETDEV_PRECHANGEUPPER 0x001A
#define NETDEV_CHANGELOWERSTATE 0x001B #define NETDEV_CHANGELOWERSTATE 0x001B
#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C #define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
int register_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb);
......
...@@ -102,7 +102,7 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r) ...@@ -102,7 +102,7 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
*/ */
static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
{ {
if (r->queue[r->producer]) if (unlikely(!r->size) || r->queue[r->producer])
return -ENOSPC; return -ENOSPC;
r->queue[r->producer++] = ptr; r->queue[r->producer++] = ptr;
...@@ -164,7 +164,9 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) ...@@ -164,7 +164,9 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
*/ */
static inline void *__ptr_ring_peek(struct ptr_ring *r) static inline void *__ptr_ring_peek(struct ptr_ring *r)
{ {
return r->queue[r->consumer]; if (likely(r->size))
return r->queue[r->consumer];
return NULL;
} }
/* Note: callers invoking this in a loop must use a compiler barrier, /* Note: callers invoking this in a loop must use a compiler barrier,
...@@ -347,20 +349,14 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) ...@@ -347,20 +349,14 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
return 0; return 0;
} }
static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
void (*destroy)(void *)) int size, gfp_t gfp,
void (*destroy)(void *))
{ {
unsigned long flags;
int producer = 0; int producer = 0;
void **queue = __ptr_ring_init_queue_alloc(size, gfp);
void **old; void **old;
void *ptr; void *ptr;
if (!queue)
return -ENOMEM;
spin_lock_irqsave(&(r)->producer_lock, flags);
while ((ptr = ptr_ring_consume(r))) while ((ptr = ptr_ring_consume(r)))
if (producer < size) if (producer < size)
queue[producer++] = ptr; queue[producer++] = ptr;
...@@ -373,6 +369,23 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, ...@@ -373,6 +369,23 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
old = r->queue; old = r->queue;
r->queue = queue; r->queue = queue;
return old;
}
static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
void (*destroy)(void *))
{
unsigned long flags;
void **queue = __ptr_ring_init_queue_alloc(size, gfp);
void **old;
if (!queue)
return -ENOMEM;
spin_lock_irqsave(&(r)->producer_lock, flags);
old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);
spin_unlock_irqrestore(&(r)->producer_lock, flags); spin_unlock_irqrestore(&(r)->producer_lock, flags);
kfree(old); kfree(old);
...@@ -380,6 +393,48 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, ...@@ -380,6 +393,48 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
return 0; return 0;
} }
static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
int size,
gfp_t gfp, void (*destroy)(void *))
{
unsigned long flags;
void ***queues;
int i;
queues = kmalloc(nrings * sizeof *queues, gfp);
if (!queues)
goto noqueues;
for (i = 0; i < nrings; ++i) {
queues[i] = __ptr_ring_init_queue_alloc(size, gfp);
if (!queues[i])
goto nomem;
}
for (i = 0; i < nrings; ++i) {
spin_lock_irqsave(&(rings[i])->producer_lock, flags);
queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
size, gfp, destroy);
spin_unlock_irqrestore(&(rings[i])->producer_lock, flags);
}
for (i = 0; i < nrings; ++i)
kfree(queues[i]);
kfree(queues);
return 0;
nomem:
while (--i >= 0)
kfree(queues[i]);
kfree(queues);
noqueues:
return -ENOMEM;
}
static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
{ {
void *ptr; void *ptr;
......
...@@ -151,16 +151,25 @@ static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) ...@@ -151,16 +151,25 @@ static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
return ptr_ring_init(&a->ring, size, gfp); return ptr_ring_init(&a->ring, size, gfp);
} }
void __skb_array_destroy_skb(void *ptr) static void __skb_array_destroy_skb(void *ptr)
{ {
kfree_skb(ptr); kfree_skb(ptr);
} }
int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
{ {
return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
} }
static inline int skb_array_resize_multiple(struct skb_array **rings,
int nrings, int size, gfp_t gfp)
{
BUILD_BUG_ON(offsetof(struct skb_array, ring));
return ptr_ring_resize_multiple((struct ptr_ring **)rings,
nrings, size, gfp,
__skb_array_destroy_skb);
}
static inline void skb_array_cleanup(struct skb_array *a) static inline void skb_array_cleanup(struct skb_array *a)
{ {
ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);
......
...@@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex); ...@@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{ {
dev->tx_queue_len = new_len; int res, orig_len = dev->tx_queue_len;
if (new_len != orig_len) {
dev->tx_queue_len = new_len;
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
res = notifier_to_errno(res);
if (res) {
netdev_err(dev,
"refused to change device tx_queue_len\n");
dev->tx_queue_len = orig_len;
return -EFAULT;
}
}
return 0; return 0;
} }
......
...@@ -1927,11 +1927,19 @@ static int do_setlink(const struct sk_buff *skb, ...@@ -1927,11 +1927,19 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_TXQLEN]) { if (tb[IFLA_TXQLEN]) {
unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
unsigned long orig_len = dev->tx_queue_len;
if (dev->tx_queue_len ^ value)
if (dev->tx_queue_len ^ value) {
dev->tx_queue_len = value;
err = call_netdevice_notifiers(
NETDEV_CHANGE_TX_QUEUE_LEN, dev);
err = notifier_to_errno(err);
if (err) {
dev->tx_queue_len = orig_len;
goto errout;
}
status |= DO_SETLINK_NOTIFY; status |= DO_SETLINK_NOTIFY;
}
dev->tx_queue_len = value;
} }
if (tb[IFLA_OPERSTATE]) if (tb[IFLA_OPERSTATE])
......
...@@ -17,6 +17,11 @@ ...@@ -17,6 +17,11 @@
typedef pthread_spinlock_t spinlock_t; typedef pthread_spinlock_t spinlock_t;
typedef int gfp_t; typedef int gfp_t;
static void *kmalloc(unsigned size, gfp_t gfp)
{
return memalign(64, size);
}
static void *kzalloc(unsigned size, gfp_t gfp) static void *kzalloc(unsigned size, gfp_t gfp)
{ {
void *p = memalign(64, size); void *p = memalign(64, size);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment