Commit e3e37e70 authored by David S. Miller's avatar David S. Miller

Merge branch 'vhost_net-batching'

Jason Wang says:

====================
vhost_net tx batching

This series tries to implement tx batching support for vhost. This was
done by using MSG_MORE as a hint for under layer socket. The backend
(e.g tap) can then batch the packets temporarily in a list and
submit it all once the number of bacthed exceeds a limitation.

Tests shows obvious improvement on guest pktgen over over
mlx4(noqueue) on host:

                                     Mpps  -+%
        rx-frames = 0                0.91  +0%
        rx-frames = 4                1.00  +9.8%
        rx-frames = 8                1.00  +9.8%
        rx-frames = 16               1.01  +10.9%
        rx-frames = 32               1.07  +17.5%
        rx-frames = 48               1.07  +17.5%
        rx-frames = 64               1.08  +18.6%
        rx-frames = 64 (no MSG_MORE) 0.91  +0%

Changes from V4:
- stick to NAPI_POLL_WEIGHT for rx-frames is user specify a value
  greater than it.
Changes from V3:
- use ethtool instead of module parameter to control the maximum
  number of batched packets
- avoid overhead when MSG_MORE were not set and no packet queued
Changes from V2:
- remove uselss queue limitation check (and we don't drop any packet now)
Changes from V1:
- drop NAPI handler since we don't use NAPI now
- fix the issues that may exceeds max pending of zerocopy
- more improvement on available buffer detection
- move the limitation of batched pacekts from vhost to tuntap
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 1a8b6d76 5503fcec
...@@ -218,6 +218,7 @@ struct tun_struct { ...@@ -218,6 +218,7 @@ struct tun_struct {
struct list_head disabled; struct list_head disabled;
void *security; void *security;
u32 flow_count; u32 flow_count;
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats; struct tun_pcpu_stats __percpu *pcpu_stats;
}; };
...@@ -522,6 +523,7 @@ static void tun_queue_purge(struct tun_file *tfile) ...@@ -522,6 +523,7 @@ static void tun_queue_purge(struct tun_file *tfile)
while ((skb = skb_array_consume(&tfile->tx_array)) != NULL) while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
kfree_skb(skb); kfree_skb(skb);
skb_queue_purge(&tfile->sk.sk_write_queue);
skb_queue_purge(&tfile->sk.sk_error_queue); skb_queue_purge(&tfile->sk.sk_error_queue);
} }
...@@ -1139,10 +1141,46 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, ...@@ -1139,10 +1141,46 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
return skb; return skb;
} }
static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
struct sk_buff *skb, int more)
{
struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
struct sk_buff_head process_queue;
u32 rx_batched = tun->rx_batched;
bool rcv = false;
if (!rx_batched || (!more && skb_queue_empty(queue))) {
local_bh_disable();
netif_receive_skb(skb);
local_bh_enable();
return;
}
spin_lock(&queue->lock);
if (!more || skb_queue_len(queue) == rx_batched) {
__skb_queue_head_init(&process_queue);
skb_queue_splice_tail_init(queue, &process_queue);
rcv = true;
} else {
__skb_queue_tail(queue, skb);
}
spin_unlock(&queue->lock);
if (rcv) {
struct sk_buff *nskb;
local_bh_disable();
while ((nskb = __skb_dequeue(&process_queue)))
netif_receive_skb(nskb);
netif_receive_skb(skb);
local_bh_enable();
}
}
/* Get packet from user space buffer */ /* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from, void *msg_control, struct iov_iter *from,
int noblock) int noblock, bool more)
{ {
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb; struct sk_buff *skb;
...@@ -1283,9 +1321,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, ...@@ -1283,9 +1321,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
rxhash = skb_get_hash(skb); rxhash = skb_get_hash(skb);
#ifndef CONFIG_4KSTACKS #ifndef CONFIG_4KSTACKS
local_bh_disable(); tun_rx_batched(tun, tfile, skb, more);
netif_receive_skb(skb);
local_bh_enable();
#else #else
netif_rx_ni(skb); netif_rx_ni(skb);
#endif #endif
...@@ -1311,7 +1347,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1311,7 +1347,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!tun) if (!tun)
return -EBADFD; return -EBADFD;
result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK); result = tun_get_user(tun, tfile, NULL, from,
file->f_flags & O_NONBLOCK, false);
tun_put(tun); tun_put(tun);
return result; return result;
...@@ -1569,7 +1606,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) ...@@ -1569,7 +1606,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
return -EBADFD; return -EBADFD;
ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT); m->msg_flags & MSG_DONTWAIT,
m->msg_flags & MSG_MORE);
tun_put(tun); tun_put(tun);
return ret; return ret;
} }
...@@ -1770,6 +1808,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ...@@ -1770,6 +1808,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->align = NET_SKB_PAD; tun->align = NET_SKB_PAD;
tun->filter_attached = false; tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->sndbuf = tfile->socket.sk->sk_sndbuf;
tun->rx_batched = 0;
tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
if (!tun->pcpu_stats) { if (!tun->pcpu_stats) {
...@@ -2438,6 +2477,29 @@ static void tun_set_msglevel(struct net_device *dev, u32 value) ...@@ -2438,6 +2477,29 @@ static void tun_set_msglevel(struct net_device *dev, u32 value)
#endif #endif
} }
static int tun_get_coalesce(struct net_device *dev,
struct ethtool_coalesce *ec)
{
struct tun_struct *tun = netdev_priv(dev);
ec->rx_max_coalesced_frames = tun->rx_batched;
return 0;
}
static int tun_set_coalesce(struct net_device *dev,
struct ethtool_coalesce *ec)
{
struct tun_struct *tun = netdev_priv(dev);
if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
tun->rx_batched = NAPI_POLL_WEIGHT;
else
tun->rx_batched = ec->rx_max_coalesced_frames;
return 0;
}
static const struct ethtool_ops tun_ethtool_ops = { static const struct ethtool_ops tun_ethtool_ops = {
.get_settings = tun_get_settings, .get_settings = tun_get_settings,
.get_drvinfo = tun_get_drvinfo, .get_drvinfo = tun_get_drvinfo,
...@@ -2445,6 +2507,8 @@ static const struct ethtool_ops tun_ethtool_ops = { ...@@ -2445,6 +2507,8 @@ static const struct ethtool_ops tun_ethtool_ops = {
.set_msglevel = tun_set_msglevel, .set_msglevel = tun_set_msglevel,
.get_link = ethtool_op_get_link, .get_link = ethtool_op_get_link,
.get_ts_info = ethtool_op_get_ts_info, .get_ts_info = ethtool_op_get_ts_info,
.get_coalesce = tun_get_coalesce,
.set_coalesce = tun_set_coalesce,
}; };
static int tun_queue_resize(struct tun_struct *tun) static int tun_queue_resize(struct tun_struct *tun)
......
...@@ -351,6 +351,15 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, ...@@ -351,6 +351,15 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
return r; return r;
} }
static bool vhost_exceeds_maxpend(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV
== nvq->done_idx;
}
/* Expects to be always run from workqueue - which acts as /* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */ * read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net) static void handle_tx(struct vhost_net *net)
...@@ -394,8 +403,7 @@ static void handle_tx(struct vhost_net *net) ...@@ -394,8 +403,7 @@ static void handle_tx(struct vhost_net *net)
/* If more outstanding DMAs, queue the work. /* If more outstanding DMAs, queue the work.
* Handle upend_idx wrap around * Handle upend_idx wrap around
*/ */
if (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND) if (unlikely(vhost_exceeds_maxpend(net)))
% UIO_MAXIOV == nvq->done_idx))
break; break;
head = vhost_net_tx_get_vq_desc(net, vq, vq->iov, head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
...@@ -454,6 +462,16 @@ static void handle_tx(struct vhost_net *net) ...@@ -454,6 +462,16 @@ static void handle_tx(struct vhost_net *net)
msg.msg_control = NULL; msg.msg_control = NULL;
ubufs = NULL; ubufs = NULL;
} }
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
!vhost_vq_avail_empty(&net->dev, vq) &&
likely(!vhost_exceeds_maxpend(net))) {
msg.msg_flags |= MSG_MORE;
} else {
msg.msg_flags &= ~MSG_MORE;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */ /* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, &msg, len); err = sock->ops->sendmsg(sock, &msg, len);
if (unlikely(err < 0)) { if (unlikely(err < 0)) {
...@@ -472,7 +490,6 @@ static void handle_tx(struct vhost_net *net) ...@@ -472,7 +490,6 @@ static void handle_tx(struct vhost_net *net)
vhost_add_used_and_signal(&net->dev, vq, head, 0); vhost_add_used_and_signal(&net->dev, vq, head, 0);
else else
vhost_zerocopy_signal_used(net, vq); vhost_zerocopy_signal_used(net, vq);
total_len += len;
vhost_net_tx_packet(net); vhost_net_tx_packet(net);
if (unlikely(total_len >= VHOST_NET_WEIGHT)) { if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll); vhost_poll_queue(&vq->poll);
......
...@@ -2241,11 +2241,15 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) ...@@ -2241,11 +2241,15 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
__virtio16 avail_idx; __virtio16 avail_idx;
int r; int r;
if (vq->avail_idx != vq->last_avail_idx)
return false;
r = vhost_get_user(vq, avail_idx, &vq->avail->idx); r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
if (r) if (unlikely(r))
return false; return false;
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx; return vq->avail_idx == vq->last_avail_idx;
} }
EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment