Commit c69e6eaf authored by David S. Miller's avatar David S. Miller

Merge branch 'vsock-virtio-optimizations-to-increase-the-throughput'

Stefano Garzarella says:

====================
vsock/virtio: optimizations to increase the throughput

This series tries to increase the throughput of virtio-vsock with slight
changes.
While I was testing the v2 of this series I discovered an huge use of memory,
so I added patch 1 to mitigate this issue. I put it in this series in order
to better track the performance trends.

v5:
- rebased all patches on net-next
- added Stefan's R-b and Michael's A-b

v4: https://patchwork.kernel.org/cover/11047717
v3: https://patchwork.kernel.org/cover/10970145
v2: https://patchwork.kernel.org/cover/10938743
v1: https://patchwork.kernel.org/cover/10885431

Below are the benchmarks step by step. I used iperf3 [1] modified with VSOCK
support. As Michael suggested in the v1, I booted host and guest with 'nosmap'.

A brief description of patches:
- Patches 1:   limit the memory usage with an extra copy for small packets
- Patches 2+3: reduce the number of credit update messages sent to the
               transmitter
- Patches 4+5: allow the host to split packets on multiple buffers and use
               VIRTIO_VSOCK_MAX_PKT_BUF_SIZE as the max packet size allowed

                    host -> guest [Gbps]
pkt_size before opt   p 1     p 2+3    p 4+5

32         0.032     0.030    0.048    0.051
64         0.061     0.059    0.108    0.117
128        0.122     0.112    0.227    0.234
256        0.244     0.241    0.418    0.415
512        0.459     0.466    0.847    0.865
1K         0.927     0.919    1.657    1.641
2K         1.884     1.813    3.262    3.269
4K         3.378     3.326    6.044    6.195
8K         5.637     5.676   10.141   11.287
16K        8.250     8.402   15.976   16.736
32K       13.327    13.204   19.013   20.515
64K       21.241    21.341   20.973   21.879
128K      21.851    22.354   21.816   23.203
256K      21.408    21.693   21.846   24.088
512K      21.600    21.899   21.921   24.106

                    guest -> host [Gbps]
pkt_size before opt   p 1     p 2+3    p 4+5

32         0.045     0.046    0.057    0.057
64         0.089     0.091    0.103    0.104
128        0.170     0.179    0.192    0.200
256        0.364     0.351    0.361    0.379
512        0.709     0.699    0.731    0.790
1K         1.399     1.407    1.395    1.427
2K         2.670     2.684    2.745    2.835
4K         5.171     5.199    5.305    5.451
8K         8.442     8.500   10.083    9.941
16K       12.305    12.259   13.519   15.385
32K       11.418    11.150   11.988   24.680
64K       10.778    10.659   11.589   35.273
128K      10.421    10.339   10.939   40.338
256K      10.300     9.719   10.508   36.562
512K       9.833     9.808   10.612   35.979

As Stefan suggested in the v1, I measured also the efficiency in this way:
    efficiency = Mbps / (%CPU_Host + %CPU_Guest)

The '%CPU_Guest' is taken inside the VM. I know that it is not the best way,
but it's provided for free from iperf3 and could be an indication.

        host -> guest efficiency [Mbps / (%CPU_Host + %CPU_Guest)]
pkt_size before opt   p 1     p 2+3    p 4+5

32         0.35      0.45     0.79     1.02
64         0.56      0.80     1.41     1.54
128        1.11      1.52     3.03     3.12
256        2.20      2.16     5.44     5.58
512        4.17      4.18    10.96    11.46
1K         8.30      8.26    20.99    20.89
2K        16.82     16.31    39.76    39.73
4K        30.89     30.79    74.07    75.73
8K        53.74     54.49   124.24   148.91
16K       80.68     83.63   200.21   232.79
32K      132.27    132.52   260.81   357.07
64K      229.82    230.40   300.19   444.18
128K     332.60    329.78   331.51   492.28
256K     331.06    337.22   339.59   511.59
512K     335.58    328.50   331.56   504.56

        guest -> host efficiency [Mbps / (%CPU_Host + %CPU_Guest)]
pkt_size before opt   p 1     p 2+3    p 4+5

32         0.43      0.43     0.53     0.56
64         0.85      0.86     1.04     1.10
128        1.63      1.71     2.07     2.13
256        3.48      3.35     4.02     4.22
512        6.80      6.67     7.97     8.63
1K        13.32     13.31    15.72    15.94
2K        25.79     25.92    30.84    30.98
4K        50.37     50.48    58.79    59.69
8K        95.90     96.15   107.04   110.33
16K      145.80    145.43   143.97   174.70
32K      147.06    144.74   146.02   282.48
64K      145.25    143.99   141.62   406.40
128K     149.34    146.96   147.49   489.34
256K     156.35    149.81   152.21   536.37
512K     151.65    150.74   151.52   519.93

[1] https://github.com/stefano-garzarella/iperf/
====================
Acked-by: default avatarMichael S. Tsirkin <mst@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d1a55841 0038ff35
...@@ -102,7 +102,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, ...@@ -102,7 +102,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
struct iov_iter iov_iter; struct iov_iter iov_iter;
unsigned out, in; unsigned out, in;
size_t nbytes; size_t nbytes;
size_t len; size_t iov_len, payload_len;
int head; int head;
spin_lock_bh(&vsock->send_pkt_list_lock); spin_lock_bh(&vsock->send_pkt_list_lock);
...@@ -147,8 +147,24 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, ...@@ -147,8 +147,24 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break; break;
} }
len = iov_length(&vq->iov[out], in); iov_len = iov_length(&vq->iov[out], in);
iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len); if (iov_len < sizeof(pkt->hdr)) {
virtio_transport_free_pkt(pkt);
vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
break;
}
iov_iter_init(&iov_iter, READ, &vq->iov[out], in, iov_len);
payload_len = pkt->len - pkt->off;
/* If the packet is greater than the space available in the
* buffer, we split it using multiple buffers.
*/
if (payload_len > iov_len - sizeof(pkt->hdr))
payload_len = iov_len - sizeof(pkt->hdr);
/* Set the correct length in the header */
pkt->hdr.len = cpu_to_le32(payload_len);
nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
if (nbytes != sizeof(pkt->hdr)) { if (nbytes != sizeof(pkt->hdr)) {
...@@ -157,33 +173,47 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, ...@@ -157,33 +173,47 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break; break;
} }
nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter); nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
if (nbytes != pkt->len) { &iov_iter);
if (nbytes != payload_len) {
virtio_transport_free_pkt(pkt); virtio_transport_free_pkt(pkt);
vq_err(vq, "Faulted on copying pkt buf\n"); vq_err(vq, "Faulted on copying pkt buf\n");
break; break;
} }
vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len); vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
added = true; added = true;
if (pkt->reply) {
int val;
val = atomic_dec_return(&vsock->queued_replies);
/* Do we have resources to resume tx processing? */
if (val + 1 == tx_vq->num)
restart_tx = true;
}
/* Deliver to monitoring devices all correctly transmitted /* Deliver to monitoring devices all correctly transmitted
* packets. * packets.
*/ */
virtio_transport_deliver_tap_pkt(pkt); virtio_transport_deliver_tap_pkt(pkt);
total_len += pkt->len; pkt->off += payload_len;
virtio_transport_free_pkt(pkt); total_len += payload_len;
/* If we didn't send all the payload we can requeue the packet
* to send it with the next available buffer.
*/
if (pkt->off < pkt->len) {
spin_lock_bh(&vsock->send_pkt_list_lock);
list_add(&pkt->list, &vsock->send_pkt_list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
} else {
if (pkt->reply) {
int val;
val = atomic_dec_return(&vsock->queued_replies);
/* Do we have resources to resume tx
* processing?
*/
if (val + 1 == tx_vq->num)
restart_tx = true;
}
virtio_transport_free_pkt(pkt);
}
} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
if (added) if (added)
vhost_signal(&vsock->dev, vq); vhost_signal(&vsock->dev, vq);
...@@ -329,6 +359,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, ...@@ -329,6 +359,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
return NULL; return NULL;
} }
pkt->buf_len = pkt->len;
nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter); nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
if (nbytes != pkt->len) { if (nbytes != pkt->len) {
vq_err(vq, "Expected %u byte payload, got %zu bytes\n", vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
......
...@@ -35,13 +35,14 @@ struct virtio_vsock_sock { ...@@ -35,13 +35,14 @@ struct virtio_vsock_sock {
/* Protected by tx_lock */ /* Protected by tx_lock */
u32 tx_cnt; u32 tx_cnt;
u32 buf_alloc;
u32 peer_fwd_cnt; u32 peer_fwd_cnt;
u32 peer_buf_alloc; u32 peer_buf_alloc;
/* Protected by rx_lock */ /* Protected by rx_lock */
u32 fwd_cnt; u32 fwd_cnt;
u32 last_fwd_cnt;
u32 rx_bytes; u32 rx_bytes;
u32 buf_alloc;
struct list_head rx_queue; struct list_head rx_queue;
}; };
...@@ -52,6 +53,7 @@ struct virtio_vsock_pkt { ...@@ -52,6 +53,7 @@ struct virtio_vsock_pkt {
/* socket refcnt not held, only use for cancellation */ /* socket refcnt not held, only use for cancellation */
struct vsock_sock *vsk; struct vsock_sock *vsk;
void *buf; void *buf;
u32 buf_len;
u32 len; u32 len;
u32 off; u32 off;
bool reply; bool reply;
......
...@@ -307,6 +307,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) ...@@ -307,6 +307,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
break; break;
} }
pkt->buf_len = buf_len;
pkt->len = buf_len; pkt->len = buf_len;
sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
......
...@@ -26,6 +26,9 @@ ...@@ -26,6 +26,9 @@
/* How long to wait for graceful shutdown of a connection */ /* How long to wait for graceful shutdown of a connection */
#define VSOCK_CLOSE_TIMEOUT (8 * HZ) #define VSOCK_CLOSE_TIMEOUT (8 * HZ)
/* Threshold for detecting small packets to copy */
#define GOOD_COPY_LEN 128
static const struct virtio_transport *virtio_transport_get_ops(void) static const struct virtio_transport *virtio_transport_get_ops(void)
{ {
const struct vsock_transport *t = vsock_core_get_transport(); const struct vsock_transport *t = vsock_core_get_transport();
...@@ -64,6 +67,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, ...@@ -64,6 +67,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
pkt->buf = kmalloc(len, GFP_KERNEL); pkt->buf = kmalloc(len, GFP_KERNEL);
if (!pkt->buf) if (!pkt->buf)
goto out_pkt; goto out_pkt;
pkt->buf_len = len;
err = memcpy_from_msg(pkt->buf, info->msg, len); err = memcpy_from_msg(pkt->buf, info->msg, len);
if (err) if (err)
goto out; goto out;
...@@ -91,8 +97,17 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) ...@@ -91,8 +97,17 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
struct virtio_vsock_pkt *pkt = opaque; struct virtio_vsock_pkt *pkt = opaque;
struct af_vsockmon_hdr *hdr; struct af_vsockmon_hdr *hdr;
struct sk_buff *skb; struct sk_buff *skb;
size_t payload_len;
void *payload_buf;
/* A packet could be split to fit the RX buffer, so we can retrieve
* the payload length from the header and the buffer pointer taking
* care of the offset in the original packet.
*/
payload_len = le32_to_cpu(pkt->hdr.len);
payload_buf = pkt->buf + pkt->off;
skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len, skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len,
GFP_ATOMIC); GFP_ATOMIC);
if (!skb) if (!skb)
return NULL; return NULL;
...@@ -132,8 +147,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) ...@@ -132,8 +147,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr)); skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
if (pkt->len) { if (payload_len) {
skb_put_data(skb, pkt->buf, pkt->len); skb_put_data(skb, payload_buf, payload_len);
} }
return skb; return skb;
...@@ -166,8 +181,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, ...@@ -166,8 +181,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
vvs = vsk->trans; vvs = vsk->trans;
/* we can send less than pkt_len bytes */ /* we can send less than pkt_len bytes */
if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) if (pkt_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; pkt_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
/* virtio_transport_get_credit might return less than pkt_len credit */ /* virtio_transport_get_credit might return less than pkt_len credit */
pkt_len = virtio_transport_get_credit(vvs, pkt_len); pkt_len = virtio_transport_get_credit(vvs, pkt_len);
...@@ -204,10 +219,11 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, ...@@ -204,10 +219,11 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
{ {
spin_lock_bh(&vvs->tx_lock); spin_lock_bh(&vvs->rx_lock);
vvs->last_fwd_cnt = vvs->fwd_cnt;
pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
spin_unlock_bh(&vvs->tx_lock); spin_unlock_bh(&vvs->rx_lock);
} }
EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
...@@ -255,6 +271,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, ...@@ -255,6 +271,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans; struct virtio_vsock_sock *vvs = vsk->trans;
struct virtio_vsock_pkt *pkt; struct virtio_vsock_pkt *pkt;
size_t bytes, total = 0; size_t bytes, total = 0;
u32 free_space;
int err = -EFAULT; int err = -EFAULT;
spin_lock_bh(&vvs->rx_lock); spin_lock_bh(&vvs->rx_lock);
...@@ -285,11 +302,19 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, ...@@ -285,11 +302,19 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
virtio_transport_free_pkt(pkt); virtio_transport_free_pkt(pkt);
} }
} }
free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
spin_unlock_bh(&vvs->rx_lock); spin_unlock_bh(&vvs->rx_lock);
/* Send a credit pkt to peer */ /* We send a credit update only when the space available seen
virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, * by the transmitter is less than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE
NULL); */
if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
virtio_transport_send_credit_update(vsk,
VIRTIO_VSOCK_TYPE_STREAM,
NULL);
}
return total; return total;
...@@ -841,24 +866,60 @@ virtio_transport_recv_connecting(struct sock *sk, ...@@ -841,24 +866,60 @@ virtio_transport_recv_connecting(struct sock *sk,
return err; return err;
} }
static void
virtio_transport_recv_enqueue(struct vsock_sock *vsk,
struct virtio_vsock_pkt *pkt)
{
struct virtio_vsock_sock *vvs = vsk->trans;
bool free_pkt = false;
pkt->len = le32_to_cpu(pkt->hdr.len);
pkt->off = 0;
spin_lock_bh(&vvs->rx_lock);
virtio_transport_inc_rx_pkt(vvs, pkt);
/* Try to copy small packets into the buffer of last packet queued,
* to avoid wasting memory queueing the entire buffer with a small
* payload.
*/
if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) {
struct virtio_vsock_pkt *last_pkt;
last_pkt = list_last_entry(&vvs->rx_queue,
struct virtio_vsock_pkt, list);
/* If there is space in the last packet queued, we copy the
* new packet in its buffer.
*/
if (pkt->len <= last_pkt->buf_len - last_pkt->len) {
memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
pkt->len);
last_pkt->len += pkt->len;
free_pkt = true;
goto out;
}
}
list_add_tail(&pkt->list, &vvs->rx_queue);
out:
spin_unlock_bh(&vvs->rx_lock);
if (free_pkt)
virtio_transport_free_pkt(pkt);
}
static int static int
virtio_transport_recv_connected(struct sock *sk, virtio_transport_recv_connected(struct sock *sk,
struct virtio_vsock_pkt *pkt) struct virtio_vsock_pkt *pkt)
{ {
struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vsk = vsock_sk(sk);
struct virtio_vsock_sock *vvs = vsk->trans;
int err = 0; int err = 0;
switch (le16_to_cpu(pkt->hdr.op)) { switch (le16_to_cpu(pkt->hdr.op)) {
case VIRTIO_VSOCK_OP_RW: case VIRTIO_VSOCK_OP_RW:
pkt->len = le32_to_cpu(pkt->hdr.len); virtio_transport_recv_enqueue(vsk, pkt);
pkt->off = 0;
spin_lock_bh(&vvs->rx_lock);
virtio_transport_inc_rx_pkt(vvs, pkt);
list_add_tail(&pkt->list, &vvs->rx_queue);
spin_unlock_bh(&vvs->rx_lock);
sk->sk_data_ready(sk); sk->sk_data_ready(sk);
return err; return err;
case VIRTIO_VSOCK_OP_CREDIT_UPDATE: case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment