Commit 3361a6ea authored by David S. Miller's avatar David S. Miller

Merge branch 'vsock-virtio' into main

Luigi Leonardi says:

====================
vsock: avoid queuing on intermediate queue if possible

This series introduces an optimization for vsock/virtio to reduce latency
and increase the throughput: When the guest sends a packet to the host,
and the intermediate queue (send_pkt_queue) is empty, if there is enough
space, the packet is put directly in the virtqueue.

v3->v4
While running experiments on fio with 64B payload, I realized that there
was a mistake in my fio configuration, so I re-ran all the experiments
and now the latency numbers are indeed lower with the patch applied.
I also noticed that I was kicking the host without the lock.

- Fixed a configuration mistake on fio and re-ran all experiments.
- Fio latency measurement using 64B payload.
- virtio_transport_send_skb_fast_path sends kick with the tx_lock acquired
- Addressed all minor style changes requested by maintainer.
- Rebased on latest net-next
- Link to v3: https://lore.kernel.org/r/20240711-pinna-v3-0-697d4164fe80@outlook.com

v2->v3
- Performed more experiments using iperf3 using multiple streams
- Handling of reply packets removed from virtio_transport_send_skb,
  as is needed just by the worker.
- Removed atomic_inc/atomic_sub when queuing directly to the vq.
- Introduced virtio_transport_send_skb_fast_path that handles the
  steps for sending on the vq.
- Fixed a missing mutex_unlock in error path.
- Changed authorship of the second commit
- Rebased on latest net-next

v1->v2
In this v2 I replaced a mutex_lock with a mutex_trylock because it was
insidea RCU critical section. I also added a check on tx_run, so if the
module is being removed the packet is not queued. I'd like to thank Stefano
for reporting the tx_run issue.

Applied all Stefano's suggestions:
    - Minor code style changes
    - Minor commit text rewrite
Performed more experiments:
     - Check if all the packets go directly to the vq (Matias' suggestion)
     - Used iperf3 to see if there is any improvement in overall throughput
      from guest to host
     - Pinned the vhost process to a pCPU.
     - Run fio using 512B payload
Rebased on latest net-next
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 5fe164fb 18ee44ce
...@@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, ...@@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
restart_tx = true; restart_tx = true;
} }
consume_skb(skb); virtio_transport_consume_skb_sent(skb, true);
} }
} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
if (added) if (added)
...@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = { ...@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size = virtio_transport_notify_buffer_size, .notify_buffer_size = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat,
.unsent_bytes = virtio_transport_unsent_bytes,
.read_skb = virtio_transport_read_skb, .read_skb = virtio_transport_read_skb,
}, },
......
...@@ -133,6 +133,7 @@ struct virtio_vsock_sock { ...@@ -133,6 +133,7 @@ struct virtio_vsock_sock {
u32 tx_cnt; u32 tx_cnt;
u32 peer_fwd_cnt; u32 peer_fwd_cnt;
u32 peer_buf_alloc; u32 peer_buf_alloc;
size_t bytes_unsent;
/* Protected by rx_lock */ /* Protected by rx_lock */
u32 fwd_cnt; u32 fwd_cnt;
...@@ -193,6 +194,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); ...@@ -193,6 +194,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk); u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk);
ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk);
void virtio_transport_consume_skb_sent(struct sk_buff *skb,
bool consume);
int virtio_transport_do_socket_init(struct vsock_sock *vsk, int virtio_transport_do_socket_init(struct vsock_sock *vsk,
struct vsock_sock *psk); struct vsock_sock *psk);
int int
......
...@@ -169,6 +169,9 @@ struct vsock_transport { ...@@ -169,6 +169,9 @@ struct vsock_transport {
void (*notify_buffer_size)(struct vsock_sock *, u64 *); void (*notify_buffer_size)(struct vsock_sock *, u64 *);
int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
/* SIOCOUTQ ioctl */
ssize_t (*unsent_bytes)(struct vsock_sock *vsk);
/* Shutdown. */ /* Shutdown. */
int (*shutdown)(struct vsock_sock *, int); int (*shutdown)(struct vsock_sock *, int);
......
...@@ -112,6 +112,7 @@ ...@@ -112,6 +112,7 @@
#include <net/sock.h> #include <net/sock.h>
#include <net/af_vsock.h> #include <net/af_vsock.h>
#include <uapi/linux/vm_sockets.h> #include <uapi/linux/vm_sockets.h>
#include <uapi/asm-generic/ioctls.h>
static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk); static void vsock_sk_destruct(struct sock *sk);
...@@ -1292,6 +1293,57 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, ...@@ -1292,6 +1293,57 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
} }
EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
static int vsock_do_ioctl(struct socket *sock, unsigned int cmd,
int __user *arg)
{
struct sock *sk = sock->sk;
struct vsock_sock *vsk;
int ret;
vsk = vsock_sk(sk);
switch (cmd) {
case SIOCOUTQ: {
ssize_t n_bytes;
if (!vsk->transport || !vsk->transport->unsent_bytes) {
ret = -EOPNOTSUPP;
break;
}
if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) {
ret = -EINVAL;
break;
}
n_bytes = vsk->transport->unsent_bytes(vsk);
if (n_bytes < 0) {
ret = n_bytes;
break;
}
ret = put_user(n_bytes, arg);
break;
}
default:
ret = -ENOIOCTLCMD;
}
return ret;
}
static int vsock_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
int ret;
lock_sock(sock->sk);
ret = vsock_do_ioctl(sock, cmd, (int __user *)arg);
release_sock(sock->sk);
return ret;
}
static const struct proto_ops vsock_dgram_ops = { static const struct proto_ops vsock_dgram_ops = {
.family = PF_VSOCK, .family = PF_VSOCK,
.owner = THIS_MODULE, .owner = THIS_MODULE,
...@@ -1302,7 +1354,7 @@ static const struct proto_ops vsock_dgram_ops = { ...@@ -1302,7 +1354,7 @@ static const struct proto_ops vsock_dgram_ops = {
.accept = sock_no_accept, .accept = sock_no_accept,
.getname = vsock_getname, .getname = vsock_getname,
.poll = vsock_poll, .poll = vsock_poll,
.ioctl = sock_no_ioctl, .ioctl = vsock_ioctl,
.listen = sock_no_listen, .listen = sock_no_listen,
.shutdown = vsock_shutdown, .shutdown = vsock_shutdown,
.sendmsg = vsock_dgram_sendmsg, .sendmsg = vsock_dgram_sendmsg,
...@@ -2286,7 +2338,7 @@ static const struct proto_ops vsock_stream_ops = { ...@@ -2286,7 +2338,7 @@ static const struct proto_ops vsock_stream_ops = {
.accept = vsock_accept, .accept = vsock_accept,
.getname = vsock_getname, .getname = vsock_getname,
.poll = vsock_poll, .poll = vsock_poll,
.ioctl = sock_no_ioctl, .ioctl = vsock_ioctl,
.listen = vsock_listen, .listen = vsock_listen,
.shutdown = vsock_shutdown, .shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt, .setsockopt = vsock_connectible_setsockopt,
...@@ -2308,7 +2360,7 @@ static const struct proto_ops vsock_seqpacket_ops = { ...@@ -2308,7 +2360,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
.accept = vsock_accept, .accept = vsock_accept,
.getname = vsock_getname, .getname = vsock_getname,
.poll = vsock_poll, .poll = vsock_poll,
.ioctl = sock_no_ioctl, .ioctl = vsock_ioctl,
.listen = vsock_listen, .listen = vsock_listen,
.shutdown = vsock_shutdown, .shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt, .setsockopt = vsock_connectible_setsockopt,
......
...@@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct *work) ...@@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct *work)
virtqueue_disable_cb(vq); virtqueue_disable_cb(vq);
while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { while ((skb = virtqueue_get_buf(vq, &len)) != NULL) {
consume_skb(skb); virtio_transport_consume_skb_sent(skb, true);
added = true; added = true;
} }
} while (!virtqueue_enable_cb(vq)); } while (!virtqueue_enable_cb(vq));
...@@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = { ...@@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size = virtio_transport_notify_buffer_size, .notify_buffer_size = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat,
.unsent_bytes = virtio_transport_unsent_bytes,
.read_skb = virtio_transport_read_skb, .read_skb = virtio_transport_read_skb,
}, },
......
...@@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff * ...@@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *
} }
EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume)
{
struct sock *s = skb->sk;
if (s && skb->len) {
struct vsock_sock *vs = vsock_sk(s);
struct virtio_vsock_sock *vvs;
vvs = vs->trans;
spin_lock_bh(&vvs->tx_lock);
vvs->bytes_unsent -= skb->len;
spin_unlock_bh(&vvs->tx_lock);
}
if (consume)
consume_skb(skb);
}
EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent);
u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
{ {
u32 ret; u32 ret;
...@@ -475,6 +495,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) ...@@ -475,6 +495,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
if (ret > credit) if (ret > credit)
ret = credit; ret = credit;
vvs->tx_cnt += ret; vvs->tx_cnt += ret;
vvs->bytes_unsent += ret;
spin_unlock_bh(&vvs->tx_lock); spin_unlock_bh(&vvs->tx_lock);
return ret; return ret;
...@@ -488,6 +509,7 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) ...@@ -488,6 +509,7 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
spin_lock_bh(&vvs->tx_lock); spin_lock_bh(&vvs->tx_lock);
vvs->tx_cnt -= credit; vvs->tx_cnt -= credit;
vvs->bytes_unsent -= credit;
spin_unlock_bh(&vvs->tx_lock); spin_unlock_bh(&vvs->tx_lock);
} }
EXPORT_SYMBOL_GPL(virtio_transport_put_credit); EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
...@@ -1090,6 +1112,19 @@ void virtio_transport_destruct(struct vsock_sock *vsk) ...@@ -1090,6 +1112,19 @@ void virtio_transport_destruct(struct vsock_sock *vsk)
} }
EXPORT_SYMBOL_GPL(virtio_transport_destruct); EXPORT_SYMBOL_GPL(virtio_transport_destruct);
ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk)
{
struct virtio_vsock_sock *vvs = vsk->trans;
size_t ret;
spin_lock_bh(&vvs->tx_lock);
ret = vvs->bytes_unsent;
spin_unlock_bh(&vvs->tx_lock);
return ret;
}
EXPORT_SYMBOL_GPL(virtio_transport_unsent_bytes);
static int virtio_transport_reset(struct vsock_sock *vsk, static int virtio_transport_reset(struct vsock_sock *vsk,
struct sk_buff *skb) struct sk_buff *skb)
{ {
......
...@@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = { ...@@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size = virtio_transport_notify_buffer_size, .notify_buffer_size = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat,
.unsent_bytes = virtio_transport_unsent_bytes,
.read_skb = virtio_transport_read_skb, .read_skb = virtio_transport_read_skb,
}, },
...@@ -123,6 +125,10 @@ static void vsock_loopback_work(struct work_struct *work) ...@@ -123,6 +125,10 @@ static void vsock_loopback_work(struct work_struct *work)
spin_unlock_bh(&vsock->pkt_queue.lock); spin_unlock_bh(&vsock->pkt_queue.lock);
while ((skb = __skb_dequeue(&pkts))) { while ((skb = __skb_dequeue(&pkts))) {
/* Decrement the bytes_unsent counter without deallocating skb
* It is freed by the receiver.
*/
virtio_transport_consume_skb_sent(skb, false);
virtio_transport_deliver_tap_pkt(skb); virtio_transport_deliver_tap_pkt(skb);
virtio_transport_recv_pkt(&loopback_transport, skb); virtio_transport_recv_pkt(&loopback_transport, skb);
} }
......
...@@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po ...@@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po
} }
/* Connect to <cid, port> and return the file descriptor. */ /* Connect to <cid, port> and return the file descriptor. */
static int vsock_connect(unsigned int cid, unsigned int port, int type) int vsock_connect(unsigned int cid, unsigned int port, int type)
{ {
union { union {
struct sockaddr sa; struct sockaddr sa;
...@@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int port, int type) ...@@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int port, int type)
/* Listen on <cid, port> and return the first incoming connection. The remote /* Listen on <cid, port> and return the first incoming connection. The remote
* address is stored to clientaddrp. clientaddrp may be NULL. * address is stored to clientaddrp. clientaddrp may be NULL.
*/ */
static int vsock_accept(unsigned int cid, unsigned int port, int vsock_accept(unsigned int cid, unsigned int port,
struct sockaddr_vm *clientaddrp, int type) struct sockaddr_vm *clientaddrp, int type)
{ {
union { union {
struct sockaddr sa; struct sockaddr sa;
......
...@@ -39,6 +39,9 @@ struct test_case { ...@@ -39,6 +39,9 @@ struct test_case {
void init_signals(void); void init_signals(void);
unsigned int parse_cid(const char *str); unsigned int parse_cid(const char *str);
unsigned int parse_port(const char *str); unsigned int parse_port(const char *str);
int vsock_connect(unsigned int cid, unsigned int port, int type);
int vsock_accept(unsigned int cid, unsigned int port,
struct sockaddr_vm *clientaddrp, int type);
int vsock_stream_connect(unsigned int cid, unsigned int port); int vsock_stream_connect(unsigned int cid, unsigned int port);
int vsock_bind_connect(unsigned int cid, unsigned int port, int vsock_bind_connect(unsigned int cid, unsigned int port,
unsigned int bind_port, int type); unsigned int bind_port, int type);
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
#include <sys/mman.h> #include <sys/mman.h>
#include <poll.h> #include <poll.h>
#include <signal.h> #include <signal.h>
#include <sys/ioctl.h>
#include <linux/sockios.h>
#include "vsock_test_zerocopy.h" #include "vsock_test_zerocopy.h"
#include "timeout.h" #include "timeout.h"
...@@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct test_opts *opts) ...@@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct test_opts *opts)
} }
} }
#define MSG_BUF_IOCTL_LEN 64
static void test_unsent_bytes_server(const struct test_opts *opts, int type)
{
unsigned char buf[MSG_BUF_IOCTL_LEN];
int client_fd;
client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type);
if (client_fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
}
recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf));
control_writeln("RECEIVED");
close(client_fd);
}
static void test_unsent_bytes_client(const struct test_opts *opts, int type)
{
unsigned char buf[MSG_BUF_IOCTL_LEN];
int ret, fd, sock_bytes_unsent;
fd = vsock_connect(opts->peer_cid, opts->peer_port, type);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
}
for (int i = 0; i < sizeof(buf); i++)
buf[i] = rand() & 0xFF;
send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
control_expectln("RECEIVED");
ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent);
if (ret < 0) {
if (errno == EOPNOTSUPP) {
fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n");
} else {
perror("ioctl");
exit(EXIT_FAILURE);
}
} else if (ret == 0 && sock_bytes_unsent != 0) {
fprintf(stderr,
"Unexpected 'SIOCOUTQ' value, expected 0, got %i\n",
sock_bytes_unsent);
exit(EXIT_FAILURE);
}
close(fd);
}
static void test_stream_unsent_bytes_client(const struct test_opts *opts)
{
test_unsent_bytes_client(opts, SOCK_STREAM);
}
static void test_stream_unsent_bytes_server(const struct test_opts *opts)
{
test_unsent_bytes_server(opts, SOCK_STREAM);
}
static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts)
{
test_unsent_bytes_client(opts, SOCK_SEQPACKET);
}
static void test_seqpacket_unsent_bytes_server(const struct test_opts *opts)
{
test_unsent_bytes_server(opts, SOCK_SEQPACKET);
}
#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) #define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128)
/* This define is the same as in 'include/linux/virtio_vsock.h': /* This define is the same as in 'include/linux/virtio_vsock.h':
* it is used to decide when to send credit update message during * it is used to decide when to send credit update message during
...@@ -1523,6 +1598,16 @@ static struct test_case test_cases[] = { ...@@ -1523,6 +1598,16 @@ static struct test_case test_cases[] = {
.run_client = test_stream_rcvlowat_def_cred_upd_client, .run_client = test_stream_rcvlowat_def_cred_upd_client,
.run_server = test_stream_cred_upd_on_low_rx_bytes, .run_server = test_stream_cred_upd_on_low_rx_bytes,
}, },
{
.name = "SOCK_STREAM ioctl(SIOCOUTQ) 0 unsent bytes",
.run_client = test_stream_unsent_bytes_client,
.run_server = test_stream_unsent_bytes_server,
},
{
.name = "SOCK_SEQPACKET ioctl(SIOCOUTQ) 0 unsent bytes",
.run_client = test_seqpacket_unsent_bytes_client,
.run_server = test_seqpacket_unsent_bytes_server,
},
{}, {},
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment