Commit 5d659b1d authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-mmap-rework-zerocopy-receive'

Eric Dumazet says:

====================
tcp: mmap: rework zerocopy receive

syzbot reported a lockdep issue caused by tcp mmap() support.

I implemented Andy Lutomirski nice suggestions to resolve the
issue and increase scalability as well.

First patch is adding a new getsockopt() operation and changes mmap()
behavior.

Second patch changes tcp_mmap reference program.

v4: tcp mmap() support depends on CONFIG_MMU, as kbuild bot told us.

v3: change TCP_ZEROCOPY_RECEIVE to be a getsockopt() option
    instead of setsockopt(), feedback from Ka-Cheon Poon

v2: Added a missing page align of zc->length in tcp_zerocopy_receive()
    Properly clear zc->recv_skip_hint in case user request was completed.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 589f84fb aacb0c2e
...@@ -122,6 +122,7 @@ enum { ...@@ -122,6 +122,7 @@ enum {
#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
#define TCP_ZEROCOPY_RECEIVE 35
struct tcp_repair_opt { struct tcp_repair_opt {
__u32 opt_code; __u32 opt_code;
...@@ -276,4 +277,11 @@ struct tcp_diag_md5sig { ...@@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN];
}; };
/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
struct tcp_zerocopy_receive {
__u64 address; /* in: address of mapping */
__u32 length; /* in/out: number of bytes to map/mapped */
__u32 recv_skip_hint; /* out: amount of bytes to skip */
};
#endif /* _UAPI_LINUX_TCP_H */ #endif /* _UAPI_LINUX_TCP_H */
...@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = { ...@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt, .getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg, .sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg, .recvmsg = inet_recvmsg,
#ifdef CONFIG_MMU
.mmap = tcp_mmap, .mmap = tcp_mmap,
#endif
.sendpage = inet_sendpage, .sendpage = inet_sendpage,
.splice_read = tcp_splice_read, .splice_read = tcp_splice_read,
.read_sock = tcp_read_sock, .read_sock = tcp_read_sock,
......
...@@ -1726,73 +1726,74 @@ int tcp_set_rcvlowat(struct sock *sk, int val) ...@@ -1726,73 +1726,74 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
} }
EXPORT_SYMBOL(tcp_set_rcvlowat); EXPORT_SYMBOL(tcp_set_rcvlowat);
/* When user wants to mmap X pages, we first need to perform the mapping #ifdef CONFIG_MMU
* before freeing any skbs in receive queue, otherwise user would be unable static const struct vm_operations_struct tcp_vm_ops = {
* to fallback to standard recvmsg(). This happens if some data in the };
* requested block is not exactly fitting in a page.
*
* We only support order-0 pages for the moment.
* mmap() on TCP is very strict, there is no point
* trying to accommodate with pathological layouts.
*/
int tcp_mmap(struct file *file, struct socket *sock, int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma) struct vm_area_struct *vma)
{ {
unsigned long size = vma->vm_end - vma->vm_start; if (vma->vm_flags & (VM_WRITE | VM_EXEC))
unsigned int nr_pages = size >> PAGE_SHIFT; return -EPERM;
struct page **pages_array = NULL; vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
u32 seq, len, offset, nr = 0;
struct sock *sk = sock->sk; /* Instruct vm_insert_page() to not down_read(mmap_sem) */
const skb_frag_t *frags; vma->vm_flags |= VM_MIXEDMAP;
vma->vm_ops = &tcp_vm_ops;
return 0;
}
EXPORT_SYMBOL(tcp_mmap);
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
unsigned long address = (unsigned long)zc->address;
const skb_frag_t *frags = NULL;
u32 length = 0, seq, offset;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
struct tcp_sock *tp; struct tcp_sock *tp;
struct sk_buff *skb;
int ret; int ret;
if (vma->vm_pgoff || !nr_pages) if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL; return -EINVAL;
if (vma->vm_flags & VM_WRITE)
return -EPERM;
/* TODO: Maybe the following is not needed if pages are COW */
vma->vm_flags &= ~VM_MAYWRITE;
lock_sock(sk);
ret = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN) if (sk->sk_state == TCP_LISTEN)
goto out; return -ENOTCONN;
sock_rps_record_flow(sk); sock_rps_record_flow(sk);
if (tcp_inq(sk) < size) { down_read(&current->mm->mmap_sem);
ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
ret = -EINVAL;
vma = find_vma(current->mm, address);
if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
goto out; goto out;
} zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
tp = tcp_sk(sk); tp = tcp_sk(sk);
seq = tp->copied_seq; seq = tp->copied_seq;
/* Abort if urgent data is in the area */ zc->length = min_t(u32, zc->length, tcp_inq(sk));
if (unlikely(tp->urg_data)) { zc->length &= ~(PAGE_SIZE - 1);
u32 urg_offset = tp->urg_seq - seq;
ret = -EINVAL; zap_page_range(vma, address, zc->length);
if (urg_offset < size)
goto out; zc->recv_skip_hint = 0;
} ret = 0;
ret = -ENOMEM; while (length + PAGE_SIZE <= zc->length) {
pages_array = kvmalloc_array(nr_pages, sizeof(struct page *), if (zc->recv_skip_hint < PAGE_SIZE) {
GFP_KERNEL); if (skb) {
if (!pages_array) skb = skb->next;
goto out; offset = seq - TCP_SKB_CB(skb)->seq;
} else {
skb = tcp_recv_skb(sk, seq, &offset); skb = tcp_recv_skb(sk, seq, &offset);
ret = -EINVAL; }
skb_start:
/* We do not support anything not in page frags */ zc->recv_skip_hint = skb->len - offset;
offset -= skb_headlen(skb); offset -= skb_headlen(skb);
if ((int)offset < 0) if ((int)offset < 0 || skb_has_frag_list(skb))
goto out; break;
if (skb_has_frag_list(skb))
goto out;
len = skb->data_len - offset;
frags = skb_shinfo(skb)->frags; frags = skb_shinfo(skb)->frags;
while (offset) { while (offset) {
if (frags->size > offset) if (frags->size > offset)
...@@ -1800,44 +1801,38 @@ int tcp_mmap(struct file *file, struct socket *sock, ...@@ -1800,44 +1801,38 @@ int tcp_mmap(struct file *file, struct socket *sock,
offset -= frags->size; offset -= frags->size;
frags++; frags++;
} }
while (nr < nr_pages) {
if (len) {
if (len < PAGE_SIZE)
goto out;
if (frags->size != PAGE_SIZE || frags->page_offset)
goto out;
pages_array[nr++] = skb_frag_page(frags);
frags++;
len -= PAGE_SIZE;
seq += PAGE_SIZE;
continue;
} }
skb = skb->next; if (frags->size != PAGE_SIZE || frags->page_offset)
offset = seq - TCP_SKB_CB(skb)->seq; break;
goto skb_start; ret = vm_insert_page(vma, address + length,
} skb_frag_page(frags));
/* OK, we have a full set of pages ready to be inserted into vma */
for (nr = 0; nr < nr_pages; nr++) {
ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
pages_array[nr]);
if (ret) if (ret)
goto out; break;
length += PAGE_SIZE;
seq += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++;
} }
/* operation is complete, we can 'consume' all skbs */ out:
up_read(&current->mm->mmap_sem);
if (length) {
tp->copied_seq = seq; tp->copied_seq = seq;
tcp_rcv_space_adjust(sk); tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */ /* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset); tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, size); tcp_cleanup_rbuf(sk, length);
ret = 0; ret = 0;
out: if (length == zc->length)
release_sock(sk); zc->recv_skip_hint = 0;
kvfree(pages_array); } else {
if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
ret = -EIO;
}
zc->length = length;
return ret; return ret;
} }
EXPORT_SYMBOL(tcp_mmap); #endif
static void tcp_update_recv_tstamps(struct sk_buff *skb, static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss) struct scm_timestamping *tss)
...@@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level, ...@@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
} }
return 0; return 0;
} }
#ifdef CONFIG_MMU
case TCP_ZEROCOPY_RECEIVE: {
struct tcp_zerocopy_receive zc;
int err;
if (get_user(len, optlen))
return -EFAULT;
if (len != sizeof(zc))
return -EINVAL;
if (copy_from_user(&zc, optval, len))
return -EFAULT;
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
release_sock(sk);
if (!err && copy_to_user(optval, &zc, len))
err = -EFAULT;
return err;
}
#endif
default: default:
return -ENOPROTOOPT; return -ENOPROTOOPT;
} }
......
...@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = { ...@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */ .sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */ .recvmsg = inet_recvmsg, /* ok */
#ifdef CONFIG_MMU
.mmap = tcp_mmap, .mmap = tcp_mmap,
#endif
.sendpage = inet_sendpage, .sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked, .sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked, .sendpage_locked = tcp_sendpage_locked,
......
...@@ -76,9 +76,10 @@ ...@@ -76,9 +76,10 @@
#include <time.h> #include <time.h>
#include <sys/time.h> #include <sys/time.h>
#include <netinet/in.h> #include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <poll.h> #include <poll.h>
#include <linux/tcp.h>
#include <assert.h>
#ifndef MSG_ZEROCOPY #ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY 0x4000000 #define MSG_ZEROCOPY 0x4000000
...@@ -134,11 +135,12 @@ void hash_zone(void *zone, unsigned int length) ...@@ -134,11 +135,12 @@ void hash_zone(void *zone, unsigned int length)
void *child_thread(void *arg) void *child_thread(void *arg)
{ {
unsigned long total_mmap = 0, total = 0; unsigned long total_mmap = 0, total = 0;
struct tcp_zerocopy_receive zc;
unsigned long delta_usec; unsigned long delta_usec;
int flags = MAP_SHARED; int flags = MAP_SHARED;
struct timeval t0, t1; struct timeval t0, t1;
char *buffer = NULL; char *buffer = NULL;
void *oaddr = NULL; void *addr = NULL;
double throughput; double throughput;
struct rusage ru; struct rusage ru;
int lu, fd; int lu, fd;
...@@ -153,41 +155,46 @@ void *child_thread(void *arg) ...@@ -153,41 +155,46 @@ void *child_thread(void *arg)
perror("malloc"); perror("malloc");
goto error; goto error;
} }
if (zflg) {
addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
if (addr == (void *)-1)
zflg = 0;
}
while (1) { while (1) {
struct pollfd pfd = { .fd = fd, .events = POLLIN, }; struct pollfd pfd = { .fd = fd, .events = POLLIN, };
int sub; int sub;
poll(&pfd, 1, 10000); poll(&pfd, 1, 10000);
if (zflg) { if (zflg) {
void *naddr; socklen_t zc_len = sizeof(zc);
int res;
naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0);
if (naddr == (void *)-1) { zc.address = (__u64)addr;
if (errno == EAGAIN) { zc.length = chunk_size;
/* That is if SO_RCVLOWAT is buggy */ zc.recv_skip_hint = 0;
usleep(1000); res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
continue; &zc, &zc_len);
} if (res == -1)
if (errno == EINVAL) {
flags = MAP_SHARED;
oaddr = NULL;
goto fallback;
}
if (errno != EIO)
perror("mmap()");
break; break;
if (zc.length) {
assert(zc.length <= chunk_size);
total_mmap += zc.length;
if (xflg)
hash_zone(addr, zc.length);
total += zc.length;
} }
total_mmap += chunk_size; if (zc.recv_skip_hint) {
assert(zc.recv_skip_hint <= chunk_size);
lu = read(fd, buffer, zc.recv_skip_hint);
if (lu > 0) {
if (xflg) if (xflg)
hash_zone(naddr, chunk_size); hash_zone(buffer, lu);
total += chunk_size; total += lu;
if (!keepflag) { }
flags |= MAP_FIXED;
oaddr = naddr;
} }
continue; continue;
} }
fallback:
sub = 0; sub = 0;
while (sub < chunk_size) { while (sub < chunk_size) {
lu = read(fd, buffer + sub, chunk_size - sub); lu = read(fd, buffer + sub, chunk_size - sub);
...@@ -228,6 +235,8 @@ void *child_thread(void *arg) ...@@ -228,6 +235,8 @@ void *child_thread(void *arg)
error: error:
free(buffer); free(buffer);
close(fd); close(fd);
if (zflg)
munmap(addr, chunk_size);
pthread_exit(0); pthread_exit(0);
} }
...@@ -371,7 +380,8 @@ int main(int argc, char *argv[]) ...@@ -371,7 +380,8 @@ int main(int argc, char *argv[])
setup_sockaddr(cfg_family, host, &listenaddr); setup_sockaddr(cfg_family, host, &listenaddr);
if (mss && if (mss &&
setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) { setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
&mss, sizeof(mss)) == -1) {
perror("setsockopt TCP_MAXSEG"); perror("setsockopt TCP_MAXSEG");
exit(1); exit(1);
} }
...@@ -402,7 +412,7 @@ int main(int argc, char *argv[]) ...@@ -402,7 +412,7 @@ int main(int argc, char *argv[])
setup_sockaddr(cfg_family, host, &addr); setup_sockaddr(cfg_family, host, &addr);
if (mss && if (mss &&
setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) { setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
perror("setsockopt TCP_MAXSEG"); perror("setsockopt TCP_MAXSEG");
exit(1); exit(1);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment