Commit f35f8219 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp: defer skb freeing after socket lock is released

tcp recvmsg() (or rx zerocopy) spends a fair amount of time
freeing skbs after their payload has been consumed.

A typical ~64KB GRO packet has to release ~45 page
references, eventually going to page allocator
for each of them.

Currently, this freeing is performed while socket lock
is held, meaning that there is a high chance that
BH handler has to queue incoming packets to tcp socket backlog.

This can cause additional latencies, because the user
thread has to process the backlog at release_sock() time,
and while doing so, additional frames can be added
by BH handler.

This patch adds logic to defer these frees after socket
lock is released, or directly from BH handler if possible.

Being able to free these skbs from BH handler helps a lot,
because this avoids the usual alloc/free assymetry,
when BH handler and user thread do not run on same cpu or
NUMA node.

One cpu can now be fully utilized for the kernel->user copy,
and another cpu is handling BH processing and skb/page
allocs/frees (assuming RFS is not forcing use of a single CPU)

Tested:
 100Gbit NIC
 Max throughput for one TCP_STREAM flow, over 10 runs

MTU : 1500
Before: 55 Gbit
After:  66 Gbit

MTU : 4096+(headers)
Before: 82 Gbit
After:  95 Gbit
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 3df684c1
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/splice.h> #include <linux/splice.h>
#include <linux/in6.h> #include <linux/in6.h>
#include <linux/if_packet.h> #include <linux/if_packet.h>
#include <linux/llist.h>
#include <net/flow.h> #include <net/flow.h>
#include <net/page_pool.h> #include <net/page_pool.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK) #if IS_ENABLED(CONFIG_NF_CONNTRACK)
...@@ -743,6 +744,7 @@ struct sk_buff { ...@@ -743,6 +744,7 @@ struct sk_buff {
}; };
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list; struct list_head list;
struct llist_node ll_node;
}; };
union { union {
......
...@@ -63,6 +63,7 @@ ...@@ -63,6 +63,7 @@
#include <linux/indirect_call_wrapper.h> #include <linux/indirect_call_wrapper.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/refcount.h> #include <linux/refcount.h>
#include <linux/llist.h>
#include <net/dst.h> #include <net/dst.h>
#include <net/checksum.h> #include <net/checksum.h>
#include <net/tcp_states.h> #include <net/tcp_states.h>
...@@ -408,6 +409,8 @@ struct sock { ...@@ -408,6 +409,8 @@ struct sock {
struct sk_buff *head; struct sk_buff *head;
struct sk_buff *tail; struct sk_buff *tail;
} sk_backlog; } sk_backlog;
struct llist_head defer_list;
#define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_rmem_alloc sk_backlog.rmem_alloc
int sk_forward_alloc; int sk_forward_alloc;
......
...@@ -1368,6 +1368,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) ...@@ -1368,6 +1368,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
} }
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
void __sk_defer_free_flush(struct sock *sk);
static inline void sk_defer_free_flush(struct sock *sk)
{
if (llist_empty(&sk->defer_list))
return;
__sk_defer_free_flush(sk);
}
int tcp_filter(struct sock *sk, struct sk_buff *skb); int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_set_state(struct sock *sk, int state); void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk); void tcp_done(struct sock *sk);
......
...@@ -1580,14 +1580,34 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) ...@@ -1580,14 +1580,34 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk); tcp_send_ack(sk);
} }
void __sk_defer_free_flush(struct sock *sk)
{
struct llist_node *head;
struct sk_buff *skb, *n;
head = llist_del_all(&sk->defer_list);
llist_for_each_entry_safe(skb, n, head, ll_node) {
prefetch(n);
skb_mark_not_on_list(skb);
__kfree_skb(skb);
}
}
EXPORT_SYMBOL(__sk_defer_free_flush);
static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
{ {
__skb_unlink(skb, &sk->sk_receive_queue);
if (likely(skb->destructor == sock_rfree)) { if (likely(skb->destructor == sock_rfree)) {
sock_rfree(skb); sock_rfree(skb);
skb->destructor = NULL; skb->destructor = NULL;
skb->sk = NULL; skb->sk = NULL;
if (!skb_queue_empty(&sk->sk_receive_queue) ||
!llist_empty(&sk->defer_list)) {
llist_add(&skb->ll_node, &sk->defer_list);
return;
}
} }
sk_eat_skb(sk, skb); __kfree_skb(skb);
} }
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
...@@ -2422,6 +2442,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -2422,6 +2442,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
/* Do not sleep, just process backlog. */ /* Do not sleep, just process backlog. */
__sk_flush_backlog(sk); __sk_flush_backlog(sk);
} else { } else {
sk_defer_free_flush(sk);
sk_wait_data(sk, &timeo, last); sk_wait_data(sk, &timeo, last);
} }
...@@ -2540,6 +2561,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, ...@@ -2540,6 +2561,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
&cmsg_flags); &cmsg_flags);
release_sock(sk); release_sock(sk);
sk_defer_free_flush(sk);
if (cmsg_flags && ret >= 0) { if (cmsg_flags && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS) if (cmsg_flags & TCP_CMSG_TS)
...@@ -3065,7 +3087,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -3065,7 +3087,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL; sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0; sk->sk_frag.offset = 0;
} }
sk_defer_free_flush(sk);
sk_error_report(sk); sk_error_report(sk);
return 0; return 0;
} }
...@@ -4194,6 +4216,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, ...@@ -4194,6 +4216,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err); &zc, &len, err);
release_sock(sk); release_sock(sk);
sk_defer_free_flush(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg; goto zerocopy_rcv_cmsg;
switch (len) { switch (len) {
......
...@@ -2102,6 +2102,7 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -2102,6 +2102,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
sk_incoming_cpu_update(sk); sk_incoming_cpu_update(sk);
sk_defer_free_flush(sk);
bh_lock_sock_nested(sk); bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb); tcp_segs_in(tcp_sk(sk), skb);
ret = 0; ret = 0;
......
...@@ -1758,6 +1758,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1758,6 +1758,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
sk_incoming_cpu_update(sk); sk_incoming_cpu_update(sk);
sk_defer_free_flush(sk);
bh_lock_sock_nested(sk); bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb); tcp_segs_in(tcp_sk(sk), skb);
ret = 0; ret = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment