Commit 8fc8911b authored by Paolo Abeni's avatar Paolo Abeni

Merge branch 'tcp-backlog-processing-optims'

Eric Dumazet says:

====================
tcp: backlog processing optims

First patches are mostly preparing the ground for the last one.

Last patch of the series implements sort of ACK reduction
only for the cases a TCP receiver is under high stress,
which happens for high throughput flows.

This gives us a ~20% increase of single TCP flow (100Gbit -> 120Gbit)
====================

Link: https://lore.kernel.org/r/20230911170531.828100-1-edumazet@google.comSigned-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents cd8bae85 133c4c0d
......@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER
Default : 44
tcp_backlog_ack_defer - BOOLEAN
If set, user thread processing socket backlog tries sending
one ACK for the whole queue. This helps to avoid potential
long latencies at end of a TCP socket syscall.
Default : true
tcp_slow_start_after_idle - BOOLEAN
If set, provide RFC2861 behavior and time out the congestion
window after an idle period. An idle period is defined at
......
......@@ -463,15 +463,17 @@ enum tsq_enum {
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
* tcp_v{4|6}_mtu_reduced()
*/
TCP_ACK_DEFERRED, /* TX pure ack is deferred */
};
enum tsq_flags {
TSQF_THROTTLED = (1UL << TSQ_THROTTLED),
TSQF_QUEUED = (1UL << TSQ_QUEUED),
TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED),
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
TSQF_THROTTLED = BIT(TSQ_THROTTLED),
TSQF_QUEUED = BIT(TSQ_QUEUED),
TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED),
TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED),
TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED),
};
#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
......
......@@ -132,6 +132,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_syncookies;
u8 sysctl_tcp_migrate_req;
u8 sysctl_tcp_comp_sack_nr;
u8 sysctl_tcp_backlog_ack_defer;
int sysctl_tcp_reordering;
u8 sysctl_tcp_retries1;
u8 sysctl_tcp_retries2;
......
......@@ -1823,12 +1823,11 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
static inline void sock_release_ownership(struct sock *sk)
{
if (sock_owned_by_user_nocheck(sk)) {
sk->sk_lock.owned = 0;
DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
sk->sk_lock.owned = 0;
/* The sk_lock has mutex_unlock() semantics: */
mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}
/* The sk_lock has mutex_unlock() semantics: */
mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}
/* no reclassification while locks are held */
......
......@@ -3001,6 +3001,9 @@ void __sk_flush_backlog(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk);
if (sk->sk_prot->release_cb)
sk->sk_prot->release_cb(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);
......@@ -3519,9 +3522,6 @@ void release_sock(struct sock *sk)
if (sk->sk_backlog.tail)
__release_sock(sk);
/* Warning : release_cb() might need to release sk ownership,
* ie call sock_release_ownership(sk) before us.
*/
if (sk->sk_prot->release_cb)
sk->sk_prot->release_cb(sk);
......
......@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "tcp_backlog_ack_defer",
.data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "tcp_reflect_tos",
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,
......
......@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
/* If we are running from __release_sock() in user context,
* Defer the ack until tcp_release_cb().
*/
if (sock_owned_by_user_nocheck(sk) &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
return;
}
send_now:
tcp_send_ack(sk);
return;
......
......@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
......
......@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \
TCPF_MTU_REDUCED_DEFERRED)
TCPF_MTU_REDUCED_DEFERRED | \
TCPF_ACK_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
......@@ -1101,16 +1102,6 @@ void tcp_release_cb(struct sock *sk)
tcp_tsq_write(sk);
__sock_put(sk);
}
/* Here begins the tricky part :
* We are called from release_sock() with :
* 1) BH disabled
* 2) sk_lock.slock spinlock held
* 3) socket owned by us (sk->sk_lock.owned == 1)
*
* But following code is meant to be called from BH handlers,
* so we should keep BH disabled, but early release socket ownership
*/
sock_release_ownership(sk);
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
......@@ -1124,6 +1115,8 @@ void tcp_release_cb(struct sock *sk)
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
tcp_send_ack(sk);
}
EXPORT_SYMBOL(tcp_release_cb);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment