Commit ccce324d authored by David Morley's avatar David Morley Committed by Paolo Abeni

tcp: make the first N SYN RTO backoffs linear

Currently the SYN RTO schedule follows an exponential backoff
scheme, which can be unnecessarily conservative in cases where
there are link failures. In such cases, it's better to
aggressively try to retransmit packets, so it takes routers
less time to find a repath with a working link.

We chose a default value for this sysctl of 4, to follow
the macOS and IOS backoff scheme of 1,1,1,1,1,2,4,8, ...
MacOS and IOS have used this backoff schedule for over
a decade, since before this 2009 IETF presentation
discussed the behavior:
https://www.ietf.org/proceedings/75/slides/tcpm-1.pdf

This commit makes the SYN RTO schedule start with a number of
linear backoffs given by the following sysctl:
* tcp_syn_linear_timeouts

This changes the SYN RTO scheme to be: init_rto_val for
tcp_syn_linear_timeouts, exp backoff starting at init_rto_val

For example if init_rto_val = 1 and tcp_syn_linear_timeouts = 2, our
backoff scheme would be: 1, 1, 1, 2, 4, 8, 16, ...
Signed-off-by: default avatarDavid Morley <morleyd@google.com>
Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
Tested-by: default avatarDavid Morley <morleyd@google.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230509180558.2541885-1-morleyd.kernel@gmail.comSigned-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 8a690c15
...@@ -881,9 +881,10 @@ tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs ...@@ -881,9 +881,10 @@ tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs
tcp_syn_retries - INTEGER tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 127. Default value will be retransmitted. Should not be higher than 127. Default value
is 6, which corresponds to 63seconds till the last retransmission is 6, which corresponds to 67seconds (with tcp_syn_linear_timeouts = 4)
with the current initial RTO of 1second. With this the final timeout till the last retransmission with the current initial RTO of 1second.
for an active TCP connection attempt will happen after 127seconds. With this the final timeout for an active TCP connection attempt
will happen after 131seconds.
tcp_timestamps - INTEGER tcp_timestamps - INTEGER
Enable timestamps as defined in RFC1323. Enable timestamps as defined in RFC1323.
...@@ -946,6 +947,16 @@ tcp_pacing_ca_ratio - INTEGER ...@@ -946,6 +947,16 @@ tcp_pacing_ca_ratio - INTEGER
Default: 120 Default: 120
tcp_syn_linear_timeouts - INTEGER
The number of times for an active TCP connection to retransmit SYNs with
a linear backoff timeout before defaulting to an exponential backoff
timeout. This has no effect on SYNACK at the passive TCP side.
With an initial RTO of 1 and tcp_syn_linear_timeouts = 4 we would
expect SYN RTOs to be: 1, 1, 1, 1, 1, 2, 4, ... (4 linear timeouts,
and the first exponential backoff using 2^0 * initial_RTO).
Default: 4
tcp_tso_win_divisor - INTEGER tcp_tso_win_divisor - INTEGER
This allows control over what percentage of the congestion window This allows control over what percentage of the congestion window
can be consumed by a single TSO frame. can be consumed by a single TSO frame.
......
...@@ -194,6 +194,7 @@ struct netns_ipv4 { ...@@ -194,6 +194,7 @@ struct netns_ipv4 {
int sysctl_udp_rmem_min; int sysctl_udp_rmem_min;
u8 sysctl_fib_notify_on_flag_change; u8 sysctl_fib_notify_on_flag_change;
u8 sysctl_tcp_syn_linear_timeouts;
#ifdef CONFIG_NET_L3_MASTER_DEV #ifdef CONFIG_NET_L3_MASTER_DEV
u8 sysctl_udp_l3mdev_accept; u8 sysctl_udp_l3mdev_accept;
......
...@@ -34,6 +34,7 @@ static int ip_ttl_min = 1; ...@@ -34,6 +34,7 @@ static int ip_ttl_min = 1;
static int ip_ttl_max = 255; static int ip_ttl_max = 255;
static int tcp_syn_retries_min = 1; static int tcp_syn_retries_min = 1;
static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static u32 u32_max_div_HZ = UINT_MAX / HZ; static u32 u32_max_div_HZ = UINT_MAX / HZ;
...@@ -1470,6 +1471,15 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -1470,6 +1471,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
.extra2 = &tcp_plb_max_cong_thresh, .extra2 = &tcp_plb_max_cong_thresh,
}, },
{
.procname = "tcp_syn_linear_timeouts",
.data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &tcp_syn_linear_timeouts_max,
},
{ } { }
}; };
......
...@@ -3275,6 +3275,7 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -3275,6 +3275,7 @@ static int __net_init tcp_sk_init(struct net *net)
else else
net->ipv4.tcp_congestion_control = &tcp_reno; net->ipv4.tcp_congestion_control = &tcp_reno;
net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
return 0; return 0;
} }
......
...@@ -234,14 +234,19 @@ static int tcp_write_timeout(struct sock *sk) ...@@ -234,14 +234,19 @@ static int tcp_write_timeout(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
bool expired = false, do_reset; bool expired = false, do_reset;
int retry_until; int retry_until, max_retransmits;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits) if (icsk->icsk_retransmits)
__dst_negative_advice(sk); __dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : retry_until = icsk->icsk_syn_retries ? :
READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
expired = icsk->icsk_retransmits >= retry_until;
max_retransmits = retry_until;
if (sk->sk_state == TCP_SYN_SENT)
max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts);
expired = icsk->icsk_retransmits >= max_retransmits;
} else { } else {
if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
/* Black hole detection */ /* Black hole detection */
...@@ -577,8 +582,12 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -577,8 +582,12 @@ void tcp_retransmit_timer(struct sock *sk)
icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
icsk->icsk_backoff = 0; icsk->icsk_backoff = 0;
icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
} else { } else if (sk->sk_state != TCP_SYN_SENT ||
/* Use normal (exponential) backoff */ icsk->icsk_backoff >
READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
/* Use normal (exponential) backoff unless linear timeouts are
* activated.
*/
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
} }
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment