Commit d1afdc51 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-improve-setsockopt-TCP_USER_TIMEOUT-accuracy'

Jon Maxwell says:

====================
tcp: improve setsockopt() TCP_USER_TIMEOUT accuracy

The patch was becoming bigger based on feedback therefore I have
implemented a series of 3 commits instead in V4.

This series is a continuation based on V3 here and associated feedback:

https://patchwork.kernel.org/patch/10516195/

Suggestions by Neal Cardwell:

1) Fix up units mismatch regarding msec/jiffies.
2) Address possiblility of time_remaining being negative.
3) Add a helper routine tcp_clamp_rto_to_user_timeout() to do the rto
calculation.
4) Move start_ts logic into helper routine tcp_retrans_stamp() to
validate tcp_sk(sk)->retrans_stamp.
5) Some u32 declation and return refactoring.
6) Return 0 instead of false in tcp_retransmit_stamp(), it's not a bool.

Suggestions by David Laight:

1) Don't cache rto in tcp_clamp_rto_to_user_timeout().

Suggestions by Eric Dumazet:

1) Make u32 declartions consistent.
2) Use patch series for easier review.
3) Convert icsk->icsk_user_timeout to millisconds to avoid jiffie to
msec dance.
4) Use seperate titles for each commit in the series.
5) Fix fuzzy indentation and line wrap issues.
6) Make commit titles descriptive.

Changes:

1) Call tcp_clamp_rto_to_user_timeout(sk) as an argument to
inet_csk_reset_xmit_timer() to save on rto declaration.

Every time the TCP retransmission timer fires. It checks to see if
there is a timeout before scheduling the next retransmit timer. The
retransmit interval between each retransmission increases
exponentially. The issue is that in order for the timeout to occur the
retransmit timer needs to fire again. If the user timeout check happens
after the 9th retransmit for example. It needs to wait for the 10th
retransmit timer to fire in order to evaluate whether a timeout has
occurred or not. If the interval is large enough then the timeout will
be inaccurate.

For example with a TCP_USER_TIMEOUT of 10 seconds without patch:

1st retransmit:

22:25:18.973488 IP host1.49310 > host2.search-agent: Flags [.]

Last retransmit:

22:25:26.205499 IP host1.49310 > host2.search-agent: Flags [.]

Timeout:

send: Connection timed out
Sun Jul  1 22:25:34 EDT 2018

We can see that last retransmit took ~7 seconds. Which pushed the total
timeout to ~15 seconds instead of the expected 10 seconds. This gets
more inaccurate the larger the TCP_USER_TIMEOUT value. As the interval
increases.

Add tcp_clamp_rto_to_user_timeout() to determine if the user rto has
expired. Or whether the rto interval needs to be recalculated. Use the
original interval if user rto is not set.

Test results with the patch is the expected 10 second timeout:

1st retransmit:

01:37:59.022555 IP host1.49310 > host2.search-agent: Flags [.]

Last retransmit:

01:38:06.486558 IP host1.49310 > host2.search-agent: Flags [.]

Timeout:

send: Connection timed out
Mon Jul  2 01:38:09 EDT 2018
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 975cd350 b701a99e
...@@ -2989,7 +2989,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, ...@@ -2989,7 +2989,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (val < 0) if (val < 0)
err = -EINVAL; err = -EINVAL;
else else
icsk->icsk_user_timeout = msecs_to_jiffies(val); icsk->icsk_user_timeout = val;
break; break;
case TCP_FASTOPEN: case TCP_FASTOPEN:
...@@ -3445,7 +3445,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, ...@@ -3445,7 +3445,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break; break;
case TCP_USER_TIMEOUT: case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout); val = icsk->icsk_user_timeout;
break; break;
case TCP_FASTOPEN: case TCP_FASTOPEN:
......
...@@ -22,6 +22,35 @@ ...@@ -22,6 +22,35 @@
#include <linux/gfp.h> #include <linux/gfp.h>
#include <net/tcp.h> #include <net/tcp.h>
u32 tcp_retransmit_stamp(const struct sock *sk)
{
u32 start_ts = tcp_sk(sk)->retrans_stamp;
if (unlikely(!start_ts)) {
struct sk_buff *head = tcp_rtx_queue_head(sk);
if (!head)
return 0;
start_ts = tcp_skb_timestamp(head);
}
return start_ts;
}
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts;
start_ts = tcp_retransmit_stamp(sk);
if (!icsk->icsk_user_timeout || !start_ts)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
if (elapsed >= icsk->icsk_user_timeout)
return 1; /* user timeout has passed; fire ASAP */
else
return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
}
/** /**
* tcp_write_err() - close socket and save error info * tcp_write_err() - close socket and save error info
* @sk: The socket the error has appeared on. * @sk: The socket the error has appeared on.
...@@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk, ...@@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk,
if (!inet_csk(sk)->icsk_retransmits) if (!inet_csk(sk)->icsk_retransmits)
return false; return false;
start_ts = tcp_sk(sk)->retrans_stamp; start_ts = tcp_retransmit_stamp(sk);
if (unlikely(!start_ts)) { if (!start_ts)
struct sk_buff *head = tcp_rtx_queue_head(sk);
if (!head)
return false; return false;
start_ts = tcp_skb_timestamp(head);
}
if (likely(timeout == 0)) { if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
...@@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk, ...@@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk,
else else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX; (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
timeout = jiffies_to_msecs(timeout);
} }
return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
} }
/* A write timeout has occurred. Process the after effects. */ /* A write timeout has occurred. Process the after effects. */
...@@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk)
if (!start_ts) if (!start_ts)
skb->skb_mstamp = tp->tcp_mstamp; skb->skb_mstamp = tp->tcp_mstamp;
else if (icsk->icsk_user_timeout && else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) > (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
jiffies_to_msecs(icsk->icsk_user_timeout))
goto abort; goto abort;
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
...@@ -535,7 +559,8 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -535,7 +559,8 @@ void tcp_retransmit_timer(struct sock *sk)
/* Use normal (exponential) backoff */ /* Use normal (exponential) backoff */
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
} }
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
__sk_dst_reset(sk); __sk_dst_reset(sk);
...@@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t) ...@@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
* to determine when to timeout instead. * to determine when to timeout instead.
*/ */
if ((icsk->icsk_user_timeout != 0 && if ((icsk->icsk_user_timeout != 0 &&
elapsed >= icsk->icsk_user_timeout && elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
icsk->icsk_probes_out > 0) || icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 && (icsk->icsk_user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) { icsk->icsk_probes_out >= keepalive_probes(tp))) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment