Commit 05cbc0db authored by Fan Du's avatar Fan Du Committed by David S. Miller

ipv4: Create probe timer for tcp PMTU as per RFC4821

As per RFC4821 7.3.  Selecting Probe Size, a probe timer should
be armed once probing has converged. Once this timer expired,
probing again to take advantage of any path PMTU change. The
recommended probing interval is 10 minutes per RFC1981. Probing
interval could be sysctled by sysctl_tcp_probe_interval.

Eric Dumazet suggested to implement pseudo timer based on 32bits
jiffies tcp_time_stamp instead of using classic timer for such
rare event.
Signed-off-by: default avatarFan Du <fan.du@intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6b58e0a5
...@@ -126,6 +126,8 @@ struct inet_connection_sock { ...@@ -126,6 +126,8 @@ struct inet_connection_sock {
/* Information on the current probe. */ /* Information on the current probe. */
int probe_size; int probe_size;
u32 probe_timestamp;
} icsk_mtup; } icsk_mtup;
u32 icsk_ca_priv[16]; u32 icsk_ca_priv[16];
u32 icsk_user_timeout; u32 icsk_user_timeout;
......
...@@ -88,6 +88,7 @@ struct netns_ipv4 { ...@@ -88,6 +88,7 @@ struct netns_ipv4 {
int sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probing;
int sysctl_tcp_base_mss; int sysctl_tcp_base_mss;
int sysctl_tcp_probe_threshold; int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;
struct ping_group_range ping_group_range; struct ping_group_range ping_group_range;
......
...@@ -67,6 +67,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -67,6 +67,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* The least MTU to use for probing */ /* The least MTU to use for probing */
#define TCP_BASE_MSS 1024 #define TCP_BASE_MSS 1024
/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL 600
/* Specify interval when tcp mtu probing will stop */ /* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD 8 #define TCP_PROBE_THRESHOLD 8
......
...@@ -890,6 +890,13 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -890,6 +890,13 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = proc_dointvec,
}, },
{
.procname = "tcp_probe_interval",
.data = &init_net.ipv4.sysctl_tcp_probe_interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ } { }
}; };
......
...@@ -2461,6 +2461,7 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -2461,6 +2461,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
return 0; return 0;
fail: fail:
......
...@@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk) ...@@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_af_ops->net_header_len; icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
} }
EXPORT_SYMBOL(tcp_mtup_init); EXPORT_SYMBOL(tcp_mtup_init);
...@@ -1828,6 +1830,31 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, ...@@ -1828,6 +1830,31 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
return false; return false;
} }
static inline void tcp_mtu_check_reprobe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
u32 interval;
s32 delta;
interval = net->ipv4.sysctl_tcp_probe_interval;
delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
if (unlikely(delta >= interval * HZ)) {
int mss = tcp_current_mss(sk);
/* Update current search range */
icsk->icsk_mtup.probe_size = 0;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
sizeof(struct tcphdr) +
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
/* Update probe time stamp */
icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
}
/* Create a new MTU probe if we are ready. /* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by * MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing * deliberately sending larger packets. This discovers routing
...@@ -1870,9 +1897,16 @@ static int tcp_mtu_probe(struct sock *sk) ...@@ -1870,9 +1897,16 @@ static int tcp_mtu_probe(struct sock *sk)
icsk->icsk_mtup.search_low) >> 1); icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
/* When misfortune happens, we are reprobing actively,
* and then reprobe timer has expired. We stick with current
* probing process by not resetting search range to its orignal.
*/
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
interval < max(1, net->ipv4.sysctl_tcp_probe_threshold)) { interval < net->ipv4.sysctl_tcp_probe_threshold) {
/* TODO: set timer for probe_converge_event */ /* Check whether enough time has elaplased for
* another round of probing.
*/
tcp_mtu_check_reprobe(sk);
return -1; return -1;
} }
......
...@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) ...@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
if (net->ipv4.sysctl_tcp_mtu_probing) { if (net->ipv4.sysctl_tcp_mtu_probing) {
if (!icsk->icsk_mtup.enabled) { if (!icsk->icsk_mtup.enabled) {
icsk->icsk_mtup.enabled = 1; icsk->icsk_mtup.enabled = 1;
icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else { } else {
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment