Commit 6b58e0a5 authored by Fan Du's avatar Fan Du Committed by David S. Miller

ipv4: Use binary search to choose tcp PMTU probe_size

Current probe_size is chosen by doubling mss_cache,
the probing process will end shortly with a sub-optimal
mss size, and the link mtu will not be taken full
advantage of, in return, this will make user to tweak
tcp_base_mss with care.

Use binary search to choose probe_size in a fine
granularity manner, an optimal mss will be found
to boost performance as its maxmium.

In addition, introduce a sysctl_tcp_probe_threshold
to control when probing will stop in respect to
the width of search range.

Test env:
Docker instance with vxlan encapuslation(82599EB)
iperf -c 10.0.0.24  -t 60

before this patch:
1.26 Gbits/sec

After this patch: increase 26%
1.59 Gbits/sec
Signed-off-by: default avatarFan Du <fan.du@intel.com>
Acked-by: default avatarJohn Heffner <johnwheffner@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent dcd8fb85
...@@ -87,6 +87,7 @@ struct netns_ipv4 { ...@@ -87,6 +87,7 @@ struct netns_ipv4 {
int sysctl_tcp_fwmark_accept; int sysctl_tcp_fwmark_accept;
int sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probing;
int sysctl_tcp_base_mss; int sysctl_tcp_base_mss;
int sysctl_tcp_probe_threshold;
struct ping_group_range ping_group_range; struct ping_group_range ping_group_range;
......
...@@ -67,6 +67,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -67,6 +67,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* The least MTU to use for probing */ /* The least MTU to use for probing */
#define TCP_BASE_MSS 1024 #define TCP_BASE_MSS 1024
/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD 8
/* After receiving this amount of duplicate ACKs fast retransmit starts. */ /* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3 #define TCP_FASTRETRANS_THRESH 3
......
...@@ -883,6 +883,13 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -883,6 +883,13 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = proc_dointvec,
}, },
{
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ } { }
}; };
......
...@@ -2460,6 +2460,7 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -2460,6 +2460,7 @@ static int __net_init tcp_sk_init(struct net *net)
} }
net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
return 0; return 0;
fail: fail:
......
...@@ -1842,11 +1842,13 @@ static int tcp_mtu_probe(struct sock *sk) ...@@ -1842,11 +1842,13 @@ static int tcp_mtu_probe(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next; struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
int len; int len;
int probe_size; int probe_size;
int size_needed; int size_needed;
int copy; int copy;
int mss_now; int mss_now;
int interval;
/* Not currently probing/verifying, /* Not currently probing/verifying,
* not in recovery, * not in recovery,
...@@ -1859,11 +1861,17 @@ static int tcp_mtu_probe(struct sock *sk) ...@@ -1859,11 +1861,17 @@ static int tcp_mtu_probe(struct sock *sk)
tp->rx_opt.num_sacks || tp->rx_opt.dsack) tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1; return -1;
/* Very simple search strategy: just double the MSS. */ /* Use binary search for probe_size between tcp_mss_base,
* and current mss_clamp. if (search_high - search_low)
* smaller than a threshold, backoff from probing.
*/
mss_now = tcp_current_mss(sk); mss_now = tcp_current_mss(sk);
probe_size = 2 * tp->mss_cache; probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
interval < max(1, net->ipv4.sysctl_tcp_probe_threshold)) {
/* TODO: set timer for probe_converge_event */ /* TODO: set timer for probe_converge_event */
return -1; return -1;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment